tcp: "Reasonably" reduce IW after SYN retransmition timeout
authorSepherosa Ziehau <sephe@dragonflybsd.org>
Thu, 12 Apr 2012 06:48:56 +0000 (14:48 +0800)
committerSepherosa Ziehau <sephe@dragonflybsd.org>
Thu, 12 Apr 2012 06:48:56 +0000 (14:48 +0800)
sys/netinet/tcp_input.c
sys/netinet/tcp_subr.c
sys/netinet/tcp_syncache.c
sys/netinet/tcp_timer.c
sys/netinet/tcp_var.h

index e414183..ecfbd2a 100644 (file)
@@ -3210,15 +3210,7 @@ tcp_established(struct tcpcb *tp)
        tp->t_state = TCPS_ESTABLISHED;
        tcp_callout_reset(tp, tp->tt_keep, tcp_getkeepidle(tp), tcp_timer_keep);
 
-       if (tp->t_flags & TF_SYN_WASLOST) {
-               /*
-                * RFC3390:
-                * "If the SYN or SYN/ACK is lost, the initial window used by
-                *  a sender after a correctly transmitted SYN MUST be one
-                *  segment consisting of MSS bytes."
-                */
-               tp->snd_cwnd = tp->t_maxseg;
-
+       if (tp->t_rxtsyn > 0) {
                /*
                 * RFC6298:
                 * "If the timer expires awaiting the ACK of a SYN segment
index c3d2452..ad57ee8 100644 (file)
@@ -1963,7 +1963,36 @@ u_long
 tcp_initial_window(const struct tcpcb *tp)
 {
        if (tcp_do_rfc3390) {
-               return min(4 * tp->t_maxseg, max(2 * tp->t_maxseg, 4380));
+               /*
+                * RFC3390:
+                * "If the SYN or SYN/ACK is lost, the initial window
+                *  used by a sender after a correctly transmitted SYN
+                *  MUST be one segment consisting of MSS bytes."
+                *
+                * However, we do something a little bit more aggressive
+                * then RFC3390 here:
+                * - Only if time spent in the SYN or SYN|ACK retransmition
+                *   >= 3 seconds, the IW is reduced.  We do this mainly
+                *   because when RFC3390 is published, the initial RTO is
+                *   still 3 seconds (the threshold we test here), while
+                *   after RFC6298, the initial RTO is 1 second.  This
+                *   behaviour probably still falls within the spirit of
+                *   RFC3390.
+                * - When IW is reduced, 2*MSS is used instead of 1*MSS.
+                *   Mainly to avoid sender and receiver deadlock until
+                *   delayed ACK timer expires.  And even RFC2581 does not
+                *   try to reduce IW upon SYN or SYN|ACK retransmition
+                *   timeout.
+                *
+                * See also:
+                * http://tools.ietf.org/html/draft-ietf-tcpm-initcwnd-03
+                */
+               if (tp->t_rxtsyn >= TCPTV_RTOBASE3) {
+                       return (2 * tp->t_maxseg);
+               } else {
+                       return min(4 * tp->t_maxseg,
+                                  max(2 * tp->t_maxseg, 4380));
+               }
        } else {
                /*
                 * Even RFC2581 (back to 1999) allows 2*SMSS IW.
index 62ba0e7..142fbce 100644 (file)
@@ -240,6 +240,15 @@ static MALLOC_DEFINE(M_SYNCACHE, "syncache", "TCP syncache");
 
 #define ENDPTS6_EQ(a, b) (memcmp(a, b, sizeof(*a)) == 0)
 
+static __inline int
+syncache_rto(int slot)
+{
+       if (tcp_low_rtobase)
+               return (TCPTV_RTOBASE * tcp_syn_backoff_low[slot]);
+       else
+               return (TCPTV_RTOBASE * tcp_syn_backoff[slot]);
+}
+
 static __inline void
 syncache_timeout(struct tcp_syncache_percpu *syncache_percpu,
                 struct syncache *sc, int slot)
@@ -248,17 +257,16 @@ syncache_timeout(struct tcp_syncache_percpu *syncache_percpu,
 
        if (slot > 0) {
                /*
-                * Record that SYN|ACK was lost.
+                * Record the time that we spent in SYN|ACK
+                * retransmition.
+                *
                 * Needed by RFC3390 and RFC6298.
                 */
-               sc->sc_flags |= SCF_SYN_WASLOST;
+               sc->sc_rxtused += syncache_rto(slot - 1);
        }
        sc->sc_rxtslot = slot;
 
-       if (tcp_low_rtobase)
-               rto = TCPTV_RTOBASE * tcp_syn_backoff_low[slot];
-       else
-               rto = TCPTV_RTOBASE * tcp_syn_backoff[slot];
+       rto = syncache_rto(slot);
        sc->sc_rxttime = ticks + rto;
 
        TAILQ_INSERT_TAIL(&syncache_percpu->timerq[slot], sc, sc_timerq);
@@ -863,14 +871,13 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m)
        }
        if (sc->sc_flags & SCF_SACK_PERMITTED)
                tp->t_flags |= TF_SACK_PERMITTED;
-       if (sc->sc_flags & SCF_SYN_WASLOST)
-               tp->t_flags |= TF_SYN_WASLOST;
 
 #ifdef TCP_SIGNATURE
        if (sc->sc_flags & SCF_SIGNATURE)
                tp->t_flags |= TF_SIGNATURE;
 #endif /* TCP_SIGNATURE */
 
+       tp->t_rxtsyn = sc->sc_rxtused;
        tcp_mss(tp, sc->sc_peer_mss);
 
        /*
index b658907..aecddc9 100644 (file)
@@ -534,15 +534,15 @@ tcp_timer_rexmt_handler(struct tcpcb *tp)
                tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
                tcp_save_congestion_state(tp);
                tp->t_flags &= ~(TF_FASTREXMT | TF_EARLYREXMT);
-
-               if (tp->t_state == TCPS_SYN_SENT ||
-                   tp->t_state == TCPS_SYN_RECEIVED) {
-                       /*
-                        * Record that SYN or SYN|ACK was lost.
-                        * Needed by RFC3390 and RFC6298.
-                        */
-                       tp->t_flags |= TF_SYN_WASLOST;
-               }
+       }
+       if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) {
+               /*
+                * Record the time that we spent in SYN or SYN|ACK
+                * retransmition.
+                *
+                * Needed by RFC3390 and RFC6298.
+                */
+               tp->t_rxtsyn += tp->t_rxtcur;
        }
        /* Throw away SACK blocks on a RTO, as specified by RFC2018. */
        tcp_sack_cleanup(&tp->scb);
index a448a68..1d00ee4 100644 (file)
@@ -164,7 +164,7 @@ struct tcpcb {
 #define TF_SIGNATURE   0x00004000      /* require MD5 digests (RFC2385) */
 #define TF_FASTKEEP    0x00008000      /* use a faster tcp_keepidle */
 #define        TF_MORETOCOME   0x00010000      /* More data to be appended to sock */
-#define        TF_SYN_WASLOST  0x00020000      /* SYN or SYN|ACK was lost */
+#define        TF_UNUSED00     0x00020000      /* unused */
 #define        TF_LASTIDLE     0x00040000      /* connection was previously idle */
 #define        TF_RXWIN0SENT   0x00080000      /* sent a receiver win 0 in response */
 #define        TF_FASTRECOVERY 0x00100000      /* in NewReno Fast Recovery */
@@ -277,6 +277,8 @@ struct tcpcb {
        int     t_keepintvl;            /* time between keepalive probes */
        int     t_keepcnt;              /* maximum number of keepalive probes */
        int     t_maxidle;              /* time to drop after starting probes */
+
+       int     t_rxtsyn;               /* time spent in SYN or SYN|ACK rexmt */
 };
 
 #define        IN_FASTRECOVERY(tp)     (tp->t_flags & TF_FASTRECOVERY)
@@ -466,11 +468,12 @@ struct syncache {
 #define SCF_NOOPT              0x01            /* no TCP options */
 #define SCF_WINSCALE           0x02            /* negotiated window scaling */
 #define SCF_TIMESTAMP          0x04            /* negotiated timestamps */
-#define SCF_SYN_WASLOST                0x08            /* SYN|ACK was lost */
+#define SCF_UNUSED             0x08            /* unused */
 #define SCF_UNREACH            0x10            /* icmp unreachable received */
 #define        SCF_SACK_PERMITTED      0x20            /* saw SACK permitted option */
 #define SCF_SIGNATURE          0x40            /* send MD5 digests */
 #define SCF_MARKER             0x80            /* not a real entry */
+       int             sc_rxtused;             /* time spent in SYN|ACK rxt */
        TAILQ_ENTRY(syncache) sc_hash;
        TAILQ_ENTRY(syncache) sc_timerq;
 };