tcp: Implement part of Eifel Response Algorithm (RFC4015)
authorSepherosa Ziehau <sephe@dragonflybsd.org>
Sat, 28 Apr 2012 02:36:17 +0000 (10:36 +0800)
committerSepherosa Ziehau <sephe@dragonflybsd.org>
Mon, 30 Apr 2012 03:15:18 +0000 (11:15 +0800)
It adapts the retransmission timer to avoid further spurious timeouts.

sys/netinet/tcp_input.c
sys/netinet/tcp_timer.c
sys/netinet/tcp_var.h
usr.bin/netstat/inet.c

index 2bd4c82..c8e5192 100644 (file)
@@ -241,7 +241,7 @@ static void  tcp_pulloutofband(struct socket *,
                     struct tcphdr *, struct mbuf *, int);
 static int      tcp_reass(struct tcpcb *, struct tcphdr *, int *,
                     struct mbuf *);
-static void     tcp_xmit_timer(struct tcpcb *, int);
+static void     tcp_xmit_timer(struct tcpcb *, int, tcp_seq);
 static void     tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *, int);
 static void     tcp_sack_rexmt(struct tcpcb *, struct tcphdr *);
 static int      tcp_rmx_msl(const struct tcpcb *);
@@ -1195,11 +1195,13 @@ after_listen:
                                 */
                                if ((to.to_flags & TOF_TS) && to.to_tsecr) {
                                        tcp_xmit_timer(tp,
-                                                      ticks - to.to_tsecr + 1);
+                                           ticks - to.to_tsecr + 1,
+                                           th->th_ack);
                                } else if (tp->t_rtttime &&
                                           SEQ_GT(th->th_ack, tp->t_rtseq)) {
                                        tcp_xmit_timer(tp,
-                                                      ticks - tp->t_rtttime);
+                                           ticks - tp->t_rtttime,
+                                           th->th_ack);
                                }
                                tcp_xmit_bandwidth_limit(tp, th->th_ack);
                                acked = th->th_ack - tp->snd_una;
@@ -2097,9 +2099,9 @@ process_ACK:
                 * timestamps of 0.
                 */
                if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0))
-                       tcp_xmit_timer(tp, ticks - to.to_tsecr + 1);
+                       tcp_xmit_timer(tp, ticks - to.to_tsecr + 1, th->th_ack);
                else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq))
-                       tcp_xmit_timer(tp, ticks - tp->t_rtttime);
+                       tcp_xmit_timer(tp, ticks - tp->t_rtttime, th->th_ack);
                tcp_xmit_bandwidth_limit(tp, th->th_ack);
 
                /*
@@ -2750,13 +2752,34 @@ tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m, int off)
  * and update averages and current timeout.
  */
 static void
-tcp_xmit_timer(struct tcpcb *tp, int rtt)
+tcp_xmit_timer(struct tcpcb *tp, int rtt, tcp_seq ack)
 {
-       int delta;
+       int rebaserto = 0;
 
        tcpstat.tcps_rttupdated++;
        tp->t_rttupdated++;
-       if (tp->t_srtt != 0) {
+       if ((tp->t_flags & TF_REBASERTO) && SEQ_GT(ack, tp->snd_max_prev)) {
+#ifdef DEBUG_EIFEL_RESPONSE
+               kprintf("srtt/rttvar, prev %d/%d, cur %d/%d, ",
+                   tp->t_srtt_prev, tp->t_rttvar_prev,
+                   tp->t_srtt, tp->t_rttvar);
+#endif
+
+               tcpstat.tcps_eifelresponse++;
+               rebaserto = 1;
+               tp->t_flags &= ~TF_REBASERTO;
+               tp->t_srtt = max(tp->t_srtt_prev, (rtt << TCP_RTT_SHIFT));
+               tp->t_rttvar = max(tp->t_rttvar_prev,
+                   (rtt << (TCP_RTTVAR_SHIFT - 1)));
+               if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
+                       tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
+
+#ifdef DEBUG_EIFEL_RESPONSE
+               kprintf("new %d/%d ", tp->t_srtt, tp->t_rttvar);
+#endif
+       } else if (tp->t_srtt != 0) {
+               int delta;
+
                /*
                 * srtt is stored as fixed point with 5 bits after the
                 * binary point (i.e., scaled by 8).  The following magic
@@ -2800,6 +2823,13 @@ tcp_xmit_timer(struct tcpcb *tp, int rtt)
        tp->t_rtttime = 0;
        tp->t_rxtshift = 0;
 
+#ifdef DEBUG_EIFEL_RESPONSE
+       if (rebaserto) {
+               kprintf("| rxtcur prev %d, old %d, ",
+                   tp->t_rxtcur_prev, tp->t_rxtcur);
+       }
+#endif
+
        /*
         * the retransmit should happen at rtt + 4 * rttvar.
         * Because of the way we do the smoothing, srtt and rttvar
@@ -2814,6 +2844,30 @@ tcp_xmit_timer(struct tcpcb *tp, int rtt)
        TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
                      max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX);
 
+       if (rebaserto) {
+               if (tp->t_rxtcur < tp->t_rxtcur_prev + tcp_eifel_rtoinc) {
+                       /*
+                        * RFC4015 requires that the new RTO is at least
+                        * 2*G (tcp_eifel_rtoinc) greater then the RTO
+                        * (t_rxtcur_prev) when the spurious retransmit
+                        * timeout happens.
+                        *
+                        * The above condition could be true, if the SRTT
+                        * and RTTVAR used to calculate t_rxtcur_prev
+                        * resulted in a value less than t_rttmin.  So
+                        * simply increasing SRTT by tcp_eifel_rtoinc when
+                        * preparing for the Eifel response in
+                        * tcp_save_congestion_state() could not ensure
+                        * that the new RTO will be tcp_eifel_rtoinc greater
+                        * t_rxtcur_prev.
+                        */
+                       tp->t_rxtcur = tp->t_rxtcur_prev + tcp_eifel_rtoinc;
+               }
+#ifdef DEBUG_EIFEL_RESPONSE
+               kprintf("new %d\n", tp->t_rxtcur);
+#endif
+       }
+
        /*
         * We received an ack for a packet that wasn't retransmitted;
         * it is probably safe to discard any error indications we've
index 8f1f98a..d6362a1 100644 (file)
@@ -191,6 +191,15 @@ int        tcp_keepcnt = TCPTV_KEEPCNT;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW,
     &tcp_keepcnt, 0, "Maximum number of keepalive probes to be sent");
 
+static int tcp_do_eifel_response = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, eifel_response, CTLFLAG_RW,
+    &tcp_do_eifel_response, 0, "Eifel response algorithm (RFC 4015)");
+
+int tcp_eifel_rtoinc = 2;
+SYSCTL_PROC(_net_inet_tcp, OID_AUTO, eifel_rtoinc, CTLTYPE_INT|CTLFLAG_RW,
+    &tcp_eifel_rtoinc, 0, sysctl_msec_to_ticks, "I",
+    "Eifel response RTO increment");
+
 /* max idle time in persist */
 int    tcp_maxpersistidle;
 
@@ -457,6 +466,14 @@ tcp_save_congestion_state(struct tcpcb *tp)
        tp->snd_wacked_prev = tp->snd_wacked;
        tp->snd_ssthresh_prev = tp->snd_ssthresh;
        tp->snd_recover_prev = tp->snd_recover;
+
+       tp->t_rxtcur_prev = tp->t_rxtcur;
+       tp->t_srtt_prev = tp->t_srtt +
+           (tcp_eifel_rtoinc << TCP_RTT_SHIFT);
+       tp->t_rttvar_prev = tp->t_rttvar;
+       tp->snd_max_prev = tp->snd_max;
+       tp->t_flags &= ~TF_REBASERTO;
+
        if (IN_FASTRECOVERY(tp))
                tp->t_flags |= TF_WASFRECOVERY;
        else
@@ -486,6 +503,8 @@ tcp_revert_congestion_state(struct tcpcb *tp)
        } else {
                ++tcpstat.tcps_sndrtobad;
                tp->snd_last = ticks;
+               if (tcp_do_eifel_response)
+                       tp->t_flags |= TF_REBASERTO;
        }
        tp->t_badrxtwin = 0;
        tp->t_rxtshift = 0;
index 342f5f3..1b0e78d 100644 (file)
@@ -88,6 +88,7 @@ extern int tcp_do_smartsack;
 extern int tcp_do_rescuesack;
 extern int tcp_aggressive_rescuesack;
 extern int tcp_aggregate_acks;
+extern int tcp_eifel_rtoinc;
 
 /* TCP segment queue entry */
 struct tseg_qent {
@@ -166,7 +167,7 @@ struct tcpcb {
 #define TF_SIGNATURE   0x00004000      /* require MD5 digests (RFC2385) */
 #define TF_SACKRESCUED 0x00008000      /* sent rescue SACK recovery data */
 #define        TF_MORETOCOME   0x00010000      /* More data to be appended to sock */
-#define        TF_UNUSED00     0x00020000      /* unused */
+#define        TF_REBASERTO    0x00020000      /* Recalculate RTO based on new RTT */
 #define        TF_LASTIDLE     0x00040000      /* connection was previously idle */
 #define        TF_RXWIN0SENT   0x00080000      /* sent a receiver win 0 in response */
 #define        TF_FASTRECOVERY 0x00100000      /* in NewReno Fast Recovery */
@@ -242,6 +243,10 @@ struct tcpcb {
        tcp_seq last_ack_sent;
 
 /* experimental */
+       int     t_srtt_prev;            /* adjusted SRTT prior to retransmit */
+       int     t_rttvar_prev;          /* RTTVAR prior to retransmit */
+       int     t_rxtcur_prev;          /* rexmt timeout prior to retransmit */
+       tcp_seq snd_max_prev;           /* SND_MAX prior to retransmit */
        u_long  snd_cwnd_prev;          /* cwnd prior to retransmit */
        u_long  snd_wacked_prev;        /* prior bytes acked in send window */
        u_long  snd_ssthresh_prev;      /* ssthresh prior to retransmit */
@@ -355,7 +360,7 @@ struct tcp_stats {
        u_long  tcps_sndidle;           /* sending idle detected */
        u_long  tcps_sackrescue;        /* SACK rescue data packets sent */
        u_long  tcps_sackrescue_try;    /* SACK rescues attempted */
-       u_long  tcps_unused00;          /* unused */
+       u_long  tcps_eifelresponse;     /* Eifel responses */
 
        u_long  tcps_rcvtotal;          /* total packets received */
        u_long  tcps_rcvpack;           /* packets received in sequence */
index 470428f..aa5e7c2 100644 (file)
@@ -453,7 +453,8 @@ tcp_stats(u_long off __unused, const char *name, int af1 __unused)
        p2a(tcps_sndfastrexmit, tcps_sndearlyrexmit,
                "\t\t%lu Fast Retransmit%s (%lu early)\n");
        p(tcps_sndlimited, "\t\t%lu packet%s sent by Limited Transmit\n");
-       p(tcps_sndrtobad, "\t\t%lu spurious RTO retransmit%s\n");
+       p2(tcps_sndrtobad, tcps_eifelresponse,
+               "\t\t%lu spurious RTO retransmit%s (%lu Eifel-response%s)\n");
        p2a(tcps_sndfastrexmitbad, tcps_sndearlyrexmitbad,
                "\t\t%lu spurious Fast Retransmit%s (%lu early)\n");
        p2a(tcps_eifeldetected, tcps_rttcantdetect,