tcp: Balance aggressiveness of SACK rescue retransmission
authorSepherosa Ziehau <sephe@dragonflybsd.org>
Thu, 26 Apr 2012 08:09:51 +0000 (16:09 +0800)
committerSepherosa Ziehau <sephe@dragonflybsd.org>
Thu, 26 Apr 2012 08:39:12 +0000 (16:39 +0800)
This commit is following the idea of sustain ACK clocking whenever
possible to avoid timeout transmission during fast recovery, which
is mentioned in both in RFC3517 and "Rescue Retransmission for SACK"
draft.

- Be a little bit more aggressive in NextSeg()

  The main problem of "Rescue Retransmission for SACK" draft is its
  conservativeness of how many rescue retransmission could happen
  during fast recovery, which under some situation is not enough to
  sustain ACK clock.

  Our aggressive SACK rescue retransmission variant tries to tick out
  one rescue segment if there are no other segments could be sent according
  to the RFC3517, thus ACK clock is kept ticking.

- Be consertive in sending out rescue segment.

  The idea of SACK rescue retransmission is just to sustain ACK clock.
  As long as there are segments sent (either new segments or retransmission)
  during SACK base fast recovery, the ACK clock will be sustained.  So
  rescue segment will not be sent in this situation.

SACK rescue retransmission statistics are updated more accurately to
reflect what had happened.

The aggressive variant of SACK rescue retransmission could be disabled
by setting sysctl net.inet.tcp.rescuesack_agg to 0; it is enabled by
default.

sys/netinet/tcp_input.c
sys/netinet/tcp_sack.c
sys/netinet/tcp_var.h
usr.bin/netstat/inet.c

index 5bd794c..3e30024 100644 (file)
@@ -193,6 +193,10 @@ int tcp_do_rescuesack = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rescuesack, CTLFLAG_RW,
     &tcp_do_rescuesack, 0, "Rescue retransmission for SACK");
 
+int tcp_aggressive_rescuesack = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, rescuesack_agg, CTLFLAG_RW,
+    &tcp_aggressive_rescuesack, 0, "Aggressive rescue retransmission for SACK");
+
 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0,
     "TCP Segment Reassembly Queue");
 
@@ -3108,6 +3112,7 @@ tcp_sack_rexmt(struct tcpcb *tp, struct tcphdr *th)
        u_long ocwnd = tp->snd_cwnd;
        uint32_t pipe;
        int nseg = 0;           /* consecutive new segments */
+       int nseg_rexmt = 0;     /* retransmitted segments */
 #define MAXBURST 4             /* limit burst of new packets on partial ack */
 
        tp->t_rtttime = 0;
@@ -3125,8 +3130,24 @@ tcp_sack_rexmt(struct tcpcb *tp, struct tcphdr *th)
                        break;
                }
 
+               /*
+                * If the next tranmission is a rescue retranmission,
+                * we check whether we have already sent some data
+                * (either new segments or retransmitted segments)
+                * into the the network or not.  Since the idea of rescue
+                * retransmission is to sustain ACK clock, as long as
+                * some segments are in the network, ACK clock will be
+                * kept ticking.
+                */
+               if (rescue && (nseg_rexmt > 0 || nseg > 0)) {
+                       tp->rexmt_high = old_rexmt_high;
+                       break;
+               }
+
                if (nextrexmt == tp->snd_max)
                        ++nseg;
+               else
+                       ++nseg_rexmt;
                tp->snd_nxt = nextrexmt;
                tp->snd_cwnd = nextrexmt - tp->snd_una + seglen;
                old_snd_max = tp->snd_max;
@@ -3147,13 +3168,20 @@ tcp_sack_rexmt(struct tcpcb *tp, struct tcphdr *th)
                tcpstat.tcps_sndsackbyte += sent;
 
                if (rescue) {
+                       tcpstat.tcps_sackrescue++;
                        tp->rexmt_rescue = tp->snd_nxt;
                        tp->t_flags |= TF_SACKRESCUED;
                        break;
                }
                if (SEQ_LT(nextrexmt, old_snd_max) &&
-                   SEQ_LT(tp->rexmt_high, tp->snd_nxt))
+                   SEQ_LT(tp->rexmt_high, tp->snd_nxt)) {
                        tp->rexmt_high = seq_min(tp->snd_nxt, old_snd_max);
+                       if ((tp->t_flags & TF_SACKRESCUED) &&
+                           SEQ_LT(tp->rexmt_rescue, tp->rexmt_high)) {
+                               /* Drag RescueRxt along with HighRxt */
+                               tp->rexmt_rescue = tp->rexmt_high;
+                       }
+               }
        }
        if (SEQ_GT(old_snd_nxt, tp->snd_nxt))
                tp->snd_nxt = old_snd_nxt;
index 2940158..4d65e30 100644 (file)
@@ -325,15 +325,24 @@ void
 tcp_sack_update_scoreboard(struct tcpcb *tp, struct tcpopt *to)
 {
        struct scoreboard *scb = &tp->scb;
+       int rexmt_high_update = 0;
 
        tcp_sack_ack_blocks(scb, tp->snd_una);
        tcp_sack_add_blocks(tp, to);
        update_lostseq(scb, tp->snd_una, tp->t_maxseg);
-       if (SEQ_LT(tp->rexmt_high, tp->snd_una))
+       if (SEQ_LT(tp->rexmt_high, tp->snd_una)) {
                tp->rexmt_high = tp->snd_una;
-       if ((tp->t_flags & TF_SACKRESCUED) &&
-           SEQ_LT(tp->rexmt_rescue, tp->snd_una))
-               tp->t_flags &= ~TF_SACKRESCUED;
+               rexmt_high_update = 1;
+       }
+       if (tp->t_flags & TF_SACKRESCUED) {
+               if (SEQ_LT(tp->rexmt_rescue, tp->snd_una)) {
+                       tp->t_flags &= ~TF_SACKRESCUED;
+               } else if (rexmt_high_update &&
+                   SEQ_LT(tp->rexmt_rescue, tp->rexmt_high)) {
+                       /* Drag RescueRxt along with HighRxt */
+                       tp->rexmt_rescue = tp->rexmt_high;
+               }
+       }
 }
 
 /*
@@ -579,17 +588,50 @@ sendunsacked:
        if (lastblock != NULL && SEQ_LT(torexmt, lastblock->sblk_end))
                goto sendunsacked;
 
+       /* Rescue retransmission */
        if (tcp_do_rescuesack) {
                tcpstat.tcps_sackrescue_try++;
-               if (lastblock == NULL)
-                       tcpstat.tcps_sackrescue_smart++;
+               if (tp->t_flags & TF_SACKRESCUED) {
+                       if (!tcp_aggressive_rescuesack)
+                               return FALSE;
 
-               if (tp->t_flags & TF_SACKRESCUED)
-                       return FALSE;
+                       /*
+                        * Aggressive variant of the rescue retransmission.
+                        *
+                        * The idea of the rescue retransmission is to sustain
+                        * the ACK clock thus to avoid timeout retransmission.
+                        *
+                        * Under some situations, the conservative approach
+                        * suggested in the draft
+                        * http://tools.ietf.org/html/
+                        * draft-nishida-tcpm-rescue-retransmission-00
+                        * could not sustain ACK clock, since it only allows
+                        * one rescue retransmission before a cumulative ACK
+                        * covers the segement transmitted by rescue
+                        * retransmission.
+                        *
+                        * We try to locate the next unSACKed segment which
+                        * follows the previously sent rescue segment.  If
+                        * there is no such segment, we loop back to the first
+                        * unacknowledged segment.
+                        */
+
+                       /*
+                        * Skip SACKed data, but here we follow
+                        * the last transmitted rescue segment.
+                        */
+                       torexmt = tp->rexmt_rescue;
+                       tcp_sack_skip_sacked(scb, &torexmt);
+                       if (torexmt == tp->snd_max) {
+                               /* Nothing left to retransmit; restart */
+                               torexmt = tp->snd_una;
+                       }
+               }
                *rescue = TRUE;
-               tcpstat.tcps_sackrescue++;
                goto sendunsacked;
        } else if (tcp_do_smartsack && lastblock == NULL) {
+               tcpstat.tcps_sackrescue_try++;
+               *rescue = TRUE;
                goto sendunsacked;
        }
 
index 0aa5ddb..d509a92 100644 (file)
@@ -86,6 +86,7 @@ extern int tcp_low_rtobase;
 extern int tcp_do_sack;
 extern int tcp_do_smartsack;
 extern int tcp_do_rescuesack;
+extern int tcp_aggressive_rescuesack;
 extern int tcp_aggregate_acks;
 
 /* TCP segment queue entry */
@@ -355,7 +356,7 @@ struct tcp_stats {
        u_long  tcps_sndidle;           /* sending idle detected */
        u_long  tcps_sackrescue;        /* SACK rescue data packets sent */
        u_long  tcps_sackrescue_try;    /* SACK rescues attempted */
-       u_long  tcps_sackrescue_smart;  /* Smart SACK can send rescue data */
+       u_long  tcps_unused00;          /* unused */
 
        u_long  tcps_rcvtotal;          /* total packets received */
        u_long  tcps_rcvpack;           /* packets received in sequence */
index 1a77448..470428f 100644 (file)
@@ -446,10 +446,10 @@ tcp_stats(u_long off __unused, const char *name, int af1 __unused)
                "\t\t%lu data packet%s (%lu byte%s)\n");
        p2(tcps_sndrexmitpack, tcps_sndrexmitbyte,
                "\t\t%lu data packet%s (%lu byte%s) retransmitted\n");
-       p2(tcps_sndsackpack, tcps_sndsackbyte,
-               "\t\t%lu data packet%s (%lu byte%s) sent by SACK recovery\n");
        p2(tcps_sndsackrtopack, tcps_sndsackrtobyte,
                "\t\t%lu data packet%s (%lu byte%s) retransmitted by SACK\n");
+       p2(tcps_sndsackpack, tcps_sndsackbyte,
+               "\t\t%lu data packet%s (%lu byte%s) sent by SACK recovery\n");
        p2a(tcps_sndfastrexmit, tcps_sndearlyrexmit,
                "\t\t%lu Fast Retransmit%s (%lu early)\n");
        p(tcps_sndlimited, "\t\t%lu packet%s sent by Limited Transmit\n");
@@ -540,7 +540,6 @@ tcp_stats(u_long off __unused, const char *name, int af1 __unused)
 
        p2(tcps_sackrescue, tcps_sackrescue_try,
            "\t%lu SACK rescue%s (of %lu attempt%s)\n");
-       p(tcps_sackrescue_smart, "\t\t%lu Smart SACK capable rescue%s\n");
 
        free(stattmp);
 #undef p