From: Sepherosa Ziehau Date: Tue, 8 May 2012 09:56:00 +0000 (+0800) Subject: tcp/sack: Implement RFC3517bis X-Git-Tag: v3.2.0~990 X-Git-Url: https://gitweb.dragonflybsd.org/~tuxillo/dragonfly.git/commitdiff_plain/ffe35e178bc55ec440e7874c5669359c15c4b982 tcp/sack: Implement RFC3517bis http://tools.ietf.org/html/draft-ietf-tcpm-3517bis-02, which will be become "Standards Track" soon. net.inet.tcp.rfc3517bis sysctl node is added to enable this update. It is off by default. --- diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c index 538cc8f145..47aac5cacc 100644 --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -197,6 +197,10 @@ int tcp_aggressive_rescuesack = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, rescuesack_agg, CTLFLAG_RW, &tcp_aggressive_rescuesack, 0, "Aggressive rescue retransmission for SACK"); +int tcp_do_rfc3517bis = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3517bis, CTLFLAG_RW, + &tcp_do_rfc3517bis, 0, "Enable RFC3517 update"); + SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0, "TCP Segment Reassembly Queue"); @@ -249,6 +253,7 @@ static int tcp_reass(struct tcpcb *, struct tcphdr *, int *, static void tcp_xmit_timer(struct tcpcb *, int, tcp_seq); static void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *, int); static void tcp_sack_rexmt(struct tcpcb *, struct tcphdr *); +static boolean_t tcp_sack_limitedxmit(struct tcpcb *); static int tcp_rmx_msl(const struct tcpcb *); static void tcp_established(struct tcpcb *); @@ -282,6 +287,15 @@ do { \ (SEQ_LT(tp->snd_wl2, th->th_ack) || \ (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)))) +#define iceildiv(n, d) (((n)+(d)-1) / (d)) +#define need_early_retransmit(tp, ownd) \ + (tcp_do_early_retransmit && \ + (tcp_do_eifel_detect && (tp->t_flags & TF_RCVD_TSTMP)) && \ + ownd < (4 * tp->t_maxseg) && \ + tp->t_dupacks + 1 >= iceildiv(ownd, tp->t_maxseg) && \ + (!TCP_DO_SACK(tp) || ownd <= tp->t_maxseg || \ + tcp_sack_has_sacked(&tp->scb, ownd - tp->t_maxseg))) + static int tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m) { @@ -1884,16 +1898,40 @@ after_listen: if (SEQ_LEQ(th->th_ack, tp->snd_una)) { if (TCP_DO_SACK(tp)) tcp_sack_update_scoreboard(tp, &to); - if (tlen != 0 || tiwin != tp->snd_wnd) { - tp->t_dupacks = 0; - break; - } - tcpstat.tcps_rcvdupack++; if (!tcp_callout_active(tp, tp->tt_rexmt) || th->th_ack != tp->snd_una) { + tcpstat.tcps_rcvdupack++; tp->t_dupacks = 0; break; } + if (tlen != 0 || tiwin != tp->snd_wnd) { + if (!tcp_do_rfc3517bis || + !TCP_DO_SACK(tp) || + (to.to_flags & + (TOF_SACK | TOF_SACK_REDUNDANT)) + != TOF_SACK) { + tp->t_dupacks = 0; + break; + } + /* + * Update window information. + */ + if (tiwin != tp->snd_wnd && + acceptable_window_update(tp, th, tiwin)) { + /* keep track of pure window updates */ + if (tlen == 0 && + tp->snd_wl2 == th->th_ack && + tiwin > tp->snd_wnd) + tcpstat.tcps_rcvwinupd++; + tp->snd_wnd = tiwin; + tp->snd_wl1 = th->th_seq; + tp->snd_wl2 = th->th_ack; + if (tp->snd_wnd > tp->max_sndwnd) + tp->max_sndwnd = tp->snd_wnd; + } + } + tcpstat.tcps_rcvdupack++; + /* * We have outstanding data (other than * a window probe), this is a completely @@ -1990,6 +2028,21 @@ fastretransmit: else tp->snd_cwnd += tp->t_maxseg * (tp->t_dupacks - tp->snd_limited); + } else if (tcp_do_rfc3517bis && TCP_DO_SACK(tp)) { + if (tcp_sack_islost(&tp->scb, tp->snd_una)) + goto fastretransmit; + if (tcp_do_limitedtransmit) { + /* outstanding data */ + uint32_t ownd = + tp->snd_max - tp->snd_una; + + if (!tcp_sack_limitedxmit(tp) && + need_early_retransmit(tp, ownd)) { + ++tcpstat.tcps_sndearlyrexmit; + tp->t_flags |= TF_EARLYREXMT; + goto fastretransmit; + } + } } else if (tcp_do_limitedtransmit) { u_long oldcwnd = tp->snd_cwnd; tcp_seq oldsndmax = tp->snd_max; @@ -1998,8 +2051,6 @@ fastretransmit: uint32_t ownd = tp->snd_max - tp->snd_una; u_int sent; -#define iceildiv(n, d) (((n)+(d)-1) / (d)) - KASSERT(tp->t_dupacks == 1 || tp->t_dupacks == 2, ("dupacks not 1 or 2")); @@ -2031,22 +2082,16 @@ fastretransmit: } else if (sent > 0) { ++tp->snd_limited; ++tcpstat.tcps_sndlimited; - } else if (tcp_do_early_retransmit && - (tcp_do_eifel_detect && - (tp->t_flags & TF_RCVD_TSTMP)) && - ownd < 4 * tp->t_maxseg && - tp->t_dupacks + 1 >= - iceildiv(ownd, tp->t_maxseg) && - (!TCP_DO_SACK(tp) || - ownd <= tp->t_maxseg || - tcp_sack_has_sacked(&tp->scb, - ownd - tp->t_maxseg))) { + } else if (need_early_retransmit(tp, ownd)) { ++tcpstat.tcps_sndearlyrexmit; tp->t_flags |= TF_EARLYREXMT; goto fastretransmit; } } - goto drop; + if (tlen != 0) + break; + else + goto drop; } KASSERT(SEQ_GT(th->th_ack, tp->snd_una), ("th_ack <= snd_una")); @@ -3266,6 +3311,47 @@ tcp_sack_rexmt(struct tcpcb *tp, struct tcphdr *th) tp->snd_cwnd = ocwnd; } +static boolean_t +tcp_sack_limitedxmit(struct tcpcb *tp) +{ + tcp_seq oldsndnxt = tp->snd_nxt; + tcp_seq oldsndmax = tp->snd_max; + u_long ocwnd = tp->snd_cwnd; + uint32_t pipe; + boolean_t ret = FALSE; + + tp->rexmt_high = tp->snd_una - 1; + pipe = tcp_sack_compute_pipe(tp); + while ((tcp_seq_diff_t)(ocwnd - pipe) >= (tcp_seq_diff_t)tp->t_maxseg) { + uint32_t sent; + tcp_seq next; + int error; + + next = tp->snd_nxt = tp->snd_max; + tp->snd_cwnd = tp->snd_nxt - tp->snd_una + tp->t_maxseg; + + error = tcp_output(tp); + if (error) + break; + + sent = tp->snd_nxt - next; + if (sent <= 0) + break; + pipe += sent; + ++tcpstat.tcps_sndlimited; + ret = TRUE; + } + + if (SEQ_LT(oldsndnxt, oldsndmax)) { + KASSERT(SEQ_GEQ(oldsndnxt, tp->snd_una), + ("snd_una moved in other threads")); + tp->snd_nxt = oldsndnxt; + } + tp->snd_cwnd = ocwnd; + + return ret; +} + /* * Reset idle time and keep-alive timer, typically called when a valid * tcp packet is received but may also be called when FASTKEEP is set diff --git a/sys/netinet/tcp_sack.c b/sys/netinet/tcp_sack.c index ea29afb565..c5450414e6 100644 --- a/sys/netinet/tcp_sack.c +++ b/sys/netinet/tcp_sack.c @@ -469,13 +469,19 @@ update_lostseq(struct scoreboard *scb, tcp_seq snd_una, u_int maxseg, struct sackblock *sb; int nsackblocks = 0; int bytes_sacked = 0; + int rxtthresh_bytes; + + if (tcp_do_rfc3517bis) + rxtthresh_bytes = (rxtthresh - 1) * maxseg; + else + rxtthresh_bytes = rxtthresh * maxseg; sb = TAILQ_LAST(&scb->sackblocks, sackblock_list); while (sb != NULL) { ++nsackblocks; bytes_sacked += sb->sblk_end - sb->sblk_start; if (nsackblocks == rxtthresh || - bytes_sacked >= rxtthresh * maxseg) { + bytes_sacked >= rxtthresh_bytes) { scb->lostseq = sb->sblk_start; return; } @@ -487,8 +493,8 @@ update_lostseq(struct scoreboard *scb, tcp_seq snd_una, u_int maxseg, /* * Return whether the given sequence number is considered lost. */ -static boolean_t -scb_islost(struct scoreboard *scb, tcp_seq seqnum) +boolean_t +tcp_sack_islost(struct scoreboard *scb, tcp_seq seqnum) { return SEQ_LT(seqnum, scb->lostseq); } @@ -577,7 +583,7 @@ tcp_sack_nextseg(struct tcpcb *tp, tcp_seq *nextrexmt, uint32_t *plen, *rescue = FALSE; if (lastblock != NULL) { if (SEQ_LT(torexmt, lastblock->sblk_end) && - scb_islost(scb, torexmt)) { + tcp_sack_islost(scb, torexmt)) { sendunsacked: *nextrexmt = torexmt; /* If the left-hand edge has been SACKed, pull it in. */ @@ -603,7 +609,7 @@ sendunsacked: goto sendunsacked; /* Rescue retransmission */ - if (tcp_do_rescuesack) { + if (tcp_do_rescuesack || tcp_do_rfc3517bis) { tcpstat.tcps_sackrescue_try++; if (tp->t_flags & TF_SACKRESCUED) { if (!tcp_aggressive_rescuesack) diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h index dc4d8dee77..c982949116 100644 --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -87,6 +87,7 @@ extern int tcp_do_sack; extern int tcp_do_smartsack; extern int tcp_do_rescuesack; extern int tcp_aggressive_rescuesack; +extern int tcp_do_rfc3517bis; extern int tcp_aggregate_acks; extern int tcp_eifel_rtoinc; @@ -653,6 +654,8 @@ uint32_t tcp_sack_compute_pipe(struct tcpcb *tp); boolean_t tcp_sack_nextseg(struct tcpcb *tp, tcp_seq *nextrexmt, uint32_t *len, boolean_t *rescue); +boolean_t + tcp_sack_islost(struct scoreboard *scb, tcp_seq seq); #ifdef later void tcp_sack_revert_scoreboard(struct scoreboard *scb, tcp_seq snd_una, u_int maxseg);