X-Git-Url: https://gitweb.dragonflybsd.org/dragonfly.git/blobdiff_plain/073ec6c4ee6f7a0907c7b2434b6b064a77617f80..ba0d6f9911cce18ff8daa525328a9d8ffa315f33:/sys/netinet/tcp_input.c diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c index c8e5192fd2..d687b68750 100644 --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -176,6 +176,15 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, abc, CTLFLAG_RW, &tcp_do_abc, 0, "TCP Appropriate Byte Counting (RFC 3465)"); +/* + * The following value actually takes range [25ms, 250ms], + * given that most modern systems use 1ms ~ 10ms as the unit + * of timestamp option. + */ +static u_int tcp_paws_tolerance = 25; +SYSCTL_UINT(_net_inet_tcp, OID_AUTO, paws_tolerance, CTLFLAG_RW, + &tcp_paws_tolerance, 0, "RFC1323 PAWS tolerance"); + /* * Define as tunable for easy testing with SACK on and off. * Warning: do not change setting in the middle of an existing active TCP flow, @@ -193,10 +202,18 @@ int tcp_do_rescuesack = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, rescuesack, CTLFLAG_RW, &tcp_do_rescuesack, 0, "Rescue retransmission for SACK"); -int tcp_aggressive_rescuesack = 1; +int tcp_aggressive_rescuesack = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, rescuesack_agg, CTLFLAG_RW, &tcp_aggressive_rescuesack, 0, "Aggressive rescue retransmission for SACK"); +int tcp_do_rfc3517bis = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3517bis, CTLFLAG_RW, + &tcp_do_rfc3517bis, 0, "Enable RFC3517 update"); + +int tcp_rfc3517bis_rxt = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3517bis_rxt, CTLFLAG_RW, + &tcp_rfc3517bis_rxt, 0, "Enable RFC3517 retransmit update"); + SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0, "TCP Segment Reassembly Queue"); @@ -236,16 +253,24 @@ int tcp_sosend_async = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, sosend_async, CTLFLAG_RW, &tcp_sosend_async, 0, "TCP asynchronized pru_send"); -static void tcp_dooptions(struct tcpopt *, u_char *, int, boolean_t); +static int tcp_ignore_redun_dsack = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, ignore_redun_dsack, CTLFLAG_RW, + &tcp_ignore_redun_dsack, 0, "Ignore redundant DSACK"); + +static void tcp_dooptions(struct tcpopt *, u_char *, int, boolean_t, + tcp_seq); static void tcp_pulloutofband(struct socket *, struct tcphdr *, struct mbuf *, int); static int tcp_reass(struct tcpcb *, struct tcphdr *, int *, struct mbuf *); static void tcp_xmit_timer(struct tcpcb *, int, tcp_seq); static void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *, int); -static void tcp_sack_rexmt(struct tcpcb *, struct tcphdr *); +static void tcp_sack_rexmt(struct tcpcb *); +static boolean_t tcp_sack_limitedxmit(struct tcpcb *); static int tcp_rmx_msl(const struct tcpcb *); static void tcp_established(struct tcpcb *); +static boolean_t tcp_fast_recovery(struct tcpcb *, tcp_seq, + const struct tcpopt *); /* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */ #ifdef INET6 @@ -277,6 +302,67 @@ do { \ (SEQ_LT(tp->snd_wl2, th->th_ack) || \ (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)))) +#define iceildiv(n, d) (((n)+(d)-1) / (d)) +#define need_early_retransmit(tp, ownd) \ + (tcp_do_early_retransmit && \ + (tcp_do_eifel_detect && (tp->t_flags & TF_RCVD_TSTMP)) && \ + ownd < ((tp->t_rxtthresh + 1) * tp->t_maxseg) && \ + tp->t_dupacks + 1 >= iceildiv(ownd, tp->t_maxseg) && \ + (!TCP_DO_SACK(tp) || ownd <= tp->t_maxseg || \ + tcp_sack_has_sacked(&tp->scb, ownd - tp->t_maxseg))) + +/* + * Returns TRUE, if this segment can be merged with the last + * pending segment in the reassemble queue and this segment + * does not overlap with the pending segment immediately + * preceeding the last pending segment. + */ +static __inline boolean_t +tcp_paws_canreasslast(const struct tcpcb *tp, const struct tcphdr *th, int tlen) +{ + const struct tseg_qent *last, *prev; + + last = TAILQ_LAST(&tp->t_segq, tsegqe_head); + if (last == NULL) + return FALSE; + + /* This segment comes immediately after the last pending segment */ + if (last->tqe_th->th_seq + last->tqe_len == th->th_seq) + return TRUE; + + if (th->th_seq + tlen != last->tqe_th->th_seq) + return FALSE; + /* This segment comes immediately before the last pending segment */ + + prev = TAILQ_PREV(last, tsegqe_head, tqe_q); + if (prev == NULL) { + /* + * No pending preceeding segment, we assume this segment + * could be reassembled. + */ + return TRUE; + } + + /* This segment does not overlap with the preceeding segment */ + if (SEQ_GEQ(th->th_seq, prev->tqe_th->th_seq + prev->tqe_len)) + return TRUE; + + return FALSE; +} + +static __inline void +tcp_ncr_update_rxtthresh(struct tcpcb *tp) +{ + int old_rxtthresh = tp->t_rxtthresh; + uint32_t ownd = tp->snd_max - tp->snd_una; + + tp->t_rxtthresh = max(3, ((ownd / tp->t_maxseg) >> 1)); + if (tp->t_rxtthresh != old_rxtthresh) { + tcp_sack_update_lostseq(&tp->scb, tp->snd_una, + tp->t_maxseg, tp->t_rxtthresh); + } +} + static int tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m) { @@ -324,7 +410,7 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m) /* * Find a segment which begins after this one does. */ - LIST_FOREACH(q, &tp->t_segq, tqe_q) { + TAILQ_FOREACH(q, &tp->t_segq, tqe_q) { if (SEQ_GT(q->tqe_th->th_seq, th->th_seq)) break; p = q; @@ -341,7 +427,8 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m) /* conversion to int (in i) handles seq wraparound */ i = p->tqe_th->th_seq + p->tqe_len - th->th_seq; if (i > 0) { /* overlaps preceding segment */ - tp->t_flags |= (TF_DUPSEG | TF_ENCLOSESEG); + tp->sack_flags |= + (TSACK_F_DUPSEG | TSACK_F_ENCLOSESEG); /* enclosing block starts w/ preceding segment */ tp->encloseblk.rblk_start = p->tqe_th->th_seq; if (i >= *tlenp) { @@ -387,13 +474,14 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m) if (i <= 0) break; - if (!(tp->t_flags & TF_DUPSEG)) { /* first time through */ - tp->t_flags |= (TF_DUPSEG | TF_ENCLOSESEG); + if (!(tp->sack_flags & TSACK_F_DUPSEG)) { + /* first time through */ + tp->sack_flags |= (TSACK_F_DUPSEG | TSACK_F_ENCLOSESEG); tp->encloseblk = tp->reportblk; /* report trailing duplicate D-SACK segment */ tp->reportblk.rblk_start = q->tqe_th->th_seq; } - if ((tp->t_flags & TF_ENCLOSESEG) && + if ((tp->sack_flags & TSACK_F_ENCLOSESEG) && SEQ_GT(qend_sack, tp->encloseblk.rblk_end)) { /* extend enclosing block if one exists */ tp->encloseblk.rblk_end = qend_sack; @@ -405,8 +493,8 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m) break; } - nq = LIST_NEXT(q, tqe_q); - LIST_REMOVE(q, tqe_q); + nq = TAILQ_NEXT(q, tqe_q); + TAILQ_REMOVE(&tp->t_segq, q, tqe_q); m_freem(q->tqe_m); kfree(q, M_TSEGQ); atomic_add_int(&tcp_reass_qsize, -1); @@ -432,15 +520,15 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m) * When not reporting a duplicate segment, use * the larger enclosing block as the SACK block. */ - if (!(tp->t_flags & TF_DUPSEG)) + if (!(tp->sack_flags & TSACK_F_DUPSEG)) tp->reportblk.rblk_end = tend_sack; - LIST_REMOVE(q, tqe_q); + TAILQ_REMOVE(&tp->t_segq, q, tqe_q); kfree(q, M_TSEGQ); atomic_add_int(&tcp_reass_qsize, -1); } if (p == NULL) { - LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q); + TAILQ_INSERT_HEAD(&tp->t_segq, te, tqe_q); } else { /* check if can coalesce with preceding segment */ if (p->tqe_th->th_seq + p->tqe_len == th->th_seq) { @@ -451,12 +539,12 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m) * When not reporting a duplicate segment, use * the larger enclosing block as the SACK block. */ - if (!(tp->t_flags & TF_DUPSEG)) + if (!(tp->sack_flags & TSACK_F_DUPSEG)) tp->reportblk.rblk_start = p->tqe_th->th_seq; kfree(te, M_TSEGQ); atomic_add_int(&tcp_reass_qsize, -1); } else { - LIST_INSERT_AFTER(p, te, tqe_q); + TAILQ_INSERT_AFTER(&tp->t_segq, p, te, tqe_q); } } @@ -467,20 +555,20 @@ present: */ if (!TCPS_HAVEESTABLISHED(tp->t_state)) return (0); - q = LIST_FIRST(&tp->t_segq); + q = TAILQ_FIRST(&tp->t_segq); if (q == NULL || q->tqe_th->th_seq != tp->rcv_nxt) return (0); tp->rcv_nxt += q->tqe_len; - if (!(tp->t_flags & TF_DUPSEG)) { + if (!(tp->sack_flags & TSACK_F_DUPSEG)) { /* no SACK block to report since ACK advanced */ tp->reportblk.rblk_start = tp->reportblk.rblk_end; } /* no enclosing block to report since ACK advanced */ - tp->t_flags &= ~TF_ENCLOSESEG; + tp->sack_flags &= ~TSACK_F_ENCLOSESEG; flags = q->tqe_th->th_flags & TH_FIN; - LIST_REMOVE(q, tqe_q); - KASSERT(LIST_EMPTY(&tp->t_segq) || - LIST_FIRST(&tp->t_segq)->tqe_th->th_seq != tp->rcv_nxt, + TAILQ_REMOVE(&tp->t_segq, q, tqe_q); + KASSERT(TAILQ_EMPTY(&tp->t_segq) || + TAILQ_FIRST(&tp->t_segq)->tqe_th->th_seq != tp->rcv_nxt, ("segment not coalesced")); if (so->so_state & SS_CANTRCVMORE) { m_freem(q->tqe_m); @@ -545,7 +633,8 @@ tcp_input(struct mbuf **mp, int *offp, int proto) int thflags; struct socket *so = NULL; int todrop, acked; - boolean_t ourfinisacked, needoutput = FALSE; + boolean_t ourfinisacked, needoutput = FALSE, delayed_dupack = FALSE; + tcp_seq th_dupack = 0; /* XXX gcc warning */ u_long tiwin; int recvwin; struct tcpopt to; /* options in this segment */ @@ -1033,7 +1122,7 @@ findpcb: * for syncache, or perform t/tcp connection. */ if (so->so_qlen <= so->so_qlimit) { - tcp_dooptions(&to, optp, optlen, TRUE); + tcp_dooptions(&to, optp, optlen, TRUE, th->th_ack); if (!syncache_add(&inc, &to, th, so, m)) goto drop; @@ -1085,7 +1174,7 @@ after_listen: * Process options. * XXX this is tradtitional behavior, may need to be cleaned up. */ - tcp_dooptions(&to, optp, optlen, (thflags & TH_SYN) != 0); + tcp_dooptions(&to, optp, optlen, (thflags & TH_SYN) != 0, th->th_ack); if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { if ((to.to_flags & TOF_SCALE) && (tp->t_flags & TF_REQ_SCALE)) { tp->t_flags |= TF_RCVD_SCALE; @@ -1170,7 +1259,7 @@ after_listen: */ if (tcp_do_eifel_detect && (to.to_flags & TOF_TS) && to.to_tsecr && - (tp->t_flags & TF_FIRSTACCACK)) { + (tp->rxt_flags & TRXT_F_FIRSTACCACK)) { /* Eifel detection applicable. */ if (to.to_tsecr < tp->t_rexmtTS) { tcp_revert_congestion_state(tp); @@ -1184,8 +1273,8 @@ after_listen: tcp_revert_congestion_state(tp); ++tcpstat.tcps_rttdetected; } - tp->t_flags &= ~(TF_FIRSTACCACK | - TF_FASTREXMT | TF_EARLYREXMT); + tp->rxt_flags &= ~(TRXT_F_FIRSTACCACK | + TRXT_F_FASTREXMT | TRXT_F_EARLYREXMT); /* * Recalculate the retransmit timer / rtt. * @@ -1251,7 +1340,7 @@ after_listen: } } else if (tiwin == tp->snd_wnd && th->th_ack == tp->snd_una && - LIST_EMPTY(&tp->t_segq) && + TAILQ_EMPTY(&tp->t_segq) && tlen <= ssb_space(&so->so_rcv)) { u_long newsize = 0; /* automatic sockbuf scaling */ /* @@ -1630,7 +1719,6 @@ after_listen: */ if ((to.to_flags & TOF_TS) && tp->ts_recent != 0 && TSTMP_LT(to.to_tsval, tp->ts_recent)) { - /* Check to see if ts_recent is over 24 days old. */ if ((int)(ticks - tp->ts_recent_age) > TCP_PAWS_IDLE) { /* @@ -1645,6 +1733,39 @@ after_listen: * dropped when ts_recent is old. */ tp->ts_recent = 0; + } else if (tcp_paws_tolerance && tlen != 0 && + tp->t_state == TCPS_ESTABLISHED && + (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK&& + !(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)) && + th->th_ack == tp->snd_una && + tiwin == tp->snd_wnd && + TSTMP_GEQ(to.to_tsval + tcp_paws_tolerance, tp->ts_recent)&& + (th->th_seq == tp->rcv_nxt || + (SEQ_GT(th->th_seq, tp->rcv_nxt) && + tcp_paws_canreasslast(tp, th, tlen)))) { + /* + * This tends to prevent valid new segments from being + * dropped by the reordered segments sent by the fast + * retransmission algorithm on the sending side, i.e. + * the fast retransmitted segment w/ larger timestamp + * arrives earlier than the previously sent new segments + * w/ smaller timestamp. + * + * If following conditions are met, the segment is + * accepted: + * - The segment contains data + * - The connection is established + * - The header does not contain important flags + * - SYN or FIN is not needed + * - It does not acknowledge new data + * - Receive window is not changed + * - The timestamp is within "acceptable" range + * - The new segment is what we are expecting or + * the new segment could be merged w/ the last + * pending segment on the reassemble queue + */ + tcpstat.tcps_pawsaccept++; + tcpstat.tcps_pawsdrop++; } else { tcpstat.tcps_rcvduppack++; tcpstat.tcps_rcvdupbyte += tlen; @@ -1676,7 +1797,8 @@ after_listen: th->th_seq + tlen, thflags); if (SEQ_GT(tp->reportblk.rblk_end, tp->rcv_nxt)) tp->reportblk.rblk_end = tp->rcv_nxt; - tp->t_flags |= (TF_DUPSEG | TF_SACKLEFT | TF_ACKNOW); + tp->sack_flags |= (TSACK_F_DUPSEG | TSACK_F_SACKLEFT); + tp->t_flags |= TF_ACKNOW; } if (thflags & TH_SYN) { thflags &= ~TH_SYN; @@ -1879,148 +2001,30 @@ after_listen: if (SEQ_LEQ(th->th_ack, tp->snd_una)) { if (TCP_DO_SACK(tp)) tcp_sack_update_scoreboard(tp, &to); - if (tlen != 0 || tiwin != tp->snd_wnd) { - tp->t_dupacks = 0; - break; - } - tcpstat.tcps_rcvdupack++; if (!tcp_callout_active(tp, tp->tt_rexmt) || th->th_ack != tp->snd_una) { + if (tlen == 0 && tiwin == tp->snd_wnd) + tcpstat.tcps_rcvdupack++; tp->t_dupacks = 0; break; } - /* - * We have outstanding data (other than - * a window probe), this is a completely - * duplicate ack (ie, window info didn't - * change), the ack is the biggest we've - * seen and we've seen exactly our rexmt - * threshhold of them, so assume a packet - * has been dropped and retransmit it. - * Kludge snd_nxt & the congestion - * window so we send only this one - * packet. - */ - if (IN_FASTRECOVERY(tp)) { - if (TCP_DO_SACK(tp)) { - /* No artifical cwnd inflation. */ - tcp_sack_rexmt(tp, th); + if (tlen != 0 || tiwin != tp->snd_wnd) { + if (!tcp_do_rfc3517bis || + !TCP_DO_SACK(tp) || + (to.to_flags & + (TOF_SACK | TOF_SACK_REDUNDANT)) + != TOF_SACK) { + tp->t_dupacks = 0; } else { - /* - * Dup acks mean that packets - * have left the network - * (they're now cached at the - * receiver) so bump cwnd by - * the amount in the receiver - * to keep a constant cwnd - * packets in the network. - */ - tp->snd_cwnd += tp->t_maxseg; - tcp_output(tp); + delayed_dupack = TRUE; + th_dupack = th->th_ack; } - } else if (SEQ_LT(th->th_ack, tp->snd_recover)) { - tp->t_dupacks = 0; break; - } else if (++tp->t_dupacks == tcprexmtthresh) { - tcp_seq old_snd_nxt; - u_int win; - -fastretransmit: - if (tcp_do_eifel_detect && - (tp->t_flags & TF_RCVD_TSTMP)) { - tcp_save_congestion_state(tp); - tp->t_flags |= TF_FASTREXMT; - } - /* - * We know we're losing at the current - * window size, so do congestion avoidance: - * set ssthresh to half the current window - * and pull our congestion window back to the - * new ssthresh. - */ - win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / - tp->t_maxseg; - if (win < 2) - win = 2; - tp->snd_ssthresh = win * tp->t_maxseg; - ENTER_FASTRECOVERY(tp); - tp->snd_recover = tp->snd_max; - tcp_callout_stop(tp, tp->tt_rexmt); - tp->t_rtttime = 0; - old_snd_nxt = tp->snd_nxt; - tp->snd_nxt = th->th_ack; - tp->snd_cwnd = tp->t_maxseg; - tcp_output(tp); - ++tcpstat.tcps_sndfastrexmit; - tp->snd_cwnd = tp->snd_ssthresh; - tp->rexmt_high = tp->snd_nxt; - tp->t_flags &= ~TF_SACKRESCUED; - if (SEQ_GT(old_snd_nxt, tp->snd_nxt)) - tp->snd_nxt = old_snd_nxt; - KASSERT(tp->snd_limited <= 2, - ("tp->snd_limited too big")); - if (TCP_DO_SACK(tp)) - tcp_sack_rexmt(tp, th); - else - tp->snd_cwnd += tp->t_maxseg * - (tp->t_dupacks - tp->snd_limited); - } else if (tcp_do_limitedtransmit) { - u_long oldcwnd = tp->snd_cwnd; - tcp_seq oldsndmax = tp->snd_max; - tcp_seq oldsndnxt = tp->snd_nxt; - /* outstanding data */ - uint32_t ownd = tp->snd_max - tp->snd_una; - u_int sent; - -#define iceildiv(n, d) (((n)+(d)-1) / (d)) - - KASSERT(tp->t_dupacks == 1 || - tp->t_dupacks == 2, - ("dupacks not 1 or 2")); - if (tp->t_dupacks == 1) - tp->snd_limited = 0; - tp->snd_nxt = tp->snd_max; - tp->snd_cwnd = ownd + - (tp->t_dupacks - tp->snd_limited) * - tp->t_maxseg; - tcp_output(tp); - - if (SEQ_LT(oldsndnxt, oldsndmax)) { - KASSERT(SEQ_GEQ(oldsndnxt, tp->snd_una), - ("snd_una moved in other threads")); - tp->snd_nxt = oldsndnxt; - } - tp->snd_cwnd = oldcwnd; - sent = tp->snd_max - oldsndmax; - if (sent > tp->t_maxseg) { - KASSERT((tp->t_dupacks == 2 && - tp->snd_limited == 0) || - (sent == tp->t_maxseg + 1 && - tp->t_flags & TF_SENTFIN), - ("sent too much")); - KASSERT(sent <= tp->t_maxseg * 2, - ("sent too many segments")); - tp->snd_limited = 2; - tcpstat.tcps_sndlimited += 2; - } else if (sent > 0) { - ++tp->snd_limited; - ++tcpstat.tcps_sndlimited; - } else if (tcp_do_early_retransmit && - (tcp_do_eifel_detect && - (tp->t_flags & TF_RCVD_TSTMP)) && - ownd < 4 * tp->t_maxseg && - tp->t_dupacks + 1 >= - iceildiv(ownd, tp->t_maxseg) && - (!TCP_DO_SACK(tp) || - ownd <= tp->t_maxseg || - tcp_sack_has_sacked(&tp->scb, - ownd - tp->t_maxseg))) { - ++tcpstat.tcps_sndearlyrexmit; - tp->t_flags |= TF_EARLYREXMT; - goto fastretransmit; - } } - goto drop; + if (tcp_fast_recovery(tp, th->th_ack, &to)) + goto drop; + else + break; } KASSERT(SEQ_GT(th->th_ack, tp->snd_una), ("th_ack <= snd_una")); @@ -2063,7 +2067,7 @@ process_ACK: if (tcp_do_eifel_detect && acked > 0 && (to.to_flags & TOF_TS) && (to.to_tsecr != 0) && - (tp->t_flags & TF_FIRSTACCACK)) { + (tp->rxt_flags & TRXT_F_FIRSTACCACK)) { /* Eifel detection applicable. */ if (to.to_tsecr < tp->t_rexmtTS) { ++tcpstat.tcps_eifeldetected; @@ -2112,7 +2116,8 @@ process_ACK: goto step6; /* Stop looking for an acceptable ACK since one was received. */ - tp->t_flags &= ~(TF_FIRSTACCACK | TF_FASTREXMT | TF_EARLYREXMT); + tp->rxt_flags &= ~(TRXT_F_FIRSTACCACK | + TRXT_F_FASTREXMT | TRXT_F_EARLYREXMT); if (acked > so->so_snd.ssb_cc) { tp->snd_wnd -= so->so_snd.ssb_cc; @@ -2173,7 +2178,7 @@ process_ACK: } else { if (TCP_DO_SACK(tp)) { tp->snd_max_rexmt = tp->snd_max; - tcp_sack_rexmt(tp, th); + tcp_sack_rexmt(tp); } else { tcp_newreno_partial_ack(tp, th, acked); } @@ -2392,7 +2397,7 @@ dodata: /* XXX */ * fast retransmit can work). */ if (th->th_seq == tp->rcv_nxt && - LIST_EMPTY(&tp->t_segq) && + TAILQ_EMPTY(&tp->t_segq) && TCPS_HAVEESTABLISHED(tp->t_state)) { if (DELAY_ACK(tp)) { tcp_callout_reset(tp, tp->tt_delack, @@ -2414,7 +2419,7 @@ dodata: /* XXX */ } sorwakeup(so); } else { - if (!(tp->t_flags & TF_DUPSEG)) { + if (!(tp->sack_flags & TSACK_F_DUPSEG)) { /* Initialize SACK report block. */ tp->reportblk.rblk_start = th->th_seq; tp->reportblk.rblk_end = TCP_SACK_BLKEND( @@ -2506,6 +2511,12 @@ dodata: /* XXX */ tcp_trace(TA_INPUT, ostate, tp, tcp_saveipgen, &tcp_savetcp, 0); #endif + /* + * Delayed duplicated ACK processing + */ + if (delayed_dupack && tcp_fast_recovery(tp, th_dupack, &to)) + needoutput = FALSE; + /* * Return any desired output. */ @@ -2612,7 +2623,8 @@ drop: * Parse TCP options and place in tcpopt. */ static void -tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, boolean_t is_syn) +tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, boolean_t is_syn, + tcp_seq ack) { int opt, optlen, i; @@ -2694,6 +2706,10 @@ tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, boolean_t is_syn) break; } } + if ((to->to_flags & TOF_SACK) && + tcp_sack_ndsack_blocks(to->to_sackblocks, + to->to_nsackblocks, ack)) + to->to_flags |= TOF_DSACK; break; #ifdef TCP_SIGNATURE /* @@ -2758,7 +2774,8 @@ tcp_xmit_timer(struct tcpcb *tp, int rtt, tcp_seq ack) tcpstat.tcps_rttupdated++; tp->t_rttupdated++; - if ((tp->t_flags & TF_REBASERTO) && SEQ_GT(ack, tp->snd_max_prev)) { + if ((tp->rxt_flags & TRXT_F_REBASERTO) && + SEQ_GT(ack, tp->snd_max_prev)) { #ifdef DEBUG_EIFEL_RESPONSE kprintf("srtt/rttvar, prev %d/%d, cur %d/%d, ", tp->t_srtt_prev, tp->t_rttvar_prev, @@ -2767,7 +2784,7 @@ tcp_xmit_timer(struct tcpcb *tp, int rtt, tcp_seq ack) tcpstat.tcps_eifelresponse++; rebaserto = 1; - tp->t_flags &= ~TF_REBASERTO; + tp->rxt_flags &= ~TRXT_F_REBASERTO; tp->t_srtt = max(tp->t_srtt_prev, (rtt << TCP_RTT_SHIFT)); tp->t_rttvar = max(tp->t_rttvar_prev, (rtt << (TCP_RTTVAR_SHIFT - 1))); @@ -3027,11 +3044,11 @@ tcp_mss(struct tcpcb *tp, int offer) mss -= TCPOLEN_TSTAMP_APPA; #if (MCLBYTES & (MCLBYTES - 1)) == 0 - if (mss > MCLBYTES) - mss &= ~(MCLBYTES-1); + if (mss > MCLBYTES) + mss &= ~(MCLBYTES-1); #else - if (mss > MCLBYTES) - mss = mss / MCLBYTES * MCLBYTES; + if (mss > MCLBYTES) + mss = mss / MCLBYTES * MCLBYTES; #endif /* * If there's a pipesize, change the socket buffer @@ -3152,7 +3169,7 @@ tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th, int acked) * except when retransmitting snd_una. */ static void -tcp_sack_rexmt(struct tcpcb *tp, struct tcphdr *th) +tcp_sack_rexmt(struct tcpcb *tp) { tcp_seq old_snd_nxt = tp->snd_nxt; u_long ocwnd = tp->snd_cwnd; @@ -3171,7 +3188,7 @@ tcp_sack_rexmt(struct tcpcb *tp, struct tcphdr *th) int error; old_rexmt_high = tp->rexmt_high; - if (!tcp_sack_nextseg(tp, &nextrexmt, &seglen, &rescue)) { + if (!tcp_sack_nextseg(tp, &nextrexmt, &seglen, &rescue)) { tp->rexmt_high = old_rexmt_high; break; } @@ -3216,13 +3233,14 @@ tcp_sack_rexmt(struct tcpcb *tp, struct tcphdr *th) if (rescue) { tcpstat.tcps_sackrescue++; tp->rexmt_rescue = tp->snd_nxt; - tp->t_flags |= TF_SACKRESCUED; + tp->sack_flags |= TSACK_F_SACKRESCUED; break; } if (SEQ_LT(nextrexmt, old_snd_max) && SEQ_LT(tp->rexmt_high, tp->snd_nxt)) { tp->rexmt_high = seq_min(tp->snd_nxt, old_snd_max); - if ((tp->t_flags & TF_SACKRESCUED) && + if (tcp_aggressive_rescuesack && + (tp->sack_flags & TSACK_F_SACKRESCUED) && SEQ_LT(tp->rexmt_rescue, tp->rexmt_high)) { /* Drag RescueRxt along with HighRxt */ tp->rexmt_rescue = tp->rexmt_high; @@ -3234,6 +3252,51 @@ tcp_sack_rexmt(struct tcpcb *tp, struct tcphdr *th) tp->snd_cwnd = ocwnd; } +/* + * Return TRUE, if some new segments are sent + */ +static boolean_t +tcp_sack_limitedxmit(struct tcpcb *tp) +{ + tcp_seq oldsndnxt = tp->snd_nxt; + tcp_seq oldsndmax = tp->snd_max; + u_long ocwnd = tp->snd_cwnd; + uint32_t pipe, sent; + boolean_t ret = FALSE; + tcp_seq_diff_t cwnd_left; + tcp_seq next; + + tp->rexmt_high = tp->snd_una - 1; + pipe = tcp_sack_compute_pipe(tp); + cwnd_left = (tcp_seq_diff_t)(ocwnd - pipe); + if (cwnd_left < (tcp_seq_diff_t)tp->t_maxseg) + return FALSE; + + next = tp->snd_nxt = tp->snd_max; + tp->snd_cwnd = tp->snd_nxt - tp->snd_una + + rounddown(cwnd_left, tp->t_maxseg); + + tcp_output(tp); + + sent = tp->snd_nxt - next; + if (sent > 0) { + tcpstat.tcps_sndlimited += howmany(sent, tp->t_maxseg); + ret = TRUE; + } + + if (SEQ_LT(oldsndnxt, oldsndmax)) { + KASSERT(SEQ_GEQ(oldsndnxt, tp->snd_una), + ("snd_una moved in other threads")); + tp->snd_nxt = oldsndnxt; + } + tp->snd_cwnd = ocwnd; + + if (ret && TCP_DO_NCR(tp)) + tcp_ncr_update_rxtthresh(tp); + + return ret; +} + /* * Reset idle time and keep-alive timer, typically called when a valid * tcp packet is received but may also be called when FASTKEEP is set @@ -3309,3 +3372,180 @@ tcp_established(struct tcpcb *tp) tp->t_rxtcur = TCPTV_RTOBASE3; } } + +/* + * Returns TRUE, if the ACK should be dropped + */ +static boolean_t +tcp_fast_recovery(struct tcpcb *tp, tcp_seq th_ack, const struct tcpopt *to) +{ + boolean_t fast_sack_rexmt = TRUE; + + tcpstat.tcps_rcvdupack++; + + /* + * We have outstanding data (other than a window probe), + * this is a completely duplicate ack (ie, window info + * didn't change), the ack is the biggest we've seen and + * we've seen exactly our rexmt threshhold of them, so + * assume a packet has been dropped and retransmit it. + * Kludge snd_nxt & the congestion window so we send only + * this one packet. + */ + if (IN_FASTRECOVERY(tp)) { + if (TCP_DO_SACK(tp)) { + /* No artifical cwnd inflation. */ + tcp_sack_rexmt(tp); + } else { + /* + * Dup acks mean that packets have left + * the network (they're now cached at the + * receiver) so bump cwnd by the amount in + * the receiver to keep a constant cwnd + * packets in the network. + */ + tp->snd_cwnd += tp->t_maxseg; + tcp_output(tp); + } + return TRUE; + } else if (SEQ_LT(th_ack, tp->snd_recover)) { + tp->t_dupacks = 0; + return FALSE; + } else if (tcp_ignore_redun_dsack && TCP_DO_SACK(tp) && + (to->to_flags & (TOF_DSACK | TOF_SACK_REDUNDANT)) == + (TOF_DSACK | TOF_SACK_REDUNDANT)) { + /* + * If the ACK carries DSACK and other SACK blocks + * carry information that we have already known, + * don't count this ACK as duplicate ACK. This + * prevents spurious early retransmit and fast + * retransmit. This also meets the requirement of + * RFC3042 that new segments should not be sent if + * the SACK blocks do not contain new information + * (XXX we actually loosen the requirment that only + * DSACK is checked here). + * + * This kind of ACKs are usually sent after spurious + * retransmit. + */ + /* Do nothing; don't change t_dupacks */ + return TRUE; + } else if (tp->t_dupacks == 0 && TCP_DO_NCR(tp)) { + tcp_ncr_update_rxtthresh(tp); + } + + if (++tp->t_dupacks == tp->t_rxtthresh) { + tcp_seq old_snd_nxt; + u_int win; + +fastretransmit: + if (tcp_do_eifel_detect && (tp->t_flags & TF_RCVD_TSTMP)) { + tcp_save_congestion_state(tp); + tp->rxt_flags |= TRXT_F_FASTREXMT; + } + /* + * We know we're losing at the current window size, + * so do congestion avoidance: set ssthresh to half + * the current window and pull our congestion window + * back to the new ssthresh. + */ + win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; + if (win < 2) + win = 2; + tp->snd_ssthresh = win * tp->t_maxseg; + ENTER_FASTRECOVERY(tp); + tp->snd_recover = tp->snd_max; + tcp_callout_stop(tp, tp->tt_rexmt); + tp->t_rtttime = 0; + old_snd_nxt = tp->snd_nxt; + tp->snd_nxt = th_ack; + tp->snd_cwnd = tp->t_maxseg; + tcp_output(tp); + ++tcpstat.tcps_sndfastrexmit; + tp->snd_cwnd = tp->snd_ssthresh; + tp->rexmt_high = tp->snd_nxt; + tp->sack_flags &= ~TSACK_F_SACKRESCUED; + if (SEQ_GT(old_snd_nxt, tp->snd_nxt)) + tp->snd_nxt = old_snd_nxt; + KASSERT(tp->snd_limited <= 2, ("tp->snd_limited too big")); + if (TCP_DO_SACK(tp)) { + if (fast_sack_rexmt) + tcp_sack_rexmt(tp); + } else { + tp->snd_cwnd += tp->t_maxseg * + (tp->t_dupacks - tp->snd_limited); + } + } else if ((tcp_do_rfc3517bis && TCP_DO_SACK(tp)) || TCP_DO_NCR(tp)) { + /* + * The RFC3517bis recommends to reduce the byte threshold, + * and enter fast retransmit if IsLost(snd_una). However, + * if we use IsLost(snd_una) based fast retransmit here, + * segments reordering will cause spurious retransmit. So + * we defer the IsLost(snd_una) based fast retransmit until + * the extended limited transmit can't send any segments and + * early retransmit can't be done. + */ + if (tcp_rfc3517bis_rxt && tcp_do_rfc3517bis && + tcp_sack_islost(&tp->scb, tp->snd_una)) + goto fastretransmit; + + if (tcp_do_limitedtransmit || TCP_DO_NCR(tp)) { + if (!tcp_sack_limitedxmit(tp)) { + /* outstanding data */ + uint32_t ownd = tp->snd_max - tp->snd_una; + + if (need_early_retransmit(tp, ownd)) { + ++tcpstat.tcps_sndearlyrexmit; + tp->rxt_flags |= TRXT_F_EARLYREXMT; + goto fastretransmit; + } else if (tcp_do_rfc3517bis && + tcp_sack_islost(&tp->scb, tp->snd_una)) { + fast_sack_rexmt = FALSE; + goto fastretransmit; + } + } + } + } else if (tcp_do_limitedtransmit) { + u_long oldcwnd = tp->snd_cwnd; + tcp_seq oldsndmax = tp->snd_max; + tcp_seq oldsndnxt = tp->snd_nxt; + /* outstanding data */ + uint32_t ownd = tp->snd_max - tp->snd_una; + u_int sent; + + KASSERT(tp->t_dupacks == 1 || tp->t_dupacks == 2, + ("dupacks not 1 or 2")); + if (tp->t_dupacks == 1) + tp->snd_limited = 0; + tp->snd_nxt = tp->snd_max; + tp->snd_cwnd = ownd + + (tp->t_dupacks - tp->snd_limited) * tp->t_maxseg; + tcp_output(tp); + + if (SEQ_LT(oldsndnxt, oldsndmax)) { + KASSERT(SEQ_GEQ(oldsndnxt, tp->snd_una), + ("snd_una moved in other threads")); + tp->snd_nxt = oldsndnxt; + } + tp->snd_cwnd = oldcwnd; + sent = tp->snd_max - oldsndmax; + if (sent > tp->t_maxseg) { + KASSERT((tp->t_dupacks == 2 && tp->snd_limited == 0) || + (sent == tp->t_maxseg + 1 && + (tp->t_flags & TF_SENTFIN)), + ("sent too much")); + KASSERT(sent <= tp->t_maxseg * 2, + ("sent too many segments")); + tp->snd_limited = 2; + tcpstat.tcps_sndlimited += 2; + } else if (sent > 0) { + ++tp->snd_limited; + ++tcpstat.tcps_sndlimited; + } else if (need_early_retransmit(tp, ownd)) { + ++tcpstat.tcps_sndearlyrexmit; + tp->rxt_flags |= TRXT_F_EARLYREXMT; + goto fastretransmit; + } + } + return TRUE; +}