X-Git-Url: https://gitweb.dragonflybsd.org/dragonfly.git/blobdiff_plain/3566868596fe1d31a2228881c385d96a0bedb74d..ba0d6f9911cce18ff8daa525328a9d8ffa315f33:/sys/netinet/tcp_input.c diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c index 2edd2ae951..d687b68750 100644 --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -67,7 +67,6 @@ * $FreeBSD: src/sys/netinet/tcp_input.c,v 1.107.2.38 2003/05/21 04:46:41 cjc Exp $ */ -#include "opt_ipfw.h" /* for ipfw_fwd */ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" @@ -168,11 +167,6 @@ int tcp_aggregate_acks = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, aggregate_acks, CTLFLAG_RW, &tcp_aggregate_acks, 0, "Aggregate built-up acks into one ack"); -int tcp_do_rfc3390 = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW, - &tcp_do_rfc3390, 0, - "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)"); - static int tcp_do_eifel_detect = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, eifel, CTLFLAG_RW, &tcp_do_eifel_detect, 0, "Eifel detection algorithm (RFC 3522)"); @@ -182,6 +176,15 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, abc, CTLFLAG_RW, &tcp_do_abc, 0, "TCP Appropriate Byte Counting (RFC 3465)"); +/* + * The following value actually takes range [25ms, 250ms], + * given that most modern systems use 1ms ~ 10ms as the unit + * of timestamp option. + */ +static u_int tcp_paws_tolerance = 25; +SYSCTL_UINT(_net_inet_tcp, OID_AUTO, paws_tolerance, CTLFLAG_RW, + &tcp_paws_tolerance, 0, "RFC1323 PAWS tolerance"); + /* * Define as tunable for easy testing with SACK on and off. * Warning: do not change setting in the middle of an existing active TCP flow, @@ -195,6 +198,22 @@ int tcp_do_smartsack = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, smartsack, CTLFLAG_RW, &tcp_do_smartsack, 0, "Enable Smart SACK Algorithms"); +int tcp_do_rescuesack = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, rescuesack, CTLFLAG_RW, + &tcp_do_rescuesack, 0, "Rescue retransmission for SACK"); + +int tcp_aggressive_rescuesack = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, rescuesack_agg, CTLFLAG_RW, + &tcp_aggressive_rescuesack, 0, "Aggressive rescue retransmission for SACK"); + +int tcp_do_rfc3517bis = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3517bis, CTLFLAG_RW, + &tcp_do_rfc3517bis, 0, "Enable RFC3517 update"); + +int tcp_rfc3517bis_rxt = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3517bis_rxt, CTLFLAG_RW, + &tcp_rfc3517bis_rxt, 0, "Enable RFC3517 retransmit update"); + SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0, "TCP Segment Reassembly Queue"); @@ -226,23 +245,32 @@ int tcp_autorcvbuf_max = 2*1024*1024; SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_RW, &tcp_autorcvbuf_max, 0, "Max size of automatic receive buffer"); -int tcp_sosnd_agglim = 2; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, sosnd_agglim, CTLFLAG_RW, - &tcp_sosnd_agglim, 0, "TCP sosend mbuf aggregation limit"); +int tcp_sosend_agglim = 2; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, sosend_agglim, CTLFLAG_RW, + &tcp_sosend_agglim, 0, "TCP sosend mbuf aggregation limit"); -int tcp_sosnd_async = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, sosnd_async, CTLFLAG_RW, - &tcp_sosnd_async, 0, "TCP asynchronized pru_send"); +int tcp_sosend_async = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, sosend_async, CTLFLAG_RW, + &tcp_sosend_async, 0, "TCP asynchronized pru_send"); -static void tcp_dooptions(struct tcpopt *, u_char *, int, boolean_t); +static int tcp_ignore_redun_dsack = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, ignore_redun_dsack, CTLFLAG_RW, + &tcp_ignore_redun_dsack, 0, "Ignore redundant DSACK"); + +static void tcp_dooptions(struct tcpopt *, u_char *, int, boolean_t, + tcp_seq); static void tcp_pulloutofband(struct socket *, struct tcphdr *, struct mbuf *, int); static int tcp_reass(struct tcpcb *, struct tcphdr *, int *, struct mbuf *); -static void tcp_xmit_timer(struct tcpcb *, int); +static void tcp_xmit_timer(struct tcpcb *, int, tcp_seq); static void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *, int); -static void tcp_sack_rexmt(struct tcpcb *, struct tcphdr *); +static void tcp_sack_rexmt(struct tcpcb *); +static boolean_t tcp_sack_limitedxmit(struct tcpcb *); static int tcp_rmx_msl(const struct tcpcb *); +static void tcp_established(struct tcpcb *); +static boolean_t tcp_fast_recovery(struct tcpcb *, tcp_seq, + const struct tcpopt *); /* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */ #ifdef INET6 @@ -274,6 +302,67 @@ do { \ (SEQ_LT(tp->snd_wl2, th->th_ack) || \ (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)))) +#define iceildiv(n, d) (((n)+(d)-1) / (d)) +#define need_early_retransmit(tp, ownd) \ + (tcp_do_early_retransmit && \ + (tcp_do_eifel_detect && (tp->t_flags & TF_RCVD_TSTMP)) && \ + ownd < ((tp->t_rxtthresh + 1) * tp->t_maxseg) && \ + tp->t_dupacks + 1 >= iceildiv(ownd, tp->t_maxseg) && \ + (!TCP_DO_SACK(tp) || ownd <= tp->t_maxseg || \ + tcp_sack_has_sacked(&tp->scb, ownd - tp->t_maxseg))) + +/* + * Returns TRUE, if this segment can be merged with the last + * pending segment in the reassemble queue and this segment + * does not overlap with the pending segment immediately + * preceeding the last pending segment. + */ +static __inline boolean_t +tcp_paws_canreasslast(const struct tcpcb *tp, const struct tcphdr *th, int tlen) +{ + const struct tseg_qent *last, *prev; + + last = TAILQ_LAST(&tp->t_segq, tsegqe_head); + if (last == NULL) + return FALSE; + + /* This segment comes immediately after the last pending segment */ + if (last->tqe_th->th_seq + last->tqe_len == th->th_seq) + return TRUE; + + if (th->th_seq + tlen != last->tqe_th->th_seq) + return FALSE; + /* This segment comes immediately before the last pending segment */ + + prev = TAILQ_PREV(last, tsegqe_head, tqe_q); + if (prev == NULL) { + /* + * No pending preceeding segment, we assume this segment + * could be reassembled. + */ + return TRUE; + } + + /* This segment does not overlap with the preceeding segment */ + if (SEQ_GEQ(th->th_seq, prev->tqe_th->th_seq + prev->tqe_len)) + return TRUE; + + return FALSE; +} + +static __inline void +tcp_ncr_update_rxtthresh(struct tcpcb *tp) +{ + int old_rxtthresh = tp->t_rxtthresh; + uint32_t ownd = tp->snd_max - tp->snd_una; + + tp->t_rxtthresh = max(3, ((ownd / tp->t_maxseg) >> 1)); + if (tp->t_rxtthresh != old_rxtthresh) { + tcp_sack_update_lostseq(&tp->scb, tp->snd_una, + tp->t_maxseg, tp->t_rxtthresh); + } +} + static int tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m) { @@ -321,7 +410,7 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m) /* * Find a segment which begins after this one does. */ - LIST_FOREACH(q, &tp->t_segq, tqe_q) { + TAILQ_FOREACH(q, &tp->t_segq, tqe_q) { if (SEQ_GT(q->tqe_th->th_seq, th->th_seq)) break; p = q; @@ -338,13 +427,15 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m) /* conversion to int (in i) handles seq wraparound */ i = p->tqe_th->th_seq + p->tqe_len - th->th_seq; if (i > 0) { /* overlaps preceding segment */ - tp->t_flags |= (TF_DUPSEG | TF_ENCLOSESEG); + tp->sack_flags |= + (TSACK_F_DUPSEG | TSACK_F_ENCLOSESEG); /* enclosing block starts w/ preceding segment */ tp->encloseblk.rblk_start = p->tqe_th->th_seq; if (i >= *tlenp) { /* preceding encloses incoming segment */ - tp->encloseblk.rblk_end = p->tqe_th->th_seq + - p->tqe_len; + tp->encloseblk.rblk_end = TCP_SACK_BLKEND( + p->tqe_th->th_seq + p->tqe_len, + p->tqe_th->th_flags); tcpstat.tcps_rcvduppack++; tcpstat.tcps_rcvdupbyte += *tlenp; m_freem(m); @@ -362,8 +453,8 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m) *tlenp -= i; th->th_seq += i; /* incoming segment end is enclosing block end */ - tp->encloseblk.rblk_end = th->th_seq + *tlenp + - ((th->th_flags & TH_FIN) != 0); + tp->encloseblk.rblk_end = TCP_SACK_BLKEND( + th->th_seq + *tlenp, th->th_flags); /* trim end of reported D-SACK block */ tp->reportblk.rblk_end = th->th_seq; } @@ -378,20 +469,22 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m) while (q) { tcp_seq_diff_t i = (th->th_seq + *tlenp) - q->tqe_th->th_seq; tcp_seq qend = q->tqe_th->th_seq + q->tqe_len; + tcp_seq qend_sack = TCP_SACK_BLKEND(qend, q->tqe_th->th_flags); struct tseg_qent *nq; if (i <= 0) break; - if (!(tp->t_flags & TF_DUPSEG)) { /* first time through */ - tp->t_flags |= (TF_DUPSEG | TF_ENCLOSESEG); + if (!(tp->sack_flags & TSACK_F_DUPSEG)) { + /* first time through */ + tp->sack_flags |= (TSACK_F_DUPSEG | TSACK_F_ENCLOSESEG); tp->encloseblk = tp->reportblk; /* report trailing duplicate D-SACK segment */ tp->reportblk.rblk_start = q->tqe_th->th_seq; } - if ((tp->t_flags & TF_ENCLOSESEG) && - SEQ_GT(qend, tp->encloseblk.rblk_end)) { + if ((tp->sack_flags & TSACK_F_ENCLOSESEG) && + SEQ_GT(qend_sack, tp->encloseblk.rblk_end)) { /* extend enclosing block if one exists */ - tp->encloseblk.rblk_end = qend; + tp->encloseblk.rblk_end = qend_sack; } if (i < q->tqe_len) { q->tqe_th->th_seq += i; @@ -400,8 +493,8 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m) break; } - nq = LIST_NEXT(q, tqe_q); - LIST_REMOVE(q, tqe_q); + nq = TAILQ_NEXT(q, tqe_q); + TAILQ_REMOVE(&tp->t_segq, q, tqe_q); m_freem(q->tqe_m); kfree(q, M_TSEGQ); atomic_add_int(&tcp_reass_qsize, -1); @@ -416,25 +509,26 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m) /* check if can coalesce with following segment */ if (q != NULL && (th->th_seq + *tlenp == q->tqe_th->th_seq)) { tcp_seq tend = te->tqe_th->th_seq + te->tqe_len; + tcp_seq tend_sack = TCP_SACK_BLKEND(tend, te->tqe_th->th_flags); te->tqe_len += q->tqe_len; if (q->tqe_th->th_flags & TH_FIN) te->tqe_th->th_flags |= TH_FIN; m_cat(te->tqe_m, q->tqe_m); - tp->encloseblk.rblk_end = tend; + tp->encloseblk.rblk_end = tend_sack; /* * When not reporting a duplicate segment, use * the larger enclosing block as the SACK block. */ - if (!(tp->t_flags & TF_DUPSEG)) - tp->reportblk.rblk_end = tend; - LIST_REMOVE(q, tqe_q); + if (!(tp->sack_flags & TSACK_F_DUPSEG)) + tp->reportblk.rblk_end = tend_sack; + TAILQ_REMOVE(&tp->t_segq, q, tqe_q); kfree(q, M_TSEGQ); atomic_add_int(&tcp_reass_qsize, -1); } if (p == NULL) { - LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q); + TAILQ_INSERT_HEAD(&tp->t_segq, te, tqe_q); } else { /* check if can coalesce with preceding segment */ if (p->tqe_th->th_seq + p->tqe_len == th->th_seq) { @@ -445,12 +539,12 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m) * When not reporting a duplicate segment, use * the larger enclosing block as the SACK block. */ - if (!(tp->t_flags & TF_DUPSEG)) + if (!(tp->sack_flags & TSACK_F_DUPSEG)) tp->reportblk.rblk_start = p->tqe_th->th_seq; kfree(te, M_TSEGQ); atomic_add_int(&tcp_reass_qsize, -1); } else { - LIST_INSERT_AFTER(p, te, tqe_q); + TAILQ_INSERT_AFTER(&tp->t_segq, p, te, tqe_q); } } @@ -461,20 +555,20 @@ present: */ if (!TCPS_HAVEESTABLISHED(tp->t_state)) return (0); - q = LIST_FIRST(&tp->t_segq); + q = TAILQ_FIRST(&tp->t_segq); if (q == NULL || q->tqe_th->th_seq != tp->rcv_nxt) return (0); tp->rcv_nxt += q->tqe_len; - if (!(tp->t_flags & TF_DUPSEG)) { + if (!(tp->sack_flags & TSACK_F_DUPSEG)) { /* no SACK block to report since ACK advanced */ tp->reportblk.rblk_start = tp->reportblk.rblk_end; } /* no enclosing block to report since ACK advanced */ - tp->t_flags &= ~TF_ENCLOSESEG; + tp->sack_flags &= ~TSACK_F_ENCLOSESEG; flags = q->tqe_th->th_flags & TH_FIN; - LIST_REMOVE(q, tqe_q); - KASSERT(LIST_EMPTY(&tp->t_segq) || - LIST_FIRST(&tp->t_segq)->tqe_th->th_seq != tp->rcv_nxt, + TAILQ_REMOVE(&tp->t_segq, q, tqe_q); + KASSERT(TAILQ_EMPTY(&tp->t_segq) || + TAILQ_FIRST(&tp->t_segq)->tqe_th->th_seq != tp->rcv_nxt, ("segment not coalesced")); if (so->so_state & SS_CANTRCVMORE) { m_freem(q->tqe_m); @@ -539,7 +633,8 @@ tcp_input(struct mbuf **mp, int *offp, int proto) int thflags; struct socket *so = NULL; int todrop, acked; - boolean_t ourfinisacked, needoutput = FALSE; + boolean_t ourfinisacked, needoutput = FALSE, delayed_dupack = FALSE; + tcp_seq th_dupack = 0; /* XXX gcc warning */ u_long tiwin; int recvwin; struct tcpopt to; /* options in this segment */ @@ -844,12 +939,6 @@ findpcb: if (tp->t_state <= TCPS_CLOSED) goto drop; - /* Unscale the window into a 32-bit value. */ - if (!(thflags & TH_SYN)) - tiwin = th->th_win << tp->snd_scale; - else - tiwin = th->th_win; - so = inp->inp_socket; #ifdef TCPDEBUG @@ -933,14 +1022,7 @@ findpcb: tp->snd_up = tp->snd_una; tp->snd_max = tp->snd_nxt = tp->iss + 1; tp->last_ack_sent = tp->rcv_nxt; -/* - * XXX possible bug - it doesn't appear that tp->snd_wnd is unscaled - * until the _second_ ACK is received: - * rcv SYN (set wscale opts) --> send SYN/ACK, set snd_wnd = window. - * rcv ACK, calculate tiwin --> process SYN_RECEIVED, determine wscale, - * move to ESTAB, set snd_wnd to tiwin. - */ - tp->snd_wnd = tiwin; /* unscaled */ + goto after_listen; } if (thflags & TH_RST) { @@ -1040,51 +1122,15 @@ findpcb: * for syncache, or perform t/tcp connection. */ if (so->so_qlen <= so->so_qlimit) { - tcp_dooptions(&to, optp, optlen, TRUE); - if (!syncache_add(&inc, &to, th, &so, m)) + tcp_dooptions(&to, optp, optlen, TRUE, th->th_ack); + if (!syncache_add(&inc, &to, th, so, m)) goto drop; /* * Entry added to syncache, mbuf used to * send SYN,ACK packet. */ - if (so == NULL) - return(IPPROTO_DONE); - - /* - * We must be in the correct protocol thread for - * this connection. - */ - KKASSERT(so->so_port == &curthread->td_msgport); - - inp = so->so_pcb; - tp = intotcpcb(inp); - tp->snd_wnd = tiwin; - tp->t_starttime = ticks; - tp->t_state = TCPS_ESTABLISHED; - - /* - * If there is a FIN, or if there is data and the - * connection is local, then delay SYN,ACK(SYN) in - * the hope of piggy-backing it on a response - * segment. Otherwise must send ACK now in case - * the other side is slow starting. - */ - if (DELAY_ACK(tp) && - ((thflags & TH_FIN) || - (tlen != 0 && - ((isipv6 && in6_localaddr(&inp->in6p_faddr)) || - (!isipv6 && in_localaddr(inp->inp_faddr)))))) { - tcp_callout_reset(tp, tp->tt_delack, - tcp_delacktime, tcp_timer_delack); - tp->t_flags |= TF_NEEDSYN; - } else { - tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); - } - - tcpstat.tcps_connects++; - soisconnected(so); - goto trimthenstep6; + return(IPPROTO_DONE); } goto drop; } @@ -1099,6 +1145,12 @@ after_listen: KASSERT(tp->t_state != TCPS_LISTEN, ("tcp_input: TCPS_LISTEN state")); KKASSERT(so->so_port == &curthread->td_msgport); + /* Unscale the window into a 32-bit value. */ + if (!(thflags & TH_SYN)) + tiwin = th->th_win << tp->snd_scale; + else + tiwin = th->th_win; + /* * This is the second part of the MSS DoS prevention code (after * minmss on the sending side) and it deals with too many too small @@ -1122,19 +1174,26 @@ after_listen: * Process options. * XXX this is tradtitional behavior, may need to be cleaned up. */ - tcp_dooptions(&to, optp, optlen, (thflags & TH_SYN) != 0); + tcp_dooptions(&to, optp, optlen, (thflags & TH_SYN) != 0, th->th_ack); if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { - if (to.to_flags & TOF_SCALE) { + if ((to.to_flags & TOF_SCALE) && (tp->t_flags & TF_REQ_SCALE)) { tp->t_flags |= TF_RCVD_SCALE; - tp->requested_s_scale = to.to_requested_s_scale; + tp->snd_scale = to.to_requested_s_scale; } + + /* + * Initial send window; will be updated upon next ACK + */ + tp->snd_wnd = th->th_win; + if (to.to_flags & TOF_TS) { tp->t_flags |= TF_RCVD_TSTMP; tp->ts_recent = to.to_tsval; tp->ts_recent_age = ticks; } - if (to.to_flags & TOF_MSS) - tcp_mss(tp, to.to_mss); + if (!(to.to_flags & TOF_MSS)) + to.to_mss = 0; + tcp_mss(tp, to.to_mss); /* * Only set the TF_SACK_PERMITTED per-connection flag * if we got a SACK_PERMITTED option from the other side @@ -1200,19 +1259,22 @@ after_listen: */ if (tcp_do_eifel_detect && (to.to_flags & TOF_TS) && to.to_tsecr && - (tp->t_flags & TF_FIRSTACCACK)) { + (tp->rxt_flags & TRXT_F_FIRSTACCACK)) { /* Eifel detection applicable. */ if (to.to_tsecr < tp->t_rexmtTS) { tcp_revert_congestion_state(tp); ++tcpstat.tcps_eifeldetected; + if (tp->t_rxtshift != 1 || + ticks >= tp->t_badrxtwin) + ++tcpstat.tcps_rttcantdetect; } } else if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) { tcp_revert_congestion_state(tp); ++tcpstat.tcps_rttdetected; } - tp->t_flags &= ~(TF_FIRSTACCACK | - TF_FASTREXMT | TF_EARLYREXMT); + tp->rxt_flags &= ~(TRXT_F_FIRSTACCACK | + TRXT_F_FASTREXMT | TRXT_F_EARLYREXMT); /* * Recalculate the retransmit timer / rtt. * @@ -1222,11 +1284,13 @@ after_listen: */ if ((to.to_flags & TOF_TS) && to.to_tsecr) { tcp_xmit_timer(tp, - ticks - to.to_tsecr + 1); + ticks - to.to_tsecr + 1, + th->th_ack); } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { tcp_xmit_timer(tp, - ticks - tp->t_rtttime); + ticks - tp->t_rtttime, + th->th_ack); } tcp_xmit_bandwidth_limit(tp, th->th_ack); acked = th->th_ack - tp->snd_una; @@ -1276,7 +1340,7 @@ after_listen: } } else if (tiwin == tp->snd_wnd && th->th_ack == tp->snd_una && - LIST_EMPTY(&tp->t_segq) && + TAILQ_EMPTY(&tp->t_segq) && tlen <= ssb_space(&so->so_rcv)) { u_long newsize = 0; /* automatic sockbuf scaling */ /* @@ -1472,7 +1536,6 @@ after_listen: } if (!(thflags & TH_SYN)) goto drop; - tp->snd_wnd = th->th_win; /* initial send window */ tp->irs = th->th_seq; tcp_rcvseqinit(tp); @@ -1482,10 +1545,8 @@ after_listen: soisconnected(so); /* Do window scaling on this connection? */ if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == - (TF_RCVD_SCALE | TF_REQ_SCALE)) { - tp->snd_scale = tp->requested_s_scale; + (TF_RCVD_SCALE | TF_REQ_SCALE)) tp->rcv_scale = tp->request_r_scale; - } tp->rcv_adv += tp->rcv_wnd; tp->snd_una++; /* SYN is acked */ tcp_callout_stop(tp, tp->tt_rexmt); @@ -1511,10 +1572,7 @@ after_listen: tp->t_flags &= ~TF_NEEDFIN; thflags &= ~TH_SYN; } else { - tp->t_state = TCPS_ESTABLISHED; - tcp_callout_reset(tp, tp->tt_keep, - tcp_getkeepidle(tp), - tcp_timer_keep); + tcp_established(tp); } } else { /* @@ -1529,7 +1587,6 @@ after_listen: tp->t_state = TCPS_SYN_RECEIVED; } -trimthenstep6: /* * Advance th->th_seq to correspond to first data byte. * If data, trim to stay within window, @@ -1662,7 +1719,6 @@ trimthenstep6: */ if ((to.to_flags & TOF_TS) && tp->ts_recent != 0 && TSTMP_LT(to.to_tsval, tp->ts_recent)) { - /* Check to see if ts_recent is over 24 days old. */ if ((int)(ticks - tp->ts_recent_age) > TCP_PAWS_IDLE) { /* @@ -1677,6 +1733,39 @@ trimthenstep6: * dropped when ts_recent is old. */ tp->ts_recent = 0; + } else if (tcp_paws_tolerance && tlen != 0 && + tp->t_state == TCPS_ESTABLISHED && + (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK&& + !(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)) && + th->th_ack == tp->snd_una && + tiwin == tp->snd_wnd && + TSTMP_GEQ(to.to_tsval + tcp_paws_tolerance, tp->ts_recent)&& + (th->th_seq == tp->rcv_nxt || + (SEQ_GT(th->th_seq, tp->rcv_nxt) && + tcp_paws_canreasslast(tp, th, tlen)))) { + /* + * This tends to prevent valid new segments from being + * dropped by the reordered segments sent by the fast + * retransmission algorithm on the sending side, i.e. + * the fast retransmitted segment w/ larger timestamp + * arrives earlier than the previously sent new segments + * w/ smaller timestamp. + * + * If following conditions are met, the segment is + * accepted: + * - The segment contains data + * - The connection is established + * - The header does not contain important flags + * - SYN or FIN is not needed + * - It does not acknowledge new data + * - Receive window is not changed + * - The timestamp is within "acceptable" range + * - The new segment is what we are expecting or + * the new segment could be merged w/ the last + * pending segment on the reassemble queue + */ + tcpstat.tcps_pawsaccept++; + tcpstat.tcps_pawsdrop++; } else { tcpstat.tcps_rcvduppack++; tcpstat.tcps_rcvdupbyte += tlen; @@ -1704,12 +1793,12 @@ trimthenstep6: if (TCP_DO_SACK(tp)) { /* Report duplicate segment at head of packet. */ tp->reportblk.rblk_start = th->th_seq; - tp->reportblk.rblk_end = th->th_seq + tlen; - if (thflags & TH_FIN) - ++tp->reportblk.rblk_end; + tp->reportblk.rblk_end = TCP_SACK_BLKEND( + th->th_seq + tlen, thflags); if (SEQ_GT(tp->reportblk.rblk_end, tp->rcv_nxt)) tp->reportblk.rblk_end = tp->rcv_nxt; - tp->t_flags |= (TF_DUPSEG | TF_SACKLEFT | TF_ACKNOW); + tp->sack_flags |= (TSACK_F_DUPSEG | TSACK_F_SACKLEFT); + tp->t_flags |= TF_ACKNOW; } if (thflags & TH_SYN) { thflags &= ~TH_SYN; @@ -1871,10 +1960,8 @@ trimthenstep6: soisconnected(so); /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == - (TF_RCVD_SCALE | TF_REQ_SCALE)) { - tp->snd_scale = tp->requested_s_scale; + (TF_RCVD_SCALE | TF_REQ_SCALE)) tp->rcv_scale = tp->request_r_scale; - } /* * Make transitions: * SYN-RECEIVED -> ESTABLISHED @@ -1885,10 +1972,7 @@ trimthenstep6: tp->t_state = TCPS_FIN_WAIT_1; tp->t_flags &= ~TF_NEEDFIN; } else { - tp->t_state = TCPS_ESTABLISHED; - tcp_callout_reset(tp, tp->tt_keep, - tcp_getkeepidle(tp), - tcp_timer_keep); + tcp_established(tp); } /* * If segment contains data or ACK, will call tcp_reass() @@ -1917,153 +2001,30 @@ trimthenstep6: if (SEQ_LEQ(th->th_ack, tp->snd_una)) { if (TCP_DO_SACK(tp)) tcp_sack_update_scoreboard(tp, &to); - if (tlen != 0 || tiwin != tp->snd_wnd) { - tp->t_dupacks = 0; - break; - } - tcpstat.tcps_rcvdupack++; if (!tcp_callout_active(tp, tp->tt_rexmt) || th->th_ack != tp->snd_una) { + if (tlen == 0 && tiwin == tp->snd_wnd) + tcpstat.tcps_rcvdupack++; tp->t_dupacks = 0; break; } - /* - * We have outstanding data (other than - * a window probe), this is a completely - * duplicate ack (ie, window info didn't - * change), the ack is the biggest we've - * seen and we've seen exactly our rexmt - * threshhold of them, so assume a packet - * has been dropped and retransmit it. - * Kludge snd_nxt & the congestion - * window so we send only this one - * packet. - */ - if (IN_FASTRECOVERY(tp)) { - if (TCP_DO_SACK(tp)) { - /* No artifical cwnd inflation. */ - tcp_sack_rexmt(tp, th); + if (tlen != 0 || tiwin != tp->snd_wnd) { + if (!tcp_do_rfc3517bis || + !TCP_DO_SACK(tp) || + (to.to_flags & + (TOF_SACK | TOF_SACK_REDUNDANT)) + != TOF_SACK) { + tp->t_dupacks = 0; } else { - /* - * Dup acks mean that packets - * have left the network - * (they're now cached at the - * receiver) so bump cwnd by - * the amount in the receiver - * to keep a constant cwnd - * packets in the network. - */ - tp->snd_cwnd += tp->t_maxseg; - tcp_output(tp); + delayed_dupack = TRUE; + th_dupack = th->th_ack; } - } else if (SEQ_LT(th->th_ack, tp->snd_recover)) { - tp->t_dupacks = 0; break; - } else if (++tp->t_dupacks == tcprexmtthresh) { - tcp_seq old_snd_nxt; - u_int win; - -fastretransmit: - if (tcp_do_eifel_detect && - (tp->t_flags & TF_RCVD_TSTMP)) { - tcp_save_congestion_state(tp); - tp->t_flags |= TF_FASTREXMT; - } - /* - * We know we're losing at the current - * window size, so do congestion avoidance: - * set ssthresh to half the current window - * and pull our congestion window back to the - * new ssthresh. - */ - win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / - tp->t_maxseg; - if (win < 2) - win = 2; - tp->snd_ssthresh = win * tp->t_maxseg; - ENTER_FASTRECOVERY(tp); - tp->snd_recover = tp->snd_max; - tcp_callout_stop(tp, tp->tt_rexmt); - tp->t_rtttime = 0; - old_snd_nxt = tp->snd_nxt; - tp->snd_nxt = th->th_ack; - tp->snd_cwnd = tp->t_maxseg; - tcp_output(tp); - ++tcpstat.tcps_sndfastrexmit; - tp->snd_cwnd = tp->snd_ssthresh; - tp->rexmt_high = tp->snd_nxt; - if (SEQ_GT(old_snd_nxt, tp->snd_nxt)) - tp->snd_nxt = old_snd_nxt; - KASSERT(tp->snd_limited <= 2, - ("tp->snd_limited too big")); - if (TCP_DO_SACK(tp)) - tcp_sack_rexmt(tp, th); - else - tp->snd_cwnd += tp->t_maxseg * - (tp->t_dupacks - tp->snd_limited); - } else if (tcp_do_limitedtransmit) { - u_long oldcwnd = tp->snd_cwnd; - tcp_seq oldsndmax = tp->snd_max; - tcp_seq oldsndnxt = tp->snd_nxt; - /* outstanding data */ - uint32_t ownd = tp->snd_max - tp->snd_una; - u_int sent; - -#define iceildiv(n, d) (((n)+(d)-1) / (d)) - - KASSERT(tp->t_dupacks == 1 || - tp->t_dupacks == 2, - ("dupacks not 1 or 2")); - if (tp->t_dupacks == 1) - tp->snd_limited = 0; - tp->snd_nxt = tp->snd_max; - tp->snd_cwnd = ownd + - (tp->t_dupacks - tp->snd_limited) * - tp->t_maxseg; - tcp_output(tp); - - /* - * Other acks may have been processed, - * snd_nxt cannot be reset to a value less - * then snd_una. - */ - if (SEQ_LT(oldsndnxt, oldsndmax)) { - if (SEQ_GT(oldsndnxt, tp->snd_una)) - tp->snd_nxt = oldsndnxt; - else - tp->snd_nxt = tp->snd_una; - } - tp->snd_cwnd = oldcwnd; - sent = tp->snd_max - oldsndmax; - if (sent > tp->t_maxseg) { - KASSERT((tp->t_dupacks == 2 && - tp->snd_limited == 0) || - (sent == tp->t_maxseg + 1 && - tp->t_flags & TF_SENTFIN), - ("sent too much")); - KASSERT(sent <= tp->t_maxseg * 2, - ("sent too many segments")); - tp->snd_limited = 2; - tcpstat.tcps_sndlimited += 2; - } else if (sent > 0) { - ++tp->snd_limited; - ++tcpstat.tcps_sndlimited; - } else if (tcp_do_early_retransmit && - (tcp_do_eifel_detect && - (tp->t_flags & TF_RCVD_TSTMP)) && - ownd < 4 * tp->t_maxseg && - tp->t_dupacks + 1 >= - iceildiv(ownd, tp->t_maxseg) && - (!TCP_DO_SACK(tp) || - ownd <= tp->t_maxseg || - tcp_sack_has_sacked(&tp->scb, - ownd - tp->t_maxseg))) { - ++tcpstat.tcps_sndearlyrexmit; - tp->t_flags |= TF_EARLYREXMT; - goto fastretransmit; - } } - goto drop; + if (tcp_fast_recovery(tp, th->th_ack, &to)) + goto drop; + else + break; } KASSERT(SEQ_GT(th->th_ack, tp->snd_una), ("th_ack <= snd_una")); @@ -2095,10 +2056,8 @@ fastretransmit: tp->snd_una++; /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == - (TF_RCVD_SCALE | TF_REQ_SCALE)) { - tp->snd_scale = tp->requested_s_scale; + (TF_RCVD_SCALE | TF_REQ_SCALE)) tp->rcv_scale = tp->request_r_scale; - } } process_ACK: @@ -2108,12 +2067,12 @@ process_ACK: if (tcp_do_eifel_detect && acked > 0 && (to.to_flags & TOF_TS) && (to.to_tsecr != 0) && - (tp->t_flags & TF_FIRSTACCACK)) { + (tp->rxt_flags & TRXT_F_FIRSTACCACK)) { /* Eifel detection applicable. */ if (to.to_tsecr < tp->t_rexmtTS) { ++tcpstat.tcps_eifeldetected; tcp_revert_congestion_state(tp); - if (tp->t_rxtshift == 1 && + if (tp->t_rxtshift != 1 || ticks >= tp->t_badrxtwin) ++tcpstat.tcps_rttcantdetect; } @@ -2144,9 +2103,9 @@ process_ACK: * timestamps of 0. */ if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) - tcp_xmit_timer(tp, ticks - to.to_tsecr + 1); + tcp_xmit_timer(tp, ticks - to.to_tsecr + 1, th->th_ack); else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) - tcp_xmit_timer(tp, ticks - tp->t_rtttime); + tcp_xmit_timer(tp, ticks - tp->t_rtttime, th->th_ack); tcp_xmit_bandwidth_limit(tp, th->th_ack); /* @@ -2157,7 +2116,8 @@ process_ACK: goto step6; /* Stop looking for an acceptable ACK since one was received. */ - tp->t_flags &= ~(TF_FIRSTACCACK | TF_FASTREXMT | TF_EARLYREXMT); + tp->rxt_flags &= ~(TRXT_F_FIRSTACCACK | + TRXT_F_FASTREXMT | TRXT_F_EARLYREXMT); if (acked > so->so_snd.ssb_cc) { tp->snd_wnd -= so->so_snd.ssb_cc; @@ -2172,13 +2132,8 @@ process_ACK: /* * Update window information. - * Don't look at window if no ACK: - * TAC's send garbage on first SYN. */ - if (SEQ_LT(tp->snd_wl1, th->th_seq) || - (tp->snd_wl1 == th->th_seq && - (SEQ_LT(tp->snd_wl2, th->th_ack) || - (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)))) { + if (acceptable_window_update(tp, th, tiwin)) { /* keep track of pure window updates */ if (tlen == 0 && tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) @@ -2223,7 +2178,7 @@ process_ACK: } else { if (TCP_DO_SACK(tp)) { tp->snd_max_rexmt = tp->snd_max; - tcp_sack_rexmt(tp, th); + tcp_sack_rexmt(tp); } else { tcp_newreno_partial_ack(tp, th, acked); } @@ -2442,7 +2397,7 @@ dodata: /* XXX */ * fast retransmit can work). */ if (th->th_seq == tp->rcv_nxt && - LIST_EMPTY(&tp->t_segq) && + TAILQ_EMPTY(&tp->t_segq) && TCPS_HAVEESTABLISHED(tp->t_state)) { if (DELAY_ACK(tp)) { tcp_callout_reset(tp, tp->tt_delack, @@ -2464,11 +2419,11 @@ dodata: /* XXX */ } sorwakeup(so); } else { - if (!(tp->t_flags & TF_DUPSEG)) { + if (!(tp->sack_flags & TSACK_F_DUPSEG)) { /* Initialize SACK report block. */ tp->reportblk.rblk_start = th->th_seq; - tp->reportblk.rblk_end = th->th_seq + tlen + - ((thflags & TH_FIN) != 0); + tp->reportblk.rblk_end = TCP_SACK_BLKEND( + th->th_seq + tlen, thflags); } thflags = tcp_reass(tp, th, &tlen, m); tp->t_flags |= TF_ACKNOW; @@ -2556,11 +2511,18 @@ dodata: /* XXX */ tcp_trace(TA_INPUT, ostate, tp, tcp_saveipgen, &tcp_savetcp, 0); #endif + /* + * Delayed duplicated ACK processing + */ + if (delayed_dupack && tcp_fast_recovery(tp, th_dupack, &to)) + needoutput = FALSE; + /* * Return any desired output. */ if (needoutput || (tp->t_flags & TF_ACKNOW)) tcp_output(tp); + tcp_sack_report_cleanup(tp); return(IPPROTO_DONE); dropafterack: @@ -2592,6 +2554,7 @@ dropafterack: m_freem(m); tp->t_flags |= TF_ACKNOW; tcp_output(tp); + tcp_sack_report_cleanup(tp); return(IPPROTO_DONE); dropwithreset: @@ -2638,6 +2601,8 @@ dropwithreset: tcp_respond(tp, mtod(m, void *), th, m, th->th_seq + tlen, (tcp_seq)0, TH_RST | TH_ACK); } + if (tp != NULL) + tcp_sack_report_cleanup(tp); return(IPPROTO_DONE); drop: @@ -2649,6 +2614,8 @@ drop: tcp_trace(TA_DROP, ostate, tp, tcp_saveipgen, &tcp_savetcp, 0); #endif m_freem(m); + if (tp != NULL) + tcp_sack_report_cleanup(tp); return(IPPROTO_DONE); } @@ -2656,7 +2623,8 @@ drop: * Parse TCP options and place in tcpopt. */ static void -tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, boolean_t is_syn) +tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, boolean_t is_syn, + tcp_seq ack) { int opt, optlen, i; @@ -2725,7 +2693,23 @@ tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, boolean_t is_syn) r->rblk_start = ntohl(r->rblk_start); r->rblk_end = ntohl(r->rblk_end); + + if (SEQ_LEQ(r->rblk_end, r->rblk_start)) { + /* + * Invalid SACK block; discard all + * SACK blocks + */ + tcpstat.tcps_rcvbadsackopt++; + to->to_nsackblocks = 0; + to->to_sackblocks = NULL; + to->to_flags &= ~TOF_SACK; + break; + } } + if ((to->to_flags & TOF_SACK) && + tcp_sack_ndsack_blocks(to->to_sackblocks, + to->to_nsackblocks, ack)) + to->to_flags |= TOF_DSACK; break; #ifdef TCP_SIGNATURE /* @@ -2784,13 +2768,35 @@ tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m, int off) * and update averages and current timeout. */ static void -tcp_xmit_timer(struct tcpcb *tp, int rtt) +tcp_xmit_timer(struct tcpcb *tp, int rtt, tcp_seq ack) { - int delta; + int rebaserto = 0; tcpstat.tcps_rttupdated++; tp->t_rttupdated++; - if (tp->t_srtt != 0) { + if ((tp->rxt_flags & TRXT_F_REBASERTO) && + SEQ_GT(ack, tp->snd_max_prev)) { +#ifdef DEBUG_EIFEL_RESPONSE + kprintf("srtt/rttvar, prev %d/%d, cur %d/%d, ", + tp->t_srtt_prev, tp->t_rttvar_prev, + tp->t_srtt, tp->t_rttvar); +#endif + + tcpstat.tcps_eifelresponse++; + rebaserto = 1; + tp->rxt_flags &= ~TRXT_F_REBASERTO; + tp->t_srtt = max(tp->t_srtt_prev, (rtt << TCP_RTT_SHIFT)); + tp->t_rttvar = max(tp->t_rttvar_prev, + (rtt << (TCP_RTTVAR_SHIFT - 1))); + if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) + tp->t_rttbest = tp->t_srtt + tp->t_rttvar; + +#ifdef DEBUG_EIFEL_RESPONSE + kprintf("new %d/%d ", tp->t_srtt, tp->t_rttvar); +#endif + } else if (tp->t_srtt != 0) { + int delta; + /* * srtt is stored as fixed point with 5 bits after the * binary point (i.e., scaled by 8). The following magic @@ -2834,6 +2840,13 @@ tcp_xmit_timer(struct tcpcb *tp, int rtt) tp->t_rtttime = 0; tp->t_rxtshift = 0; +#ifdef DEBUG_EIFEL_RESPONSE + if (rebaserto) { + kprintf("| rxtcur prev %d, old %d, ", + tp->t_rxtcur_prev, tp->t_rxtcur); + } +#endif + /* * the retransmit should happen at rtt + 4 * rttvar. * Because of the way we do the smoothing, srtt and rttvar @@ -2848,6 +2861,30 @@ tcp_xmit_timer(struct tcpcb *tp, int rtt) TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX); + if (rebaserto) { + if (tp->t_rxtcur < tp->t_rxtcur_prev + tcp_eifel_rtoinc) { + /* + * RFC4015 requires that the new RTO is at least + * 2*G (tcp_eifel_rtoinc) greater then the RTO + * (t_rxtcur_prev) when the spurious retransmit + * timeout happens. + * + * The above condition could be true, if the SRTT + * and RTTVAR used to calculate t_rxtcur_prev + * resulted in a value less than t_rttmin. So + * simply increasing SRTT by tcp_eifel_rtoinc when + * preparing for the Eifel response in + * tcp_save_congestion_state() could not ensure + * that the new RTO will be tcp_eifel_rtoinc greater + * t_rxtcur_prev. + */ + tp->t_rxtcur = tp->t_rxtcur_prev + tcp_eifel_rtoinc; + } +#ifdef DEBUG_EIFEL_RESPONSE + kprintf("new %d\n", tp->t_rxtcur); +#endif + } + /* * We received an ack for a packet that wasn't retransmitted; * it is probably safe to discard any error indications we've @@ -3007,11 +3044,11 @@ tcp_mss(struct tcpcb *tp, int offer) mss -= TCPOLEN_TSTAMP_APPA; #if (MCLBYTES & (MCLBYTES - 1)) == 0 - if (mss > MCLBYTES) - mss &= ~(MCLBYTES-1); + if (mss > MCLBYTES) + mss &= ~(MCLBYTES-1); #else - if (mss > MCLBYTES) - mss = mss / MCLBYTES * MCLBYTES; + if (mss > MCLBYTES) + mss = mss / MCLBYTES * MCLBYTES; #endif /* * If there's a pipesize, change the socket buffer @@ -3050,13 +3087,11 @@ tcp_mss(struct tcpcb *tp, int offer) } /* - * Set the slow-start flight size depending on whether this - * is a local network or not. + * Set the slow-start flight size + * + * NOTE: t_maxseg must have been configured! */ - if (tcp_do_rfc3390) - tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380)); - else - tp->snd_cwnd = mss; + tp->snd_cwnd = tcp_initial_window(tp); if (rt->rt_rmx.rmx_ssthresh) { /* @@ -3134,51 +3169,134 @@ tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th, int acked) * except when retransmitting snd_una. */ static void -tcp_sack_rexmt(struct tcpcb *tp, struct tcphdr *th) +tcp_sack_rexmt(struct tcpcb *tp) { - uint32_t pipe, seglen; - tcp_seq nextrexmt; - boolean_t lostdup; tcp_seq old_snd_nxt = tp->snd_nxt; u_long ocwnd = tp->snd_cwnd; + uint32_t pipe; int nseg = 0; /* consecutive new segments */ + int nseg_rexmt = 0; /* retransmitted segments */ #define MAXBURST 4 /* limit burst of new packets on partial ack */ tp->t_rtttime = 0; pipe = tcp_sack_compute_pipe(tp); while ((tcp_seq_diff_t)(ocwnd - pipe) >= (tcp_seq_diff_t)tp->t_maxseg && - (!tcp_do_smartsack || nseg < MAXBURST) && - tcp_sack_nextseg(tp, &nextrexmt, &seglen, &lostdup)) { - uint32_t sent; - tcp_seq old_snd_max; + (!tcp_do_smartsack || nseg < MAXBURST)) { + tcp_seq old_snd_max, old_rexmt_high, nextrexmt; + uint32_t sent, seglen; + boolean_t rescue; int error; + old_rexmt_high = tp->rexmt_high; + if (!tcp_sack_nextseg(tp, &nextrexmt, &seglen, &rescue)) { + tp->rexmt_high = old_rexmt_high; + break; + } + + /* + * If the next tranmission is a rescue retranmission, + * we check whether we have already sent some data + * (either new segments or retransmitted segments) + * into the the network or not. Since the idea of rescue + * retransmission is to sustain ACK clock, as long as + * some segments are in the network, ACK clock will be + * kept ticking. + */ + if (rescue && (nseg_rexmt > 0 || nseg > 0)) { + tp->rexmt_high = old_rexmt_high; + break; + } + if (nextrexmt == tp->snd_max) ++nseg; + else + ++nseg_rexmt; tp->snd_nxt = nextrexmt; tp->snd_cwnd = nextrexmt - tp->snd_una + seglen; old_snd_max = tp->snd_max; if (nextrexmt == tp->snd_una) tcp_callout_stop(tp, tp->tt_rexmt); error = tcp_output(tp); - if (error != 0) + if (error != 0) { + tp->rexmt_high = old_rexmt_high; break; + } sent = tp->snd_nxt - nextrexmt; - if (sent <= 0) + if (sent <= 0) { + tp->rexmt_high = old_rexmt_high; break; - if (!lostdup) - pipe += sent; + } + pipe += sent; tcpstat.tcps_sndsackpack++; tcpstat.tcps_sndsackbyte += sent; + + if (rescue) { + tcpstat.tcps_sackrescue++; + tp->rexmt_rescue = tp->snd_nxt; + tp->sack_flags |= TSACK_F_SACKRESCUED; + break; + } if (SEQ_LT(nextrexmt, old_snd_max) && - SEQ_LT(tp->rexmt_high, tp->snd_nxt)) + SEQ_LT(tp->rexmt_high, tp->snd_nxt)) { tp->rexmt_high = seq_min(tp->snd_nxt, old_snd_max); + if (tcp_aggressive_rescuesack && + (tp->sack_flags & TSACK_F_SACKRESCUED) && + SEQ_LT(tp->rexmt_rescue, tp->rexmt_high)) { + /* Drag RescueRxt along with HighRxt */ + tp->rexmt_rescue = tp->rexmt_high; + } + } } if (SEQ_GT(old_snd_nxt, tp->snd_nxt)) tp->snd_nxt = old_snd_nxt; tp->snd_cwnd = ocwnd; } +/* + * Return TRUE, if some new segments are sent + */ +static boolean_t +tcp_sack_limitedxmit(struct tcpcb *tp) +{ + tcp_seq oldsndnxt = tp->snd_nxt; + tcp_seq oldsndmax = tp->snd_max; + u_long ocwnd = tp->snd_cwnd; + uint32_t pipe, sent; + boolean_t ret = FALSE; + tcp_seq_diff_t cwnd_left; + tcp_seq next; + + tp->rexmt_high = tp->snd_una - 1; + pipe = tcp_sack_compute_pipe(tp); + cwnd_left = (tcp_seq_diff_t)(ocwnd - pipe); + if (cwnd_left < (tcp_seq_diff_t)tp->t_maxseg) + return FALSE; + + next = tp->snd_nxt = tp->snd_max; + tp->snd_cwnd = tp->snd_nxt - tp->snd_una + + rounddown(cwnd_left, tp->t_maxseg); + + tcp_output(tp); + + sent = tp->snd_nxt - next; + if (sent > 0) { + tcpstat.tcps_sndlimited += howmany(sent, tp->t_maxseg); + ret = TRUE; + } + + if (SEQ_LT(oldsndnxt, oldsndmax)) { + KASSERT(SEQ_GEQ(oldsndnxt, tp->snd_una), + ("snd_una moved in other threads")); + tp->snd_nxt = oldsndnxt; + } + tp->snd_cwnd = ocwnd; + + if (ret && TCP_DO_NCR(tp)) + tcp_ncr_update_rxtthresh(tp); + + return ret; +} + /* * Reset idle time and keep-alive timer, typically called when a valid * tcp packet is received but may also be called when FASTKEEP is set @@ -3204,7 +3322,7 @@ tcp_timer_keep_activity(struct tcpcb *tp, int thflags) tp->t_rcvtime = ticks; tp->t_flags &= ~TF_KEEPALIVE; tcp_callout_reset(tp, tp->tt_keep, - tcp_getkeepidle(tp), + tp->t_keepidle, tcp_timer_keep); } } @@ -3235,3 +3353,199 @@ tcp_rmx_msl(const struct tcpcb *tp) return msl; } + +static void +tcp_established(struct tcpcb *tp) +{ + tp->t_state = TCPS_ESTABLISHED; + tcp_callout_reset(tp, tp->tt_keep, tp->t_keepidle, tcp_timer_keep); + + if (tp->t_rxtsyn > 0) { + /* + * RFC6298: + * "If the timer expires awaiting the ACK of a SYN segment + * and the TCP implementation is using an RTO less than 3 + * seconds, the RTO MUST be re-initialized to 3 seconds + * when data transmission begins" + */ + if (tp->t_rxtcur < TCPTV_RTOBASE3) + tp->t_rxtcur = TCPTV_RTOBASE3; + } +} + +/* + * Returns TRUE, if the ACK should be dropped + */ +static boolean_t +tcp_fast_recovery(struct tcpcb *tp, tcp_seq th_ack, const struct tcpopt *to) +{ + boolean_t fast_sack_rexmt = TRUE; + + tcpstat.tcps_rcvdupack++; + + /* + * We have outstanding data (other than a window probe), + * this is a completely duplicate ack (ie, window info + * didn't change), the ack is the biggest we've seen and + * we've seen exactly our rexmt threshhold of them, so + * assume a packet has been dropped and retransmit it. + * Kludge snd_nxt & the congestion window so we send only + * this one packet. + */ + if (IN_FASTRECOVERY(tp)) { + if (TCP_DO_SACK(tp)) { + /* No artifical cwnd inflation. */ + tcp_sack_rexmt(tp); + } else { + /* + * Dup acks mean that packets have left + * the network (they're now cached at the + * receiver) so bump cwnd by the amount in + * the receiver to keep a constant cwnd + * packets in the network. + */ + tp->snd_cwnd += tp->t_maxseg; + tcp_output(tp); + } + return TRUE; + } else if (SEQ_LT(th_ack, tp->snd_recover)) { + tp->t_dupacks = 0; + return FALSE; + } else if (tcp_ignore_redun_dsack && TCP_DO_SACK(tp) && + (to->to_flags & (TOF_DSACK | TOF_SACK_REDUNDANT)) == + (TOF_DSACK | TOF_SACK_REDUNDANT)) { + /* + * If the ACK carries DSACK and other SACK blocks + * carry information that we have already known, + * don't count this ACK as duplicate ACK. This + * prevents spurious early retransmit and fast + * retransmit. This also meets the requirement of + * RFC3042 that new segments should not be sent if + * the SACK blocks do not contain new information + * (XXX we actually loosen the requirment that only + * DSACK is checked here). + * + * This kind of ACKs are usually sent after spurious + * retransmit. + */ + /* Do nothing; don't change t_dupacks */ + return TRUE; + } else if (tp->t_dupacks == 0 && TCP_DO_NCR(tp)) { + tcp_ncr_update_rxtthresh(tp); + } + + if (++tp->t_dupacks == tp->t_rxtthresh) { + tcp_seq old_snd_nxt; + u_int win; + +fastretransmit: + if (tcp_do_eifel_detect && (tp->t_flags & TF_RCVD_TSTMP)) { + tcp_save_congestion_state(tp); + tp->rxt_flags |= TRXT_F_FASTREXMT; + } + /* + * We know we're losing at the current window size, + * so do congestion avoidance: set ssthresh to half + * the current window and pull our congestion window + * back to the new ssthresh. + */ + win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; + if (win < 2) + win = 2; + tp->snd_ssthresh = win * tp->t_maxseg; + ENTER_FASTRECOVERY(tp); + tp->snd_recover = tp->snd_max; + tcp_callout_stop(tp, tp->tt_rexmt); + tp->t_rtttime = 0; + old_snd_nxt = tp->snd_nxt; + tp->snd_nxt = th_ack; + tp->snd_cwnd = tp->t_maxseg; + tcp_output(tp); + ++tcpstat.tcps_sndfastrexmit; + tp->snd_cwnd = tp->snd_ssthresh; + tp->rexmt_high = tp->snd_nxt; + tp->sack_flags &= ~TSACK_F_SACKRESCUED; + if (SEQ_GT(old_snd_nxt, tp->snd_nxt)) + tp->snd_nxt = old_snd_nxt; + KASSERT(tp->snd_limited <= 2, ("tp->snd_limited too big")); + if (TCP_DO_SACK(tp)) { + if (fast_sack_rexmt) + tcp_sack_rexmt(tp); + } else { + tp->snd_cwnd += tp->t_maxseg * + (tp->t_dupacks - tp->snd_limited); + } + } else if ((tcp_do_rfc3517bis && TCP_DO_SACK(tp)) || TCP_DO_NCR(tp)) { + /* + * The RFC3517bis recommends to reduce the byte threshold, + * and enter fast retransmit if IsLost(snd_una). However, + * if we use IsLost(snd_una) based fast retransmit here, + * segments reordering will cause spurious retransmit. So + * we defer the IsLost(snd_una) based fast retransmit until + * the extended limited transmit can't send any segments and + * early retransmit can't be done. + */ + if (tcp_rfc3517bis_rxt && tcp_do_rfc3517bis && + tcp_sack_islost(&tp->scb, tp->snd_una)) + goto fastretransmit; + + if (tcp_do_limitedtransmit || TCP_DO_NCR(tp)) { + if (!tcp_sack_limitedxmit(tp)) { + /* outstanding data */ + uint32_t ownd = tp->snd_max - tp->snd_una; + + if (need_early_retransmit(tp, ownd)) { + ++tcpstat.tcps_sndearlyrexmit; + tp->rxt_flags |= TRXT_F_EARLYREXMT; + goto fastretransmit; + } else if (tcp_do_rfc3517bis && + tcp_sack_islost(&tp->scb, tp->snd_una)) { + fast_sack_rexmt = FALSE; + goto fastretransmit; + } + } + } + } else if (tcp_do_limitedtransmit) { + u_long oldcwnd = tp->snd_cwnd; + tcp_seq oldsndmax = tp->snd_max; + tcp_seq oldsndnxt = tp->snd_nxt; + /* outstanding data */ + uint32_t ownd = tp->snd_max - tp->snd_una; + u_int sent; + + KASSERT(tp->t_dupacks == 1 || tp->t_dupacks == 2, + ("dupacks not 1 or 2")); + if (tp->t_dupacks == 1) + tp->snd_limited = 0; + tp->snd_nxt = tp->snd_max; + tp->snd_cwnd = ownd + + (tp->t_dupacks - tp->snd_limited) * tp->t_maxseg; + tcp_output(tp); + + if (SEQ_LT(oldsndnxt, oldsndmax)) { + KASSERT(SEQ_GEQ(oldsndnxt, tp->snd_una), + ("snd_una moved in other threads")); + tp->snd_nxt = oldsndnxt; + } + tp->snd_cwnd = oldcwnd; + sent = tp->snd_max - oldsndmax; + if (sent > tp->t_maxseg) { + KASSERT((tp->t_dupacks == 2 && tp->snd_limited == 0) || + (sent == tp->t_maxseg + 1 && + (tp->t_flags & TF_SENTFIN)), + ("sent too much")); + KASSERT(sent <= tp->t_maxseg * 2, + ("sent too many segments")); + tp->snd_limited = 2; + tcpstat.tcps_sndlimited += 2; + } else if (sent > 0) { + ++tp->snd_limited; + ++tcpstat.tcps_sndlimited; + } else if (need_early_retransmit(tp, ownd)) { + ++tcpstat.tcps_sndearlyrexmit; + tp->rxt_flags |= TRXT_F_EARLYREXMT; + goto fastretransmit; + } + } + return TRUE; +}