tcp/sack: Use RFC3517bis IsLost(snd_una) as fallback of early retransmit
[dragonfly.git] / sys / netinet / tcp_input.c
index c8e5192..d687b68 100644 (file)
@@ -176,6 +176,15 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, abc, CTLFLAG_RW,
     &tcp_do_abc, 0,
     "TCP Appropriate Byte Counting (RFC 3465)");
 
+/*
+ * The following value actually takes range [25ms, 250ms],
+ * given that most modern systems use 1ms ~ 10ms as the unit
+ * of timestamp option.
+ */
+static u_int tcp_paws_tolerance = 25;
+SYSCTL_UINT(_net_inet_tcp, OID_AUTO, paws_tolerance, CTLFLAG_RW,
+    &tcp_paws_tolerance, 0, "RFC1323 PAWS tolerance");
+
 /*
  * Define as tunable for easy testing with SACK on and off.
  * Warning:  do not change setting in the middle of an existing active TCP flow,
@@ -193,10 +202,18 @@ int tcp_do_rescuesack = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rescuesack, CTLFLAG_RW,
     &tcp_do_rescuesack, 0, "Rescue retransmission for SACK");
 
-int tcp_aggressive_rescuesack = 1;
+int tcp_aggressive_rescuesack = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rescuesack_agg, CTLFLAG_RW,
     &tcp_aggressive_rescuesack, 0, "Aggressive rescue retransmission for SACK");
 
+int tcp_do_rfc3517bis = 0;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3517bis, CTLFLAG_RW,
+    &tcp_do_rfc3517bis, 0, "Enable RFC3517 update");
+
+int tcp_rfc3517bis_rxt = 0;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3517bis_rxt, CTLFLAG_RW,
+    &tcp_rfc3517bis_rxt, 0, "Enable RFC3517 retransmit update");
+
 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0,
     "TCP Segment Reassembly Queue");
 
@@ -236,16 +253,24 @@ int tcp_sosend_async = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, sosend_async, CTLFLAG_RW,
     &tcp_sosend_async, 0, "TCP asynchronized pru_send");
 
-static void     tcp_dooptions(struct tcpopt *, u_char *, int, boolean_t);
+static int tcp_ignore_redun_dsack = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, ignore_redun_dsack, CTLFLAG_RW,
+    &tcp_ignore_redun_dsack, 0, "Ignore redundant DSACK");
+
+static void     tcp_dooptions(struct tcpopt *, u_char *, int, boolean_t,
+                   tcp_seq);
 static void     tcp_pulloutofband(struct socket *,
                     struct tcphdr *, struct mbuf *, int);
 static int      tcp_reass(struct tcpcb *, struct tcphdr *, int *,
                     struct mbuf *);
 static void     tcp_xmit_timer(struct tcpcb *, int, tcp_seq);
 static void     tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *, int);
-static void     tcp_sack_rexmt(struct tcpcb *, struct tcphdr *);
+static void     tcp_sack_rexmt(struct tcpcb *);
+static boolean_t tcp_sack_limitedxmit(struct tcpcb *);
 static int      tcp_rmx_msl(const struct tcpcb *);
 static void     tcp_established(struct tcpcb *);
+static boolean_t tcp_fast_recovery(struct tcpcb *, tcp_seq,
+                    const struct tcpopt *);
 
 /* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */
 #ifdef INET6
@@ -277,6 +302,67 @@ do { \
       (SEQ_LT(tp->snd_wl2, th->th_ack) ||                              \
        (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))
 
+#define        iceildiv(n, d)          (((n)+(d)-1) / (d))
+#define need_early_retransmit(tp, ownd) \
+    (tcp_do_early_retransmit && \
+     (tcp_do_eifel_detect && (tp->t_flags & TF_RCVD_TSTMP)) && \
+     ownd < ((tp->t_rxtthresh + 1) * tp->t_maxseg) && \
+     tp->t_dupacks + 1 >= iceildiv(ownd, tp->t_maxseg) && \
+     (!TCP_DO_SACK(tp) || ownd <= tp->t_maxseg || \
+      tcp_sack_has_sacked(&tp->scb, ownd - tp->t_maxseg)))
+
+/*
+ * Returns TRUE, if this segment can be merged with the last
+ * pending segment in the reassemble queue and this segment
+ * does not overlap with the pending segment immediately
+ * preceeding the last pending segment.
+ */
+static __inline boolean_t
+tcp_paws_canreasslast(const struct tcpcb *tp, const struct tcphdr *th, int tlen)
+{
+       const struct tseg_qent *last, *prev;
+
+       last = TAILQ_LAST(&tp->t_segq, tsegqe_head);
+       if (last == NULL)
+               return FALSE;
+
+       /* This segment comes immediately after the last pending segment */
+       if (last->tqe_th->th_seq + last->tqe_len == th->th_seq)
+               return TRUE;
+
+       if (th->th_seq + tlen != last->tqe_th->th_seq)
+               return FALSE;
+       /* This segment comes immediately before the last pending segment */
+
+       prev = TAILQ_PREV(last, tsegqe_head, tqe_q);
+       if (prev == NULL) {
+               /*
+                * No pending preceeding segment, we assume this segment
+                * could be reassembled.
+                */
+               return TRUE;
+       }
+
+       /* This segment does not overlap with the preceeding segment */
+       if (SEQ_GEQ(th->th_seq, prev->tqe_th->th_seq + prev->tqe_len))
+               return TRUE;
+
+       return FALSE;
+}
+
+static __inline void
+tcp_ncr_update_rxtthresh(struct tcpcb *tp)
+{
+       int old_rxtthresh = tp->t_rxtthresh;
+       uint32_t ownd = tp->snd_max - tp->snd_una;
+
+       tp->t_rxtthresh = max(3, ((ownd / tp->t_maxseg) >> 1));
+       if (tp->t_rxtthresh != old_rxtthresh) {
+               tcp_sack_update_lostseq(&tp->scb, tp->snd_una,
+                   tp->t_maxseg, tp->t_rxtthresh);
+       }
+}
+
 static int
 tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
 {
@@ -324,7 +410,7 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
        /*
         * Find a segment which begins after this one does.
         */
-       LIST_FOREACH(q, &tp->t_segq, tqe_q) {
+       TAILQ_FOREACH(q, &tp->t_segq, tqe_q) {
                if (SEQ_GT(q->tqe_th->th_seq, th->th_seq))
                        break;
                p = q;
@@ -341,7 +427,8 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
                /* conversion to int (in i) handles seq wraparound */
                i = p->tqe_th->th_seq + p->tqe_len - th->th_seq;
                if (i > 0) {            /* overlaps preceding segment */
-                       tp->t_flags |= (TF_DUPSEG | TF_ENCLOSESEG);
+                       tp->sack_flags |=
+                           (TSACK_F_DUPSEG | TSACK_F_ENCLOSESEG);
                        /* enclosing block starts w/ preceding segment */
                        tp->encloseblk.rblk_start = p->tqe_th->th_seq;
                        if (i >= *tlenp) {
@@ -387,13 +474,14 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
 
                if (i <= 0)
                        break;
-               if (!(tp->t_flags & TF_DUPSEG)) {    /* first time through */
-                       tp->t_flags |= (TF_DUPSEG | TF_ENCLOSESEG);
+               if (!(tp->sack_flags & TSACK_F_DUPSEG)) {
+                       /* first time through */
+                       tp->sack_flags |= (TSACK_F_DUPSEG | TSACK_F_ENCLOSESEG);
                        tp->encloseblk = tp->reportblk;
                        /* report trailing duplicate D-SACK segment */
                        tp->reportblk.rblk_start = q->tqe_th->th_seq;
                }
-               if ((tp->t_flags & TF_ENCLOSESEG) &&
+               if ((tp->sack_flags & TSACK_F_ENCLOSESEG) &&
                    SEQ_GT(qend_sack, tp->encloseblk.rblk_end)) {
                        /* extend enclosing block if one exists */
                        tp->encloseblk.rblk_end = qend_sack;
@@ -405,8 +493,8 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
                        break;
                }
 
-               nq = LIST_NEXT(q, tqe_q);
-               LIST_REMOVE(q, tqe_q);
+               nq = TAILQ_NEXT(q, tqe_q);
+               TAILQ_REMOVE(&tp->t_segq, q, tqe_q);
                m_freem(q->tqe_m);
                kfree(q, M_TSEGQ);
                atomic_add_int(&tcp_reass_qsize, -1);
@@ -432,15 +520,15 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
                 * When not reporting a duplicate segment, use
                 * the larger enclosing block as the SACK block.
                 */
-               if (!(tp->t_flags & TF_DUPSEG))
+               if (!(tp->sack_flags & TSACK_F_DUPSEG))
                        tp->reportblk.rblk_end = tend_sack;
-               LIST_REMOVE(q, tqe_q);
+               TAILQ_REMOVE(&tp->t_segq, q, tqe_q);
                kfree(q, M_TSEGQ);
                atomic_add_int(&tcp_reass_qsize, -1);
        }
 
        if (p == NULL) {
-               LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q);
+               TAILQ_INSERT_HEAD(&tp->t_segq, te, tqe_q);
        } else {
                /* check if can coalesce with preceding segment */
                if (p->tqe_th->th_seq + p->tqe_len == th->th_seq) {
@@ -451,12 +539,12 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
                         * When not reporting a duplicate segment, use
                         * the larger enclosing block as the SACK block.
                         */
-                       if (!(tp->t_flags & TF_DUPSEG))
+                       if (!(tp->sack_flags & TSACK_F_DUPSEG))
                                tp->reportblk.rblk_start = p->tqe_th->th_seq;
                        kfree(te, M_TSEGQ);
                        atomic_add_int(&tcp_reass_qsize, -1);
                } else {
-                       LIST_INSERT_AFTER(p, te, tqe_q);
+                       TAILQ_INSERT_AFTER(&tp->t_segq, p, te, tqe_q);
                }
        }
 
@@ -467,20 +555,20 @@ present:
         */
        if (!TCPS_HAVEESTABLISHED(tp->t_state))
                return (0);
-       q = LIST_FIRST(&tp->t_segq);
+       q = TAILQ_FIRST(&tp->t_segq);
        if (q == NULL || q->tqe_th->th_seq != tp->rcv_nxt)
                return (0);
        tp->rcv_nxt += q->tqe_len;
-       if (!(tp->t_flags & TF_DUPSEG)) {
+       if (!(tp->sack_flags & TSACK_F_DUPSEG)) {
                /* no SACK block to report since ACK advanced */
                tp->reportblk.rblk_start = tp->reportblk.rblk_end;
        }
        /* no enclosing block to report since ACK advanced */
-       tp->t_flags &= ~TF_ENCLOSESEG;
+       tp->sack_flags &= ~TSACK_F_ENCLOSESEG;
        flags = q->tqe_th->th_flags & TH_FIN;
-       LIST_REMOVE(q, tqe_q);
-       KASSERT(LIST_EMPTY(&tp->t_segq) ||
-               LIST_FIRST(&tp->t_segq)->tqe_th->th_seq != tp->rcv_nxt,
+       TAILQ_REMOVE(&tp->t_segq, q, tqe_q);
+       KASSERT(TAILQ_EMPTY(&tp->t_segq) ||
+               TAILQ_FIRST(&tp->t_segq)->tqe_th->th_seq != tp->rcv_nxt,
                ("segment not coalesced"));
        if (so->so_state & SS_CANTRCVMORE) {
                m_freem(q->tqe_m);
@@ -545,7 +633,8 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
        int thflags;
        struct socket *so = NULL;
        int todrop, acked;
-       boolean_t ourfinisacked, needoutput = FALSE;
+       boolean_t ourfinisacked, needoutput = FALSE, delayed_dupack = FALSE;
+       tcp_seq th_dupack = 0; /* XXX gcc warning */
        u_long tiwin;
        int recvwin;
        struct tcpopt to;               /* options in this segment */
@@ -1033,7 +1122,7 @@ findpcb:
                 * for syncache, or perform t/tcp connection.
                 */
                if (so->so_qlen <= so->so_qlimit) {
-                       tcp_dooptions(&to, optp, optlen, TRUE);
+                       tcp_dooptions(&to, optp, optlen, TRUE, th->th_ack);
                        if (!syncache_add(&inc, &to, th, so, m))
                                goto drop;
 
@@ -1085,7 +1174,7 @@ after_listen:
         * Process options.
         * XXX this is tradtitional behavior, may need to be cleaned up.
         */
-       tcp_dooptions(&to, optp, optlen, (thflags & TH_SYN) != 0);
+       tcp_dooptions(&to, optp, optlen, (thflags & TH_SYN) != 0, th->th_ack);
        if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
                if ((to.to_flags & TOF_SCALE) && (tp->t_flags & TF_REQ_SCALE)) {
                        tp->t_flags |= TF_RCVD_SCALE;
@@ -1170,7 +1259,7 @@ after_listen:
                                 */
                                if (tcp_do_eifel_detect &&
                                    (to.to_flags & TOF_TS) && to.to_tsecr &&
-                                   (tp->t_flags & TF_FIRSTACCACK)) {
+                                   (tp->rxt_flags & TRXT_F_FIRSTACCACK)) {
                                        /* Eifel detection applicable. */
                                        if (to.to_tsecr < tp->t_rexmtTS) {
                                                tcp_revert_congestion_state(tp);
@@ -1184,8 +1273,8 @@ after_listen:
                                        tcp_revert_congestion_state(tp);
                                        ++tcpstat.tcps_rttdetected;
                                }
-                               tp->t_flags &= ~(TF_FIRSTACCACK |
-                                                TF_FASTREXMT | TF_EARLYREXMT);
+                               tp->rxt_flags &= ~(TRXT_F_FIRSTACCACK |
+                                   TRXT_F_FASTREXMT | TRXT_F_EARLYREXMT);
                                /*
                                 * Recalculate the retransmit timer / rtt.
                                 *
@@ -1251,7 +1340,7 @@ after_listen:
                        }
                } else if (tiwin == tp->snd_wnd &&
                    th->th_ack == tp->snd_una &&
-                   LIST_EMPTY(&tp->t_segq) &&
+                   TAILQ_EMPTY(&tp->t_segq) &&
                    tlen <= ssb_space(&so->so_rcv)) {
                        u_long newsize = 0;     /* automatic sockbuf scaling */
                        /*
@@ -1630,7 +1719,6 @@ after_listen:
         */
        if ((to.to_flags & TOF_TS) && tp->ts_recent != 0 &&
            TSTMP_LT(to.to_tsval, tp->ts_recent)) {
-
                /* Check to see if ts_recent is over 24 days old.  */
                if ((int)(ticks - tp->ts_recent_age) > TCP_PAWS_IDLE) {
                        /*
@@ -1645,6 +1733,39 @@ after_listen:
                         * dropped when ts_recent is old.
                         */
                        tp->ts_recent = 0;
+               } else if (tcp_paws_tolerance && tlen != 0 &&
+                   tp->t_state == TCPS_ESTABLISHED &&
+                   (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK&&
+                   !(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)) &&
+                   th->th_ack == tp->snd_una &&
+                   tiwin == tp->snd_wnd &&
+                   TSTMP_GEQ(to.to_tsval + tcp_paws_tolerance, tp->ts_recent)&&
+                   (th->th_seq == tp->rcv_nxt ||
+                    (SEQ_GT(th->th_seq, tp->rcv_nxt) && 
+                     tcp_paws_canreasslast(tp, th, tlen)))) {
+                       /*
+                        * This tends to prevent valid new segments from being
+                        * dropped by the reordered segments sent by the fast
+                        * retransmission algorithm on the sending side, i.e.
+                        * the fast retransmitted segment w/ larger timestamp
+                        * arrives earlier than the previously sent new segments
+                        * w/ smaller timestamp.
+                        *
+                        * If following conditions are met, the segment is
+                        * accepted:
+                        * - The segment contains data
+                        * - The connection is established
+                        * - The header does not contain important flags
+                        * - SYN or FIN is not needed
+                        * - It does not acknowledge new data
+                        * - Receive window is not changed
+                        * - The timestamp is within "acceptable" range
+                        * - The new segment is what we are expecting or
+                        *   the new segment could be merged w/ the last
+                        *   pending segment on the reassemble queue
+                        */
+                       tcpstat.tcps_pawsaccept++;
+                       tcpstat.tcps_pawsdrop++;
                } else {
                        tcpstat.tcps_rcvduppack++;
                        tcpstat.tcps_rcvdupbyte += tlen;
@@ -1676,7 +1797,8 @@ after_listen:
                            th->th_seq + tlen, thflags);
                        if (SEQ_GT(tp->reportblk.rblk_end, tp->rcv_nxt))
                                tp->reportblk.rblk_end = tp->rcv_nxt;
-                       tp->t_flags |= (TF_DUPSEG | TF_SACKLEFT | TF_ACKNOW);
+                       tp->sack_flags |= (TSACK_F_DUPSEG | TSACK_F_SACKLEFT);
+                       tp->t_flags |= TF_ACKNOW;
                }
                if (thflags & TH_SYN) {
                        thflags &= ~TH_SYN;
@@ -1879,148 +2001,30 @@ after_listen:
                if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
                        if (TCP_DO_SACK(tp))
                                tcp_sack_update_scoreboard(tp, &to);
-                       if (tlen != 0 || tiwin != tp->snd_wnd) {
-                               tp->t_dupacks = 0;
-                               break;
-                       }
-                       tcpstat.tcps_rcvdupack++;
                        if (!tcp_callout_active(tp, tp->tt_rexmt) ||
                            th->th_ack != tp->snd_una) {
+                               if (tlen == 0 && tiwin == tp->snd_wnd)
+                                       tcpstat.tcps_rcvdupack++;
                                tp->t_dupacks = 0;
                                break;
                        }
-                       /*
-                        * We have outstanding data (other than
-                        * a window probe), this is a completely
-                        * duplicate ack (ie, window info didn't
-                        * change), the ack is the biggest we've
-                        * seen and we've seen exactly our rexmt
-                        * threshhold of them, so assume a packet
-                        * has been dropped and retransmit it.
-                        * Kludge snd_nxt & the congestion
-                        * window so we send only this one
-                        * packet.
-                        */
-                       if (IN_FASTRECOVERY(tp)) {
-                               if (TCP_DO_SACK(tp)) {
-                                       /* No artifical cwnd inflation. */
-                                       tcp_sack_rexmt(tp, th);
+                       if (tlen != 0 || tiwin != tp->snd_wnd) {
+                               if (!tcp_do_rfc3517bis ||
+                                   !TCP_DO_SACK(tp) ||
+                                   (to.to_flags &
+                                    (TOF_SACK | TOF_SACK_REDUNDANT))
+                                    != TOF_SACK) {
+                                       tp->t_dupacks = 0;
                                } else {
-                                       /*
-                                        * Dup acks mean that packets
-                                        * have left the network
-                                        * (they're now cached at the
-                                        * receiver) so bump cwnd by
-                                        * the amount in the receiver
-                                        * to keep a constant cwnd
-                                        * packets in the network.
-                                        */
-                                       tp->snd_cwnd += tp->t_maxseg;
-                                       tcp_output(tp);
+                                       delayed_dupack = TRUE;
+                                       th_dupack = th->th_ack;
                                }
-                       } else if (SEQ_LT(th->th_ack, tp->snd_recover)) {
-                               tp->t_dupacks = 0;
                                break;
-                       } else if (++tp->t_dupacks == tcprexmtthresh) {
-                               tcp_seq old_snd_nxt;
-                               u_int win;
-
-fastretransmit:
-                               if (tcp_do_eifel_detect &&
-                                   (tp->t_flags & TF_RCVD_TSTMP)) {
-                                       tcp_save_congestion_state(tp);
-                                       tp->t_flags |= TF_FASTREXMT;
-                               }
-                               /*
-                                * We know we're losing at the current
-                                * window size, so do congestion avoidance:
-                                * set ssthresh to half the current window
-                                * and pull our congestion window back to the
-                                * new ssthresh.
-                                */
-                               win = min(tp->snd_wnd, tp->snd_cwnd) / 2 /
-                                   tp->t_maxseg;
-                               if (win < 2)
-                                       win = 2;
-                               tp->snd_ssthresh = win * tp->t_maxseg;
-                               ENTER_FASTRECOVERY(tp);
-                               tp->snd_recover = tp->snd_max;
-                               tcp_callout_stop(tp, tp->tt_rexmt);
-                               tp->t_rtttime = 0;
-                               old_snd_nxt = tp->snd_nxt;
-                               tp->snd_nxt = th->th_ack;
-                               tp->snd_cwnd = tp->t_maxseg;
-                               tcp_output(tp);
-                               ++tcpstat.tcps_sndfastrexmit;
-                               tp->snd_cwnd = tp->snd_ssthresh;
-                               tp->rexmt_high = tp->snd_nxt;
-                               tp->t_flags &= ~TF_SACKRESCUED;
-                               if (SEQ_GT(old_snd_nxt, tp->snd_nxt))
-                                       tp->snd_nxt = old_snd_nxt;
-                               KASSERT(tp->snd_limited <= 2,
-                                   ("tp->snd_limited too big"));
-                               if (TCP_DO_SACK(tp))
-                                       tcp_sack_rexmt(tp, th);
-                               else
-                                       tp->snd_cwnd += tp->t_maxseg *
-                                           (tp->t_dupacks - tp->snd_limited);
-                       } else if (tcp_do_limitedtransmit) {
-                               u_long oldcwnd = tp->snd_cwnd;
-                               tcp_seq oldsndmax = tp->snd_max;
-                               tcp_seq oldsndnxt = tp->snd_nxt;
-                               /* outstanding data */
-                               uint32_t ownd = tp->snd_max - tp->snd_una;
-                               u_int sent;
-
-#define        iceildiv(n, d)          (((n)+(d)-1) / (d))
-
-                               KASSERT(tp->t_dupacks == 1 ||
-                                       tp->t_dupacks == 2,
-                                   ("dupacks not 1 or 2"));
-                               if (tp->t_dupacks == 1)
-                                       tp->snd_limited = 0;
-                               tp->snd_nxt = tp->snd_max;
-                               tp->snd_cwnd = ownd +
-                                   (tp->t_dupacks - tp->snd_limited) *
-                                   tp->t_maxseg;
-                               tcp_output(tp);
-
-                               if (SEQ_LT(oldsndnxt, oldsndmax)) {
-                                       KASSERT(SEQ_GEQ(oldsndnxt, tp->snd_una),
-                                           ("snd_una moved in other threads"));
-                                       tp->snd_nxt = oldsndnxt;
-                               }
-                               tp->snd_cwnd = oldcwnd;
-                               sent = tp->snd_max - oldsndmax;
-                               if (sent > tp->t_maxseg) {
-                                       KASSERT((tp->t_dupacks == 2 &&
-                                                tp->snd_limited == 0) ||
-                                               (sent == tp->t_maxseg + 1 &&
-                                                tp->t_flags & TF_SENTFIN),
-                                           ("sent too much"));
-                                       KASSERT(sent <= tp->t_maxseg * 2,
-                                           ("sent too many segments"));
-                                       tp->snd_limited = 2;
-                                       tcpstat.tcps_sndlimited += 2;
-                               } else if (sent > 0) {
-                                       ++tp->snd_limited;
-                                       ++tcpstat.tcps_sndlimited;
-                               } else if (tcp_do_early_retransmit &&
-                                   (tcp_do_eifel_detect &&
-                                    (tp->t_flags & TF_RCVD_TSTMP)) &&
-                                   ownd < 4 * tp->t_maxseg &&
-                                   tp->t_dupacks + 1 >=
-                                     iceildiv(ownd, tp->t_maxseg) &&
-                                   (!TCP_DO_SACK(tp) ||
-                                    ownd <= tp->t_maxseg ||
-                                    tcp_sack_has_sacked(&tp->scb,
-                                                       ownd - tp->t_maxseg))) {
-                                       ++tcpstat.tcps_sndearlyrexmit;
-                                       tp->t_flags |= TF_EARLYREXMT;
-                                       goto fastretransmit;
-                               }
                        }
-                       goto drop;
+                       if (tcp_fast_recovery(tp, th->th_ack, &to))
+                               goto drop;
+                       else
+                               break;
                }
 
                KASSERT(SEQ_GT(th->th_ack, tp->snd_una), ("th_ack <= snd_una"));
@@ -2063,7 +2067,7 @@ process_ACK:
 
                if (tcp_do_eifel_detect && acked > 0 &&
                    (to.to_flags & TOF_TS) && (to.to_tsecr != 0) &&
-                   (tp->t_flags & TF_FIRSTACCACK)) {
+                   (tp->rxt_flags & TRXT_F_FIRSTACCACK)) {
                        /* Eifel detection applicable. */
                        if (to.to_tsecr < tp->t_rexmtTS) {
                                ++tcpstat.tcps_eifeldetected;
@@ -2112,7 +2116,8 @@ process_ACK:
                        goto step6;
 
                /* Stop looking for an acceptable ACK since one was received. */
-               tp->t_flags &= ~(TF_FIRSTACCACK | TF_FASTREXMT | TF_EARLYREXMT);
+               tp->rxt_flags &= ~(TRXT_F_FIRSTACCACK |
+                   TRXT_F_FASTREXMT | TRXT_F_EARLYREXMT);
 
                if (acked > so->so_snd.ssb_cc) {
                        tp->snd_wnd -= so->so_snd.ssb_cc;
@@ -2173,7 +2178,7 @@ process_ACK:
                        } else {
                                if (TCP_DO_SACK(tp)) {
                                        tp->snd_max_rexmt = tp->snd_max;
-                                       tcp_sack_rexmt(tp, th);
+                                       tcp_sack_rexmt(tp);
                                } else {
                                        tcp_newreno_partial_ack(tp, th, acked);
                                }
@@ -2392,7 +2397,7 @@ dodata:                                                   /* XXX */
                 * fast retransmit can work).
                 */
                if (th->th_seq == tp->rcv_nxt &&
-                   LIST_EMPTY(&tp->t_segq) &&
+                   TAILQ_EMPTY(&tp->t_segq) &&
                    TCPS_HAVEESTABLISHED(tp->t_state)) {
                        if (DELAY_ACK(tp)) {
                                tcp_callout_reset(tp, tp->tt_delack,
@@ -2414,7 +2419,7 @@ dodata:                                                   /* XXX */
                        }
                        sorwakeup(so);
                } else {
-                       if (!(tp->t_flags & TF_DUPSEG)) {
+                       if (!(tp->sack_flags & TSACK_F_DUPSEG)) {
                                /* Initialize SACK report block. */
                                tp->reportblk.rblk_start = th->th_seq;
                                tp->reportblk.rblk_end = TCP_SACK_BLKEND(
@@ -2506,6 +2511,12 @@ dodata:                                                  /* XXX */
                tcp_trace(TA_INPUT, ostate, tp, tcp_saveipgen, &tcp_savetcp, 0);
 #endif
 
+       /*
+        * Delayed duplicated ACK processing
+        */
+       if (delayed_dupack && tcp_fast_recovery(tp, th_dupack, &to))
+               needoutput = FALSE;
+
        /*
         * Return any desired output.
         */
@@ -2612,7 +2623,8 @@ drop:
  * Parse TCP options and place in tcpopt.
  */
 static void
-tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, boolean_t is_syn)
+tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, boolean_t is_syn,
+    tcp_seq ack)
 {
        int opt, optlen, i;
 
@@ -2694,6 +2706,10 @@ tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, boolean_t is_syn)
                                        break;
                                }
                        }
+                       if ((to->to_flags & TOF_SACK) &&
+                           tcp_sack_ndsack_blocks(to->to_sackblocks,
+                           to->to_nsackblocks, ack))
+                               to->to_flags |= TOF_DSACK;
                        break;
 #ifdef TCP_SIGNATURE
                /*
@@ -2758,7 +2774,8 @@ tcp_xmit_timer(struct tcpcb *tp, int rtt, tcp_seq ack)
 
        tcpstat.tcps_rttupdated++;
        tp->t_rttupdated++;
-       if ((tp->t_flags & TF_REBASERTO) && SEQ_GT(ack, tp->snd_max_prev)) {
+       if ((tp->rxt_flags & TRXT_F_REBASERTO) &&
+           SEQ_GT(ack, tp->snd_max_prev)) {
 #ifdef DEBUG_EIFEL_RESPONSE
                kprintf("srtt/rttvar, prev %d/%d, cur %d/%d, ",
                    tp->t_srtt_prev, tp->t_rttvar_prev,
@@ -2767,7 +2784,7 @@ tcp_xmit_timer(struct tcpcb *tp, int rtt, tcp_seq ack)
 
                tcpstat.tcps_eifelresponse++;
                rebaserto = 1;
-               tp->t_flags &= ~TF_REBASERTO;
+               tp->rxt_flags &= ~TRXT_F_REBASERTO;
                tp->t_srtt = max(tp->t_srtt_prev, (rtt << TCP_RTT_SHIFT));
                tp->t_rttvar = max(tp->t_rttvar_prev,
                    (rtt << (TCP_RTTVAR_SHIFT - 1)));
@@ -3027,11 +3044,11 @@ tcp_mss(struct tcpcb *tp, int offer)
                mss -= TCPOLEN_TSTAMP_APPA;
 
 #if    (MCLBYTES & (MCLBYTES - 1)) == 0
-               if (mss > MCLBYTES)
-                       mss &= ~(MCLBYTES-1);
+       if (mss > MCLBYTES)
+               mss &= ~(MCLBYTES-1);
 #else
-               if (mss > MCLBYTES)
-                       mss = mss / MCLBYTES * MCLBYTES;
+       if (mss > MCLBYTES)
+               mss = mss / MCLBYTES * MCLBYTES;
 #endif
        /*
         * If there's a pipesize, change the socket buffer
@@ -3152,7 +3169,7 @@ tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th, int acked)
  * except when retransmitting snd_una.
  */
 static void
-tcp_sack_rexmt(struct tcpcb *tp, struct tcphdr *th)
+tcp_sack_rexmt(struct tcpcb *tp)
 {
        tcp_seq old_snd_nxt = tp->snd_nxt;
        u_long ocwnd = tp->snd_cwnd;
@@ -3171,7 +3188,7 @@ tcp_sack_rexmt(struct tcpcb *tp, struct tcphdr *th)
                int error;
 
                old_rexmt_high = tp->rexmt_high;
-               if (!tcp_sack_nextseg(tp, &nextrexmt, &seglen, &rescue)) {
+               if (!tcp_sack_nextseg(tp, &nextrexmt, &seglen, &rescue)) {
                        tp->rexmt_high = old_rexmt_high;
                        break;
                }
@@ -3216,13 +3233,14 @@ tcp_sack_rexmt(struct tcpcb *tp, struct tcphdr *th)
                if (rescue) {
                        tcpstat.tcps_sackrescue++;
                        tp->rexmt_rescue = tp->snd_nxt;
-                       tp->t_flags |= TF_SACKRESCUED;
+                       tp->sack_flags |= TSACK_F_SACKRESCUED;
                        break;
                }
                if (SEQ_LT(nextrexmt, old_snd_max) &&
                    SEQ_LT(tp->rexmt_high, tp->snd_nxt)) {
                        tp->rexmt_high = seq_min(tp->snd_nxt, old_snd_max);
-                       if ((tp->t_flags & TF_SACKRESCUED) &&
+                       if (tcp_aggressive_rescuesack &&
+                           (tp->sack_flags & TSACK_F_SACKRESCUED) &&
                            SEQ_LT(tp->rexmt_rescue, tp->rexmt_high)) {
                                /* Drag RescueRxt along with HighRxt */
                                tp->rexmt_rescue = tp->rexmt_high;
@@ -3234,6 +3252,51 @@ tcp_sack_rexmt(struct tcpcb *tp, struct tcphdr *th)
        tp->snd_cwnd = ocwnd;
 }
 
+/*
+ * Return TRUE, if some new segments are sent
+ */
+static boolean_t
+tcp_sack_limitedxmit(struct tcpcb *tp)
+{
+       tcp_seq oldsndnxt = tp->snd_nxt;
+       tcp_seq oldsndmax = tp->snd_max;
+       u_long ocwnd = tp->snd_cwnd;
+       uint32_t pipe, sent;
+       boolean_t ret = FALSE;
+       tcp_seq_diff_t cwnd_left;
+       tcp_seq next;
+
+       tp->rexmt_high = tp->snd_una - 1;
+       pipe = tcp_sack_compute_pipe(tp);
+       cwnd_left = (tcp_seq_diff_t)(ocwnd - pipe);
+       if (cwnd_left < (tcp_seq_diff_t)tp->t_maxseg)
+               return FALSE;
+
+       next = tp->snd_nxt = tp->snd_max;
+       tp->snd_cwnd = tp->snd_nxt - tp->snd_una +
+           rounddown(cwnd_left, tp->t_maxseg);
+
+       tcp_output(tp);
+
+       sent = tp->snd_nxt - next;
+       if (sent > 0) {
+               tcpstat.tcps_sndlimited += howmany(sent, tp->t_maxseg);
+               ret = TRUE;
+       }
+
+       if (SEQ_LT(oldsndnxt, oldsndmax)) {
+               KASSERT(SEQ_GEQ(oldsndnxt, tp->snd_una),
+                   ("snd_una moved in other threads"));
+               tp->snd_nxt = oldsndnxt;
+       }
+       tp->snd_cwnd = ocwnd;
+
+       if (ret && TCP_DO_NCR(tp))
+               tcp_ncr_update_rxtthresh(tp);
+
+       return ret;
+}
+
 /*
  * Reset idle time and keep-alive timer, typically called when a valid
  * tcp packet is received but may also be called when FASTKEEP is set
@@ -3309,3 +3372,180 @@ tcp_established(struct tcpcb *tp)
                        tp->t_rxtcur = TCPTV_RTOBASE3;
        }
 }
+
+/*
+ * Returns TRUE, if the ACK should be dropped
+ */
+static boolean_t
+tcp_fast_recovery(struct tcpcb *tp, tcp_seq th_ack, const struct tcpopt *to)
+{
+       boolean_t fast_sack_rexmt = TRUE;
+
+       tcpstat.tcps_rcvdupack++;
+
+       /*
+        * We have outstanding data (other than a window probe),
+        * this is a completely duplicate ack (ie, window info
+        * didn't change), the ack is the biggest we've seen and
+        * we've seen exactly our rexmt threshhold of them, so
+        * assume a packet has been dropped and retransmit it.
+        * Kludge snd_nxt & the congestion window so we send only
+        * this one packet.
+        */
+       if (IN_FASTRECOVERY(tp)) {
+               if (TCP_DO_SACK(tp)) {
+                       /* No artifical cwnd inflation. */
+                       tcp_sack_rexmt(tp);
+               } else {
+                       /*
+                        * Dup acks mean that packets have left
+                        * the network (they're now cached at the
+                        * receiver) so bump cwnd by the amount in
+                        * the receiver to keep a constant cwnd
+                        * packets in the network.
+                        */
+                       tp->snd_cwnd += tp->t_maxseg;
+                       tcp_output(tp);
+               }
+               return TRUE;
+       } else if (SEQ_LT(th_ack, tp->snd_recover)) {
+               tp->t_dupacks = 0;
+               return FALSE;
+       } else if (tcp_ignore_redun_dsack && TCP_DO_SACK(tp) &&
+           (to->to_flags & (TOF_DSACK | TOF_SACK_REDUNDANT)) ==
+           (TOF_DSACK | TOF_SACK_REDUNDANT)) {
+               /*
+                * If the ACK carries DSACK and other SACK blocks
+                * carry information that we have already known,
+                * don't count this ACK as duplicate ACK.  This
+                * prevents spurious early retransmit and fast
+                * retransmit.  This also meets the requirement of
+                * RFC3042 that new segments should not be sent if
+                * the SACK blocks do not contain new information
+                * (XXX we actually loosen the requirment that only
+                * DSACK is checked here).
+                *
+                * This kind of ACKs are usually sent after spurious
+                * retransmit.
+                */
+               /* Do nothing; don't change t_dupacks */
+               return TRUE;
+       } else if (tp->t_dupacks == 0 && TCP_DO_NCR(tp)) {
+               tcp_ncr_update_rxtthresh(tp);
+       }
+
+       if (++tp->t_dupacks == tp->t_rxtthresh) {
+               tcp_seq old_snd_nxt;
+               u_int win;
+
+fastretransmit:
+               if (tcp_do_eifel_detect && (tp->t_flags & TF_RCVD_TSTMP)) {
+                       tcp_save_congestion_state(tp);
+                       tp->rxt_flags |= TRXT_F_FASTREXMT;
+               }
+               /*
+                * We know we're losing at the current window size,
+                * so do congestion avoidance: set ssthresh to half
+                * the current window and pull our congestion window
+                * back to the new ssthresh.
+                */
+               win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg;
+               if (win < 2)
+                       win = 2;
+               tp->snd_ssthresh = win * tp->t_maxseg;
+               ENTER_FASTRECOVERY(tp);
+               tp->snd_recover = tp->snd_max;
+               tcp_callout_stop(tp, tp->tt_rexmt);
+               tp->t_rtttime = 0;
+               old_snd_nxt = tp->snd_nxt;
+               tp->snd_nxt = th_ack;
+               tp->snd_cwnd = tp->t_maxseg;
+               tcp_output(tp);
+               ++tcpstat.tcps_sndfastrexmit;
+               tp->snd_cwnd = tp->snd_ssthresh;
+               tp->rexmt_high = tp->snd_nxt;
+               tp->sack_flags &= ~TSACK_F_SACKRESCUED;
+               if (SEQ_GT(old_snd_nxt, tp->snd_nxt))
+                       tp->snd_nxt = old_snd_nxt;
+               KASSERT(tp->snd_limited <= 2, ("tp->snd_limited too big"));
+               if (TCP_DO_SACK(tp)) {
+                       if (fast_sack_rexmt)
+                               tcp_sack_rexmt(tp);
+               } else {
+                       tp->snd_cwnd += tp->t_maxseg *
+                           (tp->t_dupacks - tp->snd_limited);
+               }
+       } else if ((tcp_do_rfc3517bis && TCP_DO_SACK(tp)) || TCP_DO_NCR(tp)) {
+               /*
+                * The RFC3517bis recommends to reduce the byte threshold,
+                * and enter fast retransmit if IsLost(snd_una).  However,
+                * if we use IsLost(snd_una) based fast retransmit here,
+                * segments reordering will cause spurious retransmit.  So
+                * we defer the IsLost(snd_una) based fast retransmit until
+                * the extended limited transmit can't send any segments and
+                * early retransmit can't be done.
+                */
+               if (tcp_rfc3517bis_rxt && tcp_do_rfc3517bis &&
+                   tcp_sack_islost(&tp->scb, tp->snd_una))
+                       goto fastretransmit;
+
+               if (tcp_do_limitedtransmit || TCP_DO_NCR(tp)) {
+                       if (!tcp_sack_limitedxmit(tp)) {
+                               /* outstanding data */
+                               uint32_t ownd = tp->snd_max - tp->snd_una;
+
+                               if (need_early_retransmit(tp, ownd)) {
+                                       ++tcpstat.tcps_sndearlyrexmit;
+                                       tp->rxt_flags |= TRXT_F_EARLYREXMT;
+                                       goto fastretransmit;
+                               } else if (tcp_do_rfc3517bis &&
+                                   tcp_sack_islost(&tp->scb, tp->snd_una)) {
+                                       fast_sack_rexmt = FALSE;
+                                       goto fastretransmit;
+                               }
+                       }
+               }
+       } else if (tcp_do_limitedtransmit) {
+               u_long oldcwnd = tp->snd_cwnd;
+               tcp_seq oldsndmax = tp->snd_max;
+               tcp_seq oldsndnxt = tp->snd_nxt;
+               /* outstanding data */
+               uint32_t ownd = tp->snd_max - tp->snd_una;
+               u_int sent;
+
+               KASSERT(tp->t_dupacks == 1 || tp->t_dupacks == 2,
+                   ("dupacks not 1 or 2"));
+               if (tp->t_dupacks == 1)
+                       tp->snd_limited = 0;
+               tp->snd_nxt = tp->snd_max;
+               tp->snd_cwnd = ownd +
+                   (tp->t_dupacks - tp->snd_limited) * tp->t_maxseg;
+               tcp_output(tp);
+
+               if (SEQ_LT(oldsndnxt, oldsndmax)) {
+                       KASSERT(SEQ_GEQ(oldsndnxt, tp->snd_una),
+                           ("snd_una moved in other threads"));
+                       tp->snd_nxt = oldsndnxt;
+               }
+               tp->snd_cwnd = oldcwnd;
+               sent = tp->snd_max - oldsndmax;
+               if (sent > tp->t_maxseg) {
+                       KASSERT((tp->t_dupacks == 2 && tp->snd_limited == 0) ||
+                           (sent == tp->t_maxseg + 1 &&
+                            (tp->t_flags & TF_SENTFIN)),
+                           ("sent too much"));
+                       KASSERT(sent <= tp->t_maxseg * 2,
+                           ("sent too many segments"));
+                       tp->snd_limited = 2;
+                       tcpstat.tcps_sndlimited += 2;
+               } else if (sent > 0) {
+                       ++tp->snd_limited;
+                       ++tcpstat.tcps_sndlimited;
+               } else if (need_early_retransmit(tp, ownd)) {
+                       ++tcpstat.tcps_sndearlyrexmit;
+                       tp->rxt_flags |= TRXT_F_EARLYREXMT;
+                       goto fastretransmit;
+               }
+       }
+       return TRUE;
+}