tcp/sack: Use RFC3517bis IsLost(snd_una) as fallback of early retransmit
[dragonfly.git] / sys / netinet / tcp_input.c
index 2edd2ae..d687b68 100644 (file)
@@ -67,7 +67,6 @@
  * $FreeBSD: src/sys/netinet/tcp_input.c,v 1.107.2.38 2003/05/21 04:46:41 cjc Exp $
  */
 
-#include "opt_ipfw.h"          /* for ipfw_fwd         */
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
@@ -168,11 +167,6 @@ int tcp_aggregate_acks = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, aggregate_acks, CTLFLAG_RW,
     &tcp_aggregate_acks, 0, "Aggregate built-up acks into one ack");
 
-int tcp_do_rfc3390 = 1;
-SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW,
-    &tcp_do_rfc3390, 0,
-    "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)");
-
 static int tcp_do_eifel_detect = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, eifel, CTLFLAG_RW,
     &tcp_do_eifel_detect, 0, "Eifel detection algorithm (RFC 3522)");
@@ -182,6 +176,15 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, abc, CTLFLAG_RW,
     &tcp_do_abc, 0,
     "TCP Appropriate Byte Counting (RFC 3465)");
 
+/*
+ * The following value actually takes range [25ms, 250ms],
+ * given that most modern systems use 1ms ~ 10ms as the unit
+ * of timestamp option.
+ */
+static u_int tcp_paws_tolerance = 25;
+SYSCTL_UINT(_net_inet_tcp, OID_AUTO, paws_tolerance, CTLFLAG_RW,
+    &tcp_paws_tolerance, 0, "RFC1323 PAWS tolerance");
+
 /*
  * Define as tunable for easy testing with SACK on and off.
  * Warning:  do not change setting in the middle of an existing active TCP flow,
@@ -195,6 +198,22 @@ int tcp_do_smartsack = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, smartsack, CTLFLAG_RW,
     &tcp_do_smartsack, 0, "Enable Smart SACK Algorithms");
 
+int tcp_do_rescuesack = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, rescuesack, CTLFLAG_RW,
+    &tcp_do_rescuesack, 0, "Rescue retransmission for SACK");
+
+int tcp_aggressive_rescuesack = 0;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, rescuesack_agg, CTLFLAG_RW,
+    &tcp_aggressive_rescuesack, 0, "Aggressive rescue retransmission for SACK");
+
+int tcp_do_rfc3517bis = 0;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3517bis, CTLFLAG_RW,
+    &tcp_do_rfc3517bis, 0, "Enable RFC3517 update");
+
+int tcp_rfc3517bis_rxt = 0;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3517bis_rxt, CTLFLAG_RW,
+    &tcp_rfc3517bis_rxt, 0, "Enable RFC3517 retransmit update");
+
 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0,
     "TCP Segment Reassembly Queue");
 
@@ -226,23 +245,32 @@ int tcp_autorcvbuf_max = 2*1024*1024;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_RW,
     &tcp_autorcvbuf_max, 0, "Max size of automatic receive buffer");
 
-int tcp_sosnd_agglim = 2;
-SYSCTL_INT(_net_inet_tcp, OID_AUTO, sosnd_agglim, CTLFLAG_RW,
-    &tcp_sosnd_agglim, 0, "TCP sosend mbuf aggregation limit");
+int tcp_sosend_agglim = 2;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, sosend_agglim, CTLFLAG_RW,
+    &tcp_sosend_agglim, 0, "TCP sosend mbuf aggregation limit");
 
-int tcp_sosnd_async = 1;
-SYSCTL_INT(_net_inet_tcp, OID_AUTO, sosnd_async, CTLFLAG_RW,
-    &tcp_sosnd_async, 0, "TCP asynchronized pru_send");
+int tcp_sosend_async = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, sosend_async, CTLFLAG_RW,
+    &tcp_sosend_async, 0, "TCP asynchronized pru_send");
 
-static void     tcp_dooptions(struct tcpopt *, u_char *, int, boolean_t);
+static int tcp_ignore_redun_dsack = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, ignore_redun_dsack, CTLFLAG_RW,
+    &tcp_ignore_redun_dsack, 0, "Ignore redundant DSACK");
+
+static void     tcp_dooptions(struct tcpopt *, u_char *, int, boolean_t,
+                   tcp_seq);
 static void     tcp_pulloutofband(struct socket *,
                     struct tcphdr *, struct mbuf *, int);
 static int      tcp_reass(struct tcpcb *, struct tcphdr *, int *,
                     struct mbuf *);
-static void     tcp_xmit_timer(struct tcpcb *, int);
+static void     tcp_xmit_timer(struct tcpcb *, int, tcp_seq);
 static void     tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *, int);
-static void     tcp_sack_rexmt(struct tcpcb *, struct tcphdr *);
+static void     tcp_sack_rexmt(struct tcpcb *);
+static boolean_t tcp_sack_limitedxmit(struct tcpcb *);
 static int      tcp_rmx_msl(const struct tcpcb *);
+static void     tcp_established(struct tcpcb *);
+static boolean_t tcp_fast_recovery(struct tcpcb *, tcp_seq,
+                    const struct tcpopt *);
 
 /* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */
 #ifdef INET6
@@ -274,6 +302,67 @@ do { \
       (SEQ_LT(tp->snd_wl2, th->th_ack) ||                              \
        (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))
 
+#define        iceildiv(n, d)          (((n)+(d)-1) / (d))
+#define need_early_retransmit(tp, ownd) \
+    (tcp_do_early_retransmit && \
+     (tcp_do_eifel_detect && (tp->t_flags & TF_RCVD_TSTMP)) && \
+     ownd < ((tp->t_rxtthresh + 1) * tp->t_maxseg) && \
+     tp->t_dupacks + 1 >= iceildiv(ownd, tp->t_maxseg) && \
+     (!TCP_DO_SACK(tp) || ownd <= tp->t_maxseg || \
+      tcp_sack_has_sacked(&tp->scb, ownd - tp->t_maxseg)))
+
+/*
+ * Returns TRUE, if this segment can be merged with the last
+ * pending segment in the reassemble queue and this segment
+ * does not overlap with the pending segment immediately
+ * preceeding the last pending segment.
+ */
+static __inline boolean_t
+tcp_paws_canreasslast(const struct tcpcb *tp, const struct tcphdr *th, int tlen)
+{
+       const struct tseg_qent *last, *prev;
+
+       last = TAILQ_LAST(&tp->t_segq, tsegqe_head);
+       if (last == NULL)
+               return FALSE;
+
+       /* This segment comes immediately after the last pending segment */
+       if (last->tqe_th->th_seq + last->tqe_len == th->th_seq)
+               return TRUE;
+
+       if (th->th_seq + tlen != last->tqe_th->th_seq)
+               return FALSE;
+       /* This segment comes immediately before the last pending segment */
+
+       prev = TAILQ_PREV(last, tsegqe_head, tqe_q);
+       if (prev == NULL) {
+               /*
+                * No pending preceeding segment, we assume this segment
+                * could be reassembled.
+                */
+               return TRUE;
+       }
+
+       /* This segment does not overlap with the preceeding segment */
+       if (SEQ_GEQ(th->th_seq, prev->tqe_th->th_seq + prev->tqe_len))
+               return TRUE;
+
+       return FALSE;
+}
+
+static __inline void
+tcp_ncr_update_rxtthresh(struct tcpcb *tp)
+{
+       int old_rxtthresh = tp->t_rxtthresh;
+       uint32_t ownd = tp->snd_max - tp->snd_una;
+
+       tp->t_rxtthresh = max(3, ((ownd / tp->t_maxseg) >> 1));
+       if (tp->t_rxtthresh != old_rxtthresh) {
+               tcp_sack_update_lostseq(&tp->scb, tp->snd_una,
+                   tp->t_maxseg, tp->t_rxtthresh);
+       }
+}
+
 static int
 tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
 {
@@ -321,7 +410,7 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
        /*
         * Find a segment which begins after this one does.
         */
-       LIST_FOREACH(q, &tp->t_segq, tqe_q) {
+       TAILQ_FOREACH(q, &tp->t_segq, tqe_q) {
                if (SEQ_GT(q->tqe_th->th_seq, th->th_seq))
                        break;
                p = q;
@@ -338,13 +427,15 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
                /* conversion to int (in i) handles seq wraparound */
                i = p->tqe_th->th_seq + p->tqe_len - th->th_seq;
                if (i > 0) {            /* overlaps preceding segment */
-                       tp->t_flags |= (TF_DUPSEG | TF_ENCLOSESEG);
+                       tp->sack_flags |=
+                           (TSACK_F_DUPSEG | TSACK_F_ENCLOSESEG);
                        /* enclosing block starts w/ preceding segment */
                        tp->encloseblk.rblk_start = p->tqe_th->th_seq;
                        if (i >= *tlenp) {
                                /* preceding encloses incoming segment */
-                               tp->encloseblk.rblk_end = p->tqe_th->th_seq +
-                                   p->tqe_len;
+                               tp->encloseblk.rblk_end = TCP_SACK_BLKEND(
+                                   p->tqe_th->th_seq + p->tqe_len,
+                                   p->tqe_th->th_flags);
                                tcpstat.tcps_rcvduppack++;
                                tcpstat.tcps_rcvdupbyte += *tlenp;
                                m_freem(m);
@@ -362,8 +453,8 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
                        *tlenp -= i;
                        th->th_seq += i;
                        /* incoming segment end is enclosing block end */
-                       tp->encloseblk.rblk_end = th->th_seq + *tlenp +
-                           ((th->th_flags & TH_FIN) != 0);
+                       tp->encloseblk.rblk_end = TCP_SACK_BLKEND(
+                           th->th_seq + *tlenp, th->th_flags);
                        /* trim end of reported D-SACK block */
                        tp->reportblk.rblk_end = th->th_seq;
                }
@@ -378,20 +469,22 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
        while (q) {
                tcp_seq_diff_t i = (th->th_seq + *tlenp) - q->tqe_th->th_seq;
                tcp_seq qend = q->tqe_th->th_seq + q->tqe_len;
+               tcp_seq qend_sack = TCP_SACK_BLKEND(qend, q->tqe_th->th_flags);
                struct tseg_qent *nq;
 
                if (i <= 0)
                        break;
-               if (!(tp->t_flags & TF_DUPSEG)) {    /* first time through */
-                       tp->t_flags |= (TF_DUPSEG | TF_ENCLOSESEG);
+               if (!(tp->sack_flags & TSACK_F_DUPSEG)) {
+                       /* first time through */
+                       tp->sack_flags |= (TSACK_F_DUPSEG | TSACK_F_ENCLOSESEG);
                        tp->encloseblk = tp->reportblk;
                        /* report trailing duplicate D-SACK segment */
                        tp->reportblk.rblk_start = q->tqe_th->th_seq;
                }
-               if ((tp->t_flags & TF_ENCLOSESEG) &&
-                   SEQ_GT(qend, tp->encloseblk.rblk_end)) {
+               if ((tp->sack_flags & TSACK_F_ENCLOSESEG) &&
+                   SEQ_GT(qend_sack, tp->encloseblk.rblk_end)) {
                        /* extend enclosing block if one exists */
-                       tp->encloseblk.rblk_end = qend;
+                       tp->encloseblk.rblk_end = qend_sack;
                }
                if (i < q->tqe_len) {
                        q->tqe_th->th_seq += i;
@@ -400,8 +493,8 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
                        break;
                }
 
-               nq = LIST_NEXT(q, tqe_q);
-               LIST_REMOVE(q, tqe_q);
+               nq = TAILQ_NEXT(q, tqe_q);
+               TAILQ_REMOVE(&tp->t_segq, q, tqe_q);
                m_freem(q->tqe_m);
                kfree(q, M_TSEGQ);
                atomic_add_int(&tcp_reass_qsize, -1);
@@ -416,25 +509,26 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
        /* check if can coalesce with following segment */
        if (q != NULL && (th->th_seq + *tlenp == q->tqe_th->th_seq)) {
                tcp_seq tend = te->tqe_th->th_seq + te->tqe_len;
+               tcp_seq tend_sack = TCP_SACK_BLKEND(tend, te->tqe_th->th_flags);
 
                te->tqe_len += q->tqe_len;
                if (q->tqe_th->th_flags & TH_FIN)
                        te->tqe_th->th_flags |= TH_FIN;
                m_cat(te->tqe_m, q->tqe_m);
-               tp->encloseblk.rblk_end = tend;
+               tp->encloseblk.rblk_end = tend_sack;
                /*
                 * When not reporting a duplicate segment, use
                 * the larger enclosing block as the SACK block.
                 */
-               if (!(tp->t_flags & TF_DUPSEG))
-                       tp->reportblk.rblk_end = tend;
-               LIST_REMOVE(q, tqe_q);
+               if (!(tp->sack_flags & TSACK_F_DUPSEG))
+                       tp->reportblk.rblk_end = tend_sack;
+               TAILQ_REMOVE(&tp->t_segq, q, tqe_q);
                kfree(q, M_TSEGQ);
                atomic_add_int(&tcp_reass_qsize, -1);
        }
 
        if (p == NULL) {
-               LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q);
+               TAILQ_INSERT_HEAD(&tp->t_segq, te, tqe_q);
        } else {
                /* check if can coalesce with preceding segment */
                if (p->tqe_th->th_seq + p->tqe_len == th->th_seq) {
@@ -445,12 +539,12 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
                         * When not reporting a duplicate segment, use
                         * the larger enclosing block as the SACK block.
                         */
-                       if (!(tp->t_flags & TF_DUPSEG))
+                       if (!(tp->sack_flags & TSACK_F_DUPSEG))
                                tp->reportblk.rblk_start = p->tqe_th->th_seq;
                        kfree(te, M_TSEGQ);
                        atomic_add_int(&tcp_reass_qsize, -1);
                } else {
-                       LIST_INSERT_AFTER(p, te, tqe_q);
+                       TAILQ_INSERT_AFTER(&tp->t_segq, p, te, tqe_q);
                }
        }
 
@@ -461,20 +555,20 @@ present:
         */
        if (!TCPS_HAVEESTABLISHED(tp->t_state))
                return (0);
-       q = LIST_FIRST(&tp->t_segq);
+       q = TAILQ_FIRST(&tp->t_segq);
        if (q == NULL || q->tqe_th->th_seq != tp->rcv_nxt)
                return (0);
        tp->rcv_nxt += q->tqe_len;
-       if (!(tp->t_flags & TF_DUPSEG)) {
+       if (!(tp->sack_flags & TSACK_F_DUPSEG)) {
                /* no SACK block to report since ACK advanced */
                tp->reportblk.rblk_start = tp->reportblk.rblk_end;
        }
        /* no enclosing block to report since ACK advanced */
-       tp->t_flags &= ~TF_ENCLOSESEG;
+       tp->sack_flags &= ~TSACK_F_ENCLOSESEG;
        flags = q->tqe_th->th_flags & TH_FIN;
-       LIST_REMOVE(q, tqe_q);
-       KASSERT(LIST_EMPTY(&tp->t_segq) ||
-               LIST_FIRST(&tp->t_segq)->tqe_th->th_seq != tp->rcv_nxt,
+       TAILQ_REMOVE(&tp->t_segq, q, tqe_q);
+       KASSERT(TAILQ_EMPTY(&tp->t_segq) ||
+               TAILQ_FIRST(&tp->t_segq)->tqe_th->th_seq != tp->rcv_nxt,
                ("segment not coalesced"));
        if (so->so_state & SS_CANTRCVMORE) {
                m_freem(q->tqe_m);
@@ -539,7 +633,8 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
        int thflags;
        struct socket *so = NULL;
        int todrop, acked;
-       boolean_t ourfinisacked, needoutput = FALSE;
+       boolean_t ourfinisacked, needoutput = FALSE, delayed_dupack = FALSE;
+       tcp_seq th_dupack = 0; /* XXX gcc warning */
        u_long tiwin;
        int recvwin;
        struct tcpopt to;               /* options in this segment */
@@ -844,12 +939,6 @@ findpcb:
        if (tp->t_state <= TCPS_CLOSED)
                goto drop;
 
-       /* Unscale the window into a 32-bit value. */
-       if (!(thflags & TH_SYN))
-               tiwin = th->th_win << tp->snd_scale;
-       else
-               tiwin = th->th_win;
-
        so = inp->inp_socket;
 
 #ifdef TCPDEBUG
@@ -933,14 +1022,7 @@ findpcb:
                                tp->snd_up = tp->snd_una;
                                tp->snd_max = tp->snd_nxt = tp->iss + 1;
                                tp->last_ack_sent = tp->rcv_nxt;
-/*
- * XXX possible bug - it doesn't appear that tp->snd_wnd is unscaled
- * until the _second_ ACK is received:
- *    rcv SYN (set wscale opts)         --> send SYN/ACK, set snd_wnd = window.
- *    rcv ACK, calculate tiwin --> process SYN_RECEIVED, determine wscale,
- *       move to ESTAB, set snd_wnd to tiwin.
- */
-                               tp->snd_wnd = tiwin;    /* unscaled */
+
                                goto after_listen;
                        }
                        if (thflags & TH_RST) {
@@ -1040,51 +1122,15 @@ findpcb:
                 * for syncache, or perform t/tcp connection.
                 */
                if (so->so_qlen <= so->so_qlimit) {
-                       tcp_dooptions(&to, optp, optlen, TRUE);
-                       if (!syncache_add(&inc, &to, th, &so, m))
+                       tcp_dooptions(&to, optp, optlen, TRUE, th->th_ack);
+                       if (!syncache_add(&inc, &to, th, so, m))
                                goto drop;
 
                        /*
                         * Entry added to syncache, mbuf used to
                         * send SYN,ACK packet.
                         */
-                       if (so == NULL)
-                               return(IPPROTO_DONE);
-
-                       /*
-                        * We must be in the correct protocol thread for
-                        * this connection.
-                        */
-                       KKASSERT(so->so_port == &curthread->td_msgport);
-
-                       inp = so->so_pcb;
-                       tp = intotcpcb(inp);
-                       tp->snd_wnd = tiwin;
-                       tp->t_starttime = ticks;
-                       tp->t_state = TCPS_ESTABLISHED;
-
-                       /*
-                        * If there is a FIN, or if there is data and the
-                        * connection is local, then delay SYN,ACK(SYN) in
-                        * the hope of piggy-backing it on a response
-                        * segment.  Otherwise must send ACK now in case
-                        * the other side is slow starting.
-                        */
-                       if (DELAY_ACK(tp) &&
-                           ((thflags & TH_FIN) ||
-                            (tlen != 0 &&
-                             ((isipv6 && in6_localaddr(&inp->in6p_faddr)) ||
-                              (!isipv6 && in_localaddr(inp->inp_faddr)))))) {
-                               tcp_callout_reset(tp, tp->tt_delack,
-                                   tcp_delacktime, tcp_timer_delack);
-                               tp->t_flags |= TF_NEEDSYN;
-                       } else {
-                               tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
-                       }
-
-                       tcpstat.tcps_connects++;
-                       soisconnected(so);
-                       goto trimthenstep6;
+                       return(IPPROTO_DONE);
                }
                goto drop;
        }
@@ -1099,6 +1145,12 @@ after_listen:
        KASSERT(tp->t_state != TCPS_LISTEN, ("tcp_input: TCPS_LISTEN state"));
        KKASSERT(so->so_port == &curthread->td_msgport);
 
+       /* Unscale the window into a 32-bit value. */
+       if (!(thflags & TH_SYN))
+               tiwin = th->th_win << tp->snd_scale;
+       else
+               tiwin = th->th_win;
+
        /*
         * This is the second part of the MSS DoS prevention code (after
         * minmss on the sending side) and it deals with too many too small
@@ -1122,19 +1174,26 @@ after_listen:
         * Process options.
         * XXX this is tradtitional behavior, may need to be cleaned up.
         */
-       tcp_dooptions(&to, optp, optlen, (thflags & TH_SYN) != 0);
+       tcp_dooptions(&to, optp, optlen, (thflags & TH_SYN) != 0, th->th_ack);
        if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
-               if (to.to_flags & TOF_SCALE) {
+               if ((to.to_flags & TOF_SCALE) && (tp->t_flags & TF_REQ_SCALE)) {
                        tp->t_flags |= TF_RCVD_SCALE;
-                       tp->requested_s_scale = to.to_requested_s_scale;
+                       tp->snd_scale = to.to_requested_s_scale;
                }
+
+               /*
+                * Initial send window; will be updated upon next ACK
+                */
+               tp->snd_wnd = th->th_win;
+
                if (to.to_flags & TOF_TS) {
                        tp->t_flags |= TF_RCVD_TSTMP;
                        tp->ts_recent = to.to_tsval;
                        tp->ts_recent_age = ticks;
                }
-               if (to.to_flags & TOF_MSS)
-                       tcp_mss(tp, to.to_mss);
+               if (!(to.to_flags & TOF_MSS))
+                       to.to_mss = 0;
+               tcp_mss(tp, to.to_mss);
                /*
                 * Only set the TF_SACK_PERMITTED per-connection flag
                 * if we got a SACK_PERMITTED option from the other side
@@ -1200,19 +1259,22 @@ after_listen:
                                 */
                                if (tcp_do_eifel_detect &&
                                    (to.to_flags & TOF_TS) && to.to_tsecr &&
-                                   (tp->t_flags & TF_FIRSTACCACK)) {
+                                   (tp->rxt_flags & TRXT_F_FIRSTACCACK)) {
                                        /* Eifel detection applicable. */
                                        if (to.to_tsecr < tp->t_rexmtTS) {
                                                tcp_revert_congestion_state(tp);
                                                ++tcpstat.tcps_eifeldetected;
+                                               if (tp->t_rxtshift != 1 ||
+                                                   ticks >= tp->t_badrxtwin)
+                                                       ++tcpstat.tcps_rttcantdetect;
                                        }
                                } else if (tp->t_rxtshift == 1 &&
                                           ticks < tp->t_badrxtwin) {
                                        tcp_revert_congestion_state(tp);
                                        ++tcpstat.tcps_rttdetected;
                                }
-                               tp->t_flags &= ~(TF_FIRSTACCACK |
-                                                TF_FASTREXMT | TF_EARLYREXMT);
+                               tp->rxt_flags &= ~(TRXT_F_FIRSTACCACK |
+                                   TRXT_F_FASTREXMT | TRXT_F_EARLYREXMT);
                                /*
                                 * Recalculate the retransmit timer / rtt.
                                 *
@@ -1222,11 +1284,13 @@ after_listen:
                                 */
                                if ((to.to_flags & TOF_TS) && to.to_tsecr) {
                                        tcp_xmit_timer(tp,
-                                                      ticks - to.to_tsecr + 1);
+                                           ticks - to.to_tsecr + 1,
+                                           th->th_ack);
                                } else if (tp->t_rtttime &&
                                           SEQ_GT(th->th_ack, tp->t_rtseq)) {
                                        tcp_xmit_timer(tp,
-                                                      ticks - tp->t_rtttime);
+                                           ticks - tp->t_rtttime,
+                                           th->th_ack);
                                }
                                tcp_xmit_bandwidth_limit(tp, th->th_ack);
                                acked = th->th_ack - tp->snd_una;
@@ -1276,7 +1340,7 @@ after_listen:
                        }
                } else if (tiwin == tp->snd_wnd &&
                    th->th_ack == tp->snd_una &&
-                   LIST_EMPTY(&tp->t_segq) &&
+                   TAILQ_EMPTY(&tp->t_segq) &&
                    tlen <= ssb_space(&so->so_rcv)) {
                        u_long newsize = 0;     /* automatic sockbuf scaling */
                        /*
@@ -1472,7 +1536,6 @@ after_listen:
                }
                if (!(thflags & TH_SYN))
                        goto drop;
-               tp->snd_wnd = th->th_win;       /* initial send window */
 
                tp->irs = th->th_seq;
                tcp_rcvseqinit(tp);
@@ -1482,10 +1545,8 @@ after_listen:
                        soisconnected(so);
                        /* Do window scaling on this connection? */
                        if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
-                           (TF_RCVD_SCALE | TF_REQ_SCALE)) {
-                               tp->snd_scale = tp->requested_s_scale;
+                           (TF_RCVD_SCALE | TF_REQ_SCALE))
                                tp->rcv_scale = tp->request_r_scale;
-                       }
                        tp->rcv_adv += tp->rcv_wnd;
                        tp->snd_una++;          /* SYN is acked */
                        tcp_callout_stop(tp, tp->tt_rexmt);
@@ -1511,10 +1572,7 @@ after_listen:
                                tp->t_flags &= ~TF_NEEDFIN;
                                thflags &= ~TH_SYN;
                        } else {
-                               tp->t_state = TCPS_ESTABLISHED;
-                               tcp_callout_reset(tp, tp->tt_keep,
-                                                 tcp_getkeepidle(tp),
-                                                 tcp_timer_keep);
+                               tcp_established(tp);
                        }
                } else {
                        /*
@@ -1529,7 +1587,6 @@ after_listen:
                        tp->t_state = TCPS_SYN_RECEIVED;
                }
 
-trimthenstep6:
                /*
                 * Advance th->th_seq to correspond to first data byte.
                 * If data, trim to stay within window,
@@ -1662,7 +1719,6 @@ trimthenstep6:
         */
        if ((to.to_flags & TOF_TS) && tp->ts_recent != 0 &&
            TSTMP_LT(to.to_tsval, tp->ts_recent)) {
-
                /* Check to see if ts_recent is over 24 days old.  */
                if ((int)(ticks - tp->ts_recent_age) > TCP_PAWS_IDLE) {
                        /*
@@ -1677,6 +1733,39 @@ trimthenstep6:
                         * dropped when ts_recent is old.
                         */
                        tp->ts_recent = 0;
+               } else if (tcp_paws_tolerance && tlen != 0 &&
+                   tp->t_state == TCPS_ESTABLISHED &&
+                   (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK&&
+                   !(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)) &&
+                   th->th_ack == tp->snd_una &&
+                   tiwin == tp->snd_wnd &&
+                   TSTMP_GEQ(to.to_tsval + tcp_paws_tolerance, tp->ts_recent)&&
+                   (th->th_seq == tp->rcv_nxt ||
+                    (SEQ_GT(th->th_seq, tp->rcv_nxt) && 
+                     tcp_paws_canreasslast(tp, th, tlen)))) {
+                       /*
+                        * This tends to prevent valid new segments from being
+                        * dropped by the reordered segments sent by the fast
+                        * retransmission algorithm on the sending side, i.e.
+                        * the fast retransmitted segment w/ larger timestamp
+                        * arrives earlier than the previously sent new segments
+                        * w/ smaller timestamp.
+                        *
+                        * If following conditions are met, the segment is
+                        * accepted:
+                        * - The segment contains data
+                        * - The connection is established
+                        * - The header does not contain important flags
+                        * - SYN or FIN is not needed
+                        * - It does not acknowledge new data
+                        * - Receive window is not changed
+                        * - The timestamp is within "acceptable" range
+                        * - The new segment is what we are expecting or
+                        *   the new segment could be merged w/ the last
+                        *   pending segment on the reassemble queue
+                        */
+                       tcpstat.tcps_pawsaccept++;
+                       tcpstat.tcps_pawsdrop++;
                } else {
                        tcpstat.tcps_rcvduppack++;
                        tcpstat.tcps_rcvdupbyte += tlen;
@@ -1704,12 +1793,12 @@ trimthenstep6:
                if (TCP_DO_SACK(tp)) {
                        /* Report duplicate segment at head of packet. */
                        tp->reportblk.rblk_start = th->th_seq;
-                       tp->reportblk.rblk_end = th->th_seq + tlen;
-                       if (thflags & TH_FIN)
-                               ++tp->reportblk.rblk_end;
+                       tp->reportblk.rblk_end = TCP_SACK_BLKEND(
+                           th->th_seq + tlen, thflags);
                        if (SEQ_GT(tp->reportblk.rblk_end, tp->rcv_nxt))
                                tp->reportblk.rblk_end = tp->rcv_nxt;
-                       tp->t_flags |= (TF_DUPSEG | TF_SACKLEFT | TF_ACKNOW);
+                       tp->sack_flags |= (TSACK_F_DUPSEG | TSACK_F_SACKLEFT);
+                       tp->t_flags |= TF_ACKNOW;
                }
                if (thflags & TH_SYN) {
                        thflags &= ~TH_SYN;
@@ -1871,10 +1960,8 @@ trimthenstep6:
                soisconnected(so);
                /* Do window scaling? */
                if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
-                   (TF_RCVD_SCALE | TF_REQ_SCALE)) {
-                       tp->snd_scale = tp->requested_s_scale;
+                   (TF_RCVD_SCALE | TF_REQ_SCALE))
                        tp->rcv_scale = tp->request_r_scale;
-               }
                /*
                 * Make transitions:
                 *      SYN-RECEIVED  -> ESTABLISHED
@@ -1885,10 +1972,7 @@ trimthenstep6:
                        tp->t_state = TCPS_FIN_WAIT_1;
                        tp->t_flags &= ~TF_NEEDFIN;
                } else {
-                       tp->t_state = TCPS_ESTABLISHED;
-                       tcp_callout_reset(tp, tp->tt_keep,
-                                         tcp_getkeepidle(tp),
-                                         tcp_timer_keep);
+                       tcp_established(tp);
                }
                /*
                 * If segment contains data or ACK, will call tcp_reass()
@@ -1917,153 +2001,30 @@ trimthenstep6:
                if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
                        if (TCP_DO_SACK(tp))
                                tcp_sack_update_scoreboard(tp, &to);
-                       if (tlen != 0 || tiwin != tp->snd_wnd) {
-                               tp->t_dupacks = 0;
-                               break;
-                       }
-                       tcpstat.tcps_rcvdupack++;
                        if (!tcp_callout_active(tp, tp->tt_rexmt) ||
                            th->th_ack != tp->snd_una) {
+                               if (tlen == 0 && tiwin == tp->snd_wnd)
+                                       tcpstat.tcps_rcvdupack++;
                                tp->t_dupacks = 0;
                                break;
                        }
-                       /*
-                        * We have outstanding data (other than
-                        * a window probe), this is a completely
-                        * duplicate ack (ie, window info didn't
-                        * change), the ack is the biggest we've
-                        * seen and we've seen exactly our rexmt
-                        * threshhold of them, so assume a packet
-                        * has been dropped and retransmit it.
-                        * Kludge snd_nxt & the congestion
-                        * window so we send only this one
-                        * packet.
-                        */
-                       if (IN_FASTRECOVERY(tp)) {
-                               if (TCP_DO_SACK(tp)) {
-                                       /* No artifical cwnd inflation. */
-                                       tcp_sack_rexmt(tp, th);
+                       if (tlen != 0 || tiwin != tp->snd_wnd) {
+                               if (!tcp_do_rfc3517bis ||
+                                   !TCP_DO_SACK(tp) ||
+                                   (to.to_flags &
+                                    (TOF_SACK | TOF_SACK_REDUNDANT))
+                                    != TOF_SACK) {
+                                       tp->t_dupacks = 0;
                                } else {
-                                       /*
-                                        * Dup acks mean that packets
-                                        * have left the network
-                                        * (they're now cached at the
-                                        * receiver) so bump cwnd by
-                                        * the amount in the receiver
-                                        * to keep a constant cwnd
-                                        * packets in the network.
-                                        */
-                                       tp->snd_cwnd += tp->t_maxseg;
-                                       tcp_output(tp);
+                                       delayed_dupack = TRUE;
+                                       th_dupack = th->th_ack;
                                }
-                       } else if (SEQ_LT(th->th_ack, tp->snd_recover)) {
-                               tp->t_dupacks = 0;
                                break;
-                       } else if (++tp->t_dupacks == tcprexmtthresh) {
-                               tcp_seq old_snd_nxt;
-                               u_int win;
-
-fastretransmit:
-                               if (tcp_do_eifel_detect &&
-                                   (tp->t_flags & TF_RCVD_TSTMP)) {
-                                       tcp_save_congestion_state(tp);
-                                       tp->t_flags |= TF_FASTREXMT;
-                               }
-                               /*
-                                * We know we're losing at the current
-                                * window size, so do congestion avoidance:
-                                * set ssthresh to half the current window
-                                * and pull our congestion window back to the
-                                * new ssthresh.
-                                */
-                               win = min(tp->snd_wnd, tp->snd_cwnd) / 2 /
-                                   tp->t_maxseg;
-                               if (win < 2)
-                                       win = 2;
-                               tp->snd_ssthresh = win * tp->t_maxseg;
-                               ENTER_FASTRECOVERY(tp);
-                               tp->snd_recover = tp->snd_max;
-                               tcp_callout_stop(tp, tp->tt_rexmt);
-                               tp->t_rtttime = 0;
-                               old_snd_nxt = tp->snd_nxt;
-                               tp->snd_nxt = th->th_ack;
-                               tp->snd_cwnd = tp->t_maxseg;
-                               tcp_output(tp);
-                               ++tcpstat.tcps_sndfastrexmit;
-                               tp->snd_cwnd = tp->snd_ssthresh;
-                               tp->rexmt_high = tp->snd_nxt;
-                               if (SEQ_GT(old_snd_nxt, tp->snd_nxt))
-                                       tp->snd_nxt = old_snd_nxt;
-                               KASSERT(tp->snd_limited <= 2,
-                                   ("tp->snd_limited too big"));
-                               if (TCP_DO_SACK(tp))
-                                       tcp_sack_rexmt(tp, th);
-                               else
-                                       tp->snd_cwnd += tp->t_maxseg *
-                                           (tp->t_dupacks - tp->snd_limited);
-                       } else if (tcp_do_limitedtransmit) {
-                               u_long oldcwnd = tp->snd_cwnd;
-                               tcp_seq oldsndmax = tp->snd_max;
-                               tcp_seq oldsndnxt = tp->snd_nxt;
-                               /* outstanding data */
-                               uint32_t ownd = tp->snd_max - tp->snd_una;
-                               u_int sent;
-
-#define        iceildiv(n, d)          (((n)+(d)-1) / (d))
-
-                               KASSERT(tp->t_dupacks == 1 ||
-                                       tp->t_dupacks == 2,
-                                   ("dupacks not 1 or 2"));
-                               if (tp->t_dupacks == 1)
-                                       tp->snd_limited = 0;
-                               tp->snd_nxt = tp->snd_max;
-                               tp->snd_cwnd = ownd +
-                                   (tp->t_dupacks - tp->snd_limited) *
-                                   tp->t_maxseg;
-                               tcp_output(tp);
-
-                               /*
-                                * Other acks may have been processed,
-                                * snd_nxt cannot be reset to a value less
-                                * then snd_una.
-                                */
-                               if (SEQ_LT(oldsndnxt, oldsndmax)) {
-                                   if (SEQ_GT(oldsndnxt, tp->snd_una))
-                                       tp->snd_nxt = oldsndnxt;
-                                   else
-                                       tp->snd_nxt = tp->snd_una;
-                               }
-                               tp->snd_cwnd = oldcwnd;
-                               sent = tp->snd_max - oldsndmax;
-                               if (sent > tp->t_maxseg) {
-                                       KASSERT((tp->t_dupacks == 2 &&
-                                                tp->snd_limited == 0) ||
-                                               (sent == tp->t_maxseg + 1 &&
-                                                tp->t_flags & TF_SENTFIN),
-                                           ("sent too much"));
-                                       KASSERT(sent <= tp->t_maxseg * 2,
-                                           ("sent too many segments"));
-                                       tp->snd_limited = 2;
-                                       tcpstat.tcps_sndlimited += 2;
-                               } else if (sent > 0) {
-                                       ++tp->snd_limited;
-                                       ++tcpstat.tcps_sndlimited;
-                               } else if (tcp_do_early_retransmit &&
-                                   (tcp_do_eifel_detect &&
-                                    (tp->t_flags & TF_RCVD_TSTMP)) &&
-                                   ownd < 4 * tp->t_maxseg &&
-                                   tp->t_dupacks + 1 >=
-                                     iceildiv(ownd, tp->t_maxseg) &&
-                                   (!TCP_DO_SACK(tp) ||
-                                    ownd <= tp->t_maxseg ||
-                                    tcp_sack_has_sacked(&tp->scb,
-                                                       ownd - tp->t_maxseg))) {
-                                       ++tcpstat.tcps_sndearlyrexmit;
-                                       tp->t_flags |= TF_EARLYREXMT;
-                                       goto fastretransmit;
-                               }
                        }
-                       goto drop;
+                       if (tcp_fast_recovery(tp, th->th_ack, &to))
+                               goto drop;
+                       else
+                               break;
                }
 
                KASSERT(SEQ_GT(th->th_ack, tp->snd_una), ("th_ack <= snd_una"));
@@ -2095,10 +2056,8 @@ fastretransmit:
                        tp->snd_una++;
                        /* Do window scaling? */
                        if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
-                           (TF_RCVD_SCALE | TF_REQ_SCALE)) {
-                               tp->snd_scale = tp->requested_s_scale;
+                           (TF_RCVD_SCALE | TF_REQ_SCALE))
                                tp->rcv_scale = tp->request_r_scale;
-                       }
                }
 
 process_ACK:
@@ -2108,12 +2067,12 @@ process_ACK:
 
                if (tcp_do_eifel_detect && acked > 0 &&
                    (to.to_flags & TOF_TS) && (to.to_tsecr != 0) &&
-                   (tp->t_flags & TF_FIRSTACCACK)) {
+                   (tp->rxt_flags & TRXT_F_FIRSTACCACK)) {
                        /* Eifel detection applicable. */
                        if (to.to_tsecr < tp->t_rexmtTS) {
                                ++tcpstat.tcps_eifeldetected;
                                tcp_revert_congestion_state(tp);
-                               if (tp->t_rxtshift == 1 &&
+                               if (tp->t_rxtshift != 1 ||
                                    ticks >= tp->t_badrxtwin)
                                        ++tcpstat.tcps_rttcantdetect;
                        }
@@ -2144,9 +2103,9 @@ process_ACK:
                 * timestamps of 0.
                 */
                if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0))
-                       tcp_xmit_timer(tp, ticks - to.to_tsecr + 1);
+                       tcp_xmit_timer(tp, ticks - to.to_tsecr + 1, th->th_ack);
                else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq))
-                       tcp_xmit_timer(tp, ticks - tp->t_rtttime);
+                       tcp_xmit_timer(tp, ticks - tp->t_rtttime, th->th_ack);
                tcp_xmit_bandwidth_limit(tp, th->th_ack);
 
                /*
@@ -2157,7 +2116,8 @@ process_ACK:
                        goto step6;
 
                /* Stop looking for an acceptable ACK since one was received. */
-               tp->t_flags &= ~(TF_FIRSTACCACK | TF_FASTREXMT | TF_EARLYREXMT);
+               tp->rxt_flags &= ~(TRXT_F_FIRSTACCACK |
+                   TRXT_F_FASTREXMT | TRXT_F_EARLYREXMT);
 
                if (acked > so->so_snd.ssb_cc) {
                        tp->snd_wnd -= so->so_snd.ssb_cc;
@@ -2172,13 +2132,8 @@ process_ACK:
 
                /*
                 * Update window information.
-                * Don't look at window if no ACK:
-                * TAC's send garbage on first SYN.
                 */
-               if (SEQ_LT(tp->snd_wl1, th->th_seq) ||
-                   (tp->snd_wl1 == th->th_seq &&
-                    (SEQ_LT(tp->snd_wl2, th->th_ack) ||
-                     (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)))) {
+               if (acceptable_window_update(tp, th, tiwin)) {
                        /* keep track of pure window updates */
                        if (tlen == 0 && tp->snd_wl2 == th->th_ack &&
                            tiwin > tp->snd_wnd)
@@ -2223,7 +2178,7 @@ process_ACK:
                        } else {
                                if (TCP_DO_SACK(tp)) {
                                        tp->snd_max_rexmt = tp->snd_max;
-                                       tcp_sack_rexmt(tp, th);
+                                       tcp_sack_rexmt(tp);
                                } else {
                                        tcp_newreno_partial_ack(tp, th, acked);
                                }
@@ -2442,7 +2397,7 @@ dodata:                                                   /* XXX */
                 * fast retransmit can work).
                 */
                if (th->th_seq == tp->rcv_nxt &&
-                   LIST_EMPTY(&tp->t_segq) &&
+                   TAILQ_EMPTY(&tp->t_segq) &&
                    TCPS_HAVEESTABLISHED(tp->t_state)) {
                        if (DELAY_ACK(tp)) {
                                tcp_callout_reset(tp, tp->tt_delack,
@@ -2464,11 +2419,11 @@ dodata:                                                 /* XXX */
                        }
                        sorwakeup(so);
                } else {
-                       if (!(tp->t_flags & TF_DUPSEG)) {
+                       if (!(tp->sack_flags & TSACK_F_DUPSEG)) {
                                /* Initialize SACK report block. */
                                tp->reportblk.rblk_start = th->th_seq;
-                               tp->reportblk.rblk_end = th->th_seq + tlen +
-                                   ((thflags & TH_FIN) != 0);
+                               tp->reportblk.rblk_end = TCP_SACK_BLKEND(
+                                   th->th_seq + tlen, thflags);
                        }
                        thflags = tcp_reass(tp, th, &tlen, m);
                        tp->t_flags |= TF_ACKNOW;
@@ -2556,11 +2511,18 @@ dodata:                                                 /* XXX */
                tcp_trace(TA_INPUT, ostate, tp, tcp_saveipgen, &tcp_savetcp, 0);
 #endif
 
+       /*
+        * Delayed duplicated ACK processing
+        */
+       if (delayed_dupack && tcp_fast_recovery(tp, th_dupack, &to))
+               needoutput = FALSE;
+
        /*
         * Return any desired output.
         */
        if (needoutput || (tp->t_flags & TF_ACKNOW))
                tcp_output(tp);
+       tcp_sack_report_cleanup(tp);
        return(IPPROTO_DONE);
 
 dropafterack:
@@ -2592,6 +2554,7 @@ dropafterack:
        m_freem(m);
        tp->t_flags |= TF_ACKNOW;
        tcp_output(tp);
+       tcp_sack_report_cleanup(tp);
        return(IPPROTO_DONE);
 
 dropwithreset:
@@ -2638,6 +2601,8 @@ dropwithreset:
                tcp_respond(tp, mtod(m, void *), th, m, th->th_seq + tlen,
                            (tcp_seq)0, TH_RST | TH_ACK);
        }
+       if (tp != NULL)
+               tcp_sack_report_cleanup(tp);
        return(IPPROTO_DONE);
 
 drop:
@@ -2649,6 +2614,8 @@ drop:
                tcp_trace(TA_DROP, ostate, tp, tcp_saveipgen, &tcp_savetcp, 0);
 #endif
        m_freem(m);
+       if (tp != NULL)
+               tcp_sack_report_cleanup(tp);
        return(IPPROTO_DONE);
 }
 
@@ -2656,7 +2623,8 @@ drop:
  * Parse TCP options and place in tcpopt.
  */
 static void
-tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, boolean_t is_syn)
+tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, boolean_t is_syn,
+    tcp_seq ack)
 {
        int opt, optlen, i;
 
@@ -2725,7 +2693,23 @@ tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, boolean_t is_syn)
 
                                r->rblk_start = ntohl(r->rblk_start);
                                r->rblk_end = ntohl(r->rblk_end);
+
+                               if (SEQ_LEQ(r->rblk_end, r->rblk_start)) {
+                                       /*
+                                        * Invalid SACK block; discard all
+                                        * SACK blocks
+                                        */
+                                       tcpstat.tcps_rcvbadsackopt++;
+                                       to->to_nsackblocks = 0;
+                                       to->to_sackblocks = NULL;
+                                       to->to_flags &= ~TOF_SACK;
+                                       break;
+                               }
                        }
+                       if ((to->to_flags & TOF_SACK) &&
+                           tcp_sack_ndsack_blocks(to->to_sackblocks,
+                           to->to_nsackblocks, ack))
+                               to->to_flags |= TOF_DSACK;
                        break;
 #ifdef TCP_SIGNATURE
                /*
@@ -2784,13 +2768,35 @@ tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m, int off)
  * and update averages and current timeout.
  */
 static void
-tcp_xmit_timer(struct tcpcb *tp, int rtt)
+tcp_xmit_timer(struct tcpcb *tp, int rtt, tcp_seq ack)
 {
-       int delta;
+       int rebaserto = 0;
 
        tcpstat.tcps_rttupdated++;
        tp->t_rttupdated++;
-       if (tp->t_srtt != 0) {
+       if ((tp->rxt_flags & TRXT_F_REBASERTO) &&
+           SEQ_GT(ack, tp->snd_max_prev)) {
+#ifdef DEBUG_EIFEL_RESPONSE
+               kprintf("srtt/rttvar, prev %d/%d, cur %d/%d, ",
+                   tp->t_srtt_prev, tp->t_rttvar_prev,
+                   tp->t_srtt, tp->t_rttvar);
+#endif
+
+               tcpstat.tcps_eifelresponse++;
+               rebaserto = 1;
+               tp->rxt_flags &= ~TRXT_F_REBASERTO;
+               tp->t_srtt = max(tp->t_srtt_prev, (rtt << TCP_RTT_SHIFT));
+               tp->t_rttvar = max(tp->t_rttvar_prev,
+                   (rtt << (TCP_RTTVAR_SHIFT - 1)));
+               if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
+                       tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
+
+#ifdef DEBUG_EIFEL_RESPONSE
+               kprintf("new %d/%d ", tp->t_srtt, tp->t_rttvar);
+#endif
+       } else if (tp->t_srtt != 0) {
+               int delta;
+
                /*
                 * srtt is stored as fixed point with 5 bits after the
                 * binary point (i.e., scaled by 8).  The following magic
@@ -2834,6 +2840,13 @@ tcp_xmit_timer(struct tcpcb *tp, int rtt)
        tp->t_rtttime = 0;
        tp->t_rxtshift = 0;
 
+#ifdef DEBUG_EIFEL_RESPONSE
+       if (rebaserto) {
+               kprintf("| rxtcur prev %d, old %d, ",
+                   tp->t_rxtcur_prev, tp->t_rxtcur);
+       }
+#endif
+
        /*
         * the retransmit should happen at rtt + 4 * rttvar.
         * Because of the way we do the smoothing, srtt and rttvar
@@ -2848,6 +2861,30 @@ tcp_xmit_timer(struct tcpcb *tp, int rtt)
        TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
                      max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX);
 
+       if (rebaserto) {
+               if (tp->t_rxtcur < tp->t_rxtcur_prev + tcp_eifel_rtoinc) {
+                       /*
+                        * RFC4015 requires that the new RTO is at least
+                        * 2*G (tcp_eifel_rtoinc) greater then the RTO
+                        * (t_rxtcur_prev) when the spurious retransmit
+                        * timeout happens.
+                        *
+                        * The above condition could be true, if the SRTT
+                        * and RTTVAR used to calculate t_rxtcur_prev
+                        * resulted in a value less than t_rttmin.  So
+                        * simply increasing SRTT by tcp_eifel_rtoinc when
+                        * preparing for the Eifel response in
+                        * tcp_save_congestion_state() could not ensure
+                        * that the new RTO will be tcp_eifel_rtoinc greater
+                        * t_rxtcur_prev.
+                        */
+                       tp->t_rxtcur = tp->t_rxtcur_prev + tcp_eifel_rtoinc;
+               }
+#ifdef DEBUG_EIFEL_RESPONSE
+               kprintf("new %d\n", tp->t_rxtcur);
+#endif
+       }
+
        /*
         * We received an ack for a packet that wasn't retransmitted;
         * it is probably safe to discard any error indications we've
@@ -3007,11 +3044,11 @@ tcp_mss(struct tcpcb *tp, int offer)
                mss -= TCPOLEN_TSTAMP_APPA;
 
 #if    (MCLBYTES & (MCLBYTES - 1)) == 0
-               if (mss > MCLBYTES)
-                       mss &= ~(MCLBYTES-1);
+       if (mss > MCLBYTES)
+               mss &= ~(MCLBYTES-1);
 #else
-               if (mss > MCLBYTES)
-                       mss = mss / MCLBYTES * MCLBYTES;
+       if (mss > MCLBYTES)
+               mss = mss / MCLBYTES * MCLBYTES;
 #endif
        /*
         * If there's a pipesize, change the socket buffer
@@ -3050,13 +3087,11 @@ tcp_mss(struct tcpcb *tp, int offer)
        }
 
        /*
-        * Set the slow-start flight size depending on whether this
-        * is a local network or not.
+        * Set the slow-start flight size
+        *
+        * NOTE: t_maxseg must have been configured!
         */
-       if (tcp_do_rfc3390)
-               tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380));
-       else
-               tp->snd_cwnd = mss;
+       tp->snd_cwnd = tcp_initial_window(tp);
 
        if (rt->rt_rmx.rmx_ssthresh) {
                /*
@@ -3134,51 +3169,134 @@ tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th, int acked)
  * except when retransmitting snd_una.
  */
 static void
-tcp_sack_rexmt(struct tcpcb *tp, struct tcphdr *th)
+tcp_sack_rexmt(struct tcpcb *tp)
 {
-       uint32_t pipe, seglen;
-       tcp_seq nextrexmt;
-       boolean_t lostdup;
        tcp_seq old_snd_nxt = tp->snd_nxt;
        u_long ocwnd = tp->snd_cwnd;
+       uint32_t pipe;
        int nseg = 0;           /* consecutive new segments */
+       int nseg_rexmt = 0;     /* retransmitted segments */
 #define MAXBURST 4             /* limit burst of new packets on partial ack */
 
        tp->t_rtttime = 0;
        pipe = tcp_sack_compute_pipe(tp);
        while ((tcp_seq_diff_t)(ocwnd - pipe) >= (tcp_seq_diff_t)tp->t_maxseg &&
-           (!tcp_do_smartsack || nseg < MAXBURST) &&
-           tcp_sack_nextseg(tp, &nextrexmt, &seglen, &lostdup)) {
-               uint32_t sent;
-               tcp_seq old_snd_max;
+           (!tcp_do_smartsack || nseg < MAXBURST)) {
+               tcp_seq old_snd_max, old_rexmt_high, nextrexmt;
+               uint32_t sent, seglen;
+               boolean_t rescue;
                int error;
 
+               old_rexmt_high = tp->rexmt_high;
+               if (!tcp_sack_nextseg(tp, &nextrexmt, &seglen, &rescue)) {
+                       tp->rexmt_high = old_rexmt_high;
+                       break;
+               }
+
+               /*
+                * If the next tranmission is a rescue retranmission,
+                * we check whether we have already sent some data
+                * (either new segments or retransmitted segments)
+                * into the the network or not.  Since the idea of rescue
+                * retransmission is to sustain ACK clock, as long as
+                * some segments are in the network, ACK clock will be
+                * kept ticking.
+                */
+               if (rescue && (nseg_rexmt > 0 || nseg > 0)) {
+                       tp->rexmt_high = old_rexmt_high;
+                       break;
+               }
+
                if (nextrexmt == tp->snd_max)
                        ++nseg;
+               else
+                       ++nseg_rexmt;
                tp->snd_nxt = nextrexmt;
                tp->snd_cwnd = nextrexmt - tp->snd_una + seglen;
                old_snd_max = tp->snd_max;
                if (nextrexmt == tp->snd_una)
                        tcp_callout_stop(tp, tp->tt_rexmt);
                error = tcp_output(tp);
-               if (error != 0)
+               if (error != 0) {
+                       tp->rexmt_high = old_rexmt_high;
                        break;
+               }
                sent = tp->snd_nxt - nextrexmt;
-               if (sent <= 0)
+               if (sent <= 0) {
+                       tp->rexmt_high = old_rexmt_high;
                        break;
-               if (!lostdup)
-                       pipe += sent;
+               }
+               pipe += sent;
                tcpstat.tcps_sndsackpack++;
                tcpstat.tcps_sndsackbyte += sent;
+
+               if (rescue) {
+                       tcpstat.tcps_sackrescue++;
+                       tp->rexmt_rescue = tp->snd_nxt;
+                       tp->sack_flags |= TSACK_F_SACKRESCUED;
+                       break;
+               }
                if (SEQ_LT(nextrexmt, old_snd_max) &&
-                   SEQ_LT(tp->rexmt_high, tp->snd_nxt))
+                   SEQ_LT(tp->rexmt_high, tp->snd_nxt)) {
                        tp->rexmt_high = seq_min(tp->snd_nxt, old_snd_max);
+                       if (tcp_aggressive_rescuesack &&
+                           (tp->sack_flags & TSACK_F_SACKRESCUED) &&
+                           SEQ_LT(tp->rexmt_rescue, tp->rexmt_high)) {
+                               /* Drag RescueRxt along with HighRxt */
+                               tp->rexmt_rescue = tp->rexmt_high;
+                       }
+               }
        }
        if (SEQ_GT(old_snd_nxt, tp->snd_nxt))
                tp->snd_nxt = old_snd_nxt;
        tp->snd_cwnd = ocwnd;
 }
 
+/*
+ * Return TRUE, if some new segments are sent
+ */
+static boolean_t
+tcp_sack_limitedxmit(struct tcpcb *tp)
+{
+       tcp_seq oldsndnxt = tp->snd_nxt;
+       tcp_seq oldsndmax = tp->snd_max;
+       u_long ocwnd = tp->snd_cwnd;
+       uint32_t pipe, sent;
+       boolean_t ret = FALSE;
+       tcp_seq_diff_t cwnd_left;
+       tcp_seq next;
+
+       tp->rexmt_high = tp->snd_una - 1;
+       pipe = tcp_sack_compute_pipe(tp);
+       cwnd_left = (tcp_seq_diff_t)(ocwnd - pipe);
+       if (cwnd_left < (tcp_seq_diff_t)tp->t_maxseg)
+               return FALSE;
+
+       next = tp->snd_nxt = tp->snd_max;
+       tp->snd_cwnd = tp->snd_nxt - tp->snd_una +
+           rounddown(cwnd_left, tp->t_maxseg);
+
+       tcp_output(tp);
+
+       sent = tp->snd_nxt - next;
+       if (sent > 0) {
+               tcpstat.tcps_sndlimited += howmany(sent, tp->t_maxseg);
+               ret = TRUE;
+       }
+
+       if (SEQ_LT(oldsndnxt, oldsndmax)) {
+               KASSERT(SEQ_GEQ(oldsndnxt, tp->snd_una),
+                   ("snd_una moved in other threads"));
+               tp->snd_nxt = oldsndnxt;
+       }
+       tp->snd_cwnd = ocwnd;
+
+       if (ret && TCP_DO_NCR(tp))
+               tcp_ncr_update_rxtthresh(tp);
+
+       return ret;
+}
+
 /*
  * Reset idle time and keep-alive timer, typically called when a valid
  * tcp packet is received but may also be called when FASTKEEP is set
@@ -3204,7 +3322,7 @@ tcp_timer_keep_activity(struct tcpcb *tp, int thflags)
                        tp->t_rcvtime = ticks;
                        tp->t_flags &= ~TF_KEEPALIVE;
                        tcp_callout_reset(tp, tp->tt_keep,
-                                         tcp_getkeepidle(tp),
+                                         tp->t_keepidle,
                                          tcp_timer_keep);
                }
        }
@@ -3235,3 +3353,199 @@ tcp_rmx_msl(const struct tcpcb *tp)
 
        return msl;
 }
+
+static void
+tcp_established(struct tcpcb *tp)
+{
+       tp->t_state = TCPS_ESTABLISHED;
+       tcp_callout_reset(tp, tp->tt_keep, tp->t_keepidle, tcp_timer_keep);
+
+       if (tp->t_rxtsyn > 0) {
+               /*
+                * RFC6298:
+                * "If the timer expires awaiting the ACK of a SYN segment
+                *  and the TCP implementation is using an RTO less than 3
+                *  seconds, the RTO MUST be re-initialized to 3 seconds
+                *  when data transmission begins"
+                */
+               if (tp->t_rxtcur < TCPTV_RTOBASE3)
+                       tp->t_rxtcur = TCPTV_RTOBASE3;
+       }
+}
+
+/*
+ * Returns TRUE, if the ACK should be dropped
+ */
+static boolean_t
+tcp_fast_recovery(struct tcpcb *tp, tcp_seq th_ack, const struct tcpopt *to)
+{
+       boolean_t fast_sack_rexmt = TRUE;
+
+       tcpstat.tcps_rcvdupack++;
+
+       /*
+        * We have outstanding data (other than a window probe),
+        * this is a completely duplicate ack (ie, window info
+        * didn't change), the ack is the biggest we've seen and
+        * we've seen exactly our rexmt threshhold of them, so
+        * assume a packet has been dropped and retransmit it.
+        * Kludge snd_nxt & the congestion window so we send only
+        * this one packet.
+        */
+       if (IN_FASTRECOVERY(tp)) {
+               if (TCP_DO_SACK(tp)) {
+                       /* No artifical cwnd inflation. */
+                       tcp_sack_rexmt(tp);
+               } else {
+                       /*
+                        * Dup acks mean that packets have left
+                        * the network (they're now cached at the
+                        * receiver) so bump cwnd by the amount in
+                        * the receiver to keep a constant cwnd
+                        * packets in the network.
+                        */
+                       tp->snd_cwnd += tp->t_maxseg;
+                       tcp_output(tp);
+               }
+               return TRUE;
+       } else if (SEQ_LT(th_ack, tp->snd_recover)) {
+               tp->t_dupacks = 0;
+               return FALSE;
+       } else if (tcp_ignore_redun_dsack && TCP_DO_SACK(tp) &&
+           (to->to_flags & (TOF_DSACK | TOF_SACK_REDUNDANT)) ==
+           (TOF_DSACK | TOF_SACK_REDUNDANT)) {
+               /*
+                * If the ACK carries DSACK and other SACK blocks
+                * carry information that we have already known,
+                * don't count this ACK as duplicate ACK.  This
+                * prevents spurious early retransmit and fast
+                * retransmit.  This also meets the requirement of
+                * RFC3042 that new segments should not be sent if
+                * the SACK blocks do not contain new information
+                * (XXX we actually loosen the requirment that only
+                * DSACK is checked here).
+                *
+                * This kind of ACKs are usually sent after spurious
+                * retransmit.
+                */
+               /* Do nothing; don't change t_dupacks */
+               return TRUE;
+       } else if (tp->t_dupacks == 0 && TCP_DO_NCR(tp)) {
+               tcp_ncr_update_rxtthresh(tp);
+       }
+
+       if (++tp->t_dupacks == tp->t_rxtthresh) {
+               tcp_seq old_snd_nxt;
+               u_int win;
+
+fastretransmit:
+               if (tcp_do_eifel_detect && (tp->t_flags & TF_RCVD_TSTMP)) {
+                       tcp_save_congestion_state(tp);
+                       tp->rxt_flags |= TRXT_F_FASTREXMT;
+               }
+               /*
+                * We know we're losing at the current window size,
+                * so do congestion avoidance: set ssthresh to half
+                * the current window and pull our congestion window
+                * back to the new ssthresh.
+                */
+               win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg;
+               if (win < 2)
+                       win = 2;
+               tp->snd_ssthresh = win * tp->t_maxseg;
+               ENTER_FASTRECOVERY(tp);
+               tp->snd_recover = tp->snd_max;
+               tcp_callout_stop(tp, tp->tt_rexmt);
+               tp->t_rtttime = 0;
+               old_snd_nxt = tp->snd_nxt;
+               tp->snd_nxt = th_ack;
+               tp->snd_cwnd = tp->t_maxseg;
+               tcp_output(tp);
+               ++tcpstat.tcps_sndfastrexmit;
+               tp->snd_cwnd = tp->snd_ssthresh;
+               tp->rexmt_high = tp->snd_nxt;
+               tp->sack_flags &= ~TSACK_F_SACKRESCUED;
+               if (SEQ_GT(old_snd_nxt, tp->snd_nxt))
+                       tp->snd_nxt = old_snd_nxt;
+               KASSERT(tp->snd_limited <= 2, ("tp->snd_limited too big"));
+               if (TCP_DO_SACK(tp)) {
+                       if (fast_sack_rexmt)
+                               tcp_sack_rexmt(tp);
+               } else {
+                       tp->snd_cwnd += tp->t_maxseg *
+                           (tp->t_dupacks - tp->snd_limited);
+               }
+       } else if ((tcp_do_rfc3517bis && TCP_DO_SACK(tp)) || TCP_DO_NCR(tp)) {
+               /*
+                * The RFC3517bis recommends to reduce the byte threshold,
+                * and enter fast retransmit if IsLost(snd_una).  However,
+                * if we use IsLost(snd_una) based fast retransmit here,
+                * segments reordering will cause spurious retransmit.  So
+                * we defer the IsLost(snd_una) based fast retransmit until
+                * the extended limited transmit can't send any segments and
+                * early retransmit can't be done.
+                */
+               if (tcp_rfc3517bis_rxt && tcp_do_rfc3517bis &&
+                   tcp_sack_islost(&tp->scb, tp->snd_una))
+                       goto fastretransmit;
+
+               if (tcp_do_limitedtransmit || TCP_DO_NCR(tp)) {
+                       if (!tcp_sack_limitedxmit(tp)) {
+                               /* outstanding data */
+                               uint32_t ownd = tp->snd_max - tp->snd_una;
+
+                               if (need_early_retransmit(tp, ownd)) {
+                                       ++tcpstat.tcps_sndearlyrexmit;
+                                       tp->rxt_flags |= TRXT_F_EARLYREXMT;
+                                       goto fastretransmit;
+                               } else if (tcp_do_rfc3517bis &&
+                                   tcp_sack_islost(&tp->scb, tp->snd_una)) {
+                                       fast_sack_rexmt = FALSE;
+                                       goto fastretransmit;
+                               }
+                       }
+               }
+       } else if (tcp_do_limitedtransmit) {
+               u_long oldcwnd = tp->snd_cwnd;
+               tcp_seq oldsndmax = tp->snd_max;
+               tcp_seq oldsndnxt = tp->snd_nxt;
+               /* outstanding data */
+               uint32_t ownd = tp->snd_max - tp->snd_una;
+               u_int sent;
+
+               KASSERT(tp->t_dupacks == 1 || tp->t_dupacks == 2,
+                   ("dupacks not 1 or 2"));
+               if (tp->t_dupacks == 1)
+                       tp->snd_limited = 0;
+               tp->snd_nxt = tp->snd_max;
+               tp->snd_cwnd = ownd +
+                   (tp->t_dupacks - tp->snd_limited) * tp->t_maxseg;
+               tcp_output(tp);
+
+               if (SEQ_LT(oldsndnxt, oldsndmax)) {
+                       KASSERT(SEQ_GEQ(oldsndnxt, tp->snd_una),
+                           ("snd_una moved in other threads"));
+                       tp->snd_nxt = oldsndnxt;
+               }
+               tp->snd_cwnd = oldcwnd;
+               sent = tp->snd_max - oldsndmax;
+               if (sent > tp->t_maxseg) {
+                       KASSERT((tp->t_dupacks == 2 && tp->snd_limited == 0) ||
+                           (sent == tp->t_maxseg + 1 &&
+                            (tp->t_flags & TF_SENTFIN)),
+                           ("sent too much"));
+                       KASSERT(sent <= tp->t_maxseg * 2,
+                           ("sent too many segments"));
+                       tp->snd_limited = 2;
+                       tcpstat.tcps_sndlimited += 2;
+               } else if (sent > 0) {
+                       ++tp->snd_limited;
+                       ++tcpstat.tcps_sndlimited;
+               } else if (need_early_retransmit(tp, ownd)) {
+                       ++tcpstat.tcps_sndearlyrexmit;
+                       tp->rxt_flags |= TRXT_F_EARLYREXMT;
+                       goto fastretransmit;
+               }
+       }
+       return TRUE;
+}