kernel - MPSAFE the protocol drain routines
[dragonfly.git] / sys / netinet / tcp_input.c
index d3f951c..3069521 100644 (file)
  *
  *     @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
  * $FreeBSD: src/sys/netinet/tcp_input.c,v 1.107.2.38 2003/05/21 04:46:41 cjc Exp $
- * $DragonFly: src/sys/netinet/tcp_input.c,v 1.67 2007/04/22 01:13:14 dillon Exp $
+ * $DragonFly: src/sys/netinet/tcp_input.c,v 1.68 2008/08/22 09:14:17 sephe Exp $
  */
 
 #include "opt_ipfw.h"          /* for ipfw_fwd         */
+#include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_tcpdebug.h"
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
+#include <netinet/tcp_timer2.h>
 #include <netinet/tcp_var.h>
 #include <netinet6/tcp6_var.h>
 #include <netinet/tcpip.h>
@@ -134,7 +136,6 @@ struct tcphdr tcp_savetcp;
 
 MALLOC_DEFINE(M_TSEGQ, "tseg_qent", "TCP segment queue entry");
 
-tcp_cc tcp_ccgen;
 static int log_in_vain = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW,
     &log_in_vain, 0, "Log all incoming TCP connections");
@@ -211,6 +212,20 @@ SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, overflows, CTLFLAG_RD,
     &tcp_reass_overflows, 0,
     "Global number of TCP Segment Reassembly Queue Overflows");
 
+int tcp_do_autorcvbuf = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_RW,
+    &tcp_do_autorcvbuf, 0, "Enable automatic receive buffer sizing");
+
+int tcp_autorcvbuf_inc = 16*1024;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_RW,
+    &tcp_autorcvbuf_inc, 0,
+    "Incrementor step size of automatic receive buffer");
+
+int tcp_autorcvbuf_max = 2*1024*1024;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_RW,
+    &tcp_autorcvbuf_max, 0, "Max size of automatic receive buffer");
+
+
 static void     tcp_dooptions(struct tcpopt *, u_char *, int, boolean_t);
 static void     tcp_pulloutofband(struct socket *,
                     struct tcphdr *, struct mbuf *, int);
@@ -241,7 +256,7 @@ do { \
  *       the ack that opens up a 0-sized window.
  */
 #define DELAY_ACK(tp) \
-       (tcp_delack_enabled && !callout_pending(tp->tt_delack) && \
+       (tcp_delack_enabled && !tcp_callout_pending(tp, tp->tt_delack) && \
        !(tp->t_flags & TF_RXWIN0SENT))
 
 #define acceptable_window_update(tp, th, tiwin)                                \
@@ -293,7 +308,7 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
                tp->reportblk.rblk_start = tp->reportblk.rblk_end;
                return (0);
        }
-       tcp_reass_qsize++;
+       atomic_add_int(&tcp_reass_qsize, 1);
 
        /*
         * Find a segment which begins after this one does.
@@ -326,7 +341,7 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
                                tcpstat.tcps_rcvdupbyte += *tlenp;
                                m_freem(m);
                                kfree(te, M_TSEGQ);
-                               tcp_reass_qsize--;
+                               atomic_add_int(&tcp_reass_qsize, -1);
                                /*
                                 * Try to present any queued data
                                 * at the left window edge to the user.
@@ -381,7 +396,7 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
                LIST_REMOVE(q, tqe_q);
                m_freem(q->tqe_m);
                kfree(q, M_TSEGQ);
-               tcp_reass_qsize--;
+               atomic_add_int(&tcp_reass_qsize, -1);
                q = nq;
        }
 
@@ -407,7 +422,7 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
                        tp->reportblk.rblk_end = tend;
                LIST_REMOVE(q, tqe_q);
                kfree(q, M_TSEGQ);
-               tcp_reass_qsize--;
+               atomic_add_int(&tcp_reass_qsize, -1);
        }
 
        if (p == NULL) {
@@ -425,9 +440,10 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
                        if (!(tp->t_flags & TF_DUPSEG))
                                tp->reportblk.rblk_start = p->tqe_th->th_seq;
                        kfree(te, M_TSEGQ);
-                       tcp_reass_qsize--;
-               } else
+                       atomic_add_int(&tcp_reass_qsize, -1);
+               } else {
                        LIST_INSERT_AFTER(p, te, tqe_q);
+               }
        }
 
 present:
@@ -457,7 +473,7 @@ present:
        else
                ssb_appendstream(&so->so_rcv, q->tqe_m);
        kfree(q, M_TSEGQ);
-       tcp_reass_qsize--;
+       atomic_add_int(&tcp_reass_qsize, -1);
        ND6_HINT(tp);
        sorwakeup(so);
        return (flags);
@@ -506,7 +522,8 @@ tcp_input(struct mbuf *m, ...)
        struct inpcb *inp = NULL;
        u_char *optp = NULL;
        int optlen = 0;
-       int len, tlen, off;
+       int tlen, off;
+       int len = 0;
        int drop_hdrlen;
        struct tcpcb *tp = NULL;
        int thflags;
@@ -516,8 +533,6 @@ tcp_input(struct mbuf *m, ...)
        u_long tiwin;
        int recvwin;
        struct tcpopt to;               /* options in this segment */
-       struct rmxp_tao *taop;          /* pointer to our TAO cache entry */
-       struct rmxp_tao tao_noncached;  /* in case there's no cached entry */
        struct sockaddr_in *next_hop = NULL;
        int rstreason; /* For badport_bandlim accounting purposes */
        int cpu;
@@ -538,11 +553,12 @@ tcp_input(struct mbuf *m, ...)
 
        tcpstat.tcps_rcvtotal++;
 
-       /* Grab info from and strip MT_TAG mbufs prepended to the chain. */
-       while  (m->m_type == MT_TAG) {
-               if (m->_m_tag_id == PACKET_TAG_IPFORWARD)
-                       next_hop = (struct sockaddr_in *)m->m_hdr.mh_data;
-               m = m->m_next;
+       if (m->m_pkthdr.fw_flags & IPFORWARD_MBUF_TAGGED) {
+               struct m_tag *mtag;
+
+               mtag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
+               KKASSERT(mtag != NULL);
+               next_hop = m_tag_data(mtag);
        }
 
 #ifdef INET6
@@ -879,13 +895,21 @@ findpcb:
                                        rstreason = BANDLIM_RST_OPENPORT;
                                        goto dropwithreset;
                                }
+
+                               /*
+                                * Could not complete 3-way handshake,
+                                * connection is being closed down, and
+                                * syncache will free mbuf.
+                                */
                                if (so == NULL)
-                                       /*
-                                        * Could not complete 3-way handshake,
-                                        * connection is being closed down, and
-                                        * syncache will free mbuf.
-                                        */
                                        return;
+
+                               /*
+                                * We must be in the correct protocol thread
+                                * for this connection.
+                                */
+                               KKASSERT(so->so_port == &curthread->td_msgport);
+
                                /*
                                 * Socket is created in state SYN_RECEIVED.
                                 * Continue processing segment.
@@ -1009,15 +1033,20 @@ findpcb:
                        tcp_dooptions(&to, optp, optlen, TRUE);
                        if (!syncache_add(&inc, &to, th, &so, m))
                                goto drop;
+
+                       /*
+                        * Entry added to syncache, mbuf used to
+                        * send SYN,ACK packet.
+                        */
                        if (so == NULL)
-                               /*
-                                * Entry added to syncache, mbuf used to
-                                * send SYN,ACK packet.
-                                */
                                return;
+
                        /*
-                        * Segment passed TAO tests.
+                        * We must be in the correct protocol thread for
+                        * this connection.
                         */
+                       KKASSERT(so->so_port == &curthread->td_msgport);
+
                        inp = so->so_pcb;
                        tp = intotcpcb(inp);
                        tp->snd_wnd = tiwin;
@@ -1036,11 +1065,12 @@ findpcb:
                             (tlen != 0 &&
                              ((isipv6 && in6_localaddr(&inp->in6p_faddr)) ||
                               (!isipv6 && in_localaddr(inp->inp_faddr)))))) {
-                               callout_reset(tp->tt_delack, tcp_delacktime,
-                                               tcp_timer_delack, tp);
+                               tcp_callout_reset(tp, tp->tt_delack,
+                                   tcp_delacktime, tcp_timer_delack);
                                tp->t_flags |= TF_NEEDSYN;
-                       } else
+                       } else {
                                tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
+                       }
 
                        tcpstat.tcps_connects++;
                        soisconnected(so);
@@ -1048,18 +1078,52 @@ findpcb:
                }
                goto drop;
        }
-after_listen:
 
-       /* should not happen - syncache should pick up these connections */
+after_listen:
+       /*
+        * Should not happen - syncache should pick up these connections.
+        *
+        * Once we are past handling listen sockets we must be in the
+        * correct protocol processing thread.
+        */
        KASSERT(tp->t_state != TCPS_LISTEN, ("tcp_input: TCPS_LISTEN state"));
+       KKASSERT(so->so_port == &curthread->td_msgport);
+
+       /*
+        * This is the second part of the MSS DoS prevention code (after
+        * minmss on the sending side) and it deals with too many too small
+        * tcp packets in a too short timeframe (1 second).
+        *
+        * XXX Removed.  This code was crap.  It does not scale to network
+        *     speed, and default values break NFS.  Gone.
+        */
+       /* REMOVED */
 
        /*
         * Segment received on connection.
-        * Reset idle time and keep-alive timer.
+        *
+        * Reset idle time and keep-alive timer.  Don't waste time if less
+        * then a second has elapsed.  Only update t_rcvtime for non-SYN
+        * packets.
+        *
+        * Handle the case where one side thinks the connection is established
+        * but the other side has, say, rebooted without cleaning out the
+        * connection.   The SYNs could be construed as an attack and wind
+        * up ignored, but in case it isn't an attack we can validate the
+        * connection by forcing a keepalive.
         */
-       tp->t_rcvtime = ticks;
-       if (TCPS_HAVEESTABLISHED(tp->t_state))
-               callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp);
+       if (TCPS_HAVEESTABLISHED(tp->t_state) && (ticks - tp->t_rcvtime) > hz) {
+               if ((thflags & (TH_SYN | TH_ACK)) == TH_SYN) {
+                       tp->t_flags |= TF_KEEPALIVE;
+                       tcp_callout_reset(tp, tp->tt_keep, hz / 2,
+                                         tcp_timer_keep);
+               } else {
+                       tp->t_rcvtime = ticks;
+                       tp->t_flags &= ~TF_KEEPALIVE;
+                       tcp_callout_reset(tp, tp->tt_keep, tcp_keepidle,
+                                         tcp_timer_keep);
+               }
+       }
 
        /*
         * Process options.
@@ -1076,8 +1140,6 @@ after_listen:
                        tp->ts_recent = to.to_tsval;
                        tp->ts_recent_age = ticks;
                }
-               if (to.to_flags & (TOF_CC | TOF_CCNEW))
-                       tp->t_flags |= TF_RCVD_CC;
                if (to.to_flags & TOF_MSS)
                        tcp_mss(tp, to.to_mss);
                /*
@@ -1111,13 +1173,6 @@ after_listen:
            !(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)) &&
            (!(to.to_flags & TOF_TS) ||
             TSTMP_GEQ(to.to_tsval, tp->ts_recent)) &&
-           /*
-            * Using the CC option is compulsory if once started:
-            *   the segment is OK if no T/TCP was negotiated or
-            *   if the segment has a CC option equal to CCrecv
-            */
-           ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) != (TF_REQ_CC|TF_RCVD_CC) ||
-            ((to.to_flags & TOF_CC) && to.to_cc == tp->cc_recv)) &&
            th->th_seq == tp->rcv_nxt &&
            tp->snd_nxt == tp->snd_max) {
 
@@ -1214,12 +1269,13 @@ after_listen:
                                 * are ready to send, let tcp_output
                                 * decide between more output or persist.
                                 */
-                               if (tp->snd_una == tp->snd_max)
-                                       callout_stop(tp->tt_rexmt);
-                               else if (!callout_active(tp->tt_persist))
-                                       callout_reset(tp->tt_rexmt,
-                                                     tp->t_rxtcur,
-                                                     tcp_timer_rexmt, tp);
+                               if (tp->snd_una == tp->snd_max) {
+                                       tcp_callout_stop(tp, tp->tt_rexmt);
+                               } else if (!tcp_callout_active(tp,
+                                           tp->tt_persist)) {
+                                       tcp_callout_reset(tp, tp->tt_rexmt,
+                                           tp->t_rxtcur, tcp_timer_rexmt);
+                               }
                                sowwakeup(so);
                                if (so->so_snd.ssb_cc > 0)
                                        tcp_output(tp);
@@ -1229,6 +1285,7 @@ after_listen:
                    th->th_ack == tp->snd_una &&
                    LIST_EMPTY(&tp->t_segq) &&
                    tlen <= ssb_space(&so->so_rcv)) {
+                       u_long newsize = 0;     /* automatic sockbuf scaling */
                        /*
                         * This is a pure, in-sequence data packet
                         * with nothing on the reassembly queue and
@@ -1239,12 +1296,85 @@ after_listen:
                        tcpstat.tcps_rcvpack++;
                        tcpstat.tcps_rcvbyte += tlen;
                        ND6_HINT(tp);   /* some progress has been done */
+               /*
+                * Automatic sizing of receive socket buffer.  Often the send
+                * buffer size is not optimally adjusted to the actual network
+                * conditions at hand (delay bandwidth product).  Setting the
+                * buffer size too small limits throughput on links with high
+                * bandwidth and high delay (eg. trans-continental/oceanic links).
+                *
+                * On the receive side the socket buffer memory is only rarely
+                * used to any significant extent.  This allows us to be much
+                * more aggressive in scaling the receive socket buffer.  For
+                * the case that the buffer space is actually used to a large
+                * extent and we run out of kernel memory we can simply drop
+                * the new segments; TCP on the sender will just retransmit it
+                * later.  Setting the buffer size too big may only consume too
+                * much kernel memory if the application doesn't read() from
+                * the socket or packet loss or reordering makes use of the
+                * reassembly queue.
+                *
+                * The criteria to step up the receive buffer one notch are:
+                *  1. the number of bytes received during the time it takes
+                *     one timestamp to be reflected back to us (the RTT);
+                *  2. received bytes per RTT is within seven eighth of the
+                *     current socket buffer size;
+                *  3. receive buffer size has not hit maximal automatic size;
+                *
+                * This algorithm does one step per RTT at most and only if
+                * we receive a bulk stream w/o packet losses or reorderings.
+                * Shrinking the buffer during idle times is not necessary as
+                * it doesn't consume any memory when idle.
+                *
+                * TODO: Only step up if the application is actually serving
+                * the buffer to better manage the socket buffer resources.
+                */
+                       if (tcp_do_autorcvbuf &&
+                           to.to_tsecr &&
+                           (so->so_rcv.ssb_flags & SSB_AUTOSIZE)) {
+                               if (to.to_tsecr > tp->rfbuf_ts &&
+                                   to.to_tsecr - tp->rfbuf_ts < hz) {
+                                       if (tp->rfbuf_cnt >
+                                           (so->so_rcv.ssb_hiwat / 8 * 7) &&
+                                           so->so_rcv.ssb_hiwat <
+                                           tcp_autorcvbuf_max) {
+                                               newsize =
+                                                   ulmin(so->so_rcv.ssb_hiwat +
+                                                         tcp_autorcvbuf_inc,
+                                                         tcp_autorcvbuf_max);
+                                       }
+                                       /* Start over with next RTT. */
+                                       tp->rfbuf_ts = 0;
+                                       tp->rfbuf_cnt = 0;
+                               } else
+                                       tp->rfbuf_cnt += tlen;  /* add up */
+                       }
                        /*
                         * Add data to socket buffer.
                         */
                        if (so->so_state & SS_CANTRCVMORE) {
                                m_freem(m);
                        } else {
+                               /*
+                                * Set new socket buffer size, give up when
+                                * limit is reached.
+                                *
+                                * Adjusting the size can mess up ACK
+                                * sequencing when pure window updates are
+                                * being avoided (which is the default),
+                                * so force an ack.
+                                */
+                               if (newsize) {
+                                       tp->t_flags |= TF_RXRESIZED;
+                                       if (!ssb_reserve(&so->so_rcv, newsize,
+                                                        so, NULL)) {
+                                               atomic_clear_int(&so->so_rcv.ssb_flags, SSB_AUTOSIZE);
+                                       }
+                                       if (newsize >=
+                                           (TCP_MAXWIN << tp->rcv_scale)) {
+                                               atomic_clear_int(&so->so_rcv.ssb_flags, SSB_AUTOSIZE);
+                                       }
+                               }
                                m_adj(m, drop_hdrlen); /* delayed header drop */
                                ssb_appendstream(&so->so_rcv, m);
                        }
@@ -1273,8 +1403,8 @@ after_listen:
                         * to turn the feature off.
                         */
                        if (DELAY_ACK(tp)) {
-                               callout_reset(tp->tt_delack, tcp_delacktime,
-                                   tcp_timer_delack, tp);
+                               tcp_callout_reset(tp, tp->tt_delack,
+                                   tcp_delacktime, tcp_timer_delack);
                        } else if (tcp_aggregate_acks) {
                                tp->t_flags |= TF_ACKNOW;
                                if (!(tp->t_flags & TF_ONOUTPUTQ)) {
@@ -1303,6 +1433,10 @@ after_listen:
                recvwin = 0;
        tp->rcv_wnd = imax(recvwin, (int)(tp->rcv_adv - tp->rcv_nxt));
 
+       /* Reset receive buffer auto scaling when not in bulk receive mode. */
+       tp->rfbuf_ts = 0;
+       tp->rfbuf_cnt = 0;
+
        switch (tp->t_state) {
        /*
         * If the state is SYN_RECEIVED:
@@ -1330,28 +1464,11 @@ after_listen:
         *      continue processing rest of data/controls, beginning with URG
         */
        case TCPS_SYN_SENT:
-               if ((taop = tcp_gettaocache(&inp->inp_inc)) == NULL) {
-                       taop = &tao_noncached;
-                       bzero(taop, sizeof *taop);
-               }
-
                if ((thflags & TH_ACK) &&
                    (SEQ_LEQ(th->th_ack, tp->iss) ||
                     SEQ_GT(th->th_ack, tp->snd_max))) {
-                       /*
-                        * If we have a cached CCsent for the remote host,
-                        * hence we haven't just crashed and restarted,
-                        * do not send a RST.  This may be a retransmission
-                        * from the other side after our earlier ACK was lost.
-                        * Our new SYN, when it arrives, will serve as the
-                        * needed ACK.
-                        */
-                       if (taop->tao_ccsent != 0)
-                               goto drop;
-                       else {
-                               rstreason = BANDLIM_UNLIMITED;
-                               goto dropwithreset;
-                       }
+                       rstreason = BANDLIM_UNLIMITED;
+                       goto dropwithreset;
                }
                if (thflags & TH_RST) {
                        if (thflags & TH_ACK)
@@ -1361,30 +1478,11 @@ after_listen:
                if (!(thflags & TH_SYN))
                        goto drop;
                tp->snd_wnd = th->th_win;       /* initial send window */
-               tp->cc_recv = to.to_cc;         /* foreign CC */
 
                tp->irs = th->th_seq;
                tcp_rcvseqinit(tp);
                if (thflags & TH_ACK) {
-                       /*
-                        * Our SYN was acked.  If segment contains CC.ECHO
-                        * option, check it to make sure this segment really
-                        * matches our SYN.  If not, just drop it as old
-                        * duplicate, but send an RST if we're still playing
-                        * by the old rules.  If no CC.ECHO option, make sure
-                        * we don't get fooled into using T/TCP.
-                        */
-                       if (to.to_flags & TOF_CCECHO) {
-                               if (tp->cc_send != to.to_ccecho) {
-                                       if (taop->tao_ccsent != 0)
-                                               goto drop;
-                                       else {
-                                               rstreason = BANDLIM_UNLIMITED;
-                                               goto dropwithreset;
-                                       }
-                               }
-                       } else
-                               tp->t_flags &= ~TF_RCVD_CC;
+                       /* Our SYN was acked. */
                        tcpstat.tcps_connects++;
                        soisconnected(so);
                        /* Do window scaling on this connection? */
@@ -1393,22 +1491,19 @@ after_listen:
                                tp->snd_scale = tp->requested_s_scale;
                                tp->rcv_scale = tp->request_r_scale;
                        }
-                       /* Segment is acceptable, update cache if undefined. */
-                       if (taop->tao_ccsent == 0)
-                               taop->tao_ccsent = to.to_ccecho;
-
                        tp->rcv_adv += tp->rcv_wnd;
                        tp->snd_una++;          /* SYN is acked */
-                       callout_stop(tp->tt_rexmt);
+                       tcp_callout_stop(tp, tp->tt_rexmt);
                        /*
                         * If there's data, delay ACK; if there's also a FIN
                         * ACKNOW will be turned on later.
                         */
-                       if (DELAY_ACK(tp) && tlen != 0)
-                               callout_reset(tp->tt_delack, tcp_delacktime,
-                                   tcp_timer_delack, tp);
-                       else
+                       if (DELAY_ACK(tp) && tlen != 0) {
+                               tcp_callout_reset(tp, tp->tt_delack,
+                                   tcp_delacktime, tcp_timer_delack);
+                       } else {
                                tp->t_flags |= TF_ACKNOW;
+                       }
                        /*
                         * Received <SYN,ACK> in SYN_SENT[*] state.
                         * Transitions:
@@ -1422,50 +1517,20 @@ after_listen:
                                thflags &= ~TH_SYN;
                        } else {
                                tp->t_state = TCPS_ESTABLISHED;
-                               callout_reset(tp->tt_keep, tcp_keepidle,
-                                             tcp_timer_keep, tp);
+                               tcp_callout_reset(tp, tp->tt_keep, tcp_keepidle,
+                                   tcp_timer_keep);
                        }
                } else {
                        /*
                         * Received initial SYN in SYN-SENT[*] state =>
-                        * simultaneous open.  If segment contains CC option
-                        * and there is a cached CC, apply TAO test.
-                        * If it succeeds, connection is * half-synchronized.
-                        * Otherwise, do 3-way handshake:
+                        * simultaneous open.
+                        * Do 3-way handshake:
                         *        SYN-SENT -> SYN-RECEIVED
                         *        SYN-SENT* -> SYN-RECEIVED*
-                        * If there was no CC option, clear cached CC value.
                         */
                        tp->t_flags |= TF_ACKNOW;
-                       callout_stop(tp->tt_rexmt);
-                       if (to.to_flags & TOF_CC) {
-                               if (taop->tao_cc != 0 &&
-                                   CC_GT(to.to_cc, taop->tao_cc)) {
-                                       /*
-                                        * update cache and make transition:
-                                        *        SYN-SENT -> ESTABLISHED*
-                                        *        SYN-SENT* -> FIN-WAIT-1*
-                                        */
-                                       taop->tao_cc = to.to_cc;
-                                       tp->t_starttime = ticks;
-                                       if (tp->t_flags & TF_NEEDFIN) {
-                                               tp->t_state = TCPS_FIN_WAIT_1;
-                                               tp->t_flags &= ~TF_NEEDFIN;
-                                       } else {
-                                               tp->t_state = TCPS_ESTABLISHED;
-                                               callout_reset(tp->tt_keep,
-                                                             tcp_keepidle,
-                                                             tcp_timer_keep,
-                                                             tp);
-                                       }
-                                       tp->t_flags |= TF_NEEDSYN;
-                               } else
-                                       tp->t_state = TCPS_SYN_RECEIVED;
-                       } else {
-                               /* CC.NEW or no option => invalidate cache */
-                               taop->tao_cc = 0;
-                               tp->t_state = TCPS_SYN_RECEIVED;
-                       }
+                       tcp_callout_stop(tp, tp->tt_rexmt);
+                       tp->t_state = TCPS_SYN_RECEIVED;
                }
 
 trimthenstep6:
@@ -1499,35 +1564,11 @@ trimthenstep6:
 
        /*
         * If the state is LAST_ACK or CLOSING or TIME_WAIT:
-        *      if segment contains a SYN and CC [not CC.NEW] option:
-        *              if state == TIME_WAIT and connection duration > MSL,
-        *                  drop packet and send RST;
-        *
-        *              if SEG.CC > CCrecv then is new SYN, and can implicitly
-        *                  ack the FIN (and data) in retransmission queue.
-        *                  Complete close and delete TCPCB.  Then reprocess
-        *                  segment, hoping to find new TCPCB in LISTEN state;
-        *
-        *              else must be old SYN; drop it.
-        *      else do normal processing.
+        *      do normal processing (we no longer bother with T/TCP).
         */
        case TCPS_LAST_ACK:
        case TCPS_CLOSING:
        case TCPS_TIME_WAIT:
-               if ((thflags & TH_SYN) &&
-                   (to.to_flags & TOF_CC) && tp->cc_recv != 0) {
-                       if (tp->t_state == TCPS_TIME_WAIT &&
-                                       (ticks - tp->t_starttime) > tcp_msl) {
-                               rstreason = BANDLIM_UNLIMITED;
-                               goto dropwithreset;
-                       }
-                       if (CC_GT(to.to_cc, tp->cc_recv)) {
-                               tp = tcp_close(tp);
-                               goto findpcb;
-                       }
-                       else
-                               goto drop;
-               }
                break;  /* continue normal processing */
        }
 
@@ -1650,16 +1691,6 @@ trimthenstep6:
                }
        }
 
-       /*
-        * T/TCP mechanism
-        *   If T/TCP was negotiated and the segment doesn't have CC,
-        *   or if its CC is wrong then drop the segment.
-        *   RST segments do not have to comply with this.
-        */
-       if ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) == (TF_REQ_CC|TF_RCVD_CC) &&
-           (!(to.to_flags & TOF_CC) || tp->cc_recv != to.to_cc))
-               goto dropafterack;
-
        /*
         * In the SYN-RECEIVED state, validate that the packet belongs to
         * this connection before trimming the data to fit the receive
@@ -1848,15 +1879,6 @@ trimthenstep6:
                        tp->snd_scale = tp->requested_s_scale;
                        tp->rcv_scale = tp->request_r_scale;
                }
-               /*
-                * Upon successful completion of 3-way handshake,
-                * update cache.CC if it was undefined, pass any queued
-                * data to the user, and advance state appropriately.
-                */
-               if ((taop = tcp_gettaocache(&inp->inp_inc)) != NULL &&
-                   taop->tao_cc == 0)
-                       taop->tao_cc = tp->cc_recv;
-
                /*
                 * Make transitions:
                 *      SYN-RECEIVED  -> ESTABLISHED
@@ -1868,8 +1890,8 @@ trimthenstep6:
                        tp->t_flags &= ~TF_NEEDFIN;
                } else {
                        tp->t_state = TCPS_ESTABLISHED;
-                       callout_reset(tp->tt_keep, tcp_keepidle,
-                                     tcp_timer_keep, tp);
+                       tcp_callout_reset(tp, tp->tt_keep, tcp_keepidle,
+                           tcp_timer_keep);
                }
                /*
                 * If segment contains data or ACK, will call tcp_reass()
@@ -1903,7 +1925,7 @@ trimthenstep6:
                                break;
                        }
                        tcpstat.tcps_rcvdupack++;
-                       if (!callout_active(tp->tt_rexmt) ||
+                       if (!tcp_callout_active(tp, tp->tt_rexmt) ||
                            th->th_ack != tp->snd_una) {
                                tp->t_dupacks = 0;
                                break;
@@ -1964,7 +1986,7 @@ fastretransmit:
                                tp->snd_ssthresh = win * tp->t_maxseg;
                                ENTER_FASTRECOVERY(tp);
                                tp->snd_recover = tp->snd_max;
-                               callout_stop(tp->tt_rexmt);
+                               tcp_callout_stop(tp, tp->tt_rexmt);
                                tp->t_rtttime = 0;
                                old_snd_nxt = tp->snd_nxt;
                                tp->snd_nxt = th->th_ack;
@@ -2247,11 +2269,12 @@ process_ACK:
                 * timer, using current (possibly backed-off) value.
                 */
                if (th->th_ack == tp->snd_max) {
-                       callout_stop(tp->tt_rexmt);
+                       tcp_callout_stop(tp, tp->tt_rexmt);
                        needoutput = TRUE;
-               } else if (!callout_active(tp->tt_persist))
-                       callout_reset(tp->tt_rexmt, tp->t_rxtcur,
-                                     tcp_timer_rexmt, tp);
+               } else if (!tcp_callout_active(tp, tp->tt_persist)) {
+                       tcp_callout_reset(tp, tp->tt_rexmt, tp->t_rxtcur,
+                           tcp_timer_rexmt);
+               }
 
                switch (tp->t_state) {
                /*
@@ -2270,8 +2293,8 @@ process_ACK:
                                 */
                                if (so->so_state & SS_CANTRCVMORE) {
                                        soisdisconnected(so);
-                                       callout_reset(tp->tt_2msl, tcp_maxidle,
-                                                     tcp_timer_2msl, tp);
+                                       tcp_callout_reset(tp, tp->tt_2msl,
+                                           tcp_maxidle, tcp_timer_2msl);
                                }
                                tp->t_state = TCPS_FIN_WAIT_2;
                        }
@@ -2287,15 +2310,8 @@ process_ACK:
                        if (ourfinisacked) {
                                tp->t_state = TCPS_TIME_WAIT;
                                tcp_canceltimers(tp);
-                               /* Shorten TIME_WAIT [RFC-1644, p.28] */
-                               if (tp->cc_recv != 0 &&
-                                   (ticks - tp->t_starttime) < tcp_msl)
-                                       callout_reset(tp->tt_2msl,
-                                           tp->t_rxtcur * TCPTV_TWTRUNC,
-                                           tcp_timer_2msl, tp);
-                               else
-                                       callout_reset(tp->tt_2msl, 2 * tcp_msl,
-                                           tcp_timer_2msl, tp);
+                               tcp_callout_reset(tp, tp->tt_2msl,
+                                           2 * tcp_msl, tcp_timer_2msl);
                                soisdisconnected(so);
                        }
                        break;
@@ -2319,8 +2335,8 @@ process_ACK:
                 * it and restart the finack timer.
                 */
                case TCPS_TIME_WAIT:
-                       callout_reset(tp->tt_2msl, 2 * tcp_msl,
-                                     tcp_timer_2msl, tp);
+                       tcp_callout_reset(tp, tp->tt_2msl, 2 * tcp_msl,
+                           tcp_timer_2msl);
                        goto dropafterack;
                }
        }
@@ -2430,11 +2446,12 @@ dodata:                                                 /* XXX */
                if (th->th_seq == tp->rcv_nxt &&
                    LIST_EMPTY(&tp->t_segq) &&
                    TCPS_HAVEESTABLISHED(tp->t_state)) {
-                       if (DELAY_ACK(tp))
-                               callout_reset(tp->tt_delack, tcp_delacktime,
-                                             tcp_timer_delack, tp);
-                       else
+                       if (DELAY_ACK(tp)) {
+                               tcp_callout_reset(tp, tp->tt_delack,
+                                   tcp_delacktime, tcp_timer_delack);
+                       } else {
                                tp->t_flags |= TF_ACKNOW;
+                       }
                        tp->rcv_nxt += tlen;
                        thflags = th->th_flags & TH_FIN;
                        tcpstat.tcps_rcvpack++;
@@ -2481,11 +2498,12 @@ dodata:                                                 /* XXX */
                         * Otherwise, since we received a FIN then no
                         * more input can be expected, send ACK now.
                         */
-                       if (DELAY_ACK(tp) && (tp->t_flags & TF_NEEDSYN))
-                               callout_reset(tp->tt_delack, tcp_delacktime,
-                                   tcp_timer_delack, tp);
-                       else
+                       if (DELAY_ACK(tp) && (tp->t_flags & TF_NEEDSYN)) {
+                               tcp_callout_reset(tp, tp->tt_delack,
+                                   tcp_delacktime, tcp_timer_delack);
+                       } else {
                                tp->t_flags |= TF_ACKNOW;
+                       }
                        tp->rcv_nxt++;
                }
 
@@ -2517,18 +2535,8 @@ dodata:                                                  /* XXX */
                case TCPS_FIN_WAIT_2:
                        tp->t_state = TCPS_TIME_WAIT;
                        tcp_canceltimers(tp);
-                       /* Shorten TIME_WAIT [RFC-1644, p.28] */
-                       if (tp->cc_recv != 0 &&
-                           (ticks - tp->t_starttime) < tcp_msl) {
-                               callout_reset(tp->tt_2msl,
-                                             tp->t_rxtcur * TCPTV_TWTRUNC,
-                                             tcp_timer_2msl, tp);
-                               /* For transaction client, force ACK now. */
-                               tp->t_flags |= TF_ACKNOW;
-                       }
-                       else
-                               callout_reset(tp->tt_2msl, 2 * tcp_msl,
-                                             tcp_timer_2msl, tp);
+                       tcp_callout_reset(tp, tp->tt_2msl, 2 * tcp_msl,
+                                   tcp_timer_2msl);
                        soisdisconnected(so);
                        break;
 
@@ -2536,8 +2544,8 @@ dodata:                                                   /* XXX */
                 * In TIME_WAIT state restart the 2 MSL time_wait timer.
                 */
                case TCPS_TIME_WAIT:
-                       callout_reset(tp->tt_2msl, 2 * tcp_msl,
-                                     tcp_timer_2msl, tp);
+                       tcp_callout_reset(tp, tp->tt_2msl, 2 * tcp_msl,
+                           tcp_timer_2msl);
                        break;
                }
        }
@@ -2698,31 +2706,6 @@ tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, boolean_t is_syn)
                        if (to->to_tsecr != 0 && TSTMP_GT(to->to_tsecr, ticks))
                                to->to_tsecr = 0;
                        break;
-               case TCPOPT_CC:
-                       if (optlen != TCPOLEN_CC)
-                               continue;
-                       to->to_flags |= TOF_CC;
-                       bcopy(cp + 2, &to->to_cc, sizeof to->to_cc);
-                       to->to_cc = ntohl(to->to_cc);
-                       break;
-               case TCPOPT_CCNEW:
-                       if (optlen != TCPOLEN_CC)
-                               continue;
-                       if (!is_syn)
-                               continue;
-                       to->to_flags |= TOF_CCNEW;
-                       bcopy(cp + 2, &to->to_cc, sizeof to->to_cc);
-                       to->to_cc = ntohl(to->to_cc);
-                       break;
-               case TCPOPT_CCECHO:
-                       if (optlen != TCPOLEN_CC)
-                               continue;
-                       if (!is_syn)
-                               continue;
-                       to->to_flags |= TOF_CCECHO;
-                       bcopy(cp + 2, &to->to_ccecho, sizeof to->to_ccecho);
-                       to->to_ccecho = ntohl(to->to_ccecho);
-                       break;
                case TCPOPT_SACK_PERMITTED:
                        if (optlen != TCPOLEN_SACK_PERMITTED)
                                continue;
@@ -2743,6 +2726,19 @@ tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, boolean_t is_syn)
                                r->rblk_end = ntohl(r->rblk_end);
                        }
                        break;
+#ifdef TCP_SIGNATURE
+               /*
+                * XXX In order to reply to a host which has set the
+                * TCP_SIGNATURE option in its initial SYN, we have to
+                * record the fact that the option was observed here
+                * for the syncache code to perform the correct response.
+                */
+               case TCPOPT_SIGNATURE:
+                       if (optlen != TCPOLEN_SIGNATURE)
+                               continue;
+                       to->to_flags |= (TOF_SIGNATURE | TOF_SIGLEN);
+                       break;
+#endif /* TCP_SIGNATURE */
                default:
                        continue;
                }
@@ -2884,10 +2880,6 @@ tcp_xmit_timer(struct tcpcb *tp, int rtt)
  *
  * NOTE that this routine is only called when we process an incoming
  * segment, for outgoing segments only tcp_mssopt is called.
- *
- * In case of T/TCP, we call this routine during implicit connection
- * setup as well (offer = -1), to initialize maxseg from the cached
- * MSS of our peer.
  */
 void
 tcp_mss(struct tcpcb *tp, int offer)
@@ -2898,8 +2890,6 @@ tcp_mss(struct tcpcb *tp, int offer)
        u_long bufsize;
        struct inpcb *inp = tp->t_inpcb;
        struct socket *so;
-       struct rmxp_tao *taop;
-       int origoffer = offer;
 #ifdef INET6
        boolean_t isipv6 = ((inp->inp_vflag & INP_IPV6) ? TRUE : FALSE);
        size_t min_protoh = isipv6 ?
@@ -2922,28 +2912,41 @@ tcp_mss(struct tcpcb *tp, int offer)
        ifp = rt->rt_ifp;
        so = inp->inp_socket;
 
-       taop = rmx_taop(rt->rt_rmx);
        /*
-        * Offer == -1 means that we didn't receive SYN yet,
-        * use cached value in that case;
+        * Offer == 0 means that there was no MSS on the SYN segment,
+        * in this case we use either the interface mtu or tcp_mssdflt.
+        *
+        * An offer which is too large will be cut down later.
         */
-       if (offer == -1)
-               offer = taop->tao_mssopt;
+       if (offer == 0) {
+               if (isipv6) {
+                       if (in6_localaddr(&inp->in6p_faddr)) {
+                               offer = ND_IFINFO(rt->rt_ifp)->linkmtu -
+                                       min_protoh;
+                       } else {
+                               offer = tcp_v6mssdflt;
+                       }
+               } else {
+                       if (in_localaddr(inp->inp_faddr))
+                               offer = ifp->if_mtu - min_protoh;
+                       else
+                               offer = tcp_mssdflt;
+               }
+       }
+
        /*
-        * Offer == 0 means that there was no MSS on the SYN segment,
-        * in this case we use tcp_mssdflt.
+        * Prevent DoS attack with too small MSS. Round up
+        * to at least minmss.
+        *
+        * Sanity check: make sure that maxopd will be large
+        * enough to allow some data on segments even is the
+        * all the option space is used (40bytes).  Otherwise
+        * funny things may happen in tcp_output.
         */
-       if (offer == 0)
-               offer = (isipv6 ? tcp_v6mssdflt : tcp_mssdflt);
-       else
-               /*
-                * Sanity check: make sure that maxopd will be large
-                * enough to allow some data on segments even is the
-                * all the option space is used (40bytes).  Otherwise
-                * funny things may happen in tcp_output.
-                */
-               offer = max(offer, 64);
-       taop->tao_mssopt = offer;
+       offer = max(offer, tcp_minmss);
+       offer = max(offer, 64);
+
+       rt->rt_rmx.rmx_mssopt = offer;
 
        /*
         * While we're here, check if there's an initial rtt
@@ -2973,24 +2976,22 @@ tcp_mss(struct tcpcb *tp, int offer)
                              ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
                              tp->t_rttmin, TCPTV_REXMTMAX);
        }
+
        /*
         * if there's an mtu associated with the route, use it
-        * else, use the link mtu.
+        * else, use the link mtu.  Take the smaller of mss or offer
+        * as our final mss.
         */
-       if (rt->rt_rmx.rmx_mtu)
+       if (rt->rt_rmx.rmx_mtu) {
                mss = rt->rt_rmx.rmx_mtu - min_protoh;
-       else {
-               if (isipv6) {
+       else {
+               if (isipv6)
                        mss = ND_IFINFO(rt->rt_ifp)->linkmtu - min_protoh;
-                       if (!in6_localaddr(&inp->in6p_faddr))
-                               mss = min(mss, tcp_v6mssdflt);
-               } else {
+               else
                        mss = ifp->if_mtu - min_protoh;
-                       if (!in_localaddr(inp->inp_faddr))
-                               mss = min(mss, tcp_mssdflt);
-               }
        }
        mss = min(mss, offer);
+
        /*
         * maxopd stores the maximum length of data AND options
         * in a segment; maxseg is the amount of data in a normal
@@ -3000,19 +3001,9 @@ tcp_mss(struct tcpcb *tp, int offer)
         */
        tp->t_maxopd = mss;
 
-       /*
-        * In case of T/TCP, origoffer==-1 indicates, that no segments
-        * were received yet.  In this case we just guess, otherwise
-        * we do the same as before T/TCP.
-        */
        if ((tp->t_flags & (TF_REQ_TSTMP | TF_NOOPT)) == TF_REQ_TSTMP &&
-           (origoffer == -1 ||
-            (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP))
+           ((tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP))
                mss -= TCPOLEN_TSTAMP_APPA;
-       if ((tp->t_flags & (TF_REQ_CC | TF_NOOPT)) == TF_REQ_CC &&
-           (origoffer == -1 ||
-            (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC))
-               mss -= TCPOLEN_CC_APPA;
 
 #if    (MCLBYTES & (MCLBYTES - 1)) == 0
                if (mss > MCLBYTES)
@@ -3117,7 +3108,7 @@ tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th, int acked)
        tcp_seq old_snd_nxt = tp->snd_nxt;
        u_long ocwnd = tp->snd_cwnd;
 
-       callout_stop(tp->tt_rexmt);
+       tcp_callout_stop(tp, tp->tt_rexmt);
        tp->t_rtttime = 0;
        tp->snd_nxt = th->th_ack;
        /* Set snd_cwnd to one segment beyond acknowledged offset. */
@@ -3164,7 +3155,7 @@ tcp_sack_rexmt(struct tcpcb *tp, struct tcphdr *th)
                tp->snd_cwnd = nextrexmt - tp->snd_una + seglen;
                old_snd_max = tp->snd_max;
                if (nextrexmt == tp->snd_una)
-                       callout_stop(tp->tt_rexmt);
+                       tcp_callout_stop(tp, tp->tt_rexmt);
                error = tcp_output(tp);
                if (error != 0)
                        break;