tcp timer: Don't try stopping timers if timer message is not created.
[dragonfly.git] / sys / netinet / tcp_subr.c
index f409e7f..817c258 100644 (file)
@@ -65,7 +65,7 @@
  *
  *     @(#)tcp_subr.c  8.2 (Berkeley) 5/24/95
  * $FreeBSD: src/sys/netinet/tcp_subr.c,v 1.73.2.31 2003/01/24 05:11:34 sam Exp $
- * $DragonFly: src/sys/netinet/tcp_subr.c,v 1.57 2007/04/22 01:13:14 dillon Exp $
+ * $DragonFly: src/sys/netinet/tcp_subr.c,v 1.63 2008/11/11 10:46:58 sephe Exp $
  */
 
 #include "opt_compat.h"
@@ -85,6 +85,7 @@
 #include <sys/domain.h>
 #endif
 #include <sys/proc.h>
+#include <sys/priv.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/protosw.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
+#include <netinet/tcp_timer2.h>
 #include <netinet/tcp_var.h>
 #include <netinet6/tcp6_var.h>
 #include <netinet/tcpip.h>
 #include <sys/msgport2.h>
 #include <machine/smp.h>
 
+#include <net/netmsg2.h>
+
 #if !defined(KTR_TCP)
 #define KTR_TCP                KTR_ALL
 #endif
@@ -155,6 +159,15 @@ KTR_INFO(KTR_TCP, tcp, delayed, 2, "tcp execute delayed ops", 0);
 struct inpcbinfo tcbinfo[MAXCPU];
 struct tcpcbackqhead tcpcbackq[MAXCPU];
 
+int tcp_mpsafe_proto = 0;
+TUNABLE_INT("net.inet.tcp.mpsafe_proto", &tcp_mpsafe_proto);
+
+static int tcp_mpsafe_thread = NETMSG_SERVICE_ADAPTIVE;
+TUNABLE_INT("net.inet.tcp.mpsafe_thread", &tcp_mpsafe_thread);
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, mpsafe_thread, CTLFLAG_RW,
+          &tcp_mpsafe_thread, 0,
+          "0:BGL, 1:Adaptive BGL, 2:No BGL(experimental)");
+
 int tcp_mssdflt = TCP_MSS;
 SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW,
     &tcp_mssdflt, 0, "Default TCP Maximum Segment Size");
@@ -227,7 +240,7 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_stab, CTLFLAG_RW,
 static MALLOC_DEFINE(M_TCPTEMP, "tcptemp", "TCP Templates for Keepalives");
 static struct malloc_pipe tcptemp_mpipe;
 
-static void tcp_willblock(void);
+static void tcp_willblock(int);
 static void tcp_cleartaocache (void);
 static void tcp_notify (struct inpcb *, int);
 
@@ -282,8 +295,12 @@ struct     inp_tp {
                char    align[(sizeof(struct inpcb) + ALIGNM1) & ~ALIGNM1];
        } inp_tp_u;
        struct  tcpcb tcb;
-       struct  callout inp_tp_rexmt, inp_tp_persist, inp_tp_keep, inp_tp_2msl;
-       struct  callout inp_tp_delack;
+       struct  tcp_callout inp_tp_rexmt;
+       struct  tcp_callout inp_tp_persist;
+       struct  tcp_callout inp_tp_keep;
+       struct  tcp_callout inp_tp_2msl;
+       struct  tcp_callout inp_tp_delack;
+       struct  netmsg_tcp_timer inp_tp_timermsg;
 };
 #undef ALIGNMENT
 #undef ALIGNM1
@@ -368,7 +385,6 @@ tcp_init(void)
 #endif
 
        syncache_init();
-       tcp_sack_init();
        tcp_thread_init();
 }
 
@@ -376,23 +392,41 @@ void
 tcpmsg_service_loop(void *dummy)
 {
        struct netmsg *msg;
+       int mplocked;
+
+       /*
+        * Thread was started with TDF_MPSAFE
+        */
+       mplocked = 0;
 
-       while ((msg = lwkt_waitport(&curthread->td_msgport, NULL))) {
+       while ((msg = lwkt_waitport(&curthread->td_msgport, 0))) {
                do {
                        logtcp(rxmsg);
-                       msg->nm_lmsg.ms_cmd.cm_func(&msg->nm_lmsg);
+                       mplocked = netmsg_service(msg, tcp_mpsafe_thread,
+                                                 mplocked);
                } while ((msg = lwkt_getport(&curthread->td_msgport)) != NULL);
+
                logtcp(delayed);
-               tcp_willblock();
+               tcp_willblock(mplocked);
                logtcp(wait);
        }
 }
 
 static void
-tcp_willblock(void)
+tcp_willblock(int mplocked)
 {
        struct tcpcb *tp;
        int cpu = mycpu->gd_cpuid;
+       int unlock = 0;
+
+       if (!mplocked && !tcp_mpsafe_proto) {
+               if (TAILQ_EMPTY(&tcpcbackq[cpu]))
+                       return;
+
+               get_mplock();
+               mplocked = 1;
+               unlock = 1;
+       }
 
        while ((tp = TAILQ_FIRST(&tcpcbackq[cpu])) != NULL) {
                KKASSERT(tp->t_flags & TF_ONOUTPUTQ);
@@ -400,6 +434,9 @@ tcp_willblock(void)
                TAILQ_REMOVE(&tcpcbackq[cpu], tp, t_outputq);
                tcp_output(tp);
        }
+
+       if (unlock)
+               rel_mplock();
 }
 
 
@@ -509,6 +546,7 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
        struct route_in6 *ro6 = NULL;
        struct route_in6 sro6;
        struct ip6_hdr *ip6 = ipgen;
+       boolean_t use_tmpro = TRUE;
 #ifdef INET6
        boolean_t isipv6 = (IP_VHL_V(ip->ip_vhl) == 6);
 #else
@@ -521,11 +559,19 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
                        if (win > (long)TCP_MAXWIN << tp->rcv_scale)
                                win = (long)TCP_MAXWIN << tp->rcv_scale;
                }
-               if (isipv6)
-                       ro6 = &tp->t_inpcb->in6p_route;
-               else
-                       ro = &tp->t_inpcb->inp_route;
-       } else {
+               /*
+                * Don't use the route cache of a listen socket,
+                * it is not MPSAFE; use temporary route cache.
+                */
+               if (tp->t_state != TCPS_LISTEN) {
+                       if (isipv6)
+                               ro6 = &tp->t_inpcb->in6p_route;
+                       else
+                               ro = &tp->t_inpcb->inp_route;
+                       use_tmpro = FALSE;
+               }
+       }
+       if (use_tmpro) {
                if (isipv6) {
                        ro6 = &sro6;
                        bzero(ro6, sizeof *ro6);
@@ -590,7 +636,7 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
        }
        m->m_len = tlen;
        m->m_pkthdr.len = tlen;
-       m->m_pkthdr.rcvif = (struct ifnet *) NULL;
+       m->m_pkthdr.rcvif = NULL;
        nth->th_seq = htonl(seq);
        nth->th_ack = htonl(ack);
        nth->th_x2 = 0;
@@ -627,6 +673,7 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
                        ro6->ro_rt = NULL;
                }
        } else {
+               ipflags |= IP_DEBUGROUTE;
                ip_output(m, NULL, ro, ipflags, NULL, tp ? tp->t_inpcb : NULL);
                if ((ro == &sro) && (ro->ro_rt != NULL)) {
                        RTFREE(ro->ro_rt);
@@ -659,11 +706,20 @@ tcp_newtcpcb(struct inpcb *inp)
        tp->t_maxseg = tp->t_maxopd = isipv6 ? tcp_v6mssdflt : tcp_mssdflt;
 
        /* Set up our timeouts. */
-       callout_init(tp->tt_rexmt = &it->inp_tp_rexmt);
-       callout_init(tp->tt_persist = &it->inp_tp_persist);
-       callout_init(tp->tt_keep = &it->inp_tp_keep);
-       callout_init(tp->tt_2msl = &it->inp_tp_2msl);
-       callout_init(tp->tt_delack = &it->inp_tp_delack);
+       tp->tt_rexmt = &it->inp_tp_rexmt;
+       tp->tt_persist = &it->inp_tp_persist;
+       tp->tt_keep = &it->inp_tp_keep;
+       tp->tt_2msl = &it->inp_tp_2msl;
+       tp->tt_delack = &it->inp_tp_delack;
+       tcp_inittimers(tp);
+
+       /*
+        * Zero out timer message.  We don't create it here,
+        * since the current CPU may not be the owner of this
+        * inpcb.
+        */
+       tp->tt_msg = &it->inp_tp_timermsg;
+       bzero(tp->tt_msg, sizeof(*tp->tt_msg));
 
        if (tcp_do_rfc1323)
                tp->t_flags = (TF_REQ_SCALE | TF_REQ_TSTMP);
@@ -720,7 +776,7 @@ tcp_drop(struct tcpcb *tp, int error)
 #ifdef SMP
 
 struct netmsg_remwildcard {
-       struct lwkt_msg         nm_lmsg;
+       struct netmsg           nm_netmsg;
        struct inpcb            *nm_inp;
        struct inpcbinfo        *nm_pcbinfo;
 #if defined(INET6)
@@ -735,8 +791,8 @@ struct netmsg_remwildcard {
  * inp can be detached.  We do this by cycling through the cpus, ending up
  * on the cpu controlling the inp last and then doing the disconnect.
  */
-static int
-in_pcbremwildcardhash_handler(struct lwkt_msg *msg0)
+static void
+in_pcbremwildcardhash_handler(struct netmsg *msg0)
 {
        struct netmsg_remwildcard *msg = (struct netmsg_remwildcard *)msg0;
        int cpu;
@@ -751,14 +807,13 @@ in_pcbremwildcardhash_handler(struct lwkt_msg *msg0)
                else
 #endif
                        in_pcbdetach(msg->nm_inp);
-               lwkt_replymsg(&msg->nm_lmsg, 0);
+               lwkt_replymsg(&msg->nm_netmsg.nm_lmsg, 0);
        } else {
                in_pcbremwildcardhash_oncpu(msg->nm_inp, msg->nm_pcbinfo);
                cpu = (cpu + 1) % ncpus2;
                msg->nm_pcbinfo = &tcbinfo[cpu];
-               lwkt_forwardmsg(tcp_cport(cpu), &msg->nm_lmsg);
+               lwkt_forwardmsg(tcp_cport(cpu), &msg->nm_netmsg.nm_lmsg);
        }
-       return (EASYNC);
 }
 
 #endif
@@ -803,13 +858,17 @@ tcp_close(struct tcpcb *tp)
 
        /*
         * Make sure that all of our timers are stopped before we
-        * delete the PCB.
+        * delete the PCB.  For listen TCP socket (tp->tt_msg == NULL),
+        * timers are never used.  If timer message is never created
+        * (tp->tt_msg->tt_tcb == NULL), timers are never used too.
         */
-       callout_stop(tp->tt_rexmt);
-       callout_stop(tp->tt_persist);
-       callout_stop(tp->tt_keep);
-       callout_stop(tp->tt_2msl);
-       callout_stop(tp->tt_delack);
+       if (tp->tt_msg != NULL && tp->tt_msg->tt_tcb != NULL) {
+               tcp_callout_stop(tp, tp->tt_rexmt);
+               tcp_callout_stop(tp, tp->tt_persist);
+               tcp_callout_stop(tp, tp->tt_keep);
+               tcp_callout_stop(tp, tp->tt_2msl);
+               tcp_callout_stop(tp, tp->tt_delack);
+       }
 
        if (tp->t_flags & TF_ONOUTPUTQ) {
                KKASSERT(tp->tt_cpu == mycpu->gd_cpuid);
@@ -926,6 +985,9 @@ no_valid_rt:
 
        inp->inp_ppcb = NULL;
        soisdisconnected(so);
+
+       tcp_destroy_timermsg(tp);
+
        /*
         * Discard the inp.  In the SMP case a wildcard inp's hash (created
         * by a listen socket or an INADDR_ANY udp socket) is replicated
@@ -942,16 +1004,15 @@ no_valid_rt:
 
                cpu = (inp->inp_pcbinfo->cpu + 1) % ncpus2;
                msg = kmalloc(sizeof(struct netmsg_remwildcard),
-                           M_LWKTMSG, M_INTWAIT);
-               lwkt_initmsg(&msg->nm_lmsg, &netisr_afree_rport, 0,
-                   lwkt_cmd_func(in_pcbremwildcardhash_handler),
-                   lwkt_cmd_op_none);
+                             M_LWKTMSG, M_INTWAIT);
+               netmsg_init(&msg->nm_netmsg, &netisr_afree_rport, 0,
+                           in_pcbremwildcardhash_handler);
 #ifdef INET6
                msg->nm_isinet6 = isafinet6;
 #endif
                msg->nm_inp = inp;
                msg->nm_pcbinfo = &tcbinfo[cpu];
-               lwkt_sendmsg(tcp_cport(cpu), &msg->nm_lmsg);
+               lwkt_sendmsg(tcp_cport(cpu), &msg->nm_netmsg.nm_lmsg);
        } else
 #endif
        {
@@ -990,18 +1051,17 @@ tcp_drain_oncpu(struct inpcbhead *head)
 
 #ifdef SMP
 struct netmsg_tcp_drain {
-       struct lwkt_msg         nm_lmsg;
+       struct netmsg           nm_netmsg;
        struct inpcbhead        *nm_head;
 };
 
-static int
-tcp_drain_handler(lwkt_msg_t lmsg)
+static void
+tcp_drain_handler(netmsg_t netmsg)
 {
-       struct netmsg_tcp_drain *nm = (void *)lmsg;
+       struct netmsg_tcp_drain *nm = (void *)netmsg;
 
        tcp_drain_oncpu(nm->nm_head);
-       lwkt_replymsg(lmsg, 0);
-       return(EASYNC);
+       lwkt_replymsg(&nm->nm_netmsg.nm_lmsg, 0);
 }
 #endif
 
@@ -1034,11 +1094,10 @@ tcp_drain(void)
                                    M_LWKTMSG, M_NOWAIT);
                        if (msg == NULL)
                                continue;
-                       lwkt_initmsg(&msg->nm_lmsg, &netisr_afree_rport, 0,
-                               lwkt_cmd_func(tcp_drain_handler),
-                               lwkt_cmd_op_none);
+                       netmsg_init(&msg->nm_netmsg, &netisr_afree_rport, 0,
+                                   tcp_drain_handler);
                        msg->nm_head = &tcbinfo[cpu].pcblisthead;
-                       lwkt_sendmsg(tcp_cport(cpu), &msg->nm_lmsg);
+                       lwkt_sendmsg(tcp_cport(cpu), &msg->nm_netmsg.nm_lmsg);
                }
        }
 #else
@@ -1202,7 +1261,7 @@ tcp_getcred(SYSCTL_HANDLER_ARGS)
        int cpu;
        int error;
 
-       error = suser(req->td);
+       error = priv_check(req->td, PRIV_ROOT);
        if (error != 0)
                return (error);
        error = SYSCTL_IN(req, addrs, sizeof addrs);
@@ -1235,7 +1294,7 @@ tcp6_getcred(SYSCTL_HANDLER_ARGS)
        int error;
        boolean_t mapped = FALSE;
 
-       error = suser(req->td);
+       error = priv_check(req->td, PRIV_ROOT);
        if (error != 0)
                return (error);
        error = SYSCTL_IN(req, addrs, sizeof addrs);
@@ -1276,6 +1335,29 @@ SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred, (CTLTYPE_OPAQUE | CTLFLAG_RW),
            tcp6_getcred, "S,ucred", "Get the ucred of a TCP6 connection");
 #endif
 
+struct netmsg_tcp_notify {
+       struct netmsg   nm_nmsg;
+       void            (*nm_notify)(struct inpcb *, int);
+       struct in_addr  nm_faddr;
+       int             nm_arg;
+};
+
+static void
+tcp_notifyall_oncpu(struct netmsg *netmsg)
+{
+       struct netmsg_tcp_notify *nmsg = (struct netmsg_tcp_notify *)netmsg;
+       int nextcpu;
+
+       in_pcbnotifyall(&tcbinfo[mycpuid].pcblisthead, nmsg->nm_faddr,
+                       nmsg->nm_arg, nmsg->nm_notify);
+
+       nextcpu = mycpuid + 1;
+       if (nextcpu < ncpus2)
+               lwkt_forwardmsg(tcp_cport(nextcpu), &netmsg->nm_lmsg);
+       else
+               lwkt_replymsg(&netmsg->nm_lmsg, 0);
+}
+
 void
 tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
 {
@@ -1346,10 +1428,16 @@ tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
                }
                crit_exit();
        } else {
-               for (cpu = 0; cpu < ncpus2; cpu++) {
-                       in_pcbnotifyall(&tcbinfo[cpu].pcblisthead, faddr, arg,
-                                       notify);
-               }
+               struct netmsg_tcp_notify nmsg;
+
+               KKASSERT(&curthread->td_msgport == cpu_portfn(0));
+               netmsg_init(&nmsg.nm_nmsg, &curthread->td_msgport, 0,
+                           tcp_notifyall_oncpu);
+               nmsg.nm_faddr = faddr;
+               nmsg.nm_arg = arg;
+               nmsg.nm_notify = notify;
+
+               lwkt_domsg(tcp_cport(0), &nmsg.nm_nmsg.nm_lmsg, 0);
        }
 }