network - Fix multiple MP races
authorMatthew Dillon <dillon@apollo.backplane.com>
Mon, 13 Sep 2010 05:33:08 +0000 (22:33 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Mon, 13 Sep 2010 05:33:08 +0000 (22:33 -0700)
* Fix sonewconn() races.  sonewconn() was attaching prior to changing
  the socket->so_port, relying on the caller to set the socket->so_port.
  This resulted in a race where userland wound up with visibility on the
  socket and could issue commands, like close(), which would end up going
  to the original protocol thread instead of the post-connect protocol thread
  which was handling the sonewconn().

  Thus the close() could message the backend to detach and compete
  against the sonewconn() because the detach message was going to
  a different protocol thread.

* When the socket->so_port is changed the inpcb was not being moved
  from the old pcbinfo->pcblisthead list to the new one, resulting
  in MP races later on during removal.

* Add more debugging kprintf()s.

* Clean up sosetport() use, remove the now-unused *_soport_attach().

Reported-by: Many
14 files changed:
sys/kern/uipc_socket2.c
sys/net/netisr.c
sys/netinet/in_pcb.c
sys/netinet/in_pcb.h
sys/netinet/ip_demux.c
sys/netinet/ip_divert.c
sys/netinet/tcp_subr.c
sys/netinet/tcp_syncache.c
sys/netinet/tcp_usrreq.c
sys/netinet/tcp_var.h
sys/netinet/udp_usrreq.c
sys/netinet/udp_var.h
sys/netinet6/in6_pcb.c
sys/netinet6/udp6_usrreq.c

index 3b4fa37..11d95a0 100644 (file)
@@ -326,6 +326,15 @@ sonewconn(struct socket *head, int connstatus)
        so = soalloc(1);
        if (so == NULL)
                return (NULL);
+
+       /*
+        * Set the port prior to attaching the inpcb to the current
+        * cpu's protocol thread (which should be the current thread
+        * but might not be in all cases).  This serializes any pcb ops
+        * which occur to our cpu allowing us to complete the attachment
+        * without racing anything.
+        */
+       sosetport(so, cpu_portfn(mycpu->gd_cpuid));
        if ((head->so_options & SO_ACCEPTFILTER) != 0)
                connstatus = 0;
        so->so_head = head;
index cfa4866..7c0771a 100644 (file)
@@ -267,7 +267,23 @@ netmsg_service_loop(void *arg)
                        KASSERT(msg->nm_dispatch != NULL,
                                ("netmsg_service isr %d badmsg\n",
                                msg->nm_lmsg.u.ms_result));
-                       msg->nm_dispatch(msg);
+                       if (msg->nm_so &&
+                           msg->nm_so->so_port != &td->td_msgport) {
+                               /*
+                                * Sockets undergoing connect or disconnect
+                                * ops can change ports on us.  Chase the
+                                * port.
+                                */
+                               kprintf("netmsg_service_loop: Warning, "
+                                       "port changed so=%p\n", msg->nm_so);
+                               lwkt_forwardmsg(msg->nm_so->so_port,
+                                               &msg->nm_lmsg);
+                       } else {
+                               /*
+                                * We are on the correct port, dispatch it.
+                                */
+                               msg->nm_dispatch(msg);
+                       }
                        if (--limit == 0)
                                break;
                } while ((msg = lwkt_getport(&td->td_msgport)) != NULL);
index d974704..fe9f0ed 100644 (file)
@@ -189,6 +189,8 @@ in_pcbinfo_init(struct inpcbinfo *pcbinfo)
 {
        LIST_INIT(&pcbinfo->pcblisthead);
        pcbinfo->cpu = -1;
+       pcbinfo->portsave = kmalloc(sizeof(*pcbinfo->portsave), M_PCB,
+                                   M_WAITOK | M_ZERO);
 }
 
 /*
@@ -226,6 +228,33 @@ in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
        return (0);
 }
 
+/*
+ * Unlink a pcb with the intention of moving it to another cpu with a
+ * different pcbinfo.  While unlinked nothing should attempt to dereference
+ * inp_pcbinfo, NULL it out so we assert if it does.
+ */
+void
+in_pcbunlink(struct inpcb *inp, struct inpcbinfo *pcbinfo)
+{
+       KKASSERT(inp->inp_pcbinfo == pcbinfo);
+
+       LIST_REMOVE(inp, inp_list);
+       pcbinfo->ipi_count--;
+       inp->inp_pcbinfo = NULL;
+}
+
+/*
+ * Relink a pcb into a new pcbinfo.
+ */
+void
+in_pcblink(struct inpcb *inp, struct inpcbinfo *pcbinfo)
+{
+       KKASSERT(inp->inp_pcbinfo == NULL);
+       inp->inp_pcbinfo = pcbinfo;
+       LIST_INSERT_HEAD(&pcbinfo->pcblisthead, inp, inp_list);
+       pcbinfo->ipi_count++;
+}
+
 int
 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct thread *td)
 {
@@ -246,24 +275,39 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct thread *td)
                return (EADDRNOTAVAIL);
        if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
                return (EINVAL);        /* already bound */
+
        if (!(so->so_options & (SO_REUSEADDR|SO_REUSEPORT)))
                wild = 1;    /* neither SO_REUSEADDR nor SO_REUSEPORT is set */
        if (p)
                cred = p->p_ucred;
+
+       /*
+        * This has to be atomic.  If the porthash is shared across multiple
+        * protocol threads (aka tcp) then the token will be non-NULL.
+        */
+       if (pcbinfo->porttoken)
+               lwkt_gettoken(pcbinfo->porttoken);
+
        if (nam != NULL) {
                sin = (struct sockaddr_in *)nam;
-               if (nam->sa_len != sizeof *sin)
-                       return (EINVAL);
+               if (nam->sa_len != sizeof *sin) {
+                       error = EINVAL;
+                       goto done;
+               }
 #ifdef notdef
                /*
                 * We should check the family, but old programs
                 * incorrectly fail to initialize it.
                 */
-               if (sin->sin_family != AF_INET)
-                       return (EAFNOSUPPORT);
+               if (sin->sin_family != AF_INET) {
+                       error = EAFNOSUPPORT;
+                       goto done;
+               }
 #endif
-               if (!prison_replace_wildcards(td, nam))
-                               return (EINVAL);
+               if (!prison_replace_wildcards(td, nam)) {
+                       error = EINVAL;
+                       goto done;
+               }
                lport = sin->sin_port;
                if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
                        /*
@@ -278,21 +322,28 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct thread *td)
                } else if (sin->sin_addr.s_addr != INADDR_ANY) {
                        sin->sin_port = 0;              /* yech... */
                        bzero(&sin->sin_zero, sizeof sin->sin_zero);
-                       if (ifa_ifwithaddr((struct sockaddr *)sin) == NULL)
-                               return (EADDRNOTAVAIL);
+                       if (ifa_ifwithaddr((struct sockaddr *)sin) == NULL) {
+                               error = EADDRNOTAVAIL;
+                               goto done;
+                       }
                }
                if (lport != 0) {
                        struct inpcb *t;
 
                        /* GROSS */
                        if (ntohs(lport) < IPPORT_RESERVED &&
-                           cred && priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0))
-                               return (EACCES);
+                           cred &&
+                           priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0)) {
+                               error = EACCES;
+                               goto done;
+                       }
                        if (so->so_cred->cr_uid != 0 &&
                            !IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
-                               t = in_pcblookup_local(inp->inp_pcbinfo,
-                                   sin->sin_addr, lport,
-                                   INPLOOKUP_WILDCARD, cred);
+                               t = in_pcblookup_local(pcbinfo,
+                                                      sin->sin_addr,
+                                                      lport,
+                                                      INPLOOKUP_WILDCARD,
+                                                      cred);
                                if (t &&
                                    (!in_nullhost(sin->sin_addr) ||
                                     !in_nullhost(t->inp_laddr) ||
@@ -306,11 +357,16 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct thread *td)
                                            INP_SOCKAF(so) ==
                                            INP_SOCKAF(t->inp_socket))
 #endif
-                                       return (EADDRINUSE);
+                                       {
+                                               error = EADDRINUSE;
+                                               goto done;
+                                       }
                                }
                        }
-                       if (cred && !prison_replace_wildcards(td, nam))
-                               return (EADDRNOTAVAIL);
+                       if (cred && !prison_replace_wildcards(td, nam)) {
+                               error = EADDRNOTAVAIL;
+                               goto done;
+                       }
                        t = in_pcblookup_local(pcbinfo, sin->sin_addr, lport,
                                               wild, cred);
                        if (t && !(reuseport & t->inp_socket->so_options)) {
@@ -319,7 +375,10 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct thread *td)
                                    !in_nullhost(t->inp_laddr) ||
                                    INP_SOCKAF(so) == INP_SOCKAF(t->inp_socket))
 #endif
-                               return (EADDRINUSE);
+                               {
+                                       error = EADDRINUSE;
+                                       goto done;
+                               }
                        }
                }
                inp->inp_laddr = sin->sin_addr;
@@ -332,7 +391,8 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct thread *td)
                jsin.sin_addr.s_addr = inp->inp_laddr.s_addr;
                if (!prison_replace_wildcards(td, (struct sockaddr *)&jsin)) {
                        inp->inp_laddr.s_addr = INADDR_ANY;
-                       return (EINVAL);
+                       error = EINVAL;
+                       goto done;
                }
                inp->inp_laddr.s_addr = jsin.sin_addr.s_addr;
 
@@ -346,7 +406,7 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct thread *td)
                        if (cred &&
                            (error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0))) {
                                inp->inp_laddr.s_addr = INADDR_ANY;
-                               return (error);
+                               goto done;
                        }
                        first = ipport_lowfirstauto;    /* 1023 */
                        last  = ipport_lowlastauto;     /* 600 */
@@ -372,14 +432,15 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct thread *td)
                        do {
                                if (count-- < 0) {      /* completely used? */
                                        inp->inp_laddr.s_addr = INADDR_ANY;
-                                       return (EADDRNOTAVAIL);
+                                       error = EADDRNOTAVAIL;
+                                       goto done;
                                }
                                --*lastport;
                                if (*lastport > first || *lastport < last)
                                        *lastport = first;
                                lport = htons(*lastport);
                        } while (in_pcblookup_local(pcbinfo, inp->inp_laddr,
-                                lport, wild, cred));
+                                                   lport, wild, cred));
                } else {
                        /*
                         * counting up
@@ -389,14 +450,15 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct thread *td)
                        do {
                                if (count-- < 0) {      /* completely used? */
                                        inp->inp_laddr.s_addr = INADDR_ANY;
-                                       return (EADDRNOTAVAIL);
+                                       error = EADDRNOTAVAIL;
+                                       goto done;
                                }
                                ++*lastport;
                                if (*lastport < first || *lastport > last)
                                        *lastport = first;
                                lport = htons(*lastport);
                        } while (in_pcblookup_local(pcbinfo, inp->inp_laddr,
-                                lport, wild, cred));
+                                                   lport, wild, cred));
                }
        }
        inp->inp_lport = lport;
@@ -406,16 +468,22 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct thread *td)
        if (!prison_replace_wildcards(td, (struct sockaddr*)&jsin)) {
                inp->inp_laddr.s_addr = INADDR_ANY;
                inp->inp_lport = 0;
-               return (EINVAL);
+               error = EINVAL;
+               goto done;
        }
        inp->inp_laddr.s_addr = jsin.sin_addr.s_addr;
 
        if (in_pcbinsporthash(inp) != 0) {
                inp->inp_laddr.s_addr = INADDR_ANY;
                inp->inp_lport = 0;
-               return (EAGAIN);
+               error = EAGAIN;
+               goto done;
        }
-       return (0);
+       error = 0;
+done:
+       if (pcbinfo->porttoken)
+               lwkt_reltoken(pcbinfo->porttoken);
+       return error;
 }
 
 /*
@@ -610,8 +678,9 @@ in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct thread *td)
                return (error);
 
        if (in_pcblookup_hash(inp->inp_cpcbinfo, sin->sin_addr, sin->sin_port,
-           inp->inp_laddr.s_addr ? inp->inp_laddr : if_sin->sin_addr,
-           inp->inp_lport, FALSE, NULL) != NULL) {
+                             inp->inp_laddr.s_addr ?
+                               inp->inp_laddr : if_sin->sin_addr,
+                             inp->inp_lport, FALSE, NULL) != NULL) {
                return (EADDRINUSE);
        }
        if (inp->inp_laddr.s_addr == INADDR_ANY) {
@@ -649,6 +718,7 @@ in_pcbdetach(struct inpcb *inp)
        ipsec4_delete_pcbpolicy(inp);
 #endif /*IPSEC*/
        inp->inp_gencnt = ++ipi->ipi_gencnt;
+       KKASSERT((so->so_state & SS_ASSERTINPROG) == 0);
        in_pcbremlists(inp);
        so->so_pcb = NULL;
        sofree(so);                     /* remove pcb ref */
@@ -849,19 +919,25 @@ in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
        struct inpcb *inp;
        int matchwild = 3, wildcard;
        u_short lport = lport_arg;
-
        struct inpcbporthead *porthash;
        struct inpcbport *phd;
        struct inpcb *match = NULL;
 
        /*
+        * If the porthashbase is shared across several cpus we need
+        * to lock.
+        */
+       if (pcbinfo->porttoken)
+               lwkt_gettoken(pcbinfo->porttoken);
+
+       /*
         * Best fit PCB lookup.
         *
         * First see if this local port is in use by looking on the
         * port hash list.
         */
-       porthash = &pcbinfo->porthashbase[INP_PCBPORTHASH(lport,
-           pcbinfo->porthashmask)];
+       porthash = &pcbinfo->porthashbase[
+                       INP_PCBPORTHASH(lport, pcbinfo->porthashmask)];
        LIST_FOREACH(phd, porthash, phd_hash) {
                if (phd->phd_port == lport)
                        break;
@@ -902,6 +978,8 @@ in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
                        }
                }
        }
+       if (pcbinfo->porttoken)
+               lwkt_reltoken(pcbinfo->porttoken);
        return (match);
 }
 
@@ -909,9 +987,9 @@ in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
  * Lookup PCB in hash list.
  */
 struct inpcb *
-in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport_arg,
-                 struct in_addr laddr, u_int lport_arg, boolean_t wildcard,
-                 struct ifnet *ifp)
+in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
+                 u_int fport_arg, struct in_addr laddr, u_int lport_arg,
+                 boolean_t wildcard, struct ifnet *ifp)
 {
        struct inpcbhead *head;
        struct inpcb *inp, *jinp=NULL;
@@ -1079,22 +1157,29 @@ in_pcbinsporthash(struct inpcb *inp)
        struct inpcbport *phd;
 
        /*
+        * If the porthashbase is shared across several cpus we need
+        * to lock.
+        */
+       if (pcbinfo->porttoken)
+               lwkt_gettoken(pcbinfo->porttoken);
+
+       /*
         * Insert into the port hash table.
         */
        pcbporthash = &pcbinfo->porthashbase[
            INP_PCBPORTHASH(inp->inp_lport, pcbinfo->porthashmask)];
 
        /* Go through port list and look for a head for this lport. */
-       LIST_FOREACH(phd, pcbporthash, phd_hash)
+       LIST_FOREACH(phd, pcbporthash, phd_hash) {
                if (phd->phd_port == inp->inp_lport)
                        break;
+       }
 
        /* If none exists, malloc one and tack it on. */
        if (phd == NULL) {
-               MALLOC(phd, struct inpcbport *, sizeof(struct inpcbport),
-                   M_PCB, M_INTWAIT | M_NULLOK);
-               if (phd == NULL)
-                       return (ENOBUFS); /* XXX */
+               KKASSERT(pcbinfo->portsave != NULL);
+               phd = pcbinfo->portsave;
+               pcbinfo->portsave = NULL;
                phd->phd_port = inp->inp_lport;
                LIST_INIT(&phd->phd_pcblist);
                LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
@@ -1103,6 +1188,12 @@ in_pcbinsporthash(struct inpcb *inp)
        inp->inp_phd = phd;
        LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
 
+       if (pcbinfo->porttoken)
+               lwkt_reltoken(pcbinfo->porttoken);
+       if (pcbinfo->portsave == NULL) {
+               pcbinfo->portsave = kmalloc(sizeof(*pcbinfo->portsave),
+                                           M_PCB, M_INTWAIT | M_ZERO);
+       }
        return (0);
 }
 
index 9a4d142..39e5e14 100644 (file)
@@ -276,11 +276,15 @@ struct inpcbport {
        u_short phd_port;
 };
 
+struct lwkt_token;
+
 struct inpcbinfo {             /* XXX documentation, prefixes */
        struct  inpcbhead *hashbase;
        u_long  hashmask;
        struct  inpcbporthead *porthashbase;
        u_long  porthashmask;
+       struct  lwkt_token *porttoken;  /* if porthashbase is shared */
+       struct  inpcbport *portsave;    /* port allocation cache */
        struct  inpcontainerhead *wildcardhashbase;
        u_long  wildcardhashmask;
        struct  inpcbhead pcblisthead;  /* head of queue of active pcb's */
@@ -380,6 +384,8 @@ void        in_losing (struct inpcb *);
 void   in_rtchange (struct inpcb *, int);
 void   in_pcbinfo_init (struct inpcbinfo *);
 int    in_pcballoc (struct socket *, struct inpcbinfo *);
+void   in_pcbunlink (struct inpcb *, struct inpcbinfo *);
+void   in_pcblink (struct inpcb *, struct inpcbinfo *);
 int    in_pcbbind (struct inpcb *, struct sockaddr *, struct thread *);
 int    in_pcbconnect (struct inpcb *, struct sockaddr *, struct thread *);
 void   in_pcbdetach (struct inpcb *);
index e3e17d5..3b659f7 100644 (file)
@@ -369,16 +369,6 @@ ip_mport_pktinfo(const struct pktinfo *pi, struct mbuf *m)
 #endif
 
 /*
- * Initital port when creating the socket, generally before
- * binding or connect.
- */
-lwkt_port_t
-tcp_soport_attach(struct socket *so)
-{
-       return(cpu_portfn(0));
-}
-
-/*
  * This is used to map a socket to a message port for sendmsg() and friends.
  * It is not called for any other purpose.  In the case of TCP we just return
  * the port already installed in the socket.
@@ -448,16 +438,6 @@ udp_addrport(in_addr_t faddr, in_port_t fport, in_addr_t laddr, in_port_t lport)
 }
 
 /*
- * Initital port when creating the socket, generally before
- * binding or connect.
- */
-lwkt_port_t
-udp_soport_attach(struct socket *so)
-{
-       return(cpu_portfn(0));
-}
-
-/*
  * This is used to map a socket to a message port for sendmsg() and friends.
  * It is not called for any other purpose.
  *
index c714e71..08e2590 100644 (file)
@@ -480,6 +480,7 @@ div_attach(struct socket *so, int proto, struct pru_attach_info *ai)
        if (error)
                return error;
        lwkt_gettoken(&div_token);
+       so->so_port = cpu0_soport(so, NULL, NULL);
        error = in_pcballoc(so, &divcbinfo);
        if (error) {
                lwkt_reltoken(&div_token);
@@ -493,7 +494,6 @@ div_attach(struct socket *so, int proto, struct pru_attach_info *ai)
         * The socket is always "connected" because
         * we always know "where" to send the packet.
         */
-       so->so_port = cpu0_soport(so, NULL, NULL);
        sosetstate(so, SS_ISCONNECTED);
        lwkt_reltoken(&div_token);
        return 0;
index 5aae6e7..fce0bc2 100644 (file)
@@ -162,6 +162,9 @@ KTR_INFO(KTR_TCP, tcp, delayed, 2, "tcp execute delayed ops", 0);
 struct inpcbinfo tcbinfo[MAXCPU];
 struct tcpcbackqhead tcpcbackq[MAXCPU];
 
+static struct lwkt_token tcp_port_token =
+               LWKT_TOKEN_MP_INITIALIZER(tcp_port_token);
+
 int tcp_mssdflt = TCP_MSS;
 SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW,
     &tcp_mssdflt, 0, "Default TCP Maximum Segment Size");
@@ -317,7 +320,9 @@ struct      inp_tp {
 void
 tcp_init(void)
 {
+       struct inpcbporthead *porthashbase;
        struct inpcbinfo *ticb;
+       u_long porthashmask;
        int hashsize = TCBHASHSIZE;
        int cpu;
 
@@ -343,6 +348,7 @@ tcp_init(void)
                hashsize = 512; /* safe default */
        }
        tcp_tcbhashsize = hashsize;
+       porthashbase = hashinit(hashsize, M_PCB, &porthashmask);
 
        for (cpu = 0; cpu < ncpus2; cpu++) {
                ticb = &tcbinfo[cpu];
@@ -350,8 +356,13 @@ tcp_init(void)
                ticb->cpu = cpu;
                ticb->hashbase = hashinit(hashsize, M_PCB,
                                          &ticb->hashmask);
+               ticb->porthashbase = porthashbase;
+               ticb->porthashmask = porthashmask;
+               ticb->porttoken = &tcp_port_token;
+#if 0
                ticb->porthashbase = hashinit(hashsize, M_PCB,
                                              &ticb->porthashmask);
+#endif
                ticb->wildcardhashbase = hashinit(hashsize, M_PCB,
                                                  &ticb->wildcardhashmask);
                ticb->ipi_size = sizeof(struct inp_tp);
index 88bb81b..ebdb2d1 100644 (file)
@@ -698,6 +698,9 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m)
         * as they would have been set up if we had created the
         * connection when the SYN arrived.  If we can't create
         * the connection, abort it.
+        *
+        * Set the protocol processing port for the socket to the current
+        * port (that the connection came in on).
         */
        so = sonewconn(lso, SS_ISCONNECTED);
        if (so == NULL) {
@@ -710,12 +713,6 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m)
        }
 
        /*
-        * Set the protocol processing port for the socket to the current
-        * port (that the connection came in on).
-        */
-       sosetport(so, &curthread->td_msgport);
-
-       /*
         * Insert new socket into hash list.
         */
        inp = so->so_pcb;
@@ -743,7 +740,7 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m)
                inp->inp_lport = 0;
                goto abort;
        }
-       linp = so->so_pcb;
+       linp = lso->so_pcb;
 #ifdef IPSEC
        /* copy old policy into new socket's */
        if (ipsec_copy_policy(linp->inp_sp, inp->inp_sp))
@@ -820,6 +817,11 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m)
                port = tcp_addrport(inp->inp_faddr.s_addr, inp->inp_fport,
                                    inp->inp_laddr.s_addr, inp->inp_lport);
        }
+       if (port != &curthread->td_msgport) {
+               print_backtrace(-1);
+               kprintf("TCP PORT MISMATCH %p vs %p\n",
+                       port, &curthread->td_msgport);
+       }
        /*KKASSERT(port == &curthread->td_msgport);*/
 
        tp = intotcpcb(inp);
index 366d270..b990389 100644 (file)
@@ -162,7 +162,8 @@ static struct tcpcb *
 
 /*
  * TCP attaches to socket via pru_attach(), reserving space,
- * and an internet control block.
+ * and an internet control block.  This is likely occuring on
+ * cpu0 and may have to move later when we bind/connect.
  */
 static int
 tcp_usr_attach(struct socket *so, int proto, struct pru_attach_info *ai)
@@ -896,7 +897,6 @@ tcp_connect_oncpu(struct tcpcb *tp, int flags, struct mbuf *m,
         * Create TCP timer message now; we are on the tcpcb's owner
         * CPU/thread.
         */
-       sosetport(so, &curthread->td_msgport);
        tcp_create_timermsg(tp, &curthread->td_msgport);
 
        /*
@@ -947,12 +947,18 @@ struct netmsg_tcp_connect {
        struct mbuf             *nm_m;
 };
 
+/*
+ * This is called in the target protocol processing thread.  We must
+ * re-link our pcb to the new tcpcb
+ */
 static void
 tcp_connect_handler(netmsg_t netmsg)
 {
        struct netmsg_tcp_connect *msg = (void *)netmsg;
+       struct socket *so = netmsg->nm_so;
        int error;
 
+       in_pcblink(so->so_pcb, &tcbinfo[mycpu->gd_cpuid]);
        error = tcp_connect_oncpu(msg->nm_tp, msg->nm_flags, msg->nm_m,
                                  msg->nm_sin, msg->nm_ifsin);
        lwkt_replymsg(&msg->nm_netmsg.nm_lmsg, error);
@@ -996,6 +1002,7 @@ tcp_connect(struct tcpcb *tp, int flags, struct mbuf *m,
        struct inpcb *inp = tp->t_inpcb;
        struct sockaddr_in *sin = (struct sockaddr_in *)nam;
        struct sockaddr_in *if_sin;
+       struct socket *so;
        int error;
 #ifdef SMP
        lwkt_port_t port;
@@ -1011,10 +1018,13 @@ tcp_connect(struct tcpcb *tp, int flags, struct mbuf *m,
                        return (error);
                }
        }
+       so = inp->inp_socket;
+       KKASSERT(so);
 
        /*
         * Calculate the correct protocol processing thread.  The connect
-        * operation must run there.
+        * operation must run there.  Set the forwarding port before we
+        * forward the message or it will get bounced right back to us.
         */
        error = in_pcbladdr(inp, nam, &if_sin, td);
        if (error) {
@@ -1042,10 +1052,14 @@ tcp_connect(struct tcpcb *tp, int flags, struct mbuf *m,
                bzero(ro, sizeof(*ro));
 
                /*
-                * NOTE: We haven't set so->so_port yet do not pass so
-                *       to netmsg_init() or it will be improperly forwarded.
+                * We are moving the protocol processing port the socket
+                * is on, we have to unlink here and re-link on the
+                * target cpu.
                 */
-               netmsg_init(&msg.nm_netmsg, NULL, &curthread->td_msgport,
+               in_pcbunlink(so->so_pcb, &tcbinfo[mycpu->gd_cpuid]);
+               sosetport(so, port);
+
+               netmsg_init(&msg.nm_netmsg, so, &curthread->td_msgport,
                            0, tcp_connect_handler);
                msg.nm_tp = tp;
                msg.nm_sin = sin;
@@ -1057,6 +1071,7 @@ tcp_connect(struct tcpcb *tp, int flags, struct mbuf *m,
                error = tcp_connect_oncpu(tp, flags, m, sin, if_sin);
        }
 #else
+       KKASSERT(so->so_port == &curthread->td_msgport);
        error = tcp_connect_oncpu(tp, flags, m, sin, if_sin);
 #endif
        return (error);
@@ -1165,7 +1180,6 @@ tcp6_connect_oncpu(struct tcpcb *tp, int flags, struct mbuf *m,
         * Create TCP timer message now; we are on the tcpcb's owner
         * CPU/thread.
         */
-       sosetport(so, &curthread->td_msgport);
        tcp_create_timermsg(tp, &curthread->td_msgport);
 
        /* Compute window scaling to request.  */
@@ -1348,9 +1362,9 @@ SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
     &tcp_recvspace , 0, "Maximum incoming TCP datagram size");
 
 /*
- * Attach TCP protocol to socket, allocating
- * internet protocol control block, tcp control block,
- * bufer space, and entering LISTEN state if to accept connections.
+ * Attach TCP protocol to socket, allocating internet protocol control
+ * block, tcp control block, bufer space, and entering LISTEN state
+ * if to accept connections.
  */
 static int
 tcp_attach(struct socket *so, struct pru_attach_info *ai)
@@ -1374,6 +1388,11 @@ tcp_attach(struct socket *so, struct pru_attach_info *ai)
        atomic_set_int(&so->so_rcv.ssb_flags, SSB_AUTOSIZE);
        atomic_set_int(&so->so_snd.ssb_flags, SSB_AUTOSIZE);
        cpu = mycpu->gd_cpuid;
+
+       /*
+        * Set the default port for protocol processing. This will likely
+        * change when we connect.
+        */
        error = in_pcballoc(so, &tcbinfo[cpu]);
        if (error)
                return (error);
@@ -1402,7 +1421,6 @@ tcp_attach(struct socket *so, struct pru_attach_info *ai)
                return (ENOBUFS);
        }
        tp->t_state = TCPS_CLOSED;
-       so->so_port = tcp_soport_attach(so);
        return (0);
 }
 
index 447e8e4..29b4cbe 100644 (file)
@@ -626,8 +626,6 @@ void         tcp_fillheaders (struct tcpcb *, void *, void *);
 struct lwkt_port *
         tcp_soport(struct socket *, struct sockaddr *, struct mbuf **);
 struct lwkt_port *
-        tcp_soport_attach(struct socket *);
-struct lwkt_port *
         tcp_ctlport(int, struct sockaddr *, void *);
 struct tcpcb *
         tcp_timers (struct tcpcb *, int);
index e1a5781..5204e4b 100644 (file)
@@ -915,7 +915,11 @@ udp_attach(struct socket *so, int proto, struct pru_attach_info *ai)
        error = in_pcballoc(so, &udbinfo);
        if (error)
                return error;
-       so->so_port = udp_soport_attach(so);
+
+       /*
+        * Set default port for protocol processing prior to bind/connect.
+        */
+       sosetport(so, cpu_portfn(0));
 
        inp = (struct inpcb *)so->so_pcb;
        inp->inp_vflag |= INP_IPV4;
@@ -1004,6 +1008,7 @@ udp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
        port = udp_addrport(sin->sin_addr.s_addr, sin->sin_port,
                            inp->inp_laddr.s_addr, inp->inp_lport);
 #ifdef SMP
+       sosetport(so, port);
        if (port != &curthread->td_msgport) {
                struct netmsg_udp_connect msg;
                struct route *ro = &inp->inp_route;
@@ -1032,6 +1037,7 @@ udp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
                error = udp_connect_oncpu(so, td, sin, if_sin);
        }
 #else
+       KKASSERT(port == &curthread->td_msgport);
        error = udp_connect_oncpu(so, td, sin, if_sin);
 #endif
        return (error);
@@ -1057,7 +1063,6 @@ udp_connect_oncpu(struct socket *so, struct thread *td,
                 * socket.
                 */
                soisconnected(so);
-               sosetport(so, &curthread->td_msgport);
        } else if (error == EAFNOSUPPORT) {     /* connection dissolved */
                /*
                 * Follow traditional BSD behavior and retain
index f781f90..e0c8b7f 100644 (file)
@@ -157,7 +157,6 @@ void                        udp_notify (struct inpcb *inp, int error);
 int                    udp_shutdown (struct socket *so);
 struct lwkt_port       *udp_soport (struct socket *, struct sockaddr *,
                                     struct mbuf **);
-struct lwkt_port       *udp_soport_attach (struct socket *);
 struct lwkt_port       *udp_ctlport (int, struct sockaddr *, void *);
 struct lwkt_port       *udp_cport (int);
 
index a69fee2..c78d717 100644 (file)
@@ -685,6 +685,7 @@ in6_pcbdetach(struct inpcb *inp)
        inp->inp_gencnt = ++ipi->ipi_gencnt;
        in_pcbremlists(inp);
        so->so_pcb = NULL;
+       KKASSERT((so->so_state & SS_ASSERTINPROG) == 0);
        sofree(so);             /* remove pcb ref */
 
        if (inp->in6p_options)
@@ -940,17 +941,25 @@ in6_pcblookup_local(struct inpcbinfo *pcbinfo, struct in6_addr *laddr,
        struct inpcb *match = NULL;
 
        /*
+        * If the porthashbase is shared across several cpus we need
+        * to lock.
+        */
+       if (pcbinfo->porttoken)
+               lwkt_gettoken(pcbinfo->porttoken);
+
+       /*
         * Best fit PCB lookup.
         *
         * First see if this local port is in use by looking on the
         * port hash list.
         */
-       porthash = &pcbinfo->porthashbase[INP_PCBPORTHASH(lport,
-           pcbinfo->porthashmask)];
+       porthash = &pcbinfo->porthashbase[
+                               INP_PCBPORTHASH(lport, pcbinfo->porthashmask)];
        LIST_FOREACH(phd, porthash, phd_hash) {
                if (phd->phd_port == lport)
                        break;
        }
+
        if (phd != NULL) {
                /*
                 * Port is in use by one or more PCBs. Look for best
@@ -987,6 +996,9 @@ in6_pcblookup_local(struct inpcbinfo *pcbinfo, struct in6_addr *laddr,
                        }
                }
        }
+       if (pcbinfo->porttoken)
+               lwkt_reltoken(pcbinfo->porttoken);
+
        return (match);
 }
 
index a1d50ea..04ef9e7 100644 (file)
@@ -568,7 +568,7 @@ udp6_attach(struct socket *so, int proto, struct pru_attach_info *ai)
        crit_exit();
        if (error)
                return error;
-       so->so_port = udp_soport_attach(so);
+       sosetport(so, cpu_portfn(0));
        inp = (struct inpcb *)so->so_pcb;
        inp->inp_vflag |= INP_IPV6;
        if (!ip6_v6only)
@@ -670,7 +670,6 @@ udp6_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
                        inp->inp_vflag &= ~INP_IPV4;
                        inp->inp_vflag |= INP_IPV6;
                }
-               /* sosetport(so, port); here, see udp v4 code */
                soisconnected(so);
        } else if (error == EAFNOSUPPORT) {     /* connection dissolved */
                /*