Virtio_Balloon implementation for DragonFly
[dragonfly.git] / sys / netinet / in_pcb.c
1 /*
2  * Copyright (c) 2004 Jeffrey M. Hsu.  All rights reserved.
3  * Copyright (c) 2004 The DragonFly Project.  All rights reserved.
4  *
5  * This code is derived from software contributed to The DragonFly Project
6  * by Jeffrey M. Hsu.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of The DragonFly Project nor the names of its
17  *    contributors may be used to endorse or promote products derived
18  *    from this software without specific, prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
24  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
26  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
28  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
30  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  */
33
34 /*
35  * Copyright (c) 1982, 1986, 1991, 1993, 1995
36  *      The Regents of the University of California.  All rights reserved.
37  *
38  * Redistribution and use in source and binary forms, with or without
39  * modification, are permitted provided that the following conditions
40  * are met:
41  * 1. Redistributions of source code must retain the above copyright
42  *    notice, this list of conditions and the following disclaimer.
43  * 2. Redistributions in binary form must reproduce the above copyright
44  *    notice, this list of conditions and the following disclaimer in the
45  *    documentation and/or other materials provided with the distribution.
46  * 3. Neither the name of the University nor the names of its contributors
47  *    may be used to endorse or promote products derived from this software
48  *    without specific prior written permission.
49  *
50  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
51  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
52  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
53  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
54  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
55  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
56  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
57  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
58  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
59  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
60  * SUCH DAMAGE.
61  *
62  *      @(#)in_pcb.c    8.4 (Berkeley) 5/24/95
63  * $FreeBSD: src/sys/netinet/in_pcb.c,v 1.59.2.27 2004/01/02 04:06:42 ambrisko Exp $
64  */
65
66 #include "opt_inet6.h"
67
68 #include <sys/param.h>
69 #include <sys/systm.h>
70 #include <sys/malloc.h>
71 #include <sys/mbuf.h>
72 #include <sys/domain.h>
73 #include <sys/protosw.h>
74 #include <sys/socket.h>
75 #include <sys/socketvar.h>
76 #include <sys/proc.h>
77 #include <sys/priv.h>
78 #include <sys/jail.h>
79 #include <sys/kernel.h>
80 #include <sys/sysctl.h>
81
82 #include <sys/thread2.h>
83 #include <sys/socketvar2.h>
84 #include <sys/msgport2.h>
85
86 #include <machine/limits.h>
87
88 #include <net/if.h>
89 #include <net/if_types.h>
90 #include <net/route.h>
91 #include <net/netisr2.h>
92 #include <net/toeplitz2.h>
93
94 #include <netinet/in.h>
95 #include <netinet/in_pcb.h>
96 #include <netinet/in_var.h>
97 #include <netinet/ip_var.h>
98 #ifdef INET6
99 #include <netinet/ip6.h>
100 #include <netinet6/ip6_var.h>
101 #endif /* INET6 */
102
103 #define INP_LOCALGROUP_SIZMIN   8
104 #define INP_LOCALGROUP_SIZMAX   256
105
106 static struct inpcb *in_pcblookup_local(struct inpcbporthead *porthash,
107                 struct in_addr laddr, u_int lport_arg, int wild_okay,
108                 struct ucred *cred);
109
110 struct in_addr zeroin_addr;
111
112 /*
113  * These configure the range of local port addresses assigned to
114  * "unspecified" outgoing connections/packets/whatever.
115  */
116 int ipport_lowfirstauto = IPPORT_RESERVED - 1;  /* 1023 */
117 int ipport_lowlastauto = IPPORT_RESERVEDSTART;  /* 600 */
118
119 int ipport_firstauto = IPPORT_RESERVED;         /* 1024 */
120 int ipport_lastauto = IPPORT_USERRESERVED;      /* 5000 */
121
122 int ipport_hifirstauto = IPPORT_HIFIRSTAUTO;    /* 49152 */
123 int ipport_hilastauto = IPPORT_HILASTAUTO;      /* 65535 */
124
125 #define RANGECHK(var, min, max) \
126         if ((var) < (min)) { (var) = (min); } \
127         else if ((var) > (max)) { (var) = (max); }
128
129 int udpencap_enable = 1;        /* enabled by default */
130 int udpencap_port = 4500;       /* triggers decapsulation */
131
132 /*
133  * Per-netisr inpcb markers.
134  * NOTE: they should only be used in netisrs.
135  */
136 static struct inpcb             *in_pcbmarkers;
137 static struct inpcontainer      *in_pcbcontainer_markers;
138
139 static int
140 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
141 {
142         int error;
143
144         error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
145         if (!error) {
146                 RANGECHK(ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
147                 RANGECHK(ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
148
149                 RANGECHK(ipport_firstauto, IPPORT_RESERVED, USHRT_MAX);
150                 RANGECHK(ipport_lastauto, IPPORT_RESERVED, USHRT_MAX);
151
152                 RANGECHK(ipport_hifirstauto, IPPORT_RESERVED, USHRT_MAX);
153                 RANGECHK(ipport_hilastauto, IPPORT_RESERVED, USHRT_MAX);
154         }
155         return (error);
156 }
157
158 #undef RANGECHK
159
160 SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0, "IP Ports");
161
162 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, CTLTYPE_INT|CTLFLAG_RW,
163            &ipport_lowfirstauto, 0, &sysctl_net_ipport_check, "I", "");
164 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, CTLTYPE_INT|CTLFLAG_RW,
165            &ipport_lowlastauto, 0, &sysctl_net_ipport_check, "I", "");
166 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, CTLTYPE_INT|CTLFLAG_RW,
167            &ipport_firstauto, 0, &sysctl_net_ipport_check, "I", "");
168 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, CTLTYPE_INT|CTLFLAG_RW,
169            &ipport_lastauto, 0, &sysctl_net_ipport_check, "I", "");
170 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, CTLTYPE_INT|CTLFLAG_RW,
171            &ipport_hifirstauto, 0, &sysctl_net_ipport_check, "I", "");
172 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, CTLTYPE_INT|CTLFLAG_RW,
173            &ipport_hilastauto, 0, &sysctl_net_ipport_check, "I", "");
174
175 /* Initialized by ip_init() */
176 int ip_porthash_trycount;
177 SYSCTL_INT(_net_inet_ip, OID_AUTO, porthash_trycount, CTLFLAG_RW,
178     &ip_porthash_trycount, 0,
179     "Number of tries to find local port matching hash of 4-tuple");
180
181 /*
182  * in_pcb.c: manage the Protocol Control Blocks.
183  *
184  * NOTE: It is assumed that most of these functions will be called from
185  * a critical section.  XXX - There are, unfortunately, a few exceptions
186  * to this rule that should be fixed.
187  *
188  * NOTE: The caller should initialize the cpu field to the cpu running the
189  * protocol stack associated with this inpcbinfo.
190  */
191
192 void
193 in_pcbinfo_init(struct inpcbinfo *pcbinfo, int cpu, boolean_t shared)
194 {
195         KASSERT(cpu >= 0 && cpu < netisr_ncpus, ("invalid cpu%d", cpu));
196         pcbinfo->cpu = cpu;
197
198         LIST_INIT(&pcbinfo->pcblisthead);
199         pcbinfo->portsave = kmalloc(sizeof(*pcbinfo->portsave), M_PCB,
200                                     M_WAITOK | M_ZERO);
201
202         if (shared) {
203                 pcbinfo->infotoken = kmalloc(sizeof(struct lwkt_token),
204                     M_PCB, M_WAITOK);
205                 lwkt_token_init(pcbinfo->infotoken, "infotoken");
206         } else {
207                 pcbinfo->infotoken = NULL;
208         }
209 }
210
211 void
212 in_pcbportinfo_set(struct inpcbinfo *pcbinfo, struct inpcbportinfo *portinfo,
213     int portinfo_cnt)
214 {
215
216         KASSERT(portinfo_cnt > 0, ("invalid portinfo_cnt %d", portinfo_cnt));
217         pcbinfo->portinfo = portinfo;
218         pcbinfo->portinfo_cnt = portinfo_cnt;
219 }
220
221 struct baddynamicports baddynamicports;
222
223 /*
224  * Check if the specified port is invalid for dynamic allocation.
225  */
226 int
227 in_baddynamic(u_int16_t port, u_int16_t proto)
228 {
229         switch (proto) {
230         case IPPROTO_TCP:
231                 return (DP_ISSET(baddynamicports.tcp, port));
232         case IPPROTO_UDP:
233                 return (DP_ISSET(baddynamicports.udp, port));
234         default:
235                 return (0);
236         }
237 }
238
239 void
240 in_pcbonlist(struct inpcb *inp)
241 {
242         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
243
244         KASSERT(&curthread->td_msgport == netisr_cpuport(pcbinfo->cpu),
245             ("not in the correct netisr"));
246         KASSERT((inp->inp_flags & INP_ONLIST) == 0, ("already on pcblist"));
247         inp->inp_flags |= INP_ONLIST;
248
249         GET_PCBINFO_TOKEN(pcbinfo);
250         LIST_INSERT_HEAD(&pcbinfo->pcblisthead, inp, inp_list);
251         pcbinfo->ipi_count++;
252         REL_PCBINFO_TOKEN(pcbinfo);
253 }
254
255 void
256 in_pcbofflist(struct inpcb *inp)
257 {
258         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
259
260         KASSERT(&curthread->td_msgport == netisr_cpuport(pcbinfo->cpu),
261             ("not in the correct netisr"));
262         KASSERT(inp->inp_flags & INP_ONLIST, ("not on pcblist"));
263         inp->inp_flags &= ~INP_ONLIST;
264
265         GET_PCBINFO_TOKEN(pcbinfo);
266         LIST_REMOVE(inp, inp_list);
267         KASSERT(pcbinfo->ipi_count > 0,
268             ("invalid inpcb count %d", pcbinfo->ipi_count));
269         pcbinfo->ipi_count--;
270         REL_PCBINFO_TOKEN(pcbinfo);
271 }
272
273 /*
274  * Allocate a PCB and associate it with the socket.
275  */
276 int
277 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
278 {
279         struct inpcb *inp;
280
281         inp = kmalloc(pcbinfo->ipi_size, M_PCB, M_WAITOK|M_ZERO|M_NULLOK);
282         if (inp == NULL)
283                 return (ENOMEM);
284         inp->inp_lgrpindex = -1;
285         inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
286         inp->inp_pcbinfo = pcbinfo;
287         inp->inp_socket = so;
288 #ifdef INET6
289         if (INP_CHECK_SOCKAF(so, AF_INET6)) {
290                 if (ip6_auto_flowlabel)
291                         inp->inp_flags |= IN6P_AUTOFLOWLABEL;
292                 inp->inp_af = AF_INET6;
293         } else
294 #endif
295         inp->inp_af = AF_INET;
296         soreference(so);
297         so->so_pcb = inp;
298
299         in_pcbonlist(inp);
300         return (0);
301 }
302
303 /*
304  * Unlink a pcb with the intention of moving it to another cpu with a
305  * different pcbinfo.  While unlinked nothing should attempt to dereference
306  * inp_pcbinfo, NULL it out so we assert if it does.
307  */
308 void
309 in_pcbunlink_flags(struct inpcb *inp, struct inpcbinfo *pcbinfo, int flags)
310 {
311         KASSERT(inp->inp_pcbinfo == pcbinfo, ("pcbinfo mismatch"));
312         KASSERT((inp->inp_flags & (flags | INP_CONNECTED)) == 0,
313             ("already linked"));
314
315         in_pcbofflist(inp);
316         inp->inp_pcbinfo = NULL;
317 }
318
319 void
320 in_pcbunlink(struct inpcb *inp, struct inpcbinfo *pcbinfo)
321 {
322         in_pcbunlink_flags(inp, pcbinfo, INP_WILDCARD);
323 }
324
325 /*
326  * Relink a pcb into a new pcbinfo.
327  */
328 void
329 in_pcblink_flags(struct inpcb *inp, struct inpcbinfo *pcbinfo, int flags)
330 {
331         KASSERT(inp->inp_pcbinfo == NULL, ("has pcbinfo"));
332         KASSERT((inp->inp_flags & (flags | INP_CONNECTED)) == 0,
333             ("already linked"));
334
335         inp->inp_pcbinfo = pcbinfo;
336         in_pcbonlist(inp);
337 }
338
339 void
340 in_pcblink(struct inpcb *inp, struct inpcbinfo *pcbinfo)
341 {
342         return in_pcblink_flags(inp, pcbinfo, INP_WILDCARD);
343 }
344
345 static boolean_t
346 in_pcbporthash_update(struct inpcbportinfo *portinfo,
347     struct inpcb *inp, u_short lport, struct ucred *cred, int wild)
348 {
349         struct inpcbporthead *porthash;
350
351         /*
352          * This has to be atomic.  If the porthash is shared across multiple
353          * protocol threads, e.g. tcp and udp, then the token must be held.
354          */
355         porthash = in_pcbporthash_head(portinfo, lport);
356         GET_PORTHASH_TOKEN(porthash);
357
358         if (in_pcblookup_local(porthash, inp->inp_laddr, lport,
359             wild, cred) != NULL) {
360                 REL_PORTHASH_TOKEN(porthash);
361                 return FALSE;
362         }
363         inp->inp_lport = lport;
364         in_pcbinsporthash(porthash, inp);
365
366         REL_PORTHASH_TOKEN(porthash);
367         return TRUE;
368 }
369
370 static int
371 in_pcbsetlport(struct inpcb *inp, int wild, struct ucred *cred)
372 {
373         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
374         struct inpcbportinfo *portinfo;
375         u_short first, last, lport, step, first0, last0;
376         int count, error;
377         int portinfo_first, portinfo_idx;
378         uint32_t cut;
379
380         inp->inp_flags |= INP_ANONPORT;
381
382         step = pcbinfo->portinfo_cnt;
383         portinfo_first = mycpuid % pcbinfo->portinfo_cnt;
384         portinfo_idx = portinfo_first;
385
386         if (inp->inp_flags & INP_HIGHPORT) {
387                 first0 = ipport_hifirstauto;    /* sysctl */
388                 last0  = ipport_hilastauto;
389         } else if (inp->inp_flags & INP_LOWPORT) {
390                 if (cred &&
391                     (error =
392                      priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0))) {
393                         inp->inp_laddr.s_addr = INADDR_ANY;
394                         return error;
395                 }
396                 first0 = ipport_lowfirstauto;   /* 1023 */
397                 last0  = ipport_lowlastauto;    /* 600 */
398         } else {
399                 first0 = ipport_firstauto;      /* sysctl */
400                 last0  = ipport_lastauto;
401         }
402         if (first0 > last0) {
403                 lport = last0;
404                 last0 = first0;
405                 first0 = lport;
406         }
407         KKASSERT(last0 >= first0);
408
409         cut = karc4random();
410 loop:
411         portinfo = &pcbinfo->portinfo[portinfo_idx];
412         first = first0;
413         last = last0;
414
415         /*
416          * Simple check to ensure all ports are not used up causing
417          * a deadlock here.
418          */
419         in_pcbportrange(&last, &first, portinfo->offset, step);
420         lport = last - first;
421         count = lport / step;
422
423         lport = rounddown(cut % lport, step) + first;
424         KKASSERT(lport % step == portinfo->offset);
425
426         for (;;) {
427                 if (count-- < 0) {      /* completely used? */
428                         error = EADDRNOTAVAIL;
429                         break;
430                 }
431
432                 if (__predict_false(lport < first || lport > last)) {
433                         lport = first;
434                         KKASSERT(lport % step == portinfo->offset);
435                 }
436
437                 if (in_pcbporthash_update(portinfo, inp, htons(lport),
438                     cred, wild)) {
439                         error = 0;
440                         break;
441                 }
442
443                 lport += step;
444                 KKASSERT(lport % step == portinfo->offset);
445         }
446
447         if (error) {
448                 /* Try next portinfo */
449                 portinfo_idx++;
450                 portinfo_idx %= pcbinfo->portinfo_cnt;
451                 if (portinfo_idx != portinfo_first)
452                         goto loop;
453                 inp->inp_laddr.s_addr = INADDR_ANY;
454         }
455         return error;
456 }
457
458 int
459 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct thread *td)
460 {
461         struct socket *so = inp->inp_socket;
462         struct sockaddr_in jsin;
463         struct ucred *cred = NULL;
464         int wild = 0;
465
466         if (TAILQ_EMPTY(&in_ifaddrheads[mycpuid])) /* XXX broken! */
467                 return (EADDRNOTAVAIL);
468         if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
469                 return (EINVAL);        /* already bound */
470
471         if (!(so->so_options & (SO_REUSEADDR|SO_REUSEPORT)))
472                 wild = 1;    /* neither SO_REUSEADDR nor SO_REUSEPORT is set */
473         if (td->td_proc)
474                 cred = td->td_proc->p_ucred;
475
476         if (nam != NULL) {
477                 struct sockaddr_in *sin = (struct sockaddr_in *)nam;
478                 struct inpcbinfo *pcbinfo;
479                 struct inpcbportinfo *portinfo;
480                 struct inpcbporthead *porthash;
481                 struct inpcb *t;
482                 u_short lport, lport_ho;
483                 int reuseport = (so->so_options & SO_REUSEPORT);
484                 int error;
485
486                 if (nam->sa_len != sizeof *sin)
487                         return (EINVAL);
488 #ifdef notdef
489                 /*
490                  * We should check the family, but old programs
491                  * incorrectly fail to initialize it.
492                  */
493                 if (sin->sin_family != AF_INET)
494                         return (EAFNOSUPPORT);
495 #endif
496                 if (!prison_replace_wildcards(td, nam))
497                         return (EINVAL);
498
499                 lport = sin->sin_port;
500                 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
501                         /*
502                          * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
503                          * allow complete duplication of binding if
504                          * SO_REUSEPORT is set, or if SO_REUSEADDR is set
505                          * and a multicast address is bound on both
506                          * new and duplicated sockets.
507                          */
508                         if (so->so_options & SO_REUSEADDR)
509                                 reuseport = SO_REUSEADDR | SO_REUSEPORT;
510                 } else if (sin->sin_addr.s_addr != INADDR_ANY) {
511                         sin->sin_port = 0;              /* yech... */
512                         bzero(&sin->sin_zero, sizeof sin->sin_zero);
513                         if (ifa_ifwithaddr((struct sockaddr *)sin) == NULL)
514                                 return (EADDRNOTAVAIL);
515                 }
516
517                 inp->inp_laddr = sin->sin_addr;
518
519                 jsin.sin_family = AF_INET;
520                 jsin.sin_addr.s_addr = inp->inp_laddr.s_addr;
521                 if (!prison_replace_wildcards(td, (struct sockaddr *)&jsin)) {
522                         inp->inp_laddr.s_addr = INADDR_ANY;
523                         return (EINVAL);
524                 }
525                 inp->inp_laddr.s_addr = jsin.sin_addr.s_addr;
526
527                 if (lport == 0) {
528                         /* Auto-select local port */
529                         return in_pcbsetlport(inp, wild, cred);
530                 }
531                 lport_ho = ntohs(lport);
532
533                 /* GROSS */
534                 if (lport_ho < IPPORT_RESERVED && cred &&
535                     (error =
536                      priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0))) {
537                         inp->inp_laddr.s_addr = INADDR_ANY;
538                         return (error);
539                 }
540
541                 /*
542                  * Locate the proper portinfo based on lport
543                  */
544                 pcbinfo = inp->inp_pcbinfo;
545                 portinfo =
546                     &pcbinfo->portinfo[lport_ho % pcbinfo->portinfo_cnt];
547                 KKASSERT((lport_ho % pcbinfo->portinfo_cnt) ==
548                     portinfo->offset);
549
550                 /*
551                  * This has to be atomic.  If the porthash is shared across
552                  * multiple protocol threads, e.g. tcp and udp then the token
553                  * must be held.
554                  */
555                 porthash = in_pcbporthash_head(portinfo, lport);
556                 GET_PORTHASH_TOKEN(porthash);
557
558                 if (so->so_cred->cr_uid != 0 &&
559                     !IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
560                         t = in_pcblookup_local(porthash, sin->sin_addr, lport,
561                             INPLOOKUP_WILDCARD, cred);
562                         if (t &&
563                             (so->so_cred->cr_uid !=
564                              t->inp_socket->so_cred->cr_uid)) {
565                                 inp->inp_laddr.s_addr = INADDR_ANY;
566                                 error = EADDRINUSE;
567                                 goto done;
568                         }
569                 }
570                 if (cred && !prison_replace_wildcards(td, nam)) {
571                         inp->inp_laddr.s_addr = INADDR_ANY;
572                         error = EADDRNOTAVAIL;
573                         goto done;
574                 }
575                 t = in_pcblookup_local(porthash, sin->sin_addr, lport,
576                     wild, cred);
577                 if (t && !(reuseport & t->inp_socket->so_options)) {
578                         inp->inp_laddr.s_addr = INADDR_ANY;
579                         error = EADDRINUSE;
580                         goto done;
581                 }
582                 inp->inp_lport = lport;
583                 in_pcbinsporthash(porthash, inp);
584                 error = 0;
585 done:
586                 REL_PORTHASH_TOKEN(porthash);
587                 return (error);
588         } else {
589                 jsin.sin_family = AF_INET;
590                 jsin.sin_addr.s_addr = inp->inp_laddr.s_addr;
591                 if (!prison_replace_wildcards(td, (struct sockaddr *)&jsin)) {
592                         inp->inp_laddr.s_addr = INADDR_ANY;
593                         return (EINVAL);
594                 }
595                 inp->inp_laddr.s_addr = jsin.sin_addr.s_addr;
596
597                 return in_pcbsetlport(inp, wild, cred);
598         }
599 }
600
601 static struct inpcb *
602 in_pcblookup_localremote(struct inpcbporthead *porthash, struct in_addr laddr,
603     u_short lport, struct in_addr faddr, u_short fport, struct ucred *cred)
604 {
605         struct inpcb *inp;
606         struct inpcbport *phd;
607         struct inpcb *match = NULL;
608
609         /*
610          * If the porthashbase is shared across several cpus, it must
611          * have been locked.
612          */
613         ASSERT_PORTHASH_TOKEN_HELD(porthash);
614
615         /*
616          * Best fit PCB lookup.
617          *
618          * First see if this local port is in use by looking on the
619          * port hash list.
620          */
621         LIST_FOREACH(phd, porthash, phd_hash) {
622                 if (phd->phd_port == lport)
623                         break;
624         }
625         if (phd != NULL) {
626                 LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
627 #ifdef INET6
628                         if (!INP_ISIPV4(inp))
629                                 continue;
630 #endif
631                         if (inp->inp_laddr.s_addr != INADDR_ANY &&
632                             inp->inp_laddr.s_addr != laddr.s_addr)
633                                 continue;
634
635                         if (inp->inp_faddr.s_addr != INADDR_ANY &&
636                             inp->inp_faddr.s_addr != faddr.s_addr)
637                                 continue;
638
639                         if (inp->inp_fport != 0 && inp->inp_fport != fport)
640                                 continue;
641
642                         if (cred == NULL ||
643                             cred->cr_prison ==
644                             inp->inp_socket->so_cred->cr_prison) {
645                                 match = inp;
646                                 break;
647                         }
648                 }
649         }
650         return (match);
651 }
652
653 static boolean_t
654 in_pcbporthash_update4(struct inpcbportinfo *portinfo,
655     struct inpcb *inp, u_short lport, const struct sockaddr_in *sin,
656     struct ucred *cred)
657 {
658         struct inpcbporthead *porthash;
659
660         /*
661          * This has to be atomic.  If the porthash is shared across multiple
662          * protocol threads, e.g. tcp and udp, then the token must be held.
663          */
664         porthash = in_pcbporthash_head(portinfo, lport);
665         GET_PORTHASH_TOKEN(porthash);
666
667         if (in_pcblookup_localremote(porthash, inp->inp_laddr,
668             lport, sin->sin_addr, sin->sin_port, cred) != NULL) {
669                 REL_PORTHASH_TOKEN(porthash);
670                 return FALSE;
671         }
672         inp->inp_lport = lport;
673         in_pcbinsporthash(porthash, inp);
674
675         REL_PORTHASH_TOKEN(porthash);
676         return TRUE;
677 }
678
679 int
680 in_pcbbind_remote(struct inpcb *inp, const struct sockaddr *remote,
681     struct thread *td)
682 {
683         struct proc *p = td->td_proc;
684         const struct sockaddr_in *sin = (const struct sockaddr_in *)remote;
685         struct sockaddr_in jsin;
686         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
687         struct ucred *cred = NULL;
688         u_short first, last, lport;
689         int count, hash_count;
690         int error, selfconn = 0;
691         int cpuid = mycpuid;
692         uint32_t hash_base = 0, hash;
693
694         ASSERT_NETISR_NCPUS(cpuid);
695
696         if (TAILQ_EMPTY(&in_ifaddrheads[cpuid])) /* XXX broken! */
697                 return (EADDRNOTAVAIL);
698
699         KKASSERT(inp->inp_laddr.s_addr != INADDR_ANY);
700         if (inp->inp_lport != 0)
701                 return (EINVAL);        /* already bound */
702
703         KKASSERT(p);
704         cred = p->p_ucred;
705
706         jsin.sin_family = AF_INET;
707         jsin.sin_addr.s_addr = inp->inp_laddr.s_addr;
708         if (!prison_replace_wildcards(td, (struct sockaddr *)&jsin)) {
709                 inp->inp_laddr.s_addr = INADDR_ANY;
710                 return (EINVAL);
711         }
712         inp->inp_laddr.s_addr = jsin.sin_addr.s_addr;
713
714         hash_count = ip_porthash_trycount;
715         if (hash_count > 0) {
716                 hash_base = toeplitz_piecemeal_addr(sin->sin_addr.s_addr) ^
717                     toeplitz_piecemeal_addr(inp->inp_laddr.s_addr) ^
718                     toeplitz_piecemeal_port(sin->sin_port);
719         } else {
720                 hash_count = 0;
721         }
722
723         inp->inp_flags |= INP_ANONPORT;
724
725         if (inp->inp_flags & INP_HIGHPORT) {
726                 first = ipport_hifirstauto;     /* sysctl */
727                 last  = ipport_hilastauto;
728         } else if (inp->inp_flags & INP_LOWPORT) {
729                 if (cred &&
730                     (error =
731                      priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0))) {
732                         inp->inp_laddr.s_addr = INADDR_ANY;
733                         return (error);
734                 }
735                 first = ipport_lowfirstauto;    /* 1023 */
736                 last = ipport_lowlastauto;      /* 600 */
737         } else {
738                 first = ipport_firstauto;       /* sysctl */
739                 last  = ipport_lastauto;
740         }
741         if (first > last) {
742                 lport = last;
743                 last = first;
744                 first = lport;
745         }
746         KKASSERT(last >= first);
747
748         count = last - first;
749         lport = (karc4random() % count) + first;
750         count += hash_count;
751
752         /*
753          * Simple check to ensure all ports are not used up causing
754          * a deadlock here.
755          */
756         for (;;) {
757                 u_short lport_no;
758
759                 if (count-- < 0) {      /* completely used? */
760                         error = EADDRNOTAVAIL;
761                         break;
762                 }
763
764                 if (__predict_false(lport < first || lport > last))
765                         lport = first;
766                 lport_no = htons(lport);
767
768                 /* This could happen on loopback interface */
769                 if (__predict_false(sin->sin_port == lport_no &&
770                     sin->sin_addr.s_addr == inp->inp_laddr.s_addr)) {
771                         if (!selfconn) {
772                                 ++count; /* don't count this try */
773                                 selfconn = 1;
774                         }
775                         goto next;
776                 }
777
778                 if (hash_count) {
779                         --hash_count;
780                         hash = hash_base ^
781                             toeplitz_piecemeal_port(lport_no);
782                         if (netisr_hashcpu(hash) != cpuid && hash_count)
783                                 goto next;
784                 }
785
786                 if (in_pcbporthash_update4(
787                     &pcbinfo->portinfo[lport % pcbinfo->portinfo_cnt],
788                     inp, lport_no, sin, cred)) {
789                         error = 0;
790                         break;
791                 }
792 next:
793                 ++lport;
794         }
795
796         if (error)
797                 inp->inp_laddr.s_addr = INADDR_ANY;
798         return (error);
799 }
800
801 /*
802  *   Transform old in_pcbconnect() into an inner subroutine for new
803  *   in_pcbconnect(): Do some validity-checking on the remote
804  *   address (in mbuf 'nam') and then determine local host address
805  *   (i.e., which interface) to use to access that remote host.
806  *
807  *   This preserves definition of in_pcbconnect(), while supporting a
808  *   slightly different version for T/TCP.  (This is more than
809  *   a bit of a kludge, but cleaning up the internal interfaces would
810  *   have forced minor changes in every protocol).
811  */
812 int
813 in_pcbladdr_find(struct inpcb *inp, struct sockaddr *nam,
814     struct sockaddr_in **plocal_sin, struct thread *td, int find)
815 {
816         struct in_ifaddr *ia;
817         struct ucred *cred = NULL;
818         struct sockaddr_in *sin = (struct sockaddr_in *)nam;
819         struct sockaddr *jsin;
820         int jailed = 0, alloc_route = 0;
821
822         if (nam->sa_len != sizeof *sin)
823                 return (EINVAL);
824         if (sin->sin_family != AF_INET)
825                 return (EAFNOSUPPORT);
826         if (sin->sin_port == 0)
827                 return (EADDRNOTAVAIL);
828         if (td && td->td_proc && td->td_proc->p_ucred)
829                 cred = td->td_proc->p_ucred;
830         if (cred && cred->cr_prison)
831                 jailed = 1;
832         if (!TAILQ_EMPTY(&in_ifaddrheads[mycpuid])) {
833                 ia = TAILQ_FIRST(&in_ifaddrheads[mycpuid])->ia;
834                 /*
835                  * If the destination address is INADDR_ANY,
836                  * use the primary local address.
837                  * If the supplied address is INADDR_BROADCAST,
838                  * and the primary interface supports broadcast,
839                  * choose the broadcast address for that interface.
840                  */
841                 if (sin->sin_addr.s_addr == INADDR_ANY)
842                         sin->sin_addr = IA_SIN(ia)->sin_addr;
843                 else if (sin->sin_addr.s_addr == (u_long)INADDR_BROADCAST &&
844                     (ia->ia_ifp->if_flags & IFF_BROADCAST))
845                         sin->sin_addr = satosin(&ia->ia_broadaddr)->sin_addr;
846         }
847         if (find) {
848                 struct route *ro;
849
850                 ia = NULL;
851                 /*
852                  * If route is known or can be allocated now,
853                  * our src addr is taken from the i/f, else punt.
854                  * Note that we should check the address family of the cached
855                  * destination, in case of sharing the cache with IPv6.
856                  */
857                 ro = &inp->inp_route;
858                 if (ro->ro_rt &&
859                     (!(ro->ro_rt->rt_flags & RTF_UP) ||
860                      ro->ro_dst.sa_family != AF_INET ||
861                      satosin(&ro->ro_dst)->sin_addr.s_addr !=
862                                       sin->sin_addr.s_addr ||
863                      inp->inp_socket->so_options & SO_DONTROUTE)) {
864                         RTFREE(ro->ro_rt);
865                         ro->ro_rt = NULL;
866                 }
867                 if (!(inp->inp_socket->so_options & SO_DONTROUTE) && /*XXX*/
868                     (ro->ro_rt == NULL ||
869                     ro->ro_rt->rt_ifp == NULL)) {
870                         /* No route yet, so try to acquire one */
871                         bzero(&ro->ro_dst, sizeof(struct sockaddr_in));
872                         ro->ro_dst.sa_family = AF_INET;
873                         ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
874                         ((struct sockaddr_in *) &ro->ro_dst)->sin_addr =
875                                 sin->sin_addr;
876                         rtalloc(ro);
877                         alloc_route = 1;
878                 }
879                 /*
880                  * If we found a route, use the address
881                  * corresponding to the outgoing interface
882                  * unless it is the loopback (in case a route
883                  * to our address on another net goes to loopback).
884                  */
885                 if (ro->ro_rt &&
886                     !(ro->ro_rt->rt_ifp->if_flags & IFF_LOOPBACK)) {
887                         if (jailed) {
888                                 if (jailed_ip(cred->cr_prison, 
889                                     ro->ro_rt->rt_ifa->ifa_addr)) {
890                                         ia = ifatoia(ro->ro_rt->rt_ifa);
891                                 }
892                         } else {
893                                 ia = ifatoia(ro->ro_rt->rt_ifa);
894                         }
895                 }
896                 if (ia == NULL) {
897                         u_short fport = sin->sin_port;
898
899                         sin->sin_port = 0;
900                         ia = ifatoia(ifa_ifwithdstaddr(sintosa(sin)));
901                         if (ia && jailed && !jailed_ip(cred->cr_prison,
902                             sintosa(&ia->ia_addr)))
903                                 ia = NULL;
904                         if (ia == NULL)
905                                 ia = ifatoia(ifa_ifwithnet(sintosa(sin)));
906                         if (ia && jailed && !jailed_ip(cred->cr_prison,
907                             sintosa(&ia->ia_addr)))
908                                 ia = NULL;
909                         sin->sin_port = fport;
910                         if (ia == NULL &&
911                             !TAILQ_EMPTY(&in_ifaddrheads[mycpuid]))
912                                 ia = TAILQ_FIRST(&in_ifaddrheads[mycpuid])->ia;
913                         if (ia && jailed && !jailed_ip(cred->cr_prison,
914                             sintosa(&ia->ia_addr)))
915                                 ia = NULL;
916
917                         if (!jailed && ia == NULL)
918                                 goto fail;
919                 }
920                 /*
921                  * If the destination address is multicast and an outgoing
922                  * interface has been set as a multicast option, use the
923                  * address of that interface as our source address.
924                  */
925                 if (!jailed && IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) &&
926                     inp->inp_moptions != NULL) {
927                         struct ip_moptions *imo;
928                         struct ifnet *ifp;
929
930                         imo = inp->inp_moptions;
931                         if ((ifp = imo->imo_multicast_ifp) != NULL) {
932                                 struct in_ifaddr_container *iac;
933
934                                 ia = NULL;
935                                 TAILQ_FOREACH(iac,
936                                 &in_ifaddrheads[mycpuid], ia_link) {
937                                         if (iac->ia->ia_ifp == ifp) {
938                                                 ia = iac->ia;
939                                                 break;
940                                         }
941                                 }
942                                 if (ia == NULL)
943                                         goto fail;
944                         }
945                 }
946                 /*
947                  * Don't do pcblookup call here; return interface in plocal_sin
948                  * and exit to caller, that will do the lookup.
949                  */
950                 if (ia == NULL && jailed) {
951                         if ((jsin = prison_get_nonlocal(
952                                 cred->cr_prison, AF_INET, NULL)) != NULL ||
953                             (jsin = prison_get_local(
954                                 cred->cr_prison, AF_INET, NULL)) != NULL) {
955                                 *plocal_sin = satosin(jsin);
956                         } else {
957                                 /* IPv6 only Jail */
958                                 goto fail;
959                         }
960                 } else {
961                         *plocal_sin = &ia->ia_addr;
962                 }
963         }
964         return (0);
965 fail:
966         if (alloc_route)
967                 in_pcbresetroute(inp);
968         return (EADDRNOTAVAIL);
969 }
970
971 int
972 in_pcbladdr(struct inpcb *inp, struct sockaddr *nam,
973     struct sockaddr_in **plocal_sin, struct thread *td)
974 {
975         return in_pcbladdr_find(inp, nam, plocal_sin, td,
976             (inp->inp_laddr.s_addr == INADDR_ANY));
977 }
978
979 /*
980  * Outer subroutine:
981  * Connect from a socket to a specified address.
982  * Both address and port must be specified in argument sin.
983  * If don't have a local address for this socket yet,
984  * then pick one.
985  */
986 int
987 in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct thread *td)
988 {
989         struct sockaddr_in *if_sin;
990         struct sockaddr_in *sin = (struct sockaddr_in *)nam;
991         int error;
992
993         if_sin = NULL;  /* avoid gcc warnings */
994
995         /* Call inner routine to assign local interface address. */
996         if ((error = in_pcbladdr(inp, nam, &if_sin, td)) != 0)
997                 return (error);
998
999         if (in_pcblookup_hash(inp->inp_pcbinfo, sin->sin_addr, sin->sin_port,
1000                               inp->inp_laddr.s_addr ?
1001                                 inp->inp_laddr : if_sin->sin_addr,
1002                               inp->inp_lport, FALSE, NULL) != NULL) {
1003                 return (EADDRINUSE);
1004         }
1005         if (inp->inp_laddr.s_addr == INADDR_ANY) {
1006                 if (inp->inp_lport == 0) {
1007                         error = in_pcbbind(inp, NULL, td);
1008                         if (error)
1009                                 return (error);
1010                 }
1011                 inp->inp_laddr = if_sin->sin_addr;
1012         }
1013         inp->inp_faddr = sin->sin_addr;
1014         inp->inp_fport = sin->sin_port;
1015         in_pcbinsconnhash(inp);
1016         return (0);
1017 }
1018
1019 void
1020 in_pcbdisconnect(struct inpcb *inp)
1021 {
1022
1023         in_pcbremconnhash(inp);
1024         inp->inp_faddr.s_addr = INADDR_ANY;
1025         inp->inp_fport = 0;
1026 }
1027
1028 void
1029 in_pcbdetach(struct inpcb *inp)
1030 {
1031         struct socket *so = inp->inp_socket;
1032         struct inpcbinfo *ipi = inp->inp_pcbinfo;
1033
1034         inp->inp_gencnt = ++ipi->ipi_gencnt;
1035         KKASSERT((so->so_state & SS_ASSERTINPROG) == 0);
1036         in_pcbremlists(inp);
1037         so->so_pcb = NULL;
1038         sofree(so);                     /* remove pcb ref */
1039         if (inp->inp_options)
1040                 m_free(inp->inp_options);
1041         if (inp->inp_route.ro_rt)
1042                 rtfree(inp->inp_route.ro_rt);
1043         ip_freemoptions(inp->inp_moptions);
1044         kfree(inp, M_PCB);
1045 }
1046
1047 /*
1048  * The socket may have an invalid PCB, i.e. NULL.  For example, a TCP
1049  * socket received RST.
1050  */
1051 static int
1052 in_setsockaddr(struct socket *so, struct sockaddr **nam)
1053 {
1054         struct inpcb *inp;
1055         struct sockaddr_in *sin;
1056
1057         KASSERT(curthread->td_type == TD_TYPE_NETISR, ("not in netisr"));
1058         inp = so->so_pcb;
1059         if (!inp)
1060                 return (ECONNRESET);
1061
1062         sin = kmalloc(sizeof *sin, M_SONAME, M_WAITOK | M_ZERO);
1063         sin->sin_family = AF_INET;
1064         sin->sin_len = sizeof *sin;
1065         sin->sin_port = inp->inp_lport;
1066         sin->sin_addr = inp->inp_laddr;
1067
1068         *nam = (struct sockaddr *)sin;
1069         return (0);
1070 }
1071
1072 void
1073 in_setsockaddr_dispatch(netmsg_t msg)
1074 {
1075         int error;
1076
1077         error = in_setsockaddr(msg->base.nm_so, msg->peeraddr.nm_nam);
1078         lwkt_replymsg(&msg->lmsg, error);
1079 }
1080
1081 /*
1082  * The socket may have an invalid PCB, i.e. NULL.  For example, a TCP
1083  * socket received RST.
1084  */
1085 int
1086 in_setpeeraddr(struct socket *so, struct sockaddr **nam)
1087 {
1088         struct inpcb *inp;
1089         struct sockaddr_in *sin;
1090
1091         KASSERT(curthread->td_type == TD_TYPE_NETISR, ("not in netisr"));
1092         inp = so->so_pcb;
1093         if (!inp)
1094                 return (ECONNRESET);
1095
1096         sin = kmalloc(sizeof *sin, M_SONAME, M_WAITOK | M_ZERO);
1097         sin->sin_family = AF_INET;
1098         sin->sin_len = sizeof *sin;
1099         sin->sin_port = inp->inp_fport;
1100         sin->sin_addr = inp->inp_faddr;
1101
1102         *nam = (struct sockaddr *)sin;
1103         return (0);
1104 }
1105
1106 void
1107 in_setpeeraddr_dispatch(netmsg_t msg)
1108 {
1109         int error;
1110
1111         error = in_setpeeraddr(msg->base.nm_so, msg->peeraddr.nm_nam);
1112         lwkt_replymsg(&msg->lmsg, error);
1113 }
1114
1115 void
1116 in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int err,
1117     inp_notify_t notify)
1118 {
1119         struct inpcb *inp, *marker;
1120
1121         KASSERT(&curthread->td_msgport == netisr_cpuport(pcbinfo->cpu),
1122             ("not in the correct netisr"));
1123         marker = in_pcbmarker();
1124
1125         /*
1126          * NOTE:
1127          * - If INP_PLACEMARKER is set we must ignore the rest of the
1128          *   structure and skip it.
1129          * - It is safe to nuke inpcbs here, since we are in their own
1130          *   netisr.
1131          */
1132         GET_PCBINFO_TOKEN(pcbinfo);
1133
1134         LIST_INSERT_HEAD(&pcbinfo->pcblisthead, marker, inp_list);
1135         while ((inp = LIST_NEXT(marker, inp_list)) != NULL) {
1136                 LIST_REMOVE(marker, inp_list);
1137                 LIST_INSERT_AFTER(inp, marker, inp_list);
1138
1139                 if (inp->inp_flags & INP_PLACEMARKER)
1140                         continue;
1141 #ifdef INET6
1142                 if (!INP_ISIPV4(inp))
1143                         continue;
1144 #endif
1145                 if (inp->inp_faddr.s_addr != faddr.s_addr ||
1146                     inp->inp_socket == NULL)
1147                         continue;
1148                 (*notify)(inp, err);            /* can remove inp from list! */
1149         }
1150         LIST_REMOVE(marker, inp_list);
1151
1152         REL_PCBINFO_TOKEN(pcbinfo);
1153 }
1154
1155 void
1156 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
1157 {
1158         struct inpcb *inp, *marker;
1159
1160         /*
1161          * We only need to make sure that we are in netisr0, where all
1162          * multicast operation happen.  We could check inpcbinfo which
1163          * does not belong to netisr0 by holding the inpcbinfo's token.
1164          * In this case, the pcbinfo must be able to be shared, i.e.
1165          * pcbinfo->infotoken is not NULL.
1166          */
1167         ASSERT_NETISR0;
1168         KASSERT(pcbinfo->cpu == 0 || pcbinfo->infotoken != NULL,
1169             ("pcbinfo could not be shared"));
1170
1171         /*
1172          * Get a marker for the current netisr (netisr0).
1173          *
1174          * It is possible that the multicast address deletion blocks,
1175          * which could cause temporary token releasing.  So we use
1176          * inpcb marker here to get a coherent view of the inpcb list.
1177          *
1178          * While, on the other hand, moptions are only added and deleted
1179          * in netisr0, so we would not see staled moption or miss moption
1180          * even if the token was released due to the blocking multicast
1181          * address deletion.
1182          */
1183         marker = in_pcbmarker();
1184
1185         GET_PCBINFO_TOKEN(pcbinfo);
1186
1187         LIST_INSERT_HEAD(&pcbinfo->pcblisthead, marker, inp_list);
1188         while ((inp = LIST_NEXT(marker, inp_list)) != NULL) {
1189                 struct ip_moptions *imo;
1190
1191                 LIST_REMOVE(marker, inp_list);
1192                 LIST_INSERT_AFTER(inp, marker, inp_list);
1193
1194                 if (inp->inp_flags & INP_PLACEMARKER)
1195                         continue;
1196                 imo = inp->inp_moptions;
1197                 if (INP_ISIPV4(inp) && imo != NULL) {
1198                         int i, gap;
1199
1200                         /*
1201                          * Unselect the outgoing interface if it is being
1202                          * detached.
1203                          */
1204                         if (imo->imo_multicast_ifp == ifp)
1205                                 imo->imo_multicast_ifp = NULL;
1206
1207                         /*
1208                          * Drop multicast group membership if we joined
1209                          * through the interface being detached.
1210                          */
1211                         for (i = 0, gap = 0; i < imo->imo_num_memberships;
1212                             i++) {
1213                                 if (imo->imo_membership[i]->inm_ifp == ifp) {
1214                                         /*
1215                                          * NOTE:
1216                                          * This could block and the pcbinfo
1217                                          * token could be passively released.
1218                                          */
1219                                         in_delmulti(imo->imo_membership[i]);
1220                                         gap++;
1221                                 } else if (gap != 0)
1222                                         imo->imo_membership[i - gap] =
1223                                             imo->imo_membership[i];
1224                         }
1225                         imo->imo_num_memberships -= gap;
1226                 }
1227         }
1228         LIST_REMOVE(marker, inp_list);
1229
1230         REL_PCBINFO_TOKEN(pcbinfo);
1231 }
1232
1233 /*
1234  * Check for alternatives when higher level complains
1235  * about service problems.  For now, invalidate cached
1236  * routing information.  If the route was created dynamically
1237  * (by a redirect), time to try a default gateway again.
1238  */
1239 void
1240 in_losing(struct inpcb *inp)
1241 {
1242         struct rtentry *rt;
1243         struct rt_addrinfo rtinfo;
1244
1245         if ((rt = inp->inp_route.ro_rt)) {
1246                 bzero(&rtinfo, sizeof(struct rt_addrinfo));
1247                 rtinfo.rti_info[RTAX_DST] = rt_key(rt);
1248                 rtinfo.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
1249                 rtinfo.rti_info[RTAX_NETMASK] = rt_mask(rt);
1250                 rtinfo.rti_flags = rt->rt_flags;
1251                 rt_missmsg(RTM_LOSING, &rtinfo, rt->rt_flags, 0);
1252                 if (rt->rt_flags & RTF_DYNAMIC) {
1253                         rtrequest(RTM_DELETE, rt_key(rt), rt->rt_gateway,
1254                             rt_mask(rt), rt->rt_flags, NULL);
1255                 }
1256                 inp->inp_route.ro_rt = NULL;
1257                 rtfree(rt);
1258                 /*
1259                  * A new route can be allocated
1260                  * the next time output is attempted.
1261                  */
1262         }
1263 }
1264
1265 /*
1266  * After a routing change, flush old routing
1267  * and allocate a (hopefully) better one.
1268  */
1269 void
1270 in_rtchange(struct inpcb *inp, int err)
1271 {
1272         if (inp->inp_route.ro_rt) {
1273                 rtfree(inp->inp_route.ro_rt);
1274                 inp->inp_route.ro_rt = NULL;
1275                 /*
1276                  * A new route can be allocated the next time
1277                  * output is attempted.
1278                  */
1279         }
1280 }
1281
1282 /*
1283  * Lookup a PCB based on the local address and port.
1284  */
1285 static struct inpcb *
1286 in_pcblookup_local(struct inpcbporthead *porthash, struct in_addr laddr,
1287                    u_int lport_arg, int wild_okay, struct ucred *cred)
1288 {
1289         struct inpcb *inp;
1290         int matchwild = 3, wildcard;
1291         u_short lport = lport_arg;
1292         struct inpcbport *phd;
1293         struct inpcb *match = NULL;
1294
1295         /*
1296          * If the porthashbase is shared across several cpus, it must
1297          * have been locked.
1298          */
1299         ASSERT_PORTHASH_TOKEN_HELD(porthash);
1300
1301         /*
1302          * Best fit PCB lookup.
1303          *
1304          * First see if this local port is in use by looking on the
1305          * port hash list.
1306          */
1307         LIST_FOREACH(phd, porthash, phd_hash) {
1308                 if (phd->phd_port == lport)
1309                         break;
1310         }
1311         if (phd != NULL) {
1312                 /*
1313                  * Port is in use by one or more PCBs. Look for best
1314                  * fit.
1315                  */
1316                 LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
1317                         wildcard = 0;
1318 #ifdef INET6
1319                         if (!INP_ISIPV4(inp))
1320                                 continue;
1321 #endif
1322                         if (inp->inp_faddr.s_addr != INADDR_ANY)
1323                                 wildcard++;
1324                         if (inp->inp_laddr.s_addr != INADDR_ANY) {
1325                                 if (laddr.s_addr == INADDR_ANY)
1326                                         wildcard++;
1327                                 else if (inp->inp_laddr.s_addr != laddr.s_addr)
1328                                         continue;
1329                         } else {
1330                                 if (laddr.s_addr != INADDR_ANY)
1331                                         wildcard++;
1332                         }
1333                         if (wildcard && !wild_okay)
1334                                 continue;
1335                         if (wildcard < matchwild &&
1336                             (cred == NULL ||
1337                              cred->cr_prison == 
1338                                         inp->inp_socket->so_cred->cr_prison)) {
1339                                 match = inp;
1340                                 matchwild = wildcard;
1341                                 if (matchwild == 0) {
1342                                         break;
1343                                 }
1344                         }
1345                 }
1346         }
1347         return (match);
1348 }
1349
1350 struct inpcb *
1351 in_pcblocalgroup_last(const struct inpcbinfo *pcbinfo,
1352     const struct inpcb *inp)
1353 {
1354         const struct inp_localgrphead *hdr;
1355         const struct inp_localgroup *grp;
1356         int i;
1357
1358         if (pcbinfo->localgrphashbase == NULL)
1359                 return NULL;
1360
1361         GET_PCBINFO_TOKEN(pcbinfo);
1362
1363         hdr = &pcbinfo->localgrphashbase[
1364             INP_PCBLOCALGRPHASH(inp->inp_lport, pcbinfo->localgrphashmask)];
1365
1366         LIST_FOREACH(grp, hdr, il_list) {
1367                 if (grp->il_af == inp->inp_af &&
1368                     grp->il_lport == inp->inp_lport &&
1369                     memcmp(&grp->il_dependladdr,
1370                         &inp->inp_inc.inc_ie.ie_dependladdr,
1371                         sizeof(grp->il_dependladdr)) == 0) {
1372                         break;
1373                 }
1374         }
1375         if (grp == NULL || grp->il_inpcnt == 1) {
1376                 REL_PCBINFO_TOKEN(pcbinfo);
1377                 return NULL;
1378         }
1379
1380         KASSERT(grp->il_inpcnt >= 2,
1381             ("invalid localgroup inp count %d", grp->il_inpcnt));
1382         for (i = 0; i < grp->il_inpcnt; ++i) {
1383                 if (grp->il_inp[i] == inp) {
1384                         int last = grp->il_inpcnt - 1;
1385
1386                         if (i == last)
1387                                 last = grp->il_inpcnt - 2;
1388                         REL_PCBINFO_TOKEN(pcbinfo);
1389                         return grp->il_inp[last];
1390                 }
1391         }
1392         REL_PCBINFO_TOKEN(pcbinfo);
1393         return NULL;
1394 }
1395
1396 static struct inpcb *
1397 inp_localgroup_lookup(const struct inpcbinfo *pcbinfo,
1398     struct in_addr laddr, uint16_t lport, uint32_t pkt_hash)
1399 {
1400         struct inpcb *local_wild = NULL;
1401         const struct inp_localgrphead *hdr;
1402         const struct inp_localgroup *grp;
1403
1404         ASSERT_PCBINFO_TOKEN_HELD(pcbinfo);
1405
1406         hdr = &pcbinfo->localgrphashbase[
1407             INP_PCBLOCALGRPHASH(lport, pcbinfo->localgrphashmask)];
1408
1409         /*
1410          * Order of socket selection:
1411          * 1. non-wild.
1412          * 2. wild.
1413          *
1414          * NOTE: Local group does not contain jailed sockets
1415          */
1416         LIST_FOREACH(grp, hdr, il_list) {
1417 #ifdef INET6
1418                 if (grp->il_af != AF_INET)
1419                         continue;
1420 #endif
1421                 if (grp->il_lport == lport) {
1422                         int idx;
1423
1424                         /*
1425                          * Modulo-N is used here, which greatly reduces
1426                          * completion queue token contention, thus more
1427                          * cpu time is saved.
1428                          */
1429                         idx = netisr_hashlsb(pkt_hash) % grp->il_inpcnt;
1430                         if (grp->il_laddr.s_addr == laddr.s_addr)
1431                                 return grp->il_inp[idx];
1432                         else if (grp->il_laddr.s_addr == INADDR_ANY)
1433                                 local_wild = grp->il_inp[idx];
1434                 }
1435         }
1436         if (local_wild != NULL)
1437                 return local_wild;
1438         return NULL;
1439 }
1440
1441 /*
1442  * Lookup PCB in hash list.
1443  */
1444 struct inpcb *
1445 in_pcblookup_pkthash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
1446     u_int fport_arg, struct in_addr laddr, u_int lport_arg,
1447     boolean_t wildcard, struct ifnet *ifp, const struct mbuf *m)
1448 {
1449         struct inpcbhead *head;
1450         struct inpcb *inp, *jinp=NULL;
1451         u_short fport = fport_arg, lport = lport_arg;
1452
1453         /*
1454          * First look for an exact match.
1455          */
1456         head = &pcbinfo->hashbase[INP_PCBCONNHASH(faddr.s_addr, fport,
1457             laddr.s_addr, lport, pcbinfo->hashmask)];
1458         LIST_FOREACH(inp, head, inp_hash) {
1459 #ifdef INET6
1460                 if (!INP_ISIPV4(inp))
1461                         continue;
1462 #endif
1463                 if (in_hosteq(inp->inp_faddr, faddr) &&
1464                     in_hosteq(inp->inp_laddr, laddr) &&
1465                     inp->inp_fport == fport && inp->inp_lport == lport) {
1466                         /* found */
1467                         if (inp->inp_socket == NULL ||
1468                             inp->inp_socket->so_cred->cr_prison == NULL) {
1469                                 return (inp);
1470                         } else {
1471                                 if  (jinp == NULL)
1472                                         jinp = inp;
1473                         }
1474                 }
1475         }
1476         if (jinp != NULL)
1477                 return (jinp);
1478
1479         if (wildcard) {
1480                 struct inpcb *local_wild = NULL;
1481                 struct inpcb *jinp_wild = NULL;
1482                 struct inpcontainer *ic;
1483                 struct inpcontainerhead *chead;
1484                 struct sockaddr_in jsin;
1485                 struct ucred *cred;
1486
1487                 GET_PCBINFO_TOKEN(pcbinfo);
1488
1489                 /*
1490                  * Check local group first
1491                  */
1492                 if (pcbinfo->localgrphashbase != NULL &&
1493                     m != NULL && (m->m_flags & M_HASH)) {
1494                         inp = inp_localgroup_lookup(pcbinfo,
1495                             laddr, lport, m->m_pkthdr.hash);
1496                         if (inp != NULL) {
1497                                 REL_PCBINFO_TOKEN(pcbinfo);
1498                                 return inp;
1499                         }
1500                 }
1501
1502                 /*
1503                  * Order of socket selection:
1504                  * 1. non-jailed, non-wild.
1505                  * 2. non-jailed, wild.
1506                  * 3. jailed, non-wild.
1507                  * 4. jailed, wild.
1508                  */
1509                 jsin.sin_family = AF_INET;
1510                 chead = &pcbinfo->wildcardhashbase[
1511                     INP_PCBWILDCARDHASH(lport, pcbinfo->wildcardhashmask)];
1512                 LIST_FOREACH(ic, chead, ic_list) {
1513                         inp = ic->ic_inp;
1514                         if (inp->inp_flags & INP_PLACEMARKER)
1515                                 continue;
1516
1517                         jsin.sin_addr.s_addr = laddr.s_addr;
1518 #ifdef INET6
1519                         if (!INP_ISIPV4(inp))
1520                                 continue;
1521 #endif
1522                         if (inp->inp_socket != NULL)
1523                                 cred = inp->inp_socket->so_cred;
1524                         else
1525                                 cred = NULL;
1526                         if (cred != NULL && jailed(cred)) {
1527                                 if (jinp != NULL)
1528                                         continue;
1529                                 else
1530                                         if (!jailed_ip(cred->cr_prison,
1531                                             (struct sockaddr *)&jsin))
1532                                                 continue;
1533                         }
1534                         if (inp->inp_lport == lport) {
1535                                 if (inp->inp_laddr.s_addr == laddr.s_addr) {
1536                                         if (cred != NULL && jailed(cred)) {
1537                                                 jinp = inp;
1538                                         } else {
1539                                                 REL_PCBINFO_TOKEN(pcbinfo);
1540                                                 return (inp);
1541                                         }
1542                                 }
1543                                 if (inp->inp_laddr.s_addr == INADDR_ANY) {
1544                                         if (cred != NULL && jailed(cred))
1545                                                 jinp_wild = inp;
1546                                         else
1547                                                 local_wild = inp;
1548                                 }
1549                         }
1550                 }
1551
1552                 REL_PCBINFO_TOKEN(pcbinfo);
1553
1554                 if (local_wild != NULL)
1555                         return (local_wild);
1556                 if (jinp != NULL)
1557                         return (jinp);
1558                 return (jinp_wild);
1559         }
1560
1561         /*
1562          * Not found.
1563          */
1564         return (NULL);
1565 }
1566
1567 struct inpcb *
1568 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
1569     u_int fport_arg, struct in_addr laddr, u_int lport_arg,
1570     boolean_t wildcard, struct ifnet *ifp)
1571 {
1572         return in_pcblookup_pkthash(pcbinfo, faddr, fport_arg,
1573             laddr, lport_arg, wildcard, ifp, NULL);
1574 }
1575
1576 /*
1577  * Insert PCB into connection hash table.
1578  */
1579 void
1580 in_pcbinsconnhash(struct inpcb *inp)
1581 {
1582         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1583         struct inpcbhead *bucket;
1584         u_int32_t hashkey_faddr, hashkey_laddr;
1585
1586 #ifdef INET6
1587         if (INP_ISIPV6(inp)) {
1588                 hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX JH */;
1589                 hashkey_laddr = inp->in6p_laddr.s6_addr32[3] /* XXX JH */;
1590         } else {
1591 #endif
1592                 hashkey_faddr = inp->inp_faddr.s_addr;
1593                 hashkey_laddr = inp->inp_laddr.s_addr;
1594 #ifdef INET6
1595         }
1596 #endif
1597
1598         KASSERT(&curthread->td_msgport == netisr_cpuport(pcbinfo->cpu),
1599             ("not in the correct netisr"));
1600         ASSERT_INP_NOTINHASH(inp);
1601         inp->inp_flags |= INP_CONNECTED;
1602
1603         /*
1604          * Insert into the connection hash table.
1605          */
1606         bucket = &pcbinfo->hashbase[INP_PCBCONNHASH(hashkey_faddr,
1607             inp->inp_fport, hashkey_laddr, inp->inp_lport, pcbinfo->hashmask)];
1608         LIST_INSERT_HEAD(bucket, inp, inp_hash);
1609 }
1610
1611 /*
1612  * Remove PCB from connection hash table.
1613  */
1614 void
1615 in_pcbremconnhash(struct inpcb *inp)
1616 {
1617         struct inpcbinfo *pcbinfo __debugvar = inp->inp_pcbinfo;
1618
1619         KASSERT(&curthread->td_msgport == netisr_cpuport(pcbinfo->cpu),
1620             ("not in the correct netisr"));
1621         KASSERT(inp->inp_flags & INP_CONNECTED, ("inp not connected"));
1622
1623         LIST_REMOVE(inp, inp_hash);
1624         inp->inp_flags &= ~INP_CONNECTED;
1625 }
1626
1627 /*
1628  * Insert PCB into port hash table.
1629  */
1630 void
1631 in_pcbinsporthash(struct inpcbporthead *pcbporthash, struct inpcb *inp)
1632 {
1633         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1634         struct inpcbport *phd;
1635
1636         /*
1637          * If the porthashbase is shared across several cpus, it must
1638          * have been locked.
1639          */
1640         ASSERT_PORTHASH_TOKEN_HELD(pcbporthash);
1641
1642         /*
1643          * Insert into the port hash table.
1644          */
1645
1646         /* Go through port list and look for a head for this lport. */
1647         LIST_FOREACH(phd, pcbporthash, phd_hash) {
1648                 if (phd->phd_port == inp->inp_lport)
1649                         break;
1650         }
1651
1652         /* If none exists, use saved one and tack it on. */
1653         if (phd == NULL) {
1654                 KKASSERT(pcbinfo->portsave != NULL);
1655                 phd = pcbinfo->portsave;
1656                 pcbinfo->portsave = NULL;
1657                 phd->phd_port = inp->inp_lport;
1658                 LIST_INIT(&phd->phd_pcblist);
1659                 LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
1660         }
1661
1662         inp->inp_porthash = pcbporthash;
1663         inp->inp_phd = phd;
1664         LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
1665
1666         /*
1667          * Malloc one inpcbport for later use.  It is safe to use
1668          * "wait" malloc here (port token would be released, if
1669          * malloc ever blocked), since all changes to the porthash
1670          * are done.
1671          */
1672         if (pcbinfo->portsave == NULL) {
1673                 pcbinfo->portsave = kmalloc(sizeof(*pcbinfo->portsave),
1674                                             M_PCB, M_INTWAIT | M_ZERO);
1675         }
1676 }
1677
1678 void
1679 in_pcbinsporthash_lport(struct inpcb *inp)
1680 {
1681         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1682         struct inpcbportinfo *portinfo;
1683         struct inpcbporthead *porthash;
1684         u_short lport_ho;
1685
1686         /* Locate the proper portinfo based on lport */
1687         lport_ho = ntohs(inp->inp_lport);
1688         portinfo = &pcbinfo->portinfo[lport_ho % pcbinfo->portinfo_cnt];
1689         KKASSERT((lport_ho % pcbinfo->portinfo_cnt) == portinfo->offset);
1690
1691         porthash = in_pcbporthash_head(portinfo, inp->inp_lport);
1692         GET_PORTHASH_TOKEN(porthash);
1693         in_pcbinsporthash(porthash, inp);
1694         REL_PORTHASH_TOKEN(porthash);
1695 }
1696
1697 void
1698 in_pcbremporthash(struct inpcb *inp)
1699 {
1700         struct inpcbporthead *porthash;
1701         struct inpcbport *phd;
1702
1703         if (inp->inp_phd == NULL)
1704                 return;
1705         KASSERT(inp->inp_lport != 0, ("inpcb has no lport"));
1706
1707         porthash = inp->inp_porthash;
1708         KASSERT(porthash != NULL, ("no porthash"));
1709
1710         GET_PORTHASH_TOKEN(porthash);
1711
1712         phd = inp->inp_phd;
1713         LIST_REMOVE(inp, inp_portlist);
1714         if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
1715                 LIST_REMOVE(phd, phd_hash);
1716                 kfree(phd, M_PCB);
1717         }
1718
1719         REL_PORTHASH_TOKEN(porthash);
1720
1721         inp->inp_phd = NULL;
1722         /* NOTE: Don't whack inp_lport, which may be used later */
1723 }
1724
1725 static struct inp_localgroup *
1726 inp_localgroup_alloc(u_char af, uint16_t port,
1727     const union in_dependaddr *addr, int size)
1728 {
1729         struct inp_localgroup *grp;
1730
1731         grp = kmalloc(__offsetof(struct inp_localgroup, il_inp[size]),
1732             M_TEMP, M_INTWAIT | M_ZERO);
1733         grp->il_af = af;
1734         grp->il_lport = port;
1735         grp->il_dependladdr = *addr;
1736         grp->il_inpsiz = size;
1737
1738         return grp;
1739 }
1740
1741 static void
1742 inp_localgroup_free(struct inp_localgroup *grp)
1743 {
1744         kfree(grp, M_TEMP);
1745 }
1746
1747 static void
1748 inp_localgroup_destroy(struct inp_localgroup *grp)
1749 {
1750         LIST_REMOVE(grp, il_list);
1751         inp_localgroup_free(grp);
1752 }
1753
1754 static void
1755 inp_localgroup_copy(struct inp_localgroup *grp,
1756     const struct inp_localgroup *old_grp)
1757 {
1758         int i;
1759
1760         KASSERT(old_grp->il_inpcnt < grp->il_inpsiz,
1761             ("invalid new local group size %d and old local group count %d",
1762              grp->il_inpsiz, old_grp->il_inpcnt));
1763         for (i = 0; i < old_grp->il_inpcnt; ++i)
1764                 grp->il_inp[i] = old_grp->il_inp[i];
1765         grp->il_inpcnt = old_grp->il_inpcnt;
1766 }
1767
1768 static void
1769 in_pcbinslocalgrphash_oncpu(struct inpcb *inp, struct inpcbinfo *pcbinfo)
1770 {
1771         struct inp_localgrphead *hdr;
1772         struct inp_localgroup *grp, *grp_alloc = NULL;
1773         struct ucred *cred;
1774         int i, idx;
1775
1776         ASSERT_PCBINFO_TOKEN_HELD(pcbinfo);
1777
1778         if (pcbinfo->localgrphashbase == NULL)
1779                 return;
1780
1781         /*
1782          * XXX don't allow jailed socket to join local group
1783          */
1784         if (inp->inp_socket != NULL)
1785                 cred = inp->inp_socket->so_cred;
1786         else
1787                 cred = NULL;
1788         if (cred != NULL && jailed(cred))
1789                 return;
1790
1791         hdr = &pcbinfo->localgrphashbase[
1792             INP_PCBLOCALGRPHASH(inp->inp_lport, pcbinfo->localgrphashmask)];
1793
1794 again:
1795         LIST_FOREACH(grp, hdr, il_list) {
1796                 if (grp->il_af == inp->inp_af &&
1797                     grp->il_lport == inp->inp_lport &&
1798                     memcmp(&grp->il_dependladdr,
1799                         &inp->inp_inc.inc_ie.ie_dependladdr,
1800                         sizeof(grp->il_dependladdr)) == 0) {
1801                         break;
1802                 }
1803         }
1804         if (grp == NULL) {
1805                 /*
1806                  * Create a new local group
1807                  */
1808                 if (grp_alloc == NULL) {
1809                         grp_alloc = inp_localgroup_alloc(inp->inp_af,
1810                             inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr,
1811                             INP_LOCALGROUP_SIZMIN);
1812                         /*
1813                          * Local group allocation could block and the
1814                          * local group w/ the same property might have
1815                          * been added by others when we were blocked;
1816                          * check again.
1817                          */
1818                         goto again;
1819                 } else {
1820                         /* Local group has been allocated; link it */
1821                         grp = grp_alloc;
1822                         grp_alloc = NULL;
1823                         LIST_INSERT_HEAD(hdr, grp, il_list);
1824                 }
1825         } else if (grp->il_inpcnt == grp->il_inpsiz) {
1826                 if (grp->il_inpsiz >= INP_LOCALGROUP_SIZMAX) {
1827                         static int limit_logged = 0;
1828
1829                         if (!limit_logged) {
1830                                 limit_logged = 1;
1831                                 kprintf("local group port %d, "
1832                                     "limit reached\n", ntohs(grp->il_lport));
1833                         }
1834                         if (grp_alloc != NULL) {
1835                                 /*
1836                                  * This would happen if the local group
1837                                  * w/ the same property was expanded when
1838                                  * our local group allocation blocked.
1839                                  */
1840                                 inp_localgroup_free(grp_alloc);
1841                         }
1842                         return;
1843                 }
1844
1845                 /*
1846                  * Expand this local group
1847                  */
1848                 if (grp_alloc == NULL ||
1849                     grp->il_inpcnt >= grp_alloc->il_inpsiz) {
1850                         if (grp_alloc != NULL)
1851                                 inp_localgroup_free(grp_alloc);
1852                         grp_alloc = inp_localgroup_alloc(grp->il_af,
1853                             grp->il_lport, &grp->il_dependladdr,
1854                             grp->il_inpsiz * 2);
1855                         /*
1856                          * Local group allocation could block and the
1857                          * local group w/ the same property might have
1858                          * been expanded by others when we were blocked;
1859                          * check again.
1860                          */
1861                         goto again;
1862                 }
1863
1864                 /*
1865                  * Save the old local group, link the new one, and then
1866                  * destroy the old local group
1867                  */
1868                 inp_localgroup_copy(grp_alloc, grp);
1869                 LIST_INSERT_HEAD(hdr, grp_alloc, il_list);
1870                 inp_localgroup_destroy(grp);
1871
1872                 grp = grp_alloc;
1873                 grp_alloc = NULL;
1874         } else {
1875                 /*
1876                  * Found the local group
1877                  */
1878                 if (grp_alloc != NULL) {
1879                         /*
1880                          * This would happen if the local group w/ the
1881                          * same property was added or expanded when our
1882                          * local group allocation blocked.
1883                          */
1884                         inp_localgroup_free(grp_alloc);
1885                         grp_alloc = NULL;
1886                 }
1887         }
1888
1889         KASSERT(grp->il_inpcnt < grp->il_inpsiz,
1890             ("invalid local group size %d and count %d",
1891              grp->il_inpsiz, grp->il_inpcnt));
1892
1893         /*
1894          * Keep the local group sorted by the inpcb local group index
1895          * in ascending order.
1896          *
1897          * This eases the multi-process userland application which uses
1898          * SO_REUSEPORT sockets and binds process to the owner cpu of
1899          * the SO_REUSEPORT socket:
1900          * If we didn't sort the local group by the inpcb local group
1901          * index and one of the process owning an inpcb in this local
1902          * group restarted, e.g. crashed and restarted by watchdog,
1903          * other processes owning a inpcb in this local group would have
1904          * to detect that event, refetch its socket's owner cpu, and
1905          * re-bind.
1906          */
1907         idx = grp->il_inpcnt;
1908         for (i = 0; i < idx; ++i) {
1909                 struct inpcb *oinp = grp->il_inp[i];
1910
1911                 if (oinp->inp_lgrpindex > i) {
1912                         if (inp->inp_lgrpindex < 0) {
1913                                 inp->inp_lgrpindex = i;
1914                         } else if (inp->inp_lgrpindex != i) {
1915                                 if (bootverbose) {
1916                                         kprintf("inp %p: grpidx %d, "
1917                                             "assigned to %d, cpu%d\n",
1918                                             inp, inp->inp_lgrpindex, i,
1919                                             mycpuid);
1920                                 }
1921                         }
1922                         grp->il_inp[i] = inp;
1923
1924                         /* Pull down inpcbs */
1925                         for (; i < grp->il_inpcnt; ++i) {
1926                                 struct inpcb *oinp1 = grp->il_inp[i + 1];
1927
1928                                 grp->il_inp[i + 1] = oinp;
1929                                 oinp = oinp1;
1930                         }
1931                         grp->il_inpcnt++;
1932                         return;
1933                 }
1934         }
1935
1936         if (inp->inp_lgrpindex < 0) {
1937                 inp->inp_lgrpindex = idx;
1938         } else if (inp->inp_lgrpindex != idx) {
1939                 if (bootverbose) {
1940                         kprintf("inp %p: grpidx %d, assigned to %d, cpu%d\n",
1941                             inp, inp->inp_lgrpindex, idx, mycpuid);
1942                 }
1943         }
1944         grp->il_inp[idx] = inp;
1945         grp->il_inpcnt++;
1946 }
1947
1948 void
1949 in_pcbinswildcardhash_oncpu(struct inpcb *inp, struct inpcbinfo *pcbinfo)
1950 {
1951         struct inpcontainer *ic;
1952         struct inpcontainerhead *bucket;
1953
1954         GET_PCBINFO_TOKEN(pcbinfo);
1955
1956         in_pcbinslocalgrphash_oncpu(inp, pcbinfo);
1957
1958         bucket = &pcbinfo->wildcardhashbase[
1959             INP_PCBWILDCARDHASH(inp->inp_lport, pcbinfo->wildcardhashmask)];
1960
1961         ic = kmalloc(sizeof(struct inpcontainer), M_TEMP, M_INTWAIT);
1962         ic->ic_inp = inp;
1963         LIST_INSERT_HEAD(bucket, ic, ic_list);
1964
1965         REL_PCBINFO_TOKEN(pcbinfo);
1966 }
1967
1968 /*
1969  * Insert PCB into wildcard hash table.
1970  */
1971 void
1972 in_pcbinswildcardhash(struct inpcb *inp)
1973 {
1974         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1975
1976         KASSERT(&curthread->td_msgport == netisr_cpuport(pcbinfo->cpu),
1977             ("not in correct netisr"));
1978         ASSERT_INP_NOTINHASH(inp);
1979         inp->inp_flags |= INP_WILDCARD;
1980
1981         in_pcbinswildcardhash_oncpu(inp, pcbinfo);
1982 }
1983
1984 static void
1985 in_pcbremlocalgrphash_oncpu(struct inpcb *inp, struct inpcbinfo *pcbinfo)
1986 {
1987         struct inp_localgrphead *hdr;
1988         struct inp_localgroup *grp;
1989
1990         ASSERT_PCBINFO_TOKEN_HELD(pcbinfo);
1991
1992         if (pcbinfo->localgrphashbase == NULL)
1993                 return;
1994
1995         hdr = &pcbinfo->localgrphashbase[
1996             INP_PCBLOCALGRPHASH(inp->inp_lport, pcbinfo->localgrphashmask)];
1997
1998         LIST_FOREACH(grp, hdr, il_list) {
1999                 int i;
2000
2001                 for (i = 0; i < grp->il_inpcnt; ++i) {
2002                         if (grp->il_inp[i] != inp)
2003                                 continue;
2004
2005                         if (grp->il_inpcnt == 1) {
2006                                 /* Destroy this local group */
2007                                 inp_localgroup_destroy(grp);
2008                         } else {
2009                                 /* Pull up inpcbs */
2010                                 for (; i + 1 < grp->il_inpcnt; ++i)
2011                                         grp->il_inp[i] = grp->il_inp[i + 1];
2012                                 grp->il_inpcnt--;
2013                         }
2014                         return;
2015                 }
2016         }
2017 }
2018
2019 void
2020 in_pcbremwildcardhash_oncpu(struct inpcb *inp, struct inpcbinfo *pcbinfo)
2021 {
2022         struct inpcontainer *ic;
2023         struct inpcontainerhead *head;
2024
2025         GET_PCBINFO_TOKEN(pcbinfo);
2026
2027         in_pcbremlocalgrphash_oncpu(inp, pcbinfo);
2028
2029         /* find bucket */
2030         head = &pcbinfo->wildcardhashbase[
2031             INP_PCBWILDCARDHASH(inp->inp_lport, pcbinfo->wildcardhashmask)];
2032
2033         LIST_FOREACH(ic, head, ic_list) {
2034                 if (ic->ic_inp == inp)
2035                         goto found;
2036         }
2037         REL_PCBINFO_TOKEN(pcbinfo);
2038         return;                 /* not found! */
2039
2040 found:
2041         LIST_REMOVE(ic, ic_list);       /* remove container from bucket chain */
2042         REL_PCBINFO_TOKEN(pcbinfo);
2043         kfree(ic, M_TEMP);              /* deallocate container */
2044 }
2045
2046 /*
2047  * Remove PCB from wildcard hash table.
2048  */
2049 void
2050 in_pcbremwildcardhash(struct inpcb *inp)
2051 {
2052         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2053
2054         KASSERT(&curthread->td_msgport == netisr_cpuport(pcbinfo->cpu),
2055             ("not in correct netisr"));
2056         KASSERT(inp->inp_flags & INP_WILDCARD, ("inp not wildcard"));
2057
2058         in_pcbremwildcardhash_oncpu(inp, pcbinfo);
2059         inp->inp_lgrpindex = -1;
2060         inp->inp_flags &= ~INP_WILDCARD;
2061 }
2062
2063 /*
2064  * Remove PCB from various lists.
2065  */
2066 void
2067 in_pcbremlists(struct inpcb *inp)
2068 {
2069         in_pcbremporthash(inp);
2070         if (inp->inp_flags & INP_WILDCARD) {
2071                 in_pcbremwildcardhash(inp);
2072         } else if (inp->inp_flags & INP_CONNECTED) {
2073                 in_pcbremconnhash(inp);
2074         }
2075
2076         if (inp->inp_flags & INP_ONLIST)
2077                 in_pcbofflist(inp);
2078 }
2079
2080 int
2081 prison_xinpcb(struct thread *td, struct inpcb *inp)
2082 {
2083         struct ucred *cr;
2084
2085         if (td->td_proc == NULL)
2086                 return (0);
2087         cr = td->td_proc->p_ucred;
2088         if (cr->cr_prison == NULL)
2089                 return (0);
2090         if (inp->inp_socket && inp->inp_socket->so_cred &&
2091             inp->inp_socket->so_cred->cr_prison &&
2092             cr->cr_prison == inp->inp_socket->so_cred->cr_prison)
2093                 return (0);
2094         return (1);
2095 }
2096
2097 int
2098 in_pcblist_range(SYSCTL_HANDLER_ARGS)
2099 {
2100         struct inpcbinfo *pcbinfo_arr = arg1;
2101         int pcbinfo_arrlen = arg2;
2102         struct inpcb *marker;
2103         int cpu, origcpu;
2104         int error, n;
2105
2106         KASSERT(pcbinfo_arrlen <= netisr_ncpus && pcbinfo_arrlen >= 1,
2107             ("invalid pcbinfo count %d", pcbinfo_arrlen));
2108
2109         /*
2110          * The process of preparing the TCB list is too time-consuming and
2111          * resource-intensive to repeat twice on every request.
2112          */
2113         n = 0;
2114         if (req->oldptr == NULL) {
2115                 for (cpu = 0; cpu < pcbinfo_arrlen; ++cpu)
2116                         n += pcbinfo_arr[cpu].ipi_count;
2117                 req->oldidx = (n + n/8 + 10) * sizeof(struct xinpcb);
2118                 return 0;
2119         }
2120
2121         if (req->newptr != NULL)
2122                 return EPERM;
2123
2124         marker = kmalloc(sizeof(struct inpcb), M_TEMP, M_WAITOK|M_ZERO);
2125         marker->inp_flags |= INP_PLACEMARKER;
2126
2127         /*
2128          * OK, now we're committed to doing something.  Re-fetch ipi_count
2129          * after obtaining the generation count.
2130          */
2131         error = 0;
2132         origcpu = mycpuid;
2133         for (cpu = 0; cpu < pcbinfo_arrlen && error == 0; ++cpu) {
2134                 struct inpcbinfo *pcbinfo = &pcbinfo_arr[cpu];
2135                 struct inpcb *inp;
2136                 struct xinpcb xi;
2137                 int i;
2138
2139                 lwkt_migratecpu(cpu);
2140
2141                 GET_PCBINFO_TOKEN(pcbinfo);
2142
2143                 n = pcbinfo->ipi_count;
2144
2145                 LIST_INSERT_HEAD(&pcbinfo->pcblisthead, marker, inp_list);
2146                 i = 0;
2147                 while ((inp = LIST_NEXT(marker, inp_list)) != NULL && i < n) {
2148                         LIST_REMOVE(marker, inp_list);
2149                         LIST_INSERT_AFTER(inp, marker, inp_list);
2150
2151                         if (inp->inp_flags & INP_PLACEMARKER)
2152                                 continue;
2153                         if (prison_xinpcb(req->td, inp))
2154                                 continue;
2155
2156                         bzero(&xi, sizeof xi);
2157                         xi.xi_len = sizeof xi;
2158                         bcopy(inp, &xi.xi_inp, sizeof *inp);
2159                         if (inp->inp_socket)
2160                                 sotoxsocket(inp->inp_socket, &xi.xi_socket);
2161                         if ((error = SYSCTL_OUT(req, &xi, sizeof xi)) != 0)
2162                                 break;
2163                         ++i;
2164                 }
2165                 LIST_REMOVE(marker, inp_list);
2166
2167                 REL_PCBINFO_TOKEN(pcbinfo);
2168
2169                 if (error == 0 && i < n) {
2170                         bzero(&xi, sizeof xi);
2171                         xi.xi_len = sizeof xi;
2172                         while (i < n) {
2173                                 error = SYSCTL_OUT(req, &xi, sizeof xi);
2174                                 if (error)
2175                                         break;
2176                                 ++i;
2177                         }
2178                 }
2179         }
2180
2181         lwkt_migratecpu(origcpu);
2182         kfree(marker, M_TEMP);
2183         return error;
2184 }
2185
2186 int
2187 in_pcblist_ncpus(SYSCTL_HANDLER_ARGS)
2188 {
2189
2190         return (in_pcblist_range(oidp, arg1, netisr_ncpus, req));
2191 }
2192
2193 void
2194 in_savefaddr(struct socket *so, const struct sockaddr *faddr)
2195 {
2196         struct sockaddr_in *sin;
2197
2198         KASSERT(faddr->sa_family == AF_INET,
2199             ("not AF_INET faddr %d", faddr->sa_family));
2200
2201         sin = kmalloc(sizeof(*sin), M_SONAME, M_WAITOK | M_ZERO);
2202         sin->sin_family = AF_INET;
2203         sin->sin_len = sizeof(*sin);
2204         sin->sin_port = ((const struct sockaddr_in *)faddr)->sin_port;
2205         sin->sin_addr = ((const struct sockaddr_in *)faddr)->sin_addr;
2206
2207         so->so_faddr = (struct sockaddr *)sin;
2208 }
2209
2210 void
2211 in_pcbportinfo_init(struct inpcbportinfo *portinfo, int hashsize,
2212     u_short offset)
2213 {
2214         memset(portinfo, 0, sizeof(*portinfo));
2215
2216         portinfo->offset = offset;
2217         portinfo->porthashbase = phashinit(hashsize, M_PCB,
2218             &portinfo->porthashcnt);
2219 }
2220
2221 void
2222 in_pcbportrange(u_short *hi0, u_short *lo0, u_short ofs, u_short step)
2223 {
2224         int hi, lo;
2225
2226         if (step == 1)
2227                 return;
2228
2229         hi = *hi0;
2230         lo = *lo0;
2231
2232         hi = rounddown(hi, step);
2233         hi += ofs;
2234         if (hi > (int)*hi0)
2235                 hi -= step;
2236
2237         lo = roundup(lo, step);
2238         lo -= (step - ofs);
2239         if (lo < (int)*lo0)
2240                 lo += step;
2241
2242         *hi0 = hi;
2243         *lo0 = lo;
2244 }
2245
2246 void
2247 in_pcbglobalinit(void)
2248 {
2249         int cpu;
2250
2251         in_pcbmarkers = kmalloc(netisr_ncpus * sizeof(struct inpcb), M_PCB,
2252             M_WAITOK | M_ZERO);
2253         in_pcbcontainer_markers =
2254             kmalloc(netisr_ncpus * sizeof(struct inpcontainer), M_PCB,
2255             M_WAITOK | M_ZERO);
2256
2257         for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
2258                 struct inpcontainer *ic = &in_pcbcontainer_markers[cpu];
2259                 struct inpcb *marker = &in_pcbmarkers[cpu];
2260
2261                 marker->inp_flags |= INP_PLACEMARKER;
2262                 ic->ic_inp = marker;
2263         }
2264 }
2265
2266 struct inpcb *
2267 in_pcbmarker(void)
2268 {
2269
2270         ASSERT_NETISR_NCPUS(mycpuid);
2271         return &in_pcbmarkers[mycpuid];
2272 }
2273
2274 struct inpcontainer *
2275 in_pcbcontainer_marker(void)
2276 {
2277
2278         ASSERT_NETISR_NCPUS(mycpuid);
2279         return &in_pcbcontainer_markers[mycpuid];
2280 }
2281
2282 void
2283 in_pcbresetroute(struct inpcb *inp)
2284 {
2285         struct route *ro = &inp->inp_route;
2286
2287         if (ro->ro_rt != NULL)
2288                 RTFREE(ro->ro_rt);
2289         bzero(ro, sizeof(*ro));
2290 }