Merge remote-tracking branch 'origin/vendor/LDNS'
[dragonfly.git] / sys / netinet / in_pcb.c
1 /*
2  * Copyright (c) 2004 Jeffrey M. Hsu.  All rights reserved.
3  * Copyright (c) 2004 The DragonFly Project.  All rights reserved.
4  *
5  * This code is derived from software contributed to The DragonFly Project
6  * by Jeffrey M. Hsu.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of The DragonFly Project nor the names of its
17  *    contributors may be used to endorse or promote products derived
18  *    from this software without specific, prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
24  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
26  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
28  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
30  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  */
33
34 /*
35  * Copyright (c) 1982, 1986, 1991, 1993, 1995
36  *      The Regents of the University of California.  All rights reserved.
37  *
38  * Redistribution and use in source and binary forms, with or without
39  * modification, are permitted provided that the following conditions
40  * are met:
41  * 1. Redistributions of source code must retain the above copyright
42  *    notice, this list of conditions and the following disclaimer.
43  * 2. Redistributions in binary form must reproduce the above copyright
44  *    notice, this list of conditions and the following disclaimer in the
45  *    documentation and/or other materials provided with the distribution.
46  * 3. Neither the name of the University nor the names of its contributors
47  *    may be used to endorse or promote products derived from this software
48  *    without specific prior written permission.
49  *
50  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
51  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
52  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
53  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
54  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
55  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
56  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
57  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
58  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
59  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
60  * SUCH DAMAGE.
61  *
62  *      @(#)in_pcb.c    8.4 (Berkeley) 5/24/95
63  * $FreeBSD: src/sys/netinet/in_pcb.c,v 1.59.2.27 2004/01/02 04:06:42 ambrisko Exp $
64  */
65
66 #include "opt_inet6.h"
67
68 #include <sys/param.h>
69 #include <sys/systm.h>
70 #include <sys/malloc.h>
71 #include <sys/mbuf.h>
72 #include <sys/domain.h>
73 #include <sys/protosw.h>
74 #include <sys/socket.h>
75 #include <sys/socketvar.h>
76 #include <sys/proc.h>
77 #include <sys/priv.h>
78 #include <sys/jail.h>
79 #include <sys/kernel.h>
80 #include <sys/sysctl.h>
81
82 #include <sys/socketvar2.h>
83 #include <sys/msgport2.h>
84
85 #include <machine/limits.h>
86
87 #include <net/if.h>
88 #include <net/if_types.h>
89 #include <net/route.h>
90 #include <net/netisr2.h>
91 #include <net/toeplitz2.h>
92
93 #include <netinet/in.h>
94 #include <netinet/in_pcb.h>
95 #include <netinet/in_var.h>
96 #include <netinet/ip_var.h>
97 #ifdef INET6
98 #include <netinet/ip6.h>
99 #include <netinet6/ip6_var.h>
100 #endif /* INET6 */
101
102 #define INP_LOCALGROUP_SIZMIN   8
103 #define INP_LOCALGROUP_SIZMAX   256
104
105 static struct inpcb *in_pcblookup_local(struct inpcbporthead *porthash,
106                 struct in_addr laddr, u_int lport_arg, int wild_okay,
107                 struct ucred *cred);
108
109 struct in_addr zeroin_addr;
110
111 /*
112  * These configure the range of local port addresses assigned to
113  * "unspecified" outgoing connections/packets/whatever.
114  */
115 int ipport_lowfirstauto = IPPORT_RESERVED - 1;  /* 1023 */
116 int ipport_lowlastauto = IPPORT_RESERVEDSTART;  /* 600 */
117
118 int ipport_firstauto = IPPORT_RESERVED;         /* 1024 */
119 int ipport_lastauto = IPPORT_USERRESERVED;      /* 5000 */
120
121 int ipport_hifirstauto = IPPORT_HIFIRSTAUTO;    /* 49152 */
122 int ipport_hilastauto = IPPORT_HILASTAUTO;      /* 65535 */
123
124 #define RANGECHK(var, min, max) \
125         if ((var) < (min)) { (var) = (min); } \
126         else if ((var) > (max)) { (var) = (max); }
127
128 int udpencap_enable = 1;        /* enabled by default */
129 int udpencap_port = 4500;       /* triggers decapsulation */
130
131 /*
132  * Per-netisr inpcb markers.
133  * NOTE: they should only be used in netisrs.
134  */
135 static struct inpcb             *in_pcbmarkers;
136 static struct inpcontainer      *in_pcbcontainer_markers;
137
138 static int
139 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
140 {
141         int error;
142
143         error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
144         if (!error) {
145                 RANGECHK(ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
146                 RANGECHK(ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
147
148                 RANGECHK(ipport_firstauto, IPPORT_RESERVED, USHRT_MAX);
149                 RANGECHK(ipport_lastauto, IPPORT_RESERVED, USHRT_MAX);
150
151                 RANGECHK(ipport_hifirstauto, IPPORT_RESERVED, USHRT_MAX);
152                 RANGECHK(ipport_hilastauto, IPPORT_RESERVED, USHRT_MAX);
153         }
154         return (error);
155 }
156
157 #undef RANGECHK
158
159 SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0, "IP Ports");
160
161 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, CTLTYPE_INT|CTLFLAG_RW,
162            &ipport_lowfirstauto, 0, &sysctl_net_ipport_check, "I", "");
163 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, CTLTYPE_INT|CTLFLAG_RW,
164            &ipport_lowlastauto, 0, &sysctl_net_ipport_check, "I", "");
165 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, CTLTYPE_INT|CTLFLAG_RW,
166            &ipport_firstauto, 0, &sysctl_net_ipport_check, "I", "");
167 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, CTLTYPE_INT|CTLFLAG_RW,
168            &ipport_lastauto, 0, &sysctl_net_ipport_check, "I", "");
169 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, CTLTYPE_INT|CTLFLAG_RW,
170            &ipport_hifirstauto, 0, &sysctl_net_ipport_check, "I", "");
171 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, CTLTYPE_INT|CTLFLAG_RW,
172            &ipport_hilastauto, 0, &sysctl_net_ipport_check, "I", "");
173
174 /* Initialized by ip_init() */
175 int ip_porthash_trycount;
176 SYSCTL_INT(_net_inet_ip, OID_AUTO, porthash_trycount, CTLFLAG_RW,
177     &ip_porthash_trycount, 0,
178     "Number of tries to find local port matching hash of 4-tuple");
179
180 /*
181  * in_pcb.c: manage the Protocol Control Blocks.
182  *
183  * NOTE: It is assumed that most of these functions will be called from
184  * a critical section.  XXX - There are, unfortunately, a few exceptions
185  * to this rule that should be fixed.
186  *
187  * NOTE: The caller should initialize the cpu field to the cpu running the
188  * protocol stack associated with this inpcbinfo.
189  */
190
191 void
192 in_pcbinfo_init(struct inpcbinfo *pcbinfo, int cpu, boolean_t shared)
193 {
194         KASSERT(cpu >= 0 && cpu < netisr_ncpus, ("invalid cpu%d", cpu));
195         pcbinfo->cpu = cpu;
196
197         LIST_INIT(&pcbinfo->pcblisthead);
198         pcbinfo->portsave = kmalloc(sizeof(*pcbinfo->portsave), M_PCB,
199                                     M_WAITOK | M_ZERO);
200
201         if (shared) {
202                 pcbinfo->infotoken = kmalloc(sizeof(struct lwkt_token),
203                     M_PCB, M_WAITOK);
204                 lwkt_token_init(pcbinfo->infotoken, "infotoken");
205         } else {
206                 pcbinfo->infotoken = NULL;
207         }
208 }
209
210 void
211 in_pcbportinfo_set(struct inpcbinfo *pcbinfo, struct inpcbportinfo *portinfo,
212     int portinfo_cnt)
213 {
214
215         KASSERT(portinfo_cnt > 0, ("invalid portinfo_cnt %d", portinfo_cnt));
216         pcbinfo->portinfo = portinfo;
217         pcbinfo->portinfo_cnt = portinfo_cnt;
218 }
219
220 struct baddynamicports baddynamicports;
221
222 /*
223  * Check if the specified port is invalid for dynamic allocation.
224  */
225 int
226 in_baddynamic(u_int16_t port, u_int16_t proto)
227 {
228         switch (proto) {
229         case IPPROTO_TCP:
230                 return (DP_ISSET(baddynamicports.tcp, port));
231         case IPPROTO_UDP:
232                 return (DP_ISSET(baddynamicports.udp, port));
233         default:
234                 return (0);
235         }
236 }
237
238 void
239 in_pcbonlist(struct inpcb *inp)
240 {
241         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
242
243         KASSERT(&curthread->td_msgport == netisr_cpuport(pcbinfo->cpu),
244             ("not in the correct netisr"));
245         KASSERT((inp->inp_flags & INP_ONLIST) == 0, ("already on pcblist"));
246         inp->inp_flags |= INP_ONLIST;
247
248         GET_PCBINFO_TOKEN(pcbinfo);
249         LIST_INSERT_HEAD(&pcbinfo->pcblisthead, inp, inp_list);
250         pcbinfo->ipi_count++;
251         REL_PCBINFO_TOKEN(pcbinfo);
252 }
253
254 void
255 in_pcbofflist(struct inpcb *inp)
256 {
257         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
258
259         KASSERT(&curthread->td_msgport == netisr_cpuport(pcbinfo->cpu),
260             ("not in the correct netisr"));
261         KASSERT(inp->inp_flags & INP_ONLIST, ("not on pcblist"));
262         inp->inp_flags &= ~INP_ONLIST;
263
264         GET_PCBINFO_TOKEN(pcbinfo);
265         LIST_REMOVE(inp, inp_list);
266         KASSERT(pcbinfo->ipi_count > 0,
267             ("invalid inpcb count %d", pcbinfo->ipi_count));
268         pcbinfo->ipi_count--;
269         REL_PCBINFO_TOKEN(pcbinfo);
270 }
271
272 /*
273  * Allocate a PCB and associate it with the socket.
274  */
275 int
276 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
277 {
278         struct inpcb *inp;
279
280         inp = kmalloc(pcbinfo->ipi_size, M_PCB, M_WAITOK|M_ZERO|M_NULLOK);
281         if (inp == NULL)
282                 return (ENOMEM);
283         inp->inp_lgrpindex = -1;
284         inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
285         inp->inp_pcbinfo = pcbinfo;
286         inp->inp_socket = so;
287 #ifdef INET6
288         if (INP_CHECK_SOCKAF(so, AF_INET6)) {
289                 if (ip6_auto_flowlabel)
290                         inp->inp_flags |= IN6P_AUTOFLOWLABEL;
291                 inp->inp_af = AF_INET6;
292         } else
293 #endif
294         inp->inp_af = AF_INET;
295         soreference(so);
296         so->so_pcb = inp;
297
298         in_pcbonlist(inp);
299         return (0);
300 }
301
302 /*
303  * Unlink a pcb with the intention of moving it to another cpu with a
304  * different pcbinfo.  While unlinked nothing should attempt to dereference
305  * inp_pcbinfo, NULL it out so we assert if it does.
306  */
307 void
308 in_pcbunlink_flags(struct inpcb *inp, struct inpcbinfo *pcbinfo, int flags)
309 {
310         KASSERT(inp->inp_pcbinfo == pcbinfo, ("pcbinfo mismatch"));
311         KASSERT((inp->inp_flags & (flags | INP_CONNECTED)) == 0,
312             ("already linked"));
313
314         in_pcbofflist(inp);
315         inp->inp_pcbinfo = NULL;
316 }
317
318 void
319 in_pcbunlink(struct inpcb *inp, struct inpcbinfo *pcbinfo)
320 {
321         in_pcbunlink_flags(inp, pcbinfo, INP_WILDCARD);
322 }
323
324 /*
325  * Relink a pcb into a new pcbinfo.
326  */
327 void
328 in_pcblink_flags(struct inpcb *inp, struct inpcbinfo *pcbinfo, int flags)
329 {
330         KASSERT(inp->inp_pcbinfo == NULL, ("has pcbinfo"));
331         KASSERT((inp->inp_flags & (flags | INP_CONNECTED)) == 0,
332             ("already linked"));
333
334         inp->inp_pcbinfo = pcbinfo;
335         in_pcbonlist(inp);
336 }
337
338 void
339 in_pcblink(struct inpcb *inp, struct inpcbinfo *pcbinfo)
340 {
341         return in_pcblink_flags(inp, pcbinfo, INP_WILDCARD);
342 }
343
344 static boolean_t
345 in_pcbporthash_update(struct inpcbportinfo *portinfo,
346     struct inpcb *inp, u_short lport, struct ucred *cred, int wild)
347 {
348         struct inpcbporthead *porthash;
349
350         /*
351          * This has to be atomic.  If the porthash is shared across multiple
352          * protocol threads, e.g. tcp and udp, then the token must be held.
353          */
354         porthash = in_pcbporthash_head(portinfo, lport);
355         GET_PORTHASH_TOKEN(porthash);
356
357         if (in_pcblookup_local(porthash, inp->inp_laddr, lport,
358             wild, cred) != NULL) {
359                 REL_PORTHASH_TOKEN(porthash);
360                 return FALSE;
361         }
362         inp->inp_lport = lport;
363         in_pcbinsporthash(porthash, inp);
364
365         REL_PORTHASH_TOKEN(porthash);
366         return TRUE;
367 }
368
369 static int
370 in_pcbsetlport(struct inpcb *inp, int wild, struct ucred *cred)
371 {
372         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
373         struct inpcbportinfo *portinfo;
374         u_short first, last, lport, step, first0, last0;
375         int count, error;
376         int portinfo_first, portinfo_idx;
377         uint32_t cut;
378
379         inp->inp_flags |= INP_ANONPORT;
380
381         step = pcbinfo->portinfo_cnt;
382         portinfo_first = mycpuid % pcbinfo->portinfo_cnt;
383         portinfo_idx = portinfo_first;
384
385         if (inp->inp_flags & INP_HIGHPORT) {
386                 first0 = ipport_hifirstauto;    /* sysctl */
387                 last0  = ipport_hilastauto;
388         } else if (inp->inp_flags & INP_LOWPORT) {
389                 if (cred &&
390                     (error =
391                      priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0))) {
392                         inp->inp_laddr.s_addr = INADDR_ANY;
393                         return error;
394                 }
395                 first0 = ipport_lowfirstauto;   /* 1023 */
396                 last0  = ipport_lowlastauto;    /* 600 */
397         } else {
398                 first0 = ipport_firstauto;      /* sysctl */
399                 last0  = ipport_lastauto;
400         }
401         if (first0 > last0) {
402                 lport = last0;
403                 last0 = first0;
404                 first0 = lport;
405         }
406         KKASSERT(last0 >= first0);
407
408         cut = karc4random();
409 loop:
410         portinfo = &pcbinfo->portinfo[portinfo_idx];
411         first = first0;
412         last = last0;
413
414         /*
415          * Simple check to ensure all ports are not used up causing
416          * a deadlock here.
417          */
418         in_pcbportrange(&last, &first, portinfo->offset, step);
419         lport = last - first;
420         count = lport / step;
421
422         lport = rounddown(cut % lport, step) + first;
423         KKASSERT(lport % step == portinfo->offset);
424
425         for (;;) {
426                 if (count-- < 0) {      /* completely used? */
427                         error = EADDRNOTAVAIL;
428                         break;
429                 }
430
431                 if (__predict_false(lport < first || lport > last)) {
432                         lport = first;
433                         KKASSERT(lport % step == portinfo->offset);
434                 }
435
436                 if (in_pcbporthash_update(portinfo, inp, htons(lport),
437                     cred, wild)) {
438                         error = 0;
439                         break;
440                 }
441
442                 lport += step;
443                 KKASSERT(lport % step == portinfo->offset);
444         }
445
446         if (error) {
447                 /* Try next portinfo */
448                 portinfo_idx++;
449                 portinfo_idx %= pcbinfo->portinfo_cnt;
450                 if (portinfo_idx != portinfo_first)
451                         goto loop;
452                 inp->inp_laddr.s_addr = INADDR_ANY;
453         }
454         return error;
455 }
456
457 int
458 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct thread *td)
459 {
460         struct socket *so = inp->inp_socket;
461         struct sockaddr_in jsin;
462         struct ucred *cred = NULL;
463         int wild = 0;
464
465         if (TAILQ_EMPTY(&in_ifaddrheads[mycpuid])) /* XXX broken! */
466                 return (EADDRNOTAVAIL);
467         if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
468                 return (EINVAL);        /* already bound */
469
470         if (!(so->so_options & (SO_REUSEADDR|SO_REUSEPORT)))
471                 wild = 1;    /* neither SO_REUSEADDR nor SO_REUSEPORT is set */
472         if (td->td_proc)
473                 cred = td->td_proc->p_ucred;
474
475         if (nam != NULL) {
476                 struct sockaddr_in *sin = (struct sockaddr_in *)nam;
477                 struct inpcbinfo *pcbinfo;
478                 struct inpcbportinfo *portinfo;
479                 struct inpcbporthead *porthash;
480                 struct inpcb *t;
481                 u_short lport, lport_ho;
482                 int reuseport = (so->so_options & SO_REUSEPORT);
483                 int error;
484
485                 if (nam->sa_len != sizeof *sin)
486                         return (EINVAL);
487 #ifdef notdef
488                 /*
489                  * We should check the family, but old programs
490                  * incorrectly fail to initialize it.
491                  */
492                 if (sin->sin_family != AF_INET)
493                         return (EAFNOSUPPORT);
494 #endif
495                 if (!prison_replace_wildcards(td, nam))
496                         return (EINVAL);
497
498                 lport = sin->sin_port;
499                 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
500                         /*
501                          * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
502                          * allow complete duplication of binding if
503                          * SO_REUSEPORT is set, or if SO_REUSEADDR is set
504                          * and a multicast address is bound on both
505                          * new and duplicated sockets.
506                          */
507                         if (so->so_options & SO_REUSEADDR)
508                                 reuseport = SO_REUSEADDR | SO_REUSEPORT;
509                 } else if (sin->sin_addr.s_addr != INADDR_ANY) {
510                         sin->sin_port = 0;              /* yech... */
511                         bzero(&sin->sin_zero, sizeof sin->sin_zero);
512                         if (ifa_ifwithaddr((struct sockaddr *)sin) == NULL)
513                                 return (EADDRNOTAVAIL);
514                 }
515
516                 inp->inp_laddr = sin->sin_addr;
517
518                 jsin.sin_family = AF_INET;
519                 jsin.sin_addr.s_addr = inp->inp_laddr.s_addr;
520                 if (!prison_replace_wildcards(td, (struct sockaddr *)&jsin)) {
521                         inp->inp_laddr.s_addr = INADDR_ANY;
522                         return (EINVAL);
523                 }
524                 inp->inp_laddr.s_addr = jsin.sin_addr.s_addr;
525
526                 if (lport == 0) {
527                         /* Auto-select local port */
528                         return in_pcbsetlport(inp, wild, cred);
529                 }
530                 lport_ho = ntohs(lport);
531
532                 /* GROSS */
533                 if (lport_ho < IPPORT_RESERVED && cred &&
534                     (error =
535                      priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0))) {
536                         inp->inp_laddr.s_addr = INADDR_ANY;
537                         return (error);
538                 }
539
540                 /*
541                  * Locate the proper portinfo based on lport
542                  */
543                 pcbinfo = inp->inp_pcbinfo;
544                 portinfo =
545                     &pcbinfo->portinfo[lport_ho % pcbinfo->portinfo_cnt];
546                 KKASSERT((lport_ho % pcbinfo->portinfo_cnt) ==
547                     portinfo->offset);
548
549                 /*
550                  * This has to be atomic.  If the porthash is shared across
551                  * multiple protocol threads, e.g. tcp and udp then the token
552                  * must be held.
553                  */
554                 porthash = in_pcbporthash_head(portinfo, lport);
555                 GET_PORTHASH_TOKEN(porthash);
556
557                 if (so->so_cred->cr_uid != 0 &&
558                     !IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
559                         t = in_pcblookup_local(porthash, sin->sin_addr, lport,
560                             INPLOOKUP_WILDCARD, cred);
561                         if (t &&
562                             (so->so_cred->cr_uid !=
563                              t->inp_socket->so_cred->cr_uid)) {
564                                 inp->inp_laddr.s_addr = INADDR_ANY;
565                                 error = EADDRINUSE;
566                                 goto done;
567                         }
568                 }
569                 if (cred && !prison_replace_wildcards(td, nam)) {
570                         inp->inp_laddr.s_addr = INADDR_ANY;
571                         error = EADDRNOTAVAIL;
572                         goto done;
573                 }
574                 t = in_pcblookup_local(porthash, sin->sin_addr, lport,
575                     wild, cred);
576                 if (t && !(reuseport & t->inp_socket->so_options)) {
577                         inp->inp_laddr.s_addr = INADDR_ANY;
578                         error = EADDRINUSE;
579                         goto done;
580                 }
581                 inp->inp_lport = lport;
582                 in_pcbinsporthash(porthash, inp);
583                 error = 0;
584 done:
585                 REL_PORTHASH_TOKEN(porthash);
586                 return (error);
587         } else {
588                 jsin.sin_family = AF_INET;
589                 jsin.sin_addr.s_addr = inp->inp_laddr.s_addr;
590                 if (!prison_replace_wildcards(td, (struct sockaddr *)&jsin)) {
591                         inp->inp_laddr.s_addr = INADDR_ANY;
592                         return (EINVAL);
593                 }
594                 inp->inp_laddr.s_addr = jsin.sin_addr.s_addr;
595
596                 return in_pcbsetlport(inp, wild, cred);
597         }
598 }
599
600 static struct inpcb *
601 in_pcblookup_localremote(struct inpcbporthead *porthash, struct in_addr laddr,
602     u_short lport, struct in_addr faddr, u_short fport, struct ucred *cred)
603 {
604         struct inpcb *inp;
605         struct inpcbport *phd;
606         struct inpcb *match = NULL;
607
608         /*
609          * If the porthashbase is shared across several cpus, it must
610          * have been locked.
611          */
612         ASSERT_PORTHASH_TOKEN_HELD(porthash);
613
614         /*
615          * Best fit PCB lookup.
616          *
617          * First see if this local port is in use by looking on the
618          * port hash list.
619          */
620         LIST_FOREACH(phd, porthash, phd_hash) {
621                 if (phd->phd_port == lport)
622                         break;
623         }
624         if (phd != NULL) {
625                 LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
626 #ifdef INET6
627                         if (!INP_ISIPV4(inp))
628                                 continue;
629 #endif
630                         if (inp->inp_laddr.s_addr != INADDR_ANY &&
631                             inp->inp_laddr.s_addr != laddr.s_addr)
632                                 continue;
633
634                         if (inp->inp_faddr.s_addr != INADDR_ANY &&
635                             inp->inp_faddr.s_addr != faddr.s_addr)
636                                 continue;
637
638                         if (inp->inp_fport != 0 && inp->inp_fport != fport)
639                                 continue;
640
641                         if (cred == NULL ||
642                             cred->cr_prison ==
643                             inp->inp_socket->so_cred->cr_prison) {
644                                 match = inp;
645                                 break;
646                         }
647                 }
648         }
649         return (match);
650 }
651
652 static boolean_t
653 in_pcbporthash_update4(struct inpcbportinfo *portinfo,
654     struct inpcb *inp, u_short lport, const struct sockaddr_in *sin,
655     struct ucred *cred)
656 {
657         struct inpcbporthead *porthash;
658
659         /*
660          * This has to be atomic.  If the porthash is shared across multiple
661          * protocol threads, e.g. tcp and udp, then the token must be held.
662          */
663         porthash = in_pcbporthash_head(portinfo, lport);
664         GET_PORTHASH_TOKEN(porthash);
665
666         if (in_pcblookup_localremote(porthash, inp->inp_laddr,
667             lport, sin->sin_addr, sin->sin_port, cred) != NULL) {
668                 REL_PORTHASH_TOKEN(porthash);
669                 return FALSE;
670         }
671         inp->inp_lport = lport;
672         in_pcbinsporthash(porthash, inp);
673
674         REL_PORTHASH_TOKEN(porthash);
675         return TRUE;
676 }
677
678 int
679 in_pcbbind_remote(struct inpcb *inp, const struct sockaddr *remote,
680     struct thread *td)
681 {
682         struct proc *p = td->td_proc;
683         const struct sockaddr_in *sin = (const struct sockaddr_in *)remote;
684         struct sockaddr_in jsin;
685         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
686         struct ucred *cred = NULL;
687         u_short first, last, lport;
688         int count, hash_count;
689         int error, selfconn = 0;
690         int cpuid = mycpuid;
691         uint32_t hash_base = 0, hash;
692
693         ASSERT_NETISR_NCPUS(cpuid);
694
695         if (TAILQ_EMPTY(&in_ifaddrheads[cpuid])) /* XXX broken! */
696                 return (EADDRNOTAVAIL);
697
698         KKASSERT(inp->inp_laddr.s_addr != INADDR_ANY);
699         if (inp->inp_lport != 0)
700                 return (EINVAL);        /* already bound */
701
702         KKASSERT(p);
703         cred = p->p_ucred;
704
705         jsin.sin_family = AF_INET;
706         jsin.sin_addr.s_addr = inp->inp_laddr.s_addr;
707         if (!prison_replace_wildcards(td, (struct sockaddr *)&jsin)) {
708                 inp->inp_laddr.s_addr = INADDR_ANY;
709                 return (EINVAL);
710         }
711         inp->inp_laddr.s_addr = jsin.sin_addr.s_addr;
712
713         hash_count = ip_porthash_trycount;
714         if (hash_count > 0) {
715                 hash_base = toeplitz_piecemeal_addr(sin->sin_addr.s_addr) ^
716                     toeplitz_piecemeal_addr(inp->inp_laddr.s_addr) ^
717                     toeplitz_piecemeal_port(sin->sin_port);
718         } else {
719                 hash_count = 0;
720         }
721
722         inp->inp_flags |= INP_ANONPORT;
723
724         if (inp->inp_flags & INP_HIGHPORT) {
725                 first = ipport_hifirstauto;     /* sysctl */
726                 last  = ipport_hilastauto;
727         } else if (inp->inp_flags & INP_LOWPORT) {
728                 if (cred &&
729                     (error =
730                      priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0))) {
731                         inp->inp_laddr.s_addr = INADDR_ANY;
732                         return (error);
733                 }
734                 first = ipport_lowfirstauto;    /* 1023 */
735                 last = ipport_lowlastauto;      /* 600 */
736         } else {
737                 first = ipport_firstauto;       /* sysctl */
738                 last  = ipport_lastauto;
739         }
740         if (first > last) {
741                 lport = last;
742                 last = first;
743                 first = lport;
744         }
745         KKASSERT(last >= first);
746
747         count = last - first;
748         lport = (karc4random() % count) + first;
749         count += hash_count;
750
751         /*
752          * Simple check to ensure all ports are not used up causing
753          * a deadlock here.
754          */
755         for (;;) {
756                 u_short lport_no;
757
758                 if (count-- < 0) {      /* completely used? */
759                         error = EADDRNOTAVAIL;
760                         break;
761                 }
762
763                 if (__predict_false(lport < first || lport > last))
764                         lport = first;
765                 lport_no = htons(lport);
766
767                 /* This could happen on loopback interface */
768                 if (__predict_false(sin->sin_port == lport_no &&
769                     sin->sin_addr.s_addr == inp->inp_laddr.s_addr)) {
770                         if (!selfconn) {
771                                 ++count; /* don't count this try */
772                                 selfconn = 1;
773                         }
774                         goto next;
775                 }
776
777                 if (hash_count) {
778                         --hash_count;
779                         hash = hash_base ^
780                             toeplitz_piecemeal_port(lport_no);
781                         if (netisr_hashcpu(hash) != cpuid && hash_count)
782                                 goto next;
783                 }
784
785                 if (in_pcbporthash_update4(
786                     &pcbinfo->portinfo[lport % pcbinfo->portinfo_cnt],
787                     inp, lport_no, sin, cred)) {
788                         error = 0;
789                         break;
790                 }
791 next:
792                 ++lport;
793         }
794
795         if (error)
796                 inp->inp_laddr.s_addr = INADDR_ANY;
797         return (error);
798 }
799
800 /*
801  *   Transform old in_pcbconnect() into an inner subroutine for new
802  *   in_pcbconnect(): Do some validity-checking on the remote
803  *   address (in mbuf 'nam') and then determine local host address
804  *   (i.e., which interface) to use to access that remote host.
805  *
806  *   This preserves definition of in_pcbconnect(), while supporting a
807  *   slightly different version for T/TCP.  (This is more than
808  *   a bit of a kludge, but cleaning up the internal interfaces would
809  *   have forced minor changes in every protocol).
810  */
811 int
812 in_pcbladdr_find(struct inpcb *inp, struct sockaddr *nam,
813     struct sockaddr_in **plocal_sin, struct thread *td, int find)
814 {
815         struct in_ifaddr *ia;
816         struct ucred *cred = NULL;
817         struct sockaddr_in *sin = (struct sockaddr_in *)nam;
818         struct sockaddr *jsin;
819         int jailed = 0, alloc_route = 0;
820
821         if (nam->sa_len != sizeof *sin)
822                 return (EINVAL);
823         if (sin->sin_family != AF_INET)
824                 return (EAFNOSUPPORT);
825         if (sin->sin_port == 0)
826                 return (EADDRNOTAVAIL);
827         if (td && td->td_proc && td->td_proc->p_ucred)
828                 cred = td->td_proc->p_ucred;
829         if (cred && cred->cr_prison)
830                 jailed = 1;
831         if (!TAILQ_EMPTY(&in_ifaddrheads[mycpuid])) {
832                 ia = TAILQ_FIRST(&in_ifaddrheads[mycpuid])->ia;
833                 /*
834                  * If the destination address is INADDR_ANY,
835                  * use the primary local address.
836                  * If the supplied address is INADDR_BROADCAST,
837                  * and the primary interface supports broadcast,
838                  * choose the broadcast address for that interface.
839                  */
840                 if (sin->sin_addr.s_addr == INADDR_ANY)
841                         sin->sin_addr = IA_SIN(ia)->sin_addr;
842                 else if (sin->sin_addr.s_addr == (u_long)INADDR_BROADCAST &&
843                     (ia->ia_ifp->if_flags & IFF_BROADCAST))
844                         sin->sin_addr = satosin(&ia->ia_broadaddr)->sin_addr;
845         }
846         if (find) {
847                 struct route *ro;
848
849                 ia = NULL;
850                 /*
851                  * If route is known or can be allocated now,
852                  * our src addr is taken from the i/f, else punt.
853                  * Note that we should check the address family of the cached
854                  * destination, in case of sharing the cache with IPv6.
855                  */
856                 ro = &inp->inp_route;
857                 if (ro->ro_rt &&
858                     (!(ro->ro_rt->rt_flags & RTF_UP) ||
859                      ro->ro_dst.sa_family != AF_INET ||
860                      satosin(&ro->ro_dst)->sin_addr.s_addr !=
861                                       sin->sin_addr.s_addr ||
862                      inp->inp_socket->so_options & SO_DONTROUTE)) {
863                         RTFREE(ro->ro_rt);
864                         ro->ro_rt = NULL;
865                 }
866                 if (!(inp->inp_socket->so_options & SO_DONTROUTE) && /*XXX*/
867                     (ro->ro_rt == NULL ||
868                     ro->ro_rt->rt_ifp == NULL)) {
869                         /* No route yet, so try to acquire one */
870                         bzero(&ro->ro_dst, sizeof(struct sockaddr_in));
871                         ro->ro_dst.sa_family = AF_INET;
872                         ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
873                         ((struct sockaddr_in *) &ro->ro_dst)->sin_addr =
874                                 sin->sin_addr;
875                         rtalloc(ro);
876                         alloc_route = 1;
877                 }
878                 /*
879                  * If we found a route, use the address
880                  * corresponding to the outgoing interface
881                  * unless it is the loopback (in case a route
882                  * to our address on another net goes to loopback).
883                  */
884                 if (ro->ro_rt &&
885                     !(ro->ro_rt->rt_ifp->if_flags & IFF_LOOPBACK)) {
886                         if (jailed) {
887                                 if (jailed_ip(cred->cr_prison, 
888                                     ro->ro_rt->rt_ifa->ifa_addr)) {
889                                         ia = ifatoia(ro->ro_rt->rt_ifa);
890                                 }
891                         } else {
892                                 ia = ifatoia(ro->ro_rt->rt_ifa);
893                         }
894                 }
895                 if (ia == NULL) {
896                         u_short fport = sin->sin_port;
897
898                         sin->sin_port = 0;
899                         ia = ifatoia(ifa_ifwithdstaddr(sintosa(sin)));
900                         if (ia && jailed && !jailed_ip(cred->cr_prison,
901                             sintosa(&ia->ia_addr)))
902                                 ia = NULL;
903                         if (ia == NULL)
904                                 ia = ifatoia(ifa_ifwithnet(sintosa(sin)));
905                         if (ia && jailed && !jailed_ip(cred->cr_prison,
906                             sintosa(&ia->ia_addr)))
907                                 ia = NULL;
908                         sin->sin_port = fport;
909                         if (ia == NULL &&
910                             !TAILQ_EMPTY(&in_ifaddrheads[mycpuid]))
911                                 ia = TAILQ_FIRST(&in_ifaddrheads[mycpuid])->ia;
912                         if (ia && jailed && !jailed_ip(cred->cr_prison,
913                             sintosa(&ia->ia_addr)))
914                                 ia = NULL;
915
916                         if (!jailed && ia == NULL)
917                                 goto fail;
918                 }
919                 /*
920                  * If the destination address is multicast and an outgoing
921                  * interface has been set as a multicast option, use the
922                  * address of that interface as our source address.
923                  */
924                 if (!jailed && IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) &&
925                     inp->inp_moptions != NULL) {
926                         struct ip_moptions *imo;
927                         struct ifnet *ifp;
928
929                         imo = inp->inp_moptions;
930                         if ((ifp = imo->imo_multicast_ifp) != NULL) {
931                                 struct in_ifaddr_container *iac;
932
933                                 ia = NULL;
934                                 TAILQ_FOREACH(iac,
935                                 &in_ifaddrheads[mycpuid], ia_link) {
936                                         if (iac->ia->ia_ifp == ifp) {
937                                                 ia = iac->ia;
938                                                 break;
939                                         }
940                                 }
941                                 if (ia == NULL)
942                                         goto fail;
943                         }
944                 }
945                 /*
946                  * Don't do pcblookup call here; return interface in plocal_sin
947                  * and exit to caller, that will do the lookup.
948                  */
949                 if (ia == NULL && jailed) {
950                         if ((jsin = prison_get_nonlocal(
951                                 cred->cr_prison, AF_INET, NULL)) != NULL ||
952                             (jsin = prison_get_local(
953                                 cred->cr_prison, AF_INET, NULL)) != NULL) {
954                                 *plocal_sin = satosin(jsin);
955                         } else {
956                                 /* IPv6 only Jail */
957                                 goto fail;
958                         }
959                 } else {
960                         *plocal_sin = &ia->ia_addr;
961                 }
962         }
963         return (0);
964 fail:
965         if (alloc_route)
966                 in_pcbresetroute(inp);
967         return (EADDRNOTAVAIL);
968 }
969
970 int
971 in_pcbladdr(struct inpcb *inp, struct sockaddr *nam,
972     struct sockaddr_in **plocal_sin, struct thread *td)
973 {
974         return in_pcbladdr_find(inp, nam, plocal_sin, td,
975             (inp->inp_laddr.s_addr == INADDR_ANY));
976 }
977
978 /*
979  * Outer subroutine:
980  * Connect from a socket to a specified address.
981  * Both address and port must be specified in argument sin.
982  * If don't have a local address for this socket yet,
983  * then pick one.
984  */
985 int
986 in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct thread *td)
987 {
988         struct sockaddr_in *if_sin;
989         struct sockaddr_in *sin = (struct sockaddr_in *)nam;
990         int error;
991
992         if_sin = NULL;  /* avoid gcc warnings */
993
994         /* Call inner routine to assign local interface address. */
995         if ((error = in_pcbladdr(inp, nam, &if_sin, td)) != 0)
996                 return (error);
997
998         if (in_pcblookup_hash(inp->inp_pcbinfo, sin->sin_addr, sin->sin_port,
999                               inp->inp_laddr.s_addr ?
1000                                 inp->inp_laddr : if_sin->sin_addr,
1001                               inp->inp_lport, FALSE, NULL) != NULL) {
1002                 return (EADDRINUSE);
1003         }
1004         if (inp->inp_laddr.s_addr == INADDR_ANY) {
1005                 if (inp->inp_lport == 0) {
1006                         error = in_pcbbind(inp, NULL, td);
1007                         if (error)
1008                                 return (error);
1009                 }
1010                 inp->inp_laddr = if_sin->sin_addr;
1011         }
1012         inp->inp_faddr = sin->sin_addr;
1013         inp->inp_fport = sin->sin_port;
1014         in_pcbinsconnhash(inp);
1015         return (0);
1016 }
1017
1018 void
1019 in_pcbdisconnect(struct inpcb *inp)
1020 {
1021
1022         in_pcbremconnhash(inp);
1023         inp->inp_faddr.s_addr = INADDR_ANY;
1024         inp->inp_fport = 0;
1025 }
1026
1027 void
1028 in_pcbdetach(struct inpcb *inp)
1029 {
1030         struct socket *so = inp->inp_socket;
1031         struct inpcbinfo *ipi = inp->inp_pcbinfo;
1032
1033         inp->inp_gencnt = ++ipi->ipi_gencnt;
1034         KKASSERT((so->so_state & SS_ASSERTINPROG) == 0);
1035         in_pcbremlists(inp);
1036         so->so_pcb = NULL;
1037         sofree(so);                     /* remove pcb ref */
1038         if (inp->inp_options)
1039                 m_free(inp->inp_options);
1040         if (inp->inp_route.ro_rt)
1041                 rtfree(inp->inp_route.ro_rt);
1042         ip_freemoptions(inp->inp_moptions);
1043         kfree(inp, M_PCB);
1044 }
1045
1046 /*
1047  * The socket may have an invalid PCB, i.e. NULL.  For example, a TCP
1048  * socket received RST.
1049  */
1050 static int
1051 in_setsockaddr(struct socket *so, struct sockaddr **nam)
1052 {
1053         struct inpcb *inp;
1054         struct sockaddr_in *sin;
1055
1056         KASSERT(curthread->td_type == TD_TYPE_NETISR, ("not in netisr"));
1057         inp = so->so_pcb;
1058         if (!inp)
1059                 return (ECONNRESET);
1060
1061         sin = kmalloc(sizeof *sin, M_SONAME, M_WAITOK | M_ZERO);
1062         sin->sin_family = AF_INET;
1063         sin->sin_len = sizeof *sin;
1064         sin->sin_port = inp->inp_lport;
1065         sin->sin_addr = inp->inp_laddr;
1066
1067         *nam = (struct sockaddr *)sin;
1068         return (0);
1069 }
1070
1071 void
1072 in_setsockaddr_dispatch(netmsg_t msg)
1073 {
1074         int error;
1075
1076         error = in_setsockaddr(msg->base.nm_so, msg->peeraddr.nm_nam);
1077         lwkt_replymsg(&msg->lmsg, error);
1078 }
1079
1080 /*
1081  * The socket may have an invalid PCB, i.e. NULL.  For example, a TCP
1082  * socket received RST.
1083  */
1084 int
1085 in_setpeeraddr(struct socket *so, struct sockaddr **nam)
1086 {
1087         struct inpcb *inp;
1088         struct sockaddr_in *sin;
1089
1090         KASSERT(curthread->td_type == TD_TYPE_NETISR, ("not in netisr"));
1091         inp = so->so_pcb;
1092         if (!inp)
1093                 return (ECONNRESET);
1094
1095         sin = kmalloc(sizeof *sin, M_SONAME, M_WAITOK | M_ZERO);
1096         sin->sin_family = AF_INET;
1097         sin->sin_len = sizeof *sin;
1098         sin->sin_port = inp->inp_fport;
1099         sin->sin_addr = inp->inp_faddr;
1100
1101         *nam = (struct sockaddr *)sin;
1102         return (0);
1103 }
1104
1105 void
1106 in_setpeeraddr_dispatch(netmsg_t msg)
1107 {
1108         int error;
1109
1110         error = in_setpeeraddr(msg->base.nm_so, msg->peeraddr.nm_nam);
1111         lwkt_replymsg(&msg->lmsg, error);
1112 }
1113
1114 void
1115 in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int err,
1116     inp_notify_t notify)
1117 {
1118         struct inpcb *inp, *marker;
1119
1120         KASSERT(&curthread->td_msgport == netisr_cpuport(pcbinfo->cpu),
1121             ("not in the correct netisr"));
1122         marker = in_pcbmarker();
1123
1124         /*
1125          * NOTE:
1126          * - If INP_PLACEMARKER is set we must ignore the rest of the
1127          *   structure and skip it.
1128          * - It is safe to nuke inpcbs here, since we are in their own
1129          *   netisr.
1130          */
1131         GET_PCBINFO_TOKEN(pcbinfo);
1132
1133         LIST_INSERT_HEAD(&pcbinfo->pcblisthead, marker, inp_list);
1134         while ((inp = LIST_NEXT(marker, inp_list)) != NULL) {
1135                 LIST_REMOVE(marker, inp_list);
1136                 LIST_INSERT_AFTER(inp, marker, inp_list);
1137
1138                 if (inp->inp_flags & INP_PLACEMARKER)
1139                         continue;
1140 #ifdef INET6
1141                 if (!INP_ISIPV4(inp))
1142                         continue;
1143 #endif
1144                 if (inp->inp_faddr.s_addr != faddr.s_addr ||
1145                     inp->inp_socket == NULL)
1146                         continue;
1147                 (*notify)(inp, err);            /* can remove inp from list! */
1148         }
1149         LIST_REMOVE(marker, inp_list);
1150
1151         REL_PCBINFO_TOKEN(pcbinfo);
1152 }
1153
1154 void
1155 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
1156 {
1157         struct inpcb *inp, *marker;
1158
1159         /*
1160          * We only need to make sure that we are in netisr0, where all
1161          * multicast operation happen.  We could check inpcbinfo which
1162          * does not belong to netisr0 by holding the inpcbinfo's token.
1163          * In this case, the pcbinfo must be able to be shared, i.e.
1164          * pcbinfo->infotoken is not NULL.
1165          */
1166         ASSERT_NETISR0;
1167         KASSERT(pcbinfo->cpu == 0 || pcbinfo->infotoken != NULL,
1168             ("pcbinfo could not be shared"));
1169
1170         /*
1171          * Get a marker for the current netisr (netisr0).
1172          *
1173          * It is possible that the multicast address deletion blocks,
1174          * which could cause temporary token releasing.  So we use
1175          * inpcb marker here to get a coherent view of the inpcb list.
1176          *
1177          * While, on the other hand, moptions are only added and deleted
1178          * in netisr0, so we would not see staled moption or miss moption
1179          * even if the token was released due to the blocking multicast
1180          * address deletion.
1181          */
1182         marker = in_pcbmarker();
1183
1184         GET_PCBINFO_TOKEN(pcbinfo);
1185
1186         LIST_INSERT_HEAD(&pcbinfo->pcblisthead, marker, inp_list);
1187         while ((inp = LIST_NEXT(marker, inp_list)) != NULL) {
1188                 struct ip_moptions *imo;
1189
1190                 LIST_REMOVE(marker, inp_list);
1191                 LIST_INSERT_AFTER(inp, marker, inp_list);
1192
1193                 if (inp->inp_flags & INP_PLACEMARKER)
1194                         continue;
1195                 imo = inp->inp_moptions;
1196                 if (INP_ISIPV4(inp) && imo != NULL) {
1197                         int i, gap;
1198
1199                         /*
1200                          * Unselect the outgoing interface if it is being
1201                          * detached.
1202                          */
1203                         if (imo->imo_multicast_ifp == ifp)
1204                                 imo->imo_multicast_ifp = NULL;
1205
1206                         /*
1207                          * Drop multicast group membership if we joined
1208                          * through the interface being detached.
1209                          */
1210                         for (i = 0, gap = 0; i < imo->imo_num_memberships;
1211                             i++) {
1212                                 if (imo->imo_membership[i]->inm_ifp == ifp) {
1213                                         /*
1214                                          * NOTE:
1215                                          * This could block and the pcbinfo
1216                                          * token could be passively released.
1217                                          */
1218                                         in_delmulti(imo->imo_membership[i]);
1219                                         gap++;
1220                                 } else if (gap != 0)
1221                                         imo->imo_membership[i - gap] =
1222                                             imo->imo_membership[i];
1223                         }
1224                         imo->imo_num_memberships -= gap;
1225                 }
1226         }
1227         LIST_REMOVE(marker, inp_list);
1228
1229         REL_PCBINFO_TOKEN(pcbinfo);
1230 }
1231
1232 /*
1233  * Check for alternatives when higher level complains
1234  * about service problems.  For now, invalidate cached
1235  * routing information.  If the route was created dynamically
1236  * (by a redirect), time to try a default gateway again.
1237  */
1238 void
1239 in_losing(struct inpcb *inp)
1240 {
1241         struct rtentry *rt;
1242         struct rt_addrinfo rtinfo;
1243
1244         if ((rt = inp->inp_route.ro_rt)) {
1245                 bzero(&rtinfo, sizeof(struct rt_addrinfo));
1246                 rtinfo.rti_info[RTAX_DST] = rt_key(rt);
1247                 rtinfo.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
1248                 rtinfo.rti_info[RTAX_NETMASK] = rt_mask(rt);
1249                 rtinfo.rti_flags = rt->rt_flags;
1250                 rt_missmsg(RTM_LOSING, &rtinfo, rt->rt_flags, 0);
1251                 if (rt->rt_flags & RTF_DYNAMIC) {
1252                         rtrequest(RTM_DELETE, rt_key(rt), rt->rt_gateway,
1253                             rt_mask(rt), rt->rt_flags, NULL);
1254                 }
1255                 inp->inp_route.ro_rt = NULL;
1256                 rtfree(rt);
1257                 /*
1258                  * A new route can be allocated
1259                  * the next time output is attempted.
1260                  */
1261         }
1262 }
1263
1264 /*
1265  * After a routing change, flush old routing
1266  * and allocate a (hopefully) better one.
1267  */
1268 void
1269 in_rtchange(struct inpcb *inp, int err)
1270 {
1271         if (inp->inp_route.ro_rt) {
1272                 rtfree(inp->inp_route.ro_rt);
1273                 inp->inp_route.ro_rt = NULL;
1274                 /*
1275                  * A new route can be allocated the next time
1276                  * output is attempted.
1277                  */
1278         }
1279 }
1280
1281 /*
1282  * Lookup a PCB based on the local address and port.
1283  */
1284 static struct inpcb *
1285 in_pcblookup_local(struct inpcbporthead *porthash, struct in_addr laddr,
1286                    u_int lport_arg, int wild_okay, struct ucred *cred)
1287 {
1288         struct inpcb *inp;
1289         int matchwild = 3, wildcard;
1290         u_short lport = lport_arg;
1291         struct inpcbport *phd;
1292         struct inpcb *match = NULL;
1293
1294         /*
1295          * If the porthashbase is shared across several cpus, it must
1296          * have been locked.
1297          */
1298         ASSERT_PORTHASH_TOKEN_HELD(porthash);
1299
1300         /*
1301          * Best fit PCB lookup.
1302          *
1303          * First see if this local port is in use by looking on the
1304          * port hash list.
1305          */
1306         LIST_FOREACH(phd, porthash, phd_hash) {
1307                 if (phd->phd_port == lport)
1308                         break;
1309         }
1310         if (phd != NULL) {
1311                 /*
1312                  * Port is in use by one or more PCBs. Look for best
1313                  * fit.
1314                  */
1315                 LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
1316                         wildcard = 0;
1317 #ifdef INET6
1318                         if (!INP_ISIPV4(inp))
1319                                 continue;
1320 #endif
1321                         if (inp->inp_faddr.s_addr != INADDR_ANY)
1322                                 wildcard++;
1323                         if (inp->inp_laddr.s_addr != INADDR_ANY) {
1324                                 if (laddr.s_addr == INADDR_ANY)
1325                                         wildcard++;
1326                                 else if (inp->inp_laddr.s_addr != laddr.s_addr)
1327                                         continue;
1328                         } else {
1329                                 if (laddr.s_addr != INADDR_ANY)
1330                                         wildcard++;
1331                         }
1332                         if (wildcard && !wild_okay)
1333                                 continue;
1334                         if (wildcard < matchwild &&
1335                             (cred == NULL ||
1336                              cred->cr_prison == 
1337                                         inp->inp_socket->so_cred->cr_prison)) {
1338                                 match = inp;
1339                                 matchwild = wildcard;
1340                                 if (matchwild == 0) {
1341                                         break;
1342                                 }
1343                         }
1344                 }
1345         }
1346         return (match);
1347 }
1348
1349 struct inpcb *
1350 in_pcblocalgroup_last(const struct inpcbinfo *pcbinfo,
1351     const struct inpcb *inp)
1352 {
1353         const struct inp_localgrphead *hdr;
1354         const struct inp_localgroup *grp;
1355         int i;
1356
1357         if (pcbinfo->localgrphashbase == NULL)
1358                 return NULL;
1359
1360         GET_PCBINFO_TOKEN(pcbinfo);
1361
1362         hdr = &pcbinfo->localgrphashbase[
1363             INP_PCBLOCALGRPHASH(inp->inp_lport, pcbinfo->localgrphashmask)];
1364
1365         LIST_FOREACH(grp, hdr, il_list) {
1366                 if (grp->il_af == inp->inp_af &&
1367                     grp->il_lport == inp->inp_lport &&
1368                     memcmp(&grp->il_dependladdr,
1369                         &inp->inp_inc.inc_ie.ie_dependladdr,
1370                         sizeof(grp->il_dependladdr)) == 0) {
1371                         break;
1372                 }
1373         }
1374         if (grp == NULL || grp->il_inpcnt == 1) {
1375                 REL_PCBINFO_TOKEN(pcbinfo);
1376                 return NULL;
1377         }
1378
1379         KASSERT(grp->il_inpcnt >= 2,
1380             ("invalid localgroup inp count %d", grp->il_inpcnt));
1381         for (i = 0; i < grp->il_inpcnt; ++i) {
1382                 if (grp->il_inp[i] == inp) {
1383                         int last = grp->il_inpcnt - 1;
1384
1385                         if (i == last)
1386                                 last = grp->il_inpcnt - 2;
1387                         REL_PCBINFO_TOKEN(pcbinfo);
1388                         return grp->il_inp[last];
1389                 }
1390         }
1391         REL_PCBINFO_TOKEN(pcbinfo);
1392         return NULL;
1393 }
1394
1395 static struct inpcb *
1396 inp_localgroup_lookup(const struct inpcbinfo *pcbinfo,
1397     struct in_addr laddr, uint16_t lport, uint32_t pkt_hash)
1398 {
1399         struct inpcb *local_wild = NULL;
1400         const struct inp_localgrphead *hdr;
1401         const struct inp_localgroup *grp;
1402
1403         ASSERT_PCBINFO_TOKEN_HELD(pcbinfo);
1404
1405         hdr = &pcbinfo->localgrphashbase[
1406             INP_PCBLOCALGRPHASH(lport, pcbinfo->localgrphashmask)];
1407
1408         /*
1409          * Order of socket selection:
1410          * 1. non-wild.
1411          * 2. wild.
1412          *
1413          * NOTE: Local group does not contain jailed sockets
1414          */
1415         LIST_FOREACH(grp, hdr, il_list) {
1416 #ifdef INET6
1417                 if (grp->il_af != AF_INET)
1418                         continue;
1419 #endif
1420                 if (grp->il_lport == lport) {
1421                         int idx;
1422
1423                         /*
1424                          * Modulo-N is used here, which greatly reduces
1425                          * completion queue token contention, thus more
1426                          * cpu time is saved.
1427                          */
1428                         idx = netisr_hashlsb(pkt_hash) % grp->il_inpcnt;
1429                         if (grp->il_laddr.s_addr == laddr.s_addr)
1430                                 return grp->il_inp[idx];
1431                         else if (grp->il_laddr.s_addr == INADDR_ANY)
1432                                 local_wild = grp->il_inp[idx];
1433                 }
1434         }
1435         if (local_wild != NULL)
1436                 return local_wild;
1437         return NULL;
1438 }
1439
1440 /*
1441  * Lookup PCB in hash list.
1442  */
1443 struct inpcb *
1444 in_pcblookup_pkthash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
1445     u_int fport_arg, struct in_addr laddr, u_int lport_arg,
1446     boolean_t wildcard, struct ifnet *ifp, const struct mbuf *m)
1447 {
1448         struct inpcbhead *head;
1449         struct inpcb *inp, *jinp=NULL;
1450         u_short fport = fport_arg, lport = lport_arg;
1451
1452         /*
1453          * First look for an exact match.
1454          */
1455         head = &pcbinfo->hashbase[INP_PCBCONNHASH(faddr.s_addr, fport,
1456             laddr.s_addr, lport, pcbinfo->hashmask)];
1457         LIST_FOREACH(inp, head, inp_hash) {
1458 #ifdef INET6
1459                 if (!INP_ISIPV4(inp))
1460                         continue;
1461 #endif
1462                 if (in_hosteq(inp->inp_faddr, faddr) &&
1463                     in_hosteq(inp->inp_laddr, laddr) &&
1464                     inp->inp_fport == fport && inp->inp_lport == lport) {
1465                         /* found */
1466                         if (inp->inp_socket == NULL ||
1467                             inp->inp_socket->so_cred->cr_prison == NULL) {
1468                                 return (inp);
1469                         } else {
1470                                 if  (jinp == NULL)
1471                                         jinp = inp;
1472                         }
1473                 }
1474         }
1475         if (jinp != NULL)
1476                 return (jinp);
1477
1478         if (wildcard) {
1479                 struct inpcb *local_wild = NULL;
1480                 struct inpcb *jinp_wild = NULL;
1481                 struct inpcontainer *ic;
1482                 struct inpcontainerhead *chead;
1483                 struct sockaddr_in jsin;
1484                 struct ucred *cred;
1485
1486                 GET_PCBINFO_TOKEN(pcbinfo);
1487
1488                 /*
1489                  * Check local group first
1490                  */
1491                 if (pcbinfo->localgrphashbase != NULL &&
1492                     m != NULL && (m->m_flags & M_HASH)) {
1493                         inp = inp_localgroup_lookup(pcbinfo,
1494                             laddr, lport, m->m_pkthdr.hash);
1495                         if (inp != NULL) {
1496                                 REL_PCBINFO_TOKEN(pcbinfo);
1497                                 return inp;
1498                         }
1499                 }
1500
1501                 /*
1502                  * Order of socket selection:
1503                  * 1. non-jailed, non-wild.
1504                  * 2. non-jailed, wild.
1505                  * 3. jailed, non-wild.
1506                  * 4. jailed, wild.
1507                  */
1508                 jsin.sin_family = AF_INET;
1509                 chead = &pcbinfo->wildcardhashbase[
1510                     INP_PCBWILDCARDHASH(lport, pcbinfo->wildcardhashmask)];
1511                 LIST_FOREACH(ic, chead, ic_list) {
1512                         inp = ic->ic_inp;
1513                         if (inp->inp_flags & INP_PLACEMARKER)
1514                                 continue;
1515
1516                         jsin.sin_addr.s_addr = laddr.s_addr;
1517 #ifdef INET6
1518                         if (!INP_ISIPV4(inp))
1519                                 continue;
1520 #endif
1521                         if (inp->inp_socket != NULL)
1522                                 cred = inp->inp_socket->so_cred;
1523                         else
1524                                 cred = NULL;
1525                         if (cred != NULL && jailed(cred)) {
1526                                 if (jinp != NULL)
1527                                         continue;
1528                                 else
1529                                         if (!jailed_ip(cred->cr_prison,
1530                                             (struct sockaddr *)&jsin))
1531                                                 continue;
1532                         }
1533                         if (inp->inp_lport == lport) {
1534                                 if (inp->inp_laddr.s_addr == laddr.s_addr) {
1535                                         if (cred != NULL && jailed(cred)) {
1536                                                 jinp = inp;
1537                                         } else {
1538                                                 REL_PCBINFO_TOKEN(pcbinfo);
1539                                                 return (inp);
1540                                         }
1541                                 }
1542                                 if (inp->inp_laddr.s_addr == INADDR_ANY) {
1543                                         if (cred != NULL && jailed(cred))
1544                                                 jinp_wild = inp;
1545                                         else
1546                                                 local_wild = inp;
1547                                 }
1548                         }
1549                 }
1550
1551                 REL_PCBINFO_TOKEN(pcbinfo);
1552
1553                 if (local_wild != NULL)
1554                         return (local_wild);
1555                 if (jinp != NULL)
1556                         return (jinp);
1557                 return (jinp_wild);
1558         }
1559
1560         /*
1561          * Not found.
1562          */
1563         return (NULL);
1564 }
1565
1566 struct inpcb *
1567 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
1568     u_int fport_arg, struct in_addr laddr, u_int lport_arg,
1569     boolean_t wildcard, struct ifnet *ifp)
1570 {
1571         return in_pcblookup_pkthash(pcbinfo, faddr, fport_arg,
1572             laddr, lport_arg, wildcard, ifp, NULL);
1573 }
1574
1575 /*
1576  * Insert PCB into connection hash table.
1577  */
1578 void
1579 in_pcbinsconnhash(struct inpcb *inp)
1580 {
1581         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1582         struct inpcbhead *bucket;
1583         u_int32_t hashkey_faddr, hashkey_laddr;
1584
1585 #ifdef INET6
1586         if (INP_ISIPV6(inp)) {
1587                 hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX JH */;
1588                 hashkey_laddr = inp->in6p_laddr.s6_addr32[3] /* XXX JH */;
1589         } else {
1590 #endif
1591                 hashkey_faddr = inp->inp_faddr.s_addr;
1592                 hashkey_laddr = inp->inp_laddr.s_addr;
1593 #ifdef INET6
1594         }
1595 #endif
1596
1597         KASSERT(&curthread->td_msgport == netisr_cpuport(pcbinfo->cpu),
1598             ("not in the correct netisr"));
1599         ASSERT_INP_NOTINHASH(inp);
1600         inp->inp_flags |= INP_CONNECTED;
1601
1602         /*
1603          * Insert into the connection hash table.
1604          */
1605         bucket = &pcbinfo->hashbase[INP_PCBCONNHASH(hashkey_faddr,
1606             inp->inp_fport, hashkey_laddr, inp->inp_lport, pcbinfo->hashmask)];
1607         LIST_INSERT_HEAD(bucket, inp, inp_hash);
1608 }
1609
1610 /*
1611  * Remove PCB from connection hash table.
1612  */
1613 void
1614 in_pcbremconnhash(struct inpcb *inp)
1615 {
1616         struct inpcbinfo *pcbinfo __debugvar = inp->inp_pcbinfo;
1617
1618         KASSERT(&curthread->td_msgport == netisr_cpuport(pcbinfo->cpu),
1619             ("not in the correct netisr"));
1620         KASSERT(inp->inp_flags & INP_CONNECTED, ("inp not connected"));
1621
1622         LIST_REMOVE(inp, inp_hash);
1623         inp->inp_flags &= ~INP_CONNECTED;
1624 }
1625
1626 /*
1627  * Insert PCB into port hash table.
1628  */
1629 void
1630 in_pcbinsporthash(struct inpcbporthead *pcbporthash, struct inpcb *inp)
1631 {
1632         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1633         struct inpcbport *phd;
1634
1635         /*
1636          * If the porthashbase is shared across several cpus, it must
1637          * have been locked.
1638          */
1639         ASSERT_PORTHASH_TOKEN_HELD(pcbporthash);
1640
1641         /*
1642          * Insert into the port hash table.
1643          */
1644
1645         /* Go through port list and look for a head for this lport. */
1646         LIST_FOREACH(phd, pcbporthash, phd_hash) {
1647                 if (phd->phd_port == inp->inp_lport)
1648                         break;
1649         }
1650
1651         /* If none exists, use saved one and tack it on. */
1652         if (phd == NULL) {
1653                 KKASSERT(pcbinfo->portsave != NULL);
1654                 phd = pcbinfo->portsave;
1655                 pcbinfo->portsave = NULL;
1656                 phd->phd_port = inp->inp_lport;
1657                 LIST_INIT(&phd->phd_pcblist);
1658                 LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
1659         }
1660
1661         inp->inp_porthash = pcbporthash;
1662         inp->inp_phd = phd;
1663         LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
1664
1665         /*
1666          * Malloc one inpcbport for later use.  It is safe to use
1667          * "wait" malloc here (port token would be released, if
1668          * malloc ever blocked), since all changes to the porthash
1669          * are done.
1670          */
1671         if (pcbinfo->portsave == NULL) {
1672                 pcbinfo->portsave = kmalloc(sizeof(*pcbinfo->portsave),
1673                                             M_PCB, M_INTWAIT | M_ZERO);
1674         }
1675 }
1676
1677 void
1678 in_pcbinsporthash_lport(struct inpcb *inp)
1679 {
1680         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1681         struct inpcbportinfo *portinfo;
1682         struct inpcbporthead *porthash;
1683         u_short lport_ho;
1684
1685         /* Locate the proper portinfo based on lport */
1686         lport_ho = ntohs(inp->inp_lport);
1687         portinfo = &pcbinfo->portinfo[lport_ho % pcbinfo->portinfo_cnt];
1688         KKASSERT((lport_ho % pcbinfo->portinfo_cnt) == portinfo->offset);
1689
1690         porthash = in_pcbporthash_head(portinfo, inp->inp_lport);
1691         GET_PORTHASH_TOKEN(porthash);
1692         in_pcbinsporthash(porthash, inp);
1693         REL_PORTHASH_TOKEN(porthash);
1694 }
1695
1696 void
1697 in_pcbremporthash(struct inpcb *inp)
1698 {
1699         struct inpcbporthead *porthash;
1700         struct inpcbport *phd;
1701
1702         if (inp->inp_phd == NULL)
1703                 return;
1704         KASSERT(inp->inp_lport != 0, ("inpcb has no lport"));
1705
1706         porthash = inp->inp_porthash;
1707         KASSERT(porthash != NULL, ("no porthash"));
1708
1709         GET_PORTHASH_TOKEN(porthash);
1710
1711         phd = inp->inp_phd;
1712         LIST_REMOVE(inp, inp_portlist);
1713         if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
1714                 LIST_REMOVE(phd, phd_hash);
1715                 kfree(phd, M_PCB);
1716         }
1717
1718         REL_PORTHASH_TOKEN(porthash);
1719
1720         inp->inp_phd = NULL;
1721         /* NOTE: Don't whack inp_lport, which may be used later */
1722 }
1723
1724 static struct inp_localgroup *
1725 inp_localgroup_alloc(u_char af, uint16_t port,
1726     const union in_dependaddr *addr, int size)
1727 {
1728         struct inp_localgroup *grp;
1729
1730         grp = kmalloc(__offsetof(struct inp_localgroup, il_inp[size]),
1731             M_TEMP, M_INTWAIT | M_ZERO);
1732         grp->il_af = af;
1733         grp->il_lport = port;
1734         grp->il_dependladdr = *addr;
1735         grp->il_inpsiz = size;
1736
1737         return grp;
1738 }
1739
1740 static void
1741 inp_localgroup_free(struct inp_localgroup *grp)
1742 {
1743         kfree(grp, M_TEMP);
1744 }
1745
1746 static void
1747 inp_localgroup_destroy(struct inp_localgroup *grp)
1748 {
1749         LIST_REMOVE(grp, il_list);
1750         inp_localgroup_free(grp);
1751 }
1752
1753 static void
1754 inp_localgroup_copy(struct inp_localgroup *grp,
1755     const struct inp_localgroup *old_grp)
1756 {
1757         int i;
1758
1759         KASSERT(old_grp->il_inpcnt < grp->il_inpsiz,
1760             ("invalid new local group size %d and old local group count %d",
1761              grp->il_inpsiz, old_grp->il_inpcnt));
1762         for (i = 0; i < old_grp->il_inpcnt; ++i)
1763                 grp->il_inp[i] = old_grp->il_inp[i];
1764         grp->il_inpcnt = old_grp->il_inpcnt;
1765 }
1766
1767 static void
1768 in_pcbinslocalgrphash_oncpu(struct inpcb *inp, struct inpcbinfo *pcbinfo)
1769 {
1770         struct inp_localgrphead *hdr;
1771         struct inp_localgroup *grp, *grp_alloc = NULL;
1772         struct ucred *cred;
1773         int i, idx;
1774
1775         ASSERT_PCBINFO_TOKEN_HELD(pcbinfo);
1776
1777         if (pcbinfo->localgrphashbase == NULL)
1778                 return;
1779
1780         /*
1781          * XXX don't allow jailed socket to join local group
1782          */
1783         if (inp->inp_socket != NULL)
1784                 cred = inp->inp_socket->so_cred;
1785         else
1786                 cred = NULL;
1787         if (cred != NULL && jailed(cred))
1788                 return;
1789
1790         hdr = &pcbinfo->localgrphashbase[
1791             INP_PCBLOCALGRPHASH(inp->inp_lport, pcbinfo->localgrphashmask)];
1792
1793 again:
1794         LIST_FOREACH(grp, hdr, il_list) {
1795                 if (grp->il_af == inp->inp_af &&
1796                     grp->il_lport == inp->inp_lport &&
1797                     memcmp(&grp->il_dependladdr,
1798                         &inp->inp_inc.inc_ie.ie_dependladdr,
1799                         sizeof(grp->il_dependladdr)) == 0) {
1800                         break;
1801                 }
1802         }
1803         if (grp == NULL) {
1804                 /*
1805                  * Create a new local group
1806                  */
1807                 if (grp_alloc == NULL) {
1808                         grp_alloc = inp_localgroup_alloc(inp->inp_af,
1809                             inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr,
1810                             INP_LOCALGROUP_SIZMIN);
1811                         /*
1812                          * Local group allocation could block and the
1813                          * local group w/ the same property might have
1814                          * been added by others when we were blocked;
1815                          * check again.
1816                          */
1817                         goto again;
1818                 } else {
1819                         /* Local group has been allocated; link it */
1820                         grp = grp_alloc;
1821                         grp_alloc = NULL;
1822                         LIST_INSERT_HEAD(hdr, grp, il_list);
1823                 }
1824         } else if (grp->il_inpcnt == grp->il_inpsiz) {
1825                 if (grp->il_inpsiz >= INP_LOCALGROUP_SIZMAX) {
1826                         static int limit_logged = 0;
1827
1828                         if (!limit_logged) {
1829                                 limit_logged = 1;
1830                                 kprintf("local group port %d, "
1831                                     "limit reached\n", ntohs(grp->il_lport));
1832                         }
1833                         if (grp_alloc != NULL) {
1834                                 /*
1835                                  * This would happen if the local group
1836                                  * w/ the same property was expanded when
1837                                  * our local group allocation blocked.
1838                                  */
1839                                 inp_localgroup_free(grp_alloc);
1840                         }
1841                         return;
1842                 }
1843
1844                 /*
1845                  * Expand this local group
1846                  */
1847                 if (grp_alloc == NULL ||
1848                     grp->il_inpcnt >= grp_alloc->il_inpsiz) {
1849                         if (grp_alloc != NULL)
1850                                 inp_localgroup_free(grp_alloc);
1851                         grp_alloc = inp_localgroup_alloc(grp->il_af,
1852                             grp->il_lport, &grp->il_dependladdr,
1853                             grp->il_inpsiz * 2);
1854                         /*
1855                          * Local group allocation could block and the
1856                          * local group w/ the same property might have
1857                          * been expanded by others when we were blocked;
1858                          * check again.
1859                          */
1860                         goto again;
1861                 }
1862
1863                 /*
1864                  * Save the old local group, link the new one, and then
1865                  * destroy the old local group
1866                  */
1867                 inp_localgroup_copy(grp_alloc, grp);
1868                 LIST_INSERT_HEAD(hdr, grp_alloc, il_list);
1869                 inp_localgroup_destroy(grp);
1870
1871                 grp = grp_alloc;
1872                 grp_alloc = NULL;
1873         } else {
1874                 /*
1875                  * Found the local group
1876                  */
1877                 if (grp_alloc != NULL) {
1878                         /*
1879                          * This would happen if the local group w/ the
1880                          * same property was added or expanded when our
1881                          * local group allocation blocked.
1882                          */
1883                         inp_localgroup_free(grp_alloc);
1884                         grp_alloc = NULL;
1885                 }
1886         }
1887
1888         KASSERT(grp->il_inpcnt < grp->il_inpsiz,
1889             ("invalid local group size %d and count %d",
1890              grp->il_inpsiz, grp->il_inpcnt));
1891
1892         /*
1893          * Keep the local group sorted by the inpcb local group index
1894          * in ascending order.
1895          *
1896          * This eases the multi-process userland application which uses
1897          * SO_REUSEPORT sockets and binds process to the owner cpu of
1898          * the SO_REUSEPORT socket:
1899          * If we didn't sort the local group by the inpcb local group
1900          * index and one of the process owning an inpcb in this local
1901          * group restarted, e.g. crashed and restarted by watchdog,
1902          * other processes owning a inpcb in this local group would have
1903          * to detect that event, refetch its socket's owner cpu, and
1904          * re-bind.
1905          */
1906         idx = grp->il_inpcnt;
1907         for (i = 0; i < idx; ++i) {
1908                 struct inpcb *oinp = grp->il_inp[i];
1909
1910                 if (oinp->inp_lgrpindex > i) {
1911                         if (inp->inp_lgrpindex < 0) {
1912                                 inp->inp_lgrpindex = i;
1913                         } else if (inp->inp_lgrpindex != i) {
1914                                 if (bootverbose) {
1915                                         kprintf("inp %p: grpidx %d, "
1916                                             "assigned to %d, cpu%d\n",
1917                                             inp, inp->inp_lgrpindex, i,
1918                                             mycpuid);
1919                                 }
1920                         }
1921                         grp->il_inp[i] = inp;
1922
1923                         /* Pull down inpcbs */
1924                         for (; i < grp->il_inpcnt; ++i) {
1925                                 struct inpcb *oinp1 = grp->il_inp[i + 1];
1926
1927                                 grp->il_inp[i + 1] = oinp;
1928                                 oinp = oinp1;
1929                         }
1930                         grp->il_inpcnt++;
1931                         return;
1932                 }
1933         }
1934
1935         if (inp->inp_lgrpindex < 0) {
1936                 inp->inp_lgrpindex = idx;
1937         } else if (inp->inp_lgrpindex != idx) {
1938                 if (bootverbose) {
1939                         kprintf("inp %p: grpidx %d, assigned to %d, cpu%d\n",
1940                             inp, inp->inp_lgrpindex, idx, mycpuid);
1941                 }
1942         }
1943         grp->il_inp[idx] = inp;
1944         grp->il_inpcnt++;
1945 }
1946
1947 void
1948 in_pcbinswildcardhash_oncpu(struct inpcb *inp, struct inpcbinfo *pcbinfo)
1949 {
1950         struct inpcontainer *ic;
1951         struct inpcontainerhead *bucket;
1952
1953         GET_PCBINFO_TOKEN(pcbinfo);
1954
1955         in_pcbinslocalgrphash_oncpu(inp, pcbinfo);
1956
1957         bucket = &pcbinfo->wildcardhashbase[
1958             INP_PCBWILDCARDHASH(inp->inp_lport, pcbinfo->wildcardhashmask)];
1959
1960         ic = kmalloc(sizeof(struct inpcontainer), M_TEMP, M_INTWAIT);
1961         ic->ic_inp = inp;
1962         LIST_INSERT_HEAD(bucket, ic, ic_list);
1963
1964         REL_PCBINFO_TOKEN(pcbinfo);
1965 }
1966
1967 /*
1968  * Insert PCB into wildcard hash table.
1969  */
1970 void
1971 in_pcbinswildcardhash(struct inpcb *inp)
1972 {
1973         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1974
1975         KASSERT(&curthread->td_msgport == netisr_cpuport(pcbinfo->cpu),
1976             ("not in correct netisr"));
1977         ASSERT_INP_NOTINHASH(inp);
1978         inp->inp_flags |= INP_WILDCARD;
1979
1980         in_pcbinswildcardhash_oncpu(inp, pcbinfo);
1981 }
1982
1983 static void
1984 in_pcbremlocalgrphash_oncpu(struct inpcb *inp, struct inpcbinfo *pcbinfo)
1985 {
1986         struct inp_localgrphead *hdr;
1987         struct inp_localgroup *grp;
1988
1989         ASSERT_PCBINFO_TOKEN_HELD(pcbinfo);
1990
1991         if (pcbinfo->localgrphashbase == NULL)
1992                 return;
1993
1994         hdr = &pcbinfo->localgrphashbase[
1995             INP_PCBLOCALGRPHASH(inp->inp_lport, pcbinfo->localgrphashmask)];
1996
1997         LIST_FOREACH(grp, hdr, il_list) {
1998                 int i;
1999
2000                 for (i = 0; i < grp->il_inpcnt; ++i) {
2001                         if (grp->il_inp[i] != inp)
2002                                 continue;
2003
2004                         if (grp->il_inpcnt == 1) {
2005                                 /* Destroy this local group */
2006                                 inp_localgroup_destroy(grp);
2007                         } else {
2008                                 /* Pull up inpcbs */
2009                                 for (; i + 1 < grp->il_inpcnt; ++i)
2010                                         grp->il_inp[i] = grp->il_inp[i + 1];
2011                                 grp->il_inpcnt--;
2012                         }
2013                         return;
2014                 }
2015         }
2016 }
2017
2018 void
2019 in_pcbremwildcardhash_oncpu(struct inpcb *inp, struct inpcbinfo *pcbinfo)
2020 {
2021         struct inpcontainer *ic;
2022         struct inpcontainerhead *head;
2023
2024         GET_PCBINFO_TOKEN(pcbinfo);
2025
2026         in_pcbremlocalgrphash_oncpu(inp, pcbinfo);
2027
2028         /* find bucket */
2029         head = &pcbinfo->wildcardhashbase[
2030             INP_PCBWILDCARDHASH(inp->inp_lport, pcbinfo->wildcardhashmask)];
2031
2032         LIST_FOREACH(ic, head, ic_list) {
2033                 if (ic->ic_inp == inp)
2034                         goto found;
2035         }
2036         REL_PCBINFO_TOKEN(pcbinfo);
2037         return;                 /* not found! */
2038
2039 found:
2040         LIST_REMOVE(ic, ic_list);       /* remove container from bucket chain */
2041         REL_PCBINFO_TOKEN(pcbinfo);
2042         kfree(ic, M_TEMP);              /* deallocate container */
2043 }
2044
2045 /*
2046  * Remove PCB from wildcard hash table.
2047  */
2048 void
2049 in_pcbremwildcardhash(struct inpcb *inp)
2050 {
2051         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2052
2053         KASSERT(&curthread->td_msgport == netisr_cpuport(pcbinfo->cpu),
2054             ("not in correct netisr"));
2055         KASSERT(inp->inp_flags & INP_WILDCARD, ("inp not wildcard"));
2056
2057         in_pcbremwildcardhash_oncpu(inp, pcbinfo);
2058         inp->inp_lgrpindex = -1;
2059         inp->inp_flags &= ~INP_WILDCARD;
2060 }
2061
2062 /*
2063  * Remove PCB from various lists.
2064  */
2065 void
2066 in_pcbremlists(struct inpcb *inp)
2067 {
2068         in_pcbremporthash(inp);
2069         if (inp->inp_flags & INP_WILDCARD) {
2070                 in_pcbremwildcardhash(inp);
2071         } else if (inp->inp_flags & INP_CONNECTED) {
2072                 in_pcbremconnhash(inp);
2073         }
2074
2075         if (inp->inp_flags & INP_ONLIST)
2076                 in_pcbofflist(inp);
2077 }
2078
2079 int
2080 prison_xinpcb(struct thread *td, struct inpcb *inp)
2081 {
2082         struct ucred *cr;
2083
2084         if (td->td_proc == NULL)
2085                 return (0);
2086         cr = td->td_proc->p_ucred;
2087         if (cr->cr_prison == NULL)
2088                 return (0);
2089         if (inp->inp_socket && inp->inp_socket->so_cred &&
2090             inp->inp_socket->so_cred->cr_prison &&
2091             cr->cr_prison == inp->inp_socket->so_cred->cr_prison)
2092                 return (0);
2093         return (1);
2094 }
2095
2096 int
2097 in_pcblist_range(SYSCTL_HANDLER_ARGS)
2098 {
2099         struct inpcbinfo *pcbinfo_arr = arg1;
2100         int pcbinfo_arrlen = arg2;
2101         struct inpcb *marker;
2102         int cpu, origcpu;
2103         int error, n;
2104
2105         KASSERT(pcbinfo_arrlen <= netisr_ncpus && pcbinfo_arrlen >= 1,
2106             ("invalid pcbinfo count %d", pcbinfo_arrlen));
2107
2108         /*
2109          * The process of preparing the TCB list is too time-consuming and
2110          * resource-intensive to repeat twice on every request.
2111          */
2112         n = 0;
2113         if (req->oldptr == NULL) {
2114                 for (cpu = 0; cpu < pcbinfo_arrlen; ++cpu)
2115                         n += pcbinfo_arr[cpu].ipi_count;
2116                 req->oldidx = (n + n/8 + 10) * sizeof(struct xinpcb);
2117                 return 0;
2118         }
2119
2120         if (req->newptr != NULL)
2121                 return EPERM;
2122
2123         marker = kmalloc(sizeof(struct inpcb), M_TEMP, M_WAITOK|M_ZERO);
2124         marker->inp_flags |= INP_PLACEMARKER;
2125
2126         /*
2127          * OK, now we're committed to doing something.  Re-fetch ipi_count
2128          * after obtaining the generation count.
2129          */
2130         error = 0;
2131         origcpu = mycpuid;
2132         for (cpu = 0; cpu < pcbinfo_arrlen && error == 0; ++cpu) {
2133                 struct inpcbinfo *pcbinfo = &pcbinfo_arr[cpu];
2134                 struct inpcb *inp;
2135                 struct xinpcb xi;
2136                 int i;
2137
2138                 lwkt_migratecpu(cpu);
2139
2140                 GET_PCBINFO_TOKEN(pcbinfo);
2141
2142                 n = pcbinfo->ipi_count;
2143
2144                 LIST_INSERT_HEAD(&pcbinfo->pcblisthead, marker, inp_list);
2145                 i = 0;
2146                 while ((inp = LIST_NEXT(marker, inp_list)) != NULL && i < n) {
2147                         LIST_REMOVE(marker, inp_list);
2148                         LIST_INSERT_AFTER(inp, marker, inp_list);
2149
2150                         if (inp->inp_flags & INP_PLACEMARKER)
2151                                 continue;
2152                         if (prison_xinpcb(req->td, inp))
2153                                 continue;
2154
2155                         bzero(&xi, sizeof xi);
2156                         xi.xi_len = sizeof xi;
2157                         bcopy(inp, &xi.xi_inp, sizeof *inp);
2158                         if (inp->inp_socket)
2159                                 sotoxsocket(inp->inp_socket, &xi.xi_socket);
2160                         if ((error = SYSCTL_OUT(req, &xi, sizeof xi)) != 0)
2161                                 break;
2162                         ++i;
2163                 }
2164                 LIST_REMOVE(marker, inp_list);
2165
2166                 REL_PCBINFO_TOKEN(pcbinfo);
2167
2168                 if (error == 0 && i < n) {
2169                         bzero(&xi, sizeof xi);
2170                         xi.xi_len = sizeof xi;
2171                         while (i < n) {
2172                                 error = SYSCTL_OUT(req, &xi, sizeof xi);
2173                                 if (error)
2174                                         break;
2175                                 ++i;
2176                         }
2177                 }
2178         }
2179
2180         lwkt_migratecpu(origcpu);
2181         kfree(marker, M_TEMP);
2182         return error;
2183 }
2184
2185 int
2186 in_pcblist_ncpus(SYSCTL_HANDLER_ARGS)
2187 {
2188
2189         return (in_pcblist_range(oidp, arg1, netisr_ncpus, req));
2190 }
2191
2192 void
2193 in_savefaddr(struct socket *so, const struct sockaddr *faddr)
2194 {
2195         struct sockaddr_in *sin;
2196
2197         KASSERT(faddr->sa_family == AF_INET,
2198             ("not AF_INET faddr %d", faddr->sa_family));
2199
2200         sin = kmalloc(sizeof(*sin), M_SONAME, M_WAITOK | M_ZERO);
2201         sin->sin_family = AF_INET;
2202         sin->sin_len = sizeof(*sin);
2203         sin->sin_port = ((const struct sockaddr_in *)faddr)->sin_port;
2204         sin->sin_addr = ((const struct sockaddr_in *)faddr)->sin_addr;
2205
2206         so->so_faddr = (struct sockaddr *)sin;
2207 }
2208
2209 void
2210 in_pcbportinfo_init(struct inpcbportinfo *portinfo, int hashsize,
2211     u_short offset)
2212 {
2213         memset(portinfo, 0, sizeof(*portinfo));
2214
2215         portinfo->offset = offset;
2216         portinfo->porthashbase = phashinit(hashsize, M_PCB,
2217             &portinfo->porthashcnt);
2218 }
2219
2220 void
2221 in_pcbportrange(u_short *hi0, u_short *lo0, u_short ofs, u_short step)
2222 {
2223         int hi, lo;
2224
2225         if (step == 1)
2226                 return;
2227
2228         hi = *hi0;
2229         lo = *lo0;
2230
2231         hi = rounddown(hi, step);
2232         hi += ofs;
2233         if (hi > (int)*hi0)
2234                 hi -= step;
2235
2236         lo = roundup(lo, step);
2237         lo -= (step - ofs);
2238         if (lo < (int)*lo0)
2239                 lo += step;
2240
2241         *hi0 = hi;
2242         *lo0 = lo;
2243 }
2244
2245 void
2246 in_pcbglobalinit(void)
2247 {
2248         int cpu;
2249
2250         in_pcbmarkers = kmalloc(netisr_ncpus * sizeof(struct inpcb), M_PCB,
2251             M_WAITOK | M_ZERO);
2252         in_pcbcontainer_markers =
2253             kmalloc(netisr_ncpus * sizeof(struct inpcontainer), M_PCB,
2254             M_WAITOK | M_ZERO);
2255
2256         for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
2257                 struct inpcontainer *ic = &in_pcbcontainer_markers[cpu];
2258                 struct inpcb *marker = &in_pcbmarkers[cpu];
2259
2260                 marker->inp_flags |= INP_PLACEMARKER;
2261                 ic->ic_inp = marker;
2262         }
2263 }
2264
2265 struct inpcb *
2266 in_pcbmarker(void)
2267 {
2268
2269         ASSERT_NETISR_NCPUS(mycpuid);
2270         return &in_pcbmarkers[mycpuid];
2271 }
2272
2273 struct inpcontainer *
2274 in_pcbcontainer_marker(void)
2275 {
2276
2277         ASSERT_NETISR_NCPUS(mycpuid);
2278         return &in_pcbcontainer_markers[mycpuid];
2279 }
2280
2281 void
2282 in_pcbresetroute(struct inpcb *inp)
2283 {
2284         struct route *ro = &inp->inp_route;
2285
2286         if (ro->ro_rt != NULL)
2287                 RTFREE(ro->ro_rt);
2288         bzero(ro, sizeof(*ro));
2289 }