Merge branch 'vendor/TCPDUMP' (version 4.3.0 -> 4.9.3)
[dragonfly.git] / sys / netinet / in_pcb.c
1 /*
2  * Copyright (c) 2004 Jeffrey M. Hsu.  All rights reserved.
3  * Copyright (c) 2004 The DragonFly Project.  All rights reserved.
4  *
5  * This code is derived from software contributed to The DragonFly Project
6  * by Jeffrey M. Hsu.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of The DragonFly Project nor the names of its
17  *    contributors may be used to endorse or promote products derived
18  *    from this software without specific, prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
24  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
26  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
28  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
30  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  */
33
34 /*
35  * Copyright (c) 1982, 1986, 1991, 1993, 1995
36  *      The Regents of the University of California.  All rights reserved.
37  *
38  * Redistribution and use in source and binary forms, with or without
39  * modification, are permitted provided that the following conditions
40  * are met:
41  * 1. Redistributions of source code must retain the above copyright
42  *    notice, this list of conditions and the following disclaimer.
43  * 2. Redistributions in binary form must reproduce the above copyright
44  *    notice, this list of conditions and the following disclaimer in the
45  *    documentation and/or other materials provided with the distribution.
46  * 3. Neither the name of the University nor the names of its contributors
47  *    may be used to endorse or promote products derived from this software
48  *    without specific prior written permission.
49  *
50  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
51  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
52  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
53  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
54  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
55  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
56  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
57  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
58  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
59  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
60  * SUCH DAMAGE.
61  *
62  *      @(#)in_pcb.c    8.4 (Berkeley) 5/24/95
63  * $FreeBSD: src/sys/netinet/in_pcb.c,v 1.59.2.27 2004/01/02 04:06:42 ambrisko Exp $
64  */
65
66 #include "opt_inet6.h"
67
68 #include <sys/param.h>
69 #include <sys/systm.h>
70 #include <sys/malloc.h>
71 #include <sys/mbuf.h>
72 #include <sys/domain.h>
73 #include <sys/protosw.h>
74 #include <sys/socket.h>
75 #include <sys/socketvar.h>
76 #include <sys/proc.h>
77 #include <sys/priv.h>
78 #include <sys/jail.h>
79 #include <sys/kernel.h>
80 #include <sys/sysctl.h>
81
82 #include <sys/socketvar2.h>
83 #include <sys/msgport2.h>
84
85 #include <machine/limits.h>
86
87 #include <net/if.h>
88 #include <net/if_types.h>
89 #include <net/route.h>
90 #include <net/netisr2.h>
91 #include <net/toeplitz2.h>
92
93 #include <netinet/in.h>
94 #include <netinet/in_pcb.h>
95 #include <netinet/in_var.h>
96 #include <netinet/ip_var.h>
97 #ifdef INET6
98 #include <netinet/ip6.h>
99 #include <netinet6/ip6_var.h>
100 #endif /* INET6 */
101
102 #define INP_LOCALGROUP_SIZMIN   8
103 #define INP_LOCALGROUP_SIZMAX   256
104
105 static struct inpcb *in_pcblookup_local(struct inpcbporthead *porthash,
106                 struct in_addr laddr, u_int lport_arg, int wild_okay,
107                 struct ucred *cred);
108
109 struct in_addr zeroin_addr;
110
111 /*
112  * These configure the range of local port addresses assigned to
113  * "unspecified" outgoing connections/packets/whatever.
114  */
115 int ipport_lowfirstauto = IPPORT_RESERVED - 1;  /* 1023 */
116 int ipport_lowlastauto = IPPORT_RESERVEDSTART;  /* 600 */
117
118 int ipport_firstauto = IPPORT_RESERVED;         /* 1024 */
119 int ipport_lastauto = IPPORT_USERRESERVED;      /* 5000 */
120
121 int ipport_hifirstauto = IPPORT_HIFIRSTAUTO;    /* 49152 */
122 int ipport_hilastauto = IPPORT_HILASTAUTO;      /* 65535 */
123
124 #define RANGECHK(var, min, max) \
125         if ((var) < (min)) { (var) = (min); } \
126         else if ((var) > (max)) { (var) = (max); }
127
128 int udpencap_enable = 1;        /* enabled by default */
129 int udpencap_port = 4500;       /* triggers decapsulation */
130
131 /*
132  * Per-netisr inpcb markers.
133  * NOTE: they should only be used in netisrs.
134  */
135 static struct inpcb             *in_pcbmarkers;
136 static struct inpcontainer      *in_pcbcontainer_markers;
137
138 static int
139 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
140 {
141         int error;
142
143         error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
144         if (!error) {
145                 RANGECHK(ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
146                 RANGECHK(ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
147
148                 RANGECHK(ipport_firstauto, IPPORT_RESERVED, USHRT_MAX);
149                 RANGECHK(ipport_lastauto, IPPORT_RESERVED, USHRT_MAX);
150
151                 RANGECHK(ipport_hifirstauto, IPPORT_RESERVED, USHRT_MAX);
152                 RANGECHK(ipport_hilastauto, IPPORT_RESERVED, USHRT_MAX);
153         }
154         return (error);
155 }
156
157 #undef RANGECHK
158
159 SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0, "IP Ports");
160
161 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, CTLTYPE_INT|CTLFLAG_RW,
162            &ipport_lowfirstauto, 0, &sysctl_net_ipport_check, "I", "");
163 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, CTLTYPE_INT|CTLFLAG_RW,
164            &ipport_lowlastauto, 0, &sysctl_net_ipport_check, "I", "");
165 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, CTLTYPE_INT|CTLFLAG_RW,
166            &ipport_firstauto, 0, &sysctl_net_ipport_check, "I", "");
167 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, CTLTYPE_INT|CTLFLAG_RW,
168            &ipport_lastauto, 0, &sysctl_net_ipport_check, "I", "");
169 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, CTLTYPE_INT|CTLFLAG_RW,
170            &ipport_hifirstauto, 0, &sysctl_net_ipport_check, "I", "");
171 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, CTLTYPE_INT|CTLFLAG_RW,
172            &ipport_hilastauto, 0, &sysctl_net_ipport_check, "I", "");
173
174 /* Initialized by ip_init() */
175 int ip_porthash_trycount;
176 SYSCTL_INT(_net_inet_ip, OID_AUTO, porthash_trycount, CTLFLAG_RW,
177     &ip_porthash_trycount, 0,
178     "Number of tries to find local port matching hash of 4-tuple");
179
180 /*
181  * in_pcb.c: manage the Protocol Control Blocks.
182  *
183  * NOTE: It is assumed that most of these functions will be called from
184  * a critical section.  XXX - There are, unfortunately, a few exceptions
185  * to this rule that should be fixed.
186  *
187  * NOTE: The caller should initialize the cpu field to the cpu running the
188  * protocol stack associated with this inpcbinfo.
189  */
190
191 void
192 in_pcbinfo_init(struct inpcbinfo *pcbinfo, int cpu, boolean_t shared)
193 {
194         KASSERT(cpu >= 0 && cpu < netisr_ncpus, ("invalid cpu%d", cpu));
195         pcbinfo->cpu = cpu;
196
197         LIST_INIT(&pcbinfo->pcblisthead);
198         pcbinfo->portsave = kmalloc(sizeof(*pcbinfo->portsave), M_PCB,
199                                     M_WAITOK | M_ZERO);
200
201         if (shared) {
202                 pcbinfo->infotoken = kmalloc(sizeof(struct lwkt_token),
203                     M_PCB, M_WAITOK);
204                 lwkt_token_init(pcbinfo->infotoken, "infotoken");
205         } else {
206                 pcbinfo->infotoken = NULL;
207         }
208 }
209
210 void
211 in_pcbportinfo_set(struct inpcbinfo *pcbinfo, struct inpcbportinfo *portinfo,
212     int portinfo_cnt)
213 {
214
215         KASSERT(portinfo_cnt > 0, ("invalid portinfo_cnt %d", portinfo_cnt));
216         pcbinfo->portinfo = portinfo;
217         pcbinfo->portinfo_cnt = portinfo_cnt;
218 }
219
220 struct baddynamicports baddynamicports;
221
222 /*
223  * Check if the specified port is invalid for dynamic allocation.
224  */
225 int
226 in_baddynamic(u_int16_t port, u_int16_t proto)
227 {
228         switch (proto) {
229         case IPPROTO_TCP:
230                 return (DP_ISSET(baddynamicports.tcp, port));
231         case IPPROTO_UDP:
232                 return (DP_ISSET(baddynamicports.udp, port));
233         default:
234                 return (0);
235         }
236 }
237
238 void
239 in_pcbonlist(struct inpcb *inp)
240 {
241         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
242
243         KASSERT(&curthread->td_msgport == netisr_cpuport(pcbinfo->cpu),
244             ("not in the correct netisr"));
245         KASSERT((inp->inp_flags & INP_ONLIST) == 0, ("already on pcblist"));
246         inp->inp_flags |= INP_ONLIST;
247
248         GET_PCBINFO_TOKEN(pcbinfo);
249         LIST_INSERT_HEAD(&pcbinfo->pcblisthead, inp, inp_list);
250         pcbinfo->ipi_count++;
251         REL_PCBINFO_TOKEN(pcbinfo);
252 }
253
254 void
255 in_pcbofflist(struct inpcb *inp)
256 {
257         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
258
259         KASSERT(&curthread->td_msgport == netisr_cpuport(pcbinfo->cpu),
260             ("not in the correct netisr"));
261         KASSERT(inp->inp_flags & INP_ONLIST, ("not on pcblist"));
262         inp->inp_flags &= ~INP_ONLIST;
263
264         GET_PCBINFO_TOKEN(pcbinfo);
265         LIST_REMOVE(inp, inp_list);
266         KASSERT(pcbinfo->ipi_count > 0,
267             ("invalid inpcb count %d", pcbinfo->ipi_count));
268         pcbinfo->ipi_count--;
269         REL_PCBINFO_TOKEN(pcbinfo);
270 }
271
272 /*
273  * Allocate a PCB and associate it with the socket.
274  */
275 int
276 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
277 {
278         struct inpcb *inp;
279
280         inp = kmalloc(pcbinfo->ipi_size, M_PCB, M_WAITOK|M_ZERO|M_NULLOK);
281         if (inp == NULL)
282                 return (ENOMEM);
283         inp->inp_lgrpindex = -1;
284         inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
285         inp->inp_pcbinfo = pcbinfo;
286         inp->inp_socket = so;
287 #ifdef INET6
288         if (INP_CHECK_SOCKAF(so, AF_INET6)) {
289                 if (ip6_auto_flowlabel)
290                         inp->inp_flags |= IN6P_AUTOFLOWLABEL;
291                 inp->inp_af = AF_INET6;
292         } else
293 #endif
294         inp->inp_af = AF_INET;
295         soreference(so);
296         so->so_pcb = inp;
297
298         in_pcbonlist(inp);
299         return (0);
300 }
301
302 /*
303  * Unlink a pcb with the intention of moving it to another cpu with a
304  * different pcbinfo.  While unlinked nothing should attempt to dereference
305  * inp_pcbinfo, NULL it out so we assert if it does.
306  */
307 void
308 in_pcbunlink_flags(struct inpcb *inp, struct inpcbinfo *pcbinfo, int flags)
309 {
310         KASSERT(inp->inp_pcbinfo == pcbinfo, ("pcbinfo mismatch"));
311         KASSERT((inp->inp_flags & (flags | INP_CONNECTED)) == 0,
312             ("already linked"));
313
314         in_pcbofflist(inp);
315         inp->inp_pcbinfo = NULL;
316 }
317
318 void
319 in_pcbunlink(struct inpcb *inp, struct inpcbinfo *pcbinfo)
320 {
321         in_pcbunlink_flags(inp, pcbinfo, INP_WILDCARD);
322 }
323
324 /*
325  * Relink a pcb into a new pcbinfo.
326  */
327 void
328 in_pcblink_flags(struct inpcb *inp, struct inpcbinfo *pcbinfo, int flags)
329 {
330         KASSERT(inp->inp_pcbinfo == NULL, ("has pcbinfo"));
331         KASSERT((inp->inp_flags & (flags | INP_CONNECTED)) == 0,
332             ("already linked"));
333
334         inp->inp_pcbinfo = pcbinfo;
335         in_pcbonlist(inp);
336 }
337
338 void
339 in_pcblink(struct inpcb *inp, struct inpcbinfo *pcbinfo)
340 {
341         return in_pcblink_flags(inp, pcbinfo, INP_WILDCARD);
342 }
343
344 static boolean_t
345 in_pcbporthash_update(struct inpcbportinfo *portinfo,
346     struct inpcb *inp, u_short lport, struct ucred *cred, int wild)
347 {
348         struct inpcbporthead *porthash;
349
350         /*
351          * This has to be atomic.  If the porthash is shared across multiple
352          * protocol threads, e.g. tcp and udp, then the token must be held.
353          */
354         porthash = in_pcbporthash_head(portinfo, lport);
355         GET_PORTHASH_TOKEN(porthash);
356
357         if (in_pcblookup_local(porthash, inp->inp_laddr, lport, wild, cred)) {
358                 REL_PORTHASH_TOKEN(porthash);
359                 return FALSE;
360         }
361         inp->inp_lport = lport;
362         in_pcbinsporthash(porthash, inp);
363
364         REL_PORTHASH_TOKEN(porthash);
365         return TRUE;
366 }
367
368 static int
369 in_pcbsetlport(struct inpcb *inp, int wild, struct ucred *cred)
370 {
371         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
372         struct inpcbportinfo *portinfo;
373         u_short first, last, lport, step, first0, last0;
374         int count, error;
375         int portinfo_first, portinfo_idx;
376         uint32_t cut;
377
378         inp->inp_flags |= INP_ANONPORT;
379
380         step = pcbinfo->portinfo_cnt;
381         portinfo_first = mycpuid % pcbinfo->portinfo_cnt;
382         portinfo_idx = portinfo_first;
383
384         if (inp->inp_flags & INP_HIGHPORT) {
385                 first0 = ipport_hifirstauto;    /* sysctl */
386                 last0  = ipport_hilastauto;
387         } else if (inp->inp_flags & INP_LOWPORT) {
388                 if (cred &&
389                     (error =
390                      priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0))) {
391                         inp->inp_laddr.s_addr = INADDR_ANY;
392                         return error;
393                 }
394                 first0 = ipport_lowfirstauto;   /* 1023 */
395                 last0  = ipport_lowlastauto;    /* 600 */
396         } else {
397                 first0 = ipport_firstauto;      /* sysctl */
398                 last0  = ipport_lastauto;
399         }
400         if (first0 > last0) {
401                 lport = last0;
402                 last0 = first0;
403                 first0 = lport;
404         }
405         KKASSERT(last0 >= first0);
406
407         cut = karc4random();
408 loop:
409         portinfo = &pcbinfo->portinfo[portinfo_idx];
410         first = first0;
411         last = last0;
412
413         /*
414          * Simple check to ensure all ports are not used up causing
415          * a deadlock here.
416          */
417         in_pcbportrange(&last, &first, portinfo->offset, step);
418         lport = last - first;
419         count = lport / step;
420
421         lport = rounddown(cut % lport, step) + first;
422         KKASSERT(lport % step == portinfo->offset);
423
424         for (;;) {
425                 if (count-- < 0) {      /* completely used? */
426                         error = EADDRNOTAVAIL;
427                         break;
428                 }
429
430                 if (__predict_false(lport < first || lport > last)) {
431                         lport = first;
432                         KKASSERT(lport % step == portinfo->offset);
433                 }
434
435                 if (in_pcbporthash_update(portinfo, inp, htons(lport),
436                                           cred, wild)) {
437                         error = 0;
438                         break;
439                 }
440
441                 lport += step;
442                 KKASSERT(lport % step == portinfo->offset);
443         }
444
445         if (error) {
446                 /* Try next portinfo */
447                 portinfo_idx++;
448                 portinfo_idx %= pcbinfo->portinfo_cnt;
449                 if (portinfo_idx != portinfo_first)
450                         goto loop;
451                 inp->inp_laddr.s_addr = INADDR_ANY;
452         }
453         return error;
454 }
455
456 int
457 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct thread *td)
458 {
459         struct socket *so = inp->inp_socket;
460         struct sockaddr_in jsin;
461         struct ucred *cred = NULL;
462         int wild = 0;
463
464         if (TAILQ_EMPTY(&in_ifaddrheads[mycpuid])) /* XXX broken! */
465                 return (EADDRNOTAVAIL);
466         if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
467                 return (EINVAL);        /* already bound */
468
469         if (!(so->so_options & (SO_REUSEADDR|SO_REUSEPORT)))
470                 wild = 1;    /* neither SO_REUSEADDR nor SO_REUSEPORT is set */
471         if (td->td_proc)
472                 cred = td->td_proc->p_ucred;
473
474         if (nam != NULL) {
475                 struct sockaddr_in *sin = (struct sockaddr_in *)nam;
476                 struct inpcbinfo *pcbinfo;
477                 struct inpcbportinfo *portinfo;
478                 struct inpcbporthead *porthash;
479                 struct inpcb *t;
480                 u_short lport, lport_ho;
481                 int reuseport = (so->so_options & SO_REUSEPORT);
482                 int error;
483
484                 if (nam->sa_len != sizeof *sin)
485                         return (EINVAL);
486 #ifdef notdef
487                 /*
488                  * We should check the family, but old programs
489                  * incorrectly fail to initialize it.
490                  */
491                 if (sin->sin_family != AF_INET)
492                         return (EAFNOSUPPORT);
493 #endif
494                 if (!prison_replace_wildcards(td, nam))
495                         return (EINVAL);
496
497                 lport = sin->sin_port;
498                 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
499                         /*
500                          * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
501                          * allow complete duplication of binding if
502                          * SO_REUSEPORT is set, or if SO_REUSEADDR is set
503                          * and a multicast address is bound on both
504                          * new and duplicated sockets.
505                          */
506                         if (so->so_options & SO_REUSEADDR)
507                                 reuseport = SO_REUSEADDR | SO_REUSEPORT;
508                 } else if (sin->sin_addr.s_addr != INADDR_ANY) {
509                         sin->sin_port = 0;              /* yech... */
510                         bzero(&sin->sin_zero, sizeof sin->sin_zero);
511                         if (ifa_ifwithaddr((struct sockaddr *)sin) == NULL)
512                                 return (EADDRNOTAVAIL);
513                 }
514
515                 inp->inp_laddr = sin->sin_addr;
516
517                 jsin.sin_family = AF_INET;
518                 jsin.sin_addr.s_addr = inp->inp_laddr.s_addr;
519                 if (!prison_replace_wildcards(td, (struct sockaddr *)&jsin)) {
520                         inp->inp_laddr.s_addr = INADDR_ANY;
521                         return (EINVAL);
522                 }
523                 inp->inp_laddr.s_addr = jsin.sin_addr.s_addr;
524
525                 if (lport == 0) {
526                         /* Auto-select local port */
527                         return in_pcbsetlport(inp, wild, cred);
528                 }
529                 lport_ho = ntohs(lport);
530
531                 /* GROSS */
532                 if (lport_ho < IPPORT_RESERVED && cred &&
533                     (error =
534                      priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0))) {
535                         inp->inp_laddr.s_addr = INADDR_ANY;
536                         return (error);
537                 }
538
539                 /*
540                  * Locate the proper portinfo based on lport
541                  */
542                 pcbinfo = inp->inp_pcbinfo;
543                 portinfo =
544                     &pcbinfo->portinfo[lport_ho % pcbinfo->portinfo_cnt];
545                 KKASSERT((lport_ho % pcbinfo->portinfo_cnt) ==
546                     portinfo->offset);
547
548                 /*
549                  * This has to be atomic.  If the porthash is shared across
550                  * multiple protocol threads, e.g. tcp and udp then the token
551                  * must be held.
552                  */
553                 porthash = in_pcbporthash_head(portinfo, lport);
554                 GET_PORTHASH_TOKEN(porthash);
555
556                 if (so->so_cred->cr_uid != 0 &&
557                     !IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
558                         t = in_pcblookup_local(porthash, sin->sin_addr, lport,
559                                                INPLOOKUP_WILDCARD, cred);
560                         if (t &&
561                             (so->so_cred->cr_uid !=
562                              t->inp_socket->so_cred->cr_uid)) {
563                                 inp->inp_laddr.s_addr = INADDR_ANY;
564                                 error = EADDRINUSE;
565                                 goto done;
566                         }
567                 }
568                 if (cred && !prison_replace_wildcards(td, nam)) {
569                         inp->inp_laddr.s_addr = INADDR_ANY;
570                         error = EADDRNOTAVAIL;
571                         goto done;
572                 }
573
574                 /*
575                  * When binding to a local port if the best match is against
576                  * an accepted socket we generally want to allow the binding.
577                  * This means that there is no longer any specific socket
578                  * bound or bound for listening.
579                  */
580                 t = in_pcblookup_local(porthash, sin->sin_addr, lport,
581                                        wild, cred);
582                 if (t &&
583                     (reuseport & t->inp_socket->so_options) == 0 &&
584                     (t->inp_socket->so_state & SS_ACCEPTMECH) == 0) {
585                         inp->inp_laddr.s_addr = INADDR_ANY;
586                         error = EADDRINUSE;
587                         goto done;
588                 }
589                 inp->inp_lport = lport;
590                 in_pcbinsporthash(porthash, inp);
591                 error = 0;
592 done:
593                 REL_PORTHASH_TOKEN(porthash);
594                 return (error);
595         } else {
596                 jsin.sin_family = AF_INET;
597                 jsin.sin_addr.s_addr = inp->inp_laddr.s_addr;
598                 if (!prison_replace_wildcards(td, (struct sockaddr *)&jsin)) {
599                         inp->inp_laddr.s_addr = INADDR_ANY;
600                         return (EINVAL);
601                 }
602                 inp->inp_laddr.s_addr = jsin.sin_addr.s_addr;
603
604                 return in_pcbsetlport(inp, wild, cred);
605         }
606 }
607
608 /*
609  * Lookup a PCB based on the local and remote address and port.
610  *
611  * This function is only used when scanning for a free port.
612  */
613 static struct inpcb *
614 in_pcblookup_localremote(struct inpcbporthead *porthash, struct in_addr laddr,
615                          u_short lport, struct in_addr faddr, u_short fport,
616                          struct ucred *cred)
617 {
618         struct inpcb *inp;
619         struct inpcbport *phd;
620         struct inpcb *match = NULL;
621         struct prison *pscan;
622         struct prison *pr;
623
624         /*
625          * If the porthashbase is shared across several cpus, it must
626          * have been locked.
627          */
628         ASSERT_PORTHASH_TOKEN_HELD(porthash);
629
630         /*
631          * Best fit PCB lookup.
632          *
633          * First see if this local port is in use by looking on the
634          * port hash list.
635          */
636         LIST_FOREACH(phd, porthash, phd_hash) {
637                 if (phd->phd_port == lport)
638                         break;
639         }
640         if (phd != NULL) {
641                 pr = cred ? cred->cr_prison : NULL;
642
643                 LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
644 #ifdef INET6
645                         if (!INP_ISIPV4(inp))
646                                 continue;
647 #endif
648                         if (inp->inp_laddr.s_addr == INADDR_ANY) {
649                                 if (inp->inp_socket && inp->inp_socket->so_cred)
650                                         pscan = inp->inp_socket->so_cred->cr_prison;
651                                 else
652                                         pscan = NULL;
653                                 if (pr != pscan)
654                                         continue;
655                         } else {
656                                 if (inp->inp_laddr.s_addr != laddr.s_addr)
657                                         continue;
658                         }
659
660                         if (inp->inp_faddr.s_addr != INADDR_ANY &&
661                             inp->inp_faddr.s_addr != faddr.s_addr)
662                                 continue;
663
664                         if (inp->inp_fport != 0 && inp->inp_fport != fport)
665                                 continue;
666
667                         match = inp;
668                         break;
669                 }
670         }
671         return (match);
672 }
673
674 static boolean_t
675 in_pcbporthash_update4(struct inpcbportinfo *portinfo, struct inpcb *inp,
676                        u_short lport, const struct sockaddr_in *sin,
677                        struct ucred *cred)
678 {
679         struct inpcbporthead *porthash;
680
681         /*
682          * This has to be atomic.  If the porthash is shared across multiple
683          * protocol threads, e.g. tcp and udp, then the token must be held.
684          */
685         porthash = in_pcbporthash_head(portinfo, lport);
686         GET_PORTHASH_TOKEN(porthash);
687
688         if (in_pcblookup_localremote(porthash, inp->inp_laddr, lport,
689                                      sin->sin_addr, sin->sin_port, cred)) {
690                 REL_PORTHASH_TOKEN(porthash);
691                 return FALSE;
692         }
693         inp->inp_lport = lport;
694         in_pcbinsporthash(porthash, inp);
695
696         REL_PORTHASH_TOKEN(porthash);
697         return TRUE;
698 }
699
700 int
701 in_pcbbind_remote(struct inpcb *inp, const struct sockaddr *remote,
702     struct thread *td)
703 {
704         struct proc *p = td->td_proc;
705         const struct sockaddr_in *sin = (const struct sockaddr_in *)remote;
706         struct sockaddr_in jsin;
707         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
708         struct ucred *cred = NULL;
709         u_short first, last, lport;
710         int count, hash_count;
711         int error, selfconn = 0;
712         int cpuid = mycpuid;
713         uint32_t hash_base = 0, hash;
714
715         ASSERT_NETISR_NCPUS(cpuid);
716
717         if (TAILQ_EMPTY(&in_ifaddrheads[cpuid])) /* XXX broken! */
718                 return (EADDRNOTAVAIL);
719
720         KKASSERT(inp->inp_laddr.s_addr != INADDR_ANY);
721         if (inp->inp_lport != 0)
722                 return (EINVAL);        /* already bound */
723
724         KKASSERT(p);
725         cred = p->p_ucred;
726
727         jsin.sin_family = AF_INET;
728         jsin.sin_addr.s_addr = inp->inp_laddr.s_addr;
729         if (!prison_replace_wildcards(td, (struct sockaddr *)&jsin)) {
730                 inp->inp_laddr.s_addr = INADDR_ANY;
731                 return (EINVAL);
732         }
733         inp->inp_laddr.s_addr = jsin.sin_addr.s_addr;
734
735         hash_count = ip_porthash_trycount;
736         if (hash_count > 0) {
737                 hash_base = toeplitz_piecemeal_addr(sin->sin_addr.s_addr) ^
738                     toeplitz_piecemeal_addr(inp->inp_laddr.s_addr) ^
739                     toeplitz_piecemeal_port(sin->sin_port);
740         } else {
741                 hash_count = 0;
742         }
743
744         inp->inp_flags |= INP_ANONPORT;
745
746         if (inp->inp_flags & INP_HIGHPORT) {
747                 first = ipport_hifirstauto;     /* sysctl */
748                 last  = ipport_hilastauto;
749         } else if (inp->inp_flags & INP_LOWPORT) {
750                 if (cred &&
751                     (error =
752                      priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0))) {
753                         inp->inp_laddr.s_addr = INADDR_ANY;
754                         return (error);
755                 }
756                 first = ipport_lowfirstauto;    /* 1023 */
757                 last = ipport_lowlastauto;      /* 600 */
758         } else {
759                 first = ipport_firstauto;       /* sysctl */
760                 last  = ipport_lastauto;
761         }
762         if (first > last) {
763                 lport = last;
764                 last = first;
765                 first = lport;
766         }
767         KKASSERT(last >= first);
768
769         count = last - first;
770         lport = (karc4random() % count) + first;
771         count += hash_count;
772
773         /*
774          * Simple check to ensure all ports are not used up causing
775          * a deadlock here.
776          */
777         for (;;) {
778                 u_short lport_no;
779
780                 if (count-- < 0) {      /* completely used? */
781                         error = EADDRNOTAVAIL;
782                         break;
783                 }
784
785                 if (__predict_false(lport < first || lport > last))
786                         lport = first;
787                 lport_no = htons(lport);
788
789                 /* This could happen on loopback interface */
790                 if (__predict_false(sin->sin_port == lport_no &&
791                     sin->sin_addr.s_addr == inp->inp_laddr.s_addr)) {
792                         if (!selfconn) {
793                                 ++count; /* don't count this try */
794                                 selfconn = 1;
795                         }
796                         goto next;
797                 }
798
799                 if (hash_count) {
800                         --hash_count;
801                         hash = hash_base ^
802                             toeplitz_piecemeal_port(lport_no);
803                         if (netisr_hashcpu(hash) != cpuid && hash_count)
804                                 goto next;
805                 }
806
807                 if (in_pcbporthash_update4(
808                             &pcbinfo->portinfo[lport % pcbinfo->portinfo_cnt],
809                             inp, lport_no, sin, cred)) {
810                         error = 0;
811                         break;
812                 }
813 next:
814                 ++lport;
815         }
816
817         if (error)
818                 inp->inp_laddr.s_addr = INADDR_ANY;
819         return (error);
820 }
821
822 /*
823  * Figure out the local interface address to pair against the requested
824  * target address, as well as validate the target address.
825  */
826 int
827 in_pcbladdr_find(struct inpcb *inp, struct sockaddr *nam,
828                  struct sockaddr_in **plocal_sin, struct thread *td, int find)
829 {
830         struct in_ifaddr_container *iac;
831         struct in_ifaddr *ia;
832         struct ucred *cred = NULL;
833         struct sockaddr_in *sin = (struct sockaddr_in *)nam;
834         struct sockaddr *jsin;
835         struct prison *pr;
836         struct route *ro;
837         int alloc_route = 0;
838
839         if (nam->sa_len != sizeof *sin)
840                 return (EINVAL);
841         if (sin->sin_family != AF_INET)
842                 return (EAFNOSUPPORT);
843         if (sin->sin_port == 0)
844                 return (EADDRNOTAVAIL);
845
846         /*
847          * Are we in a jail?
848          */
849         pr = NULL;
850         if (td && td->td_proc && td->td_proc->p_ucred)
851                 cred = td->td_proc->p_ucred;
852         if (cred)
853                 pr = cred->cr_prison;
854
855         /*
856          * If the destination address is INADDR_ANY then use the primary
857          * local address.
858          *
859          * If the supplied address is INADDR_BROADCAST, and the primary
860          * interface supports broadcast, choose the broadcast address for
861          * that interface.
862          *
863          * If jailed, locate an interface address acceptable to the jail.
864          */
865         if (sin->sin_addr.s_addr == INADDR_ANY) {
866                 TAILQ_FOREACH(iac, &in_ifaddrheads[mycpuid], ia_link) {
867                         ia = iac->ia;
868                         if (pr == NULL ||
869                             jailed_ip(pr, sintosa(&ia->ia_addr))) {
870                                 sin->sin_addr = IA_SIN(ia)->sin_addr;
871                                 break;
872                         }
873                 }
874         } else if (sin->sin_addr.s_addr == (u_long)INADDR_BROADCAST) {
875                 TAILQ_FOREACH(iac, &in_ifaddrheads[mycpuid], ia_link) {
876                         ia = iac->ia;
877                         if ((pr == NULL ||
878                              jailed_ip(pr, sintosa(&ia->ia_addr))) &&
879                             (iac->ia->ia_ifp->if_flags & IFF_BROADCAST)) {
880                                 sin->sin_addr =
881                                     satosin(&ia->ia_broadaddr)->sin_addr;
882                                 break;
883                         }
884                 }
885         }
886
887         /*
888          * If asked to do a search, use the cached route or do a route table
889          * lookup to try to find an acceptable local interface IP.
890          */
891         if (find == 0)
892                 return 0;
893
894         ia = NULL;
895
896         /*
897          * If we have a cached route, check to see if it is acceptable.
898          * If not, free it.
899          */
900         ro = &inp->inp_route;
901         if (ro->ro_rt &&
902             (!(ro->ro_rt->rt_flags & RTF_UP) ||
903              ro->ro_dst.sa_family != AF_INET ||
904              satosin(&ro->ro_dst)->sin_addr.s_addr !=
905                               sin->sin_addr.s_addr ||
906              inp->inp_socket->so_options & SO_DONTROUTE)) {
907                 RTFREE(ro->ro_rt);
908                 ro->ro_rt = NULL;
909         }
910
911         /*
912          * If we do not have a route, construct one and do a lookup,
913          * unless we are forbidden to do so.
914          *
915          * Note that we should check the address family of the cached
916          * destination, in case of sharing the cache with IPv6.
917          */
918         if (!(inp->inp_socket->so_options & SO_DONTROUTE) && /*XXX*/
919             (ro->ro_rt == NULL || ro->ro_rt->rt_ifp == NULL)) {
920                 bzero(&ro->ro_dst, sizeof(struct sockaddr_in));
921                 ro->ro_dst.sa_family = AF_INET;
922                 ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
923                 ((struct sockaddr_in *)&ro->ro_dst)->sin_addr = sin->sin_addr;
924                 rtalloc(ro);
925                 alloc_route = 1;
926         }
927
928         /*
929          * If we found a route, use the address corresponding to the
930          * outgoing interface.
931          *
932          * If jailed, try to find a compatible address on the outgoing
933          * interface.
934          */
935         if (ro->ro_rt) {
936                 ia = ifatoia(ro->ro_rt->rt_ifa);
937                 if (pr == NULL)
938                         goto skip;
939                 if (jailed_ip(pr, sintosa(&ia->ia_addr)))
940                         goto skip;
941                 TAILQ_FOREACH(iac, &in_ifaddrheads[mycpuid], ia_link) {
942                         if (iac->ia->ia_ifp != ia->ia_ifp)
943                                 continue;
944                         ia = iac->ia;
945                         if (jailed_ip(pr, sintosa(&ia->ia_addr)))
946                                 goto skip;
947                 }
948                 ia = NULL;
949         }
950 skip:
951
952         /*
953          * If the route didn't work or there was no route,
954          * fall-back to the first address in in_ifaddrheads[].
955          *
956          * If jailed and this address is not available for
957          * the jail, leave ia set to NULL.
958          */
959         if (ia == NULL) {
960                 u_short fport = sin->sin_port;
961
962                 sin->sin_port = 0;
963                 ia = ifatoia(ifa_ifwithdstaddr(sintosa(sin)));
964                 if (ia && pr && !jailed_ip(pr, sintosa(&ia->ia_addr)))
965                         ia = NULL;
966
967                 if (ia == NULL)
968                         ia = ifatoia(ifa_ifwithnet(sintosa(sin)));
969                 if (ia && pr && !jailed_ip(pr, sintosa(&ia->ia_addr)))
970                         ia = NULL;
971
972                 sin->sin_port = fport;
973                 if (ia == NULL && !TAILQ_EMPTY(&in_ifaddrheads[mycpuid]))
974                         ia = TAILQ_FIRST(&in_ifaddrheads[mycpuid])->ia;
975
976                 if (ia && pr && !jailed_ip(pr, sintosa(&ia->ia_addr)))
977                         ia = NULL;
978
979                 if (pr == NULL && ia == NULL)
980                         goto fail;
981         }
982
983         /*
984          * If the destination address is multicast and an outgoing
985          * interface has been set as a multicast option, use the
986          * address of that interface as our source address.
987          */
988         if (pr == NULL && IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) &&
989             inp->inp_moptions != NULL) {
990                 struct ip_moptions *imo;
991                 struct ifnet *ifp;
992
993                 imo = inp->inp_moptions;
994                 if ((ifp = imo->imo_multicast_ifp) != NULL) {
995                         struct in_ifaddr_container *iac;
996
997                         ia = NULL;
998                         TAILQ_FOREACH(iac, &in_ifaddrheads[mycpuid], ia_link) {
999                                 if (iac->ia->ia_ifp == ifp) {
1000                                         ia = iac->ia;
1001                                         break;
1002                                 }
1003                         }
1004                         if (ia == NULL)
1005                                 goto fail;
1006                 }
1007         }
1008
1009         /*
1010          * If we still don't have a local address, and are jailed,
1011          * use the jail's first non-localhost IP.  If there isn't
1012          * one, use the jail's first localhost IP.
1013          *
1014          * Don't do pcblookup call here; return interface in plocal_sin
1015          * and exit to caller, that will do the lookup.
1016          */
1017         if (ia == NULL && pr) {
1018                 jsin = prison_get_nonlocal(cred->cr_prison, AF_INET, NULL);
1019                 if (jsin == NULL)
1020                         jsin = prison_get_local(cred->cr_prison, AF_INET, NULL);
1021                 if (jsin)
1022                         *plocal_sin = satosin(jsin);
1023                 else
1024                         goto fail;
1025         } else if (ia) {
1026                 *plocal_sin = &ia->ia_addr;
1027         } else {
1028                 goto fail;
1029         }
1030         return (0);
1031 fail:
1032         if (alloc_route)
1033                 in_pcbresetroute(inp);
1034         return (EADDRNOTAVAIL);
1035 }
1036
1037 int
1038 in_pcbladdr(struct inpcb *inp, struct sockaddr *nam,
1039             struct sockaddr_in **plocal_sin, struct thread *td)
1040 {
1041         return in_pcbladdr_find(inp, nam, plocal_sin, td,
1042                                 (inp->inp_laddr.s_addr == INADDR_ANY));
1043 }
1044
1045 /*
1046  * Outer subroutine:
1047  * Connect from a socket to a specified address.
1048  * Both address and port must be specified in argument sin.
1049  * If don't have a local address for this socket yet,
1050  * then pick one.
1051  */
1052 int
1053 in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct thread *td)
1054 {
1055         struct sockaddr_in *if_sin;
1056         struct sockaddr_in *sin = (struct sockaddr_in *)nam;
1057         int error;
1058
1059         if_sin = NULL;  /* avoid gcc warnings */
1060
1061         /* Call inner routine to assign local interface address. */
1062         if ((error = in_pcbladdr(inp, nam, &if_sin, td)) != 0)
1063                 return (error);
1064
1065         if (in_pcblookup_hash(inp->inp_pcbinfo, sin->sin_addr, sin->sin_port,
1066                               inp->inp_laddr.s_addr ?
1067                                 inp->inp_laddr : if_sin->sin_addr,
1068                               inp->inp_lport, FALSE, NULL) != NULL) {
1069                 return (EADDRINUSE);
1070         }
1071         if (inp->inp_laddr.s_addr == INADDR_ANY) {
1072                 if (inp->inp_lport == 0) {
1073                         error = in_pcbbind(inp, NULL, td);
1074                         if (error)
1075                                 return (error);
1076                 }
1077                 inp->inp_laddr = if_sin->sin_addr;
1078         }
1079         inp->inp_faddr = sin->sin_addr;
1080         inp->inp_fport = sin->sin_port;
1081         in_pcbinsconnhash(inp);
1082         return (0);
1083 }
1084
1085 void
1086 in_pcbdisconnect(struct inpcb *inp)
1087 {
1088
1089         in_pcbremconnhash(inp);
1090         inp->inp_faddr.s_addr = INADDR_ANY;
1091         inp->inp_fport = 0;
1092 }
1093
1094 void
1095 in_pcbdetach(struct inpcb *inp)
1096 {
1097         struct socket *so = inp->inp_socket;
1098         struct inpcbinfo *ipi = inp->inp_pcbinfo;
1099
1100         inp->inp_gencnt = ++ipi->ipi_gencnt;
1101         KKASSERT((so->so_state & SS_ASSERTINPROG) == 0);
1102         in_pcbremlists(inp);
1103         so->so_pcb = NULL;
1104         sofree(so);                     /* remove pcb ref */
1105         if (inp->inp_options)
1106                 m_free(inp->inp_options);
1107         if (inp->inp_route.ro_rt)
1108                 rtfree(inp->inp_route.ro_rt);
1109         ip_freemoptions(inp->inp_moptions);
1110         kfree(inp, M_PCB);
1111 }
1112
1113 /*
1114  * The socket may have an invalid PCB, i.e. NULL.  For example, a TCP
1115  * socket received RST.
1116  */
1117 static int
1118 in_setsockaddr(struct socket *so, struct sockaddr **nam)
1119 {
1120         struct inpcb *inp;
1121         struct sockaddr_in *sin;
1122
1123         KASSERT(curthread->td_type == TD_TYPE_NETISR, ("not in netisr"));
1124         inp = so->so_pcb;
1125         if (!inp)
1126                 return (ECONNRESET);
1127
1128         sin = kmalloc(sizeof *sin, M_SONAME, M_WAITOK | M_ZERO);
1129         sin->sin_family = AF_INET;
1130         sin->sin_len = sizeof *sin;
1131         sin->sin_port = inp->inp_lport;
1132         sin->sin_addr = inp->inp_laddr;
1133
1134         *nam = (struct sockaddr *)sin;
1135         return (0);
1136 }
1137
1138 void
1139 in_setsockaddr_dispatch(netmsg_t msg)
1140 {
1141         int error;
1142
1143         error = in_setsockaddr(msg->base.nm_so, msg->peeraddr.nm_nam);
1144         lwkt_replymsg(&msg->lmsg, error);
1145 }
1146
1147 /*
1148  * The socket may have an invalid PCB, i.e. NULL.  For example, a TCP
1149  * socket received RST.
1150  */
1151 int
1152 in_setpeeraddr(struct socket *so, struct sockaddr **nam)
1153 {
1154         struct inpcb *inp;
1155         struct sockaddr_in *sin;
1156
1157         KASSERT(curthread->td_type == TD_TYPE_NETISR, ("not in netisr"));
1158         inp = so->so_pcb;
1159         if (!inp)
1160                 return (ECONNRESET);
1161
1162         sin = kmalloc(sizeof *sin, M_SONAME, M_WAITOK | M_ZERO);
1163         sin->sin_family = AF_INET;
1164         sin->sin_len = sizeof *sin;
1165         sin->sin_port = inp->inp_fport;
1166         sin->sin_addr = inp->inp_faddr;
1167
1168         *nam = (struct sockaddr *)sin;
1169         return (0);
1170 }
1171
1172 void
1173 in_setpeeraddr_dispatch(netmsg_t msg)
1174 {
1175         int error;
1176
1177         error = in_setpeeraddr(msg->base.nm_so, msg->peeraddr.nm_nam);
1178         lwkt_replymsg(&msg->lmsg, error);
1179 }
1180
1181 void
1182 in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int err,
1183     inp_notify_t notify)
1184 {
1185         struct inpcb *inp, *marker;
1186
1187         KASSERT(&curthread->td_msgport == netisr_cpuport(pcbinfo->cpu),
1188             ("not in the correct netisr"));
1189         marker = in_pcbmarker();
1190
1191         /*
1192          * NOTE:
1193          * - If INP_PLACEMARKER is set we must ignore the rest of the
1194          *   structure and skip it.
1195          * - It is safe to nuke inpcbs here, since we are in their own
1196          *   netisr.
1197          */
1198         GET_PCBINFO_TOKEN(pcbinfo);
1199
1200         LIST_INSERT_HEAD(&pcbinfo->pcblisthead, marker, inp_list);
1201         while ((inp = LIST_NEXT(marker, inp_list)) != NULL) {
1202                 LIST_REMOVE(marker, inp_list);
1203                 LIST_INSERT_AFTER(inp, marker, inp_list);
1204
1205                 if (inp->inp_flags & INP_PLACEMARKER)
1206                         continue;
1207 #ifdef INET6
1208                 if (!INP_ISIPV4(inp))
1209                         continue;
1210 #endif
1211                 if (inp->inp_faddr.s_addr != faddr.s_addr ||
1212                     inp->inp_socket == NULL)
1213                         continue;
1214                 (*notify)(inp, err);            /* can remove inp from list! */
1215         }
1216         LIST_REMOVE(marker, inp_list);
1217
1218         REL_PCBINFO_TOKEN(pcbinfo);
1219 }
1220
1221 void
1222 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
1223 {
1224         struct inpcb *inp, *marker;
1225
1226         /*
1227          * We only need to make sure that we are in netisr0, where all
1228          * multicast operation happen.  We could check inpcbinfo which
1229          * does not belong to netisr0 by holding the inpcbinfo's token.
1230          * In this case, the pcbinfo must be able to be shared, i.e.
1231          * pcbinfo->infotoken is not NULL.
1232          */
1233         ASSERT_NETISR0;
1234         KASSERT(pcbinfo->cpu == 0 || pcbinfo->infotoken != NULL,
1235             ("pcbinfo could not be shared"));
1236
1237         /*
1238          * Get a marker for the current netisr (netisr0).
1239          *
1240          * It is possible that the multicast address deletion blocks,
1241          * which could cause temporary token releasing.  So we use
1242          * inpcb marker here to get a coherent view of the inpcb list.
1243          *
1244          * While, on the other hand, moptions are only added and deleted
1245          * in netisr0, so we would not see staled moption or miss moption
1246          * even if the token was released due to the blocking multicast
1247          * address deletion.
1248          */
1249         marker = in_pcbmarker();
1250
1251         GET_PCBINFO_TOKEN(pcbinfo);
1252
1253         LIST_INSERT_HEAD(&pcbinfo->pcblisthead, marker, inp_list);
1254         while ((inp = LIST_NEXT(marker, inp_list)) != NULL) {
1255                 struct ip_moptions *imo;
1256
1257                 LIST_REMOVE(marker, inp_list);
1258                 LIST_INSERT_AFTER(inp, marker, inp_list);
1259
1260                 if (inp->inp_flags & INP_PLACEMARKER)
1261                         continue;
1262                 imo = inp->inp_moptions;
1263                 if (INP_ISIPV4(inp) && imo != NULL) {
1264                         int i, gap;
1265
1266                         /*
1267                          * Unselect the outgoing interface if it is being
1268                          * detached.
1269                          */
1270                         if (imo->imo_multicast_ifp == ifp)
1271                                 imo->imo_multicast_ifp = NULL;
1272
1273                         /*
1274                          * Drop multicast group membership if we joined
1275                          * through the interface being detached.
1276                          */
1277                         for (i = 0, gap = 0; i < imo->imo_num_memberships;
1278                             i++) {
1279                                 if (imo->imo_membership[i]->inm_ifp == ifp) {
1280                                         /*
1281                                          * NOTE:
1282                                          * This could block and the pcbinfo
1283                                          * token could be passively released.
1284                                          */
1285                                         in_delmulti(imo->imo_membership[i]);
1286                                         gap++;
1287                                 } else if (gap != 0)
1288                                         imo->imo_membership[i - gap] =
1289                                             imo->imo_membership[i];
1290                         }
1291                         imo->imo_num_memberships -= gap;
1292                 }
1293         }
1294         LIST_REMOVE(marker, inp_list);
1295
1296         REL_PCBINFO_TOKEN(pcbinfo);
1297 }
1298
1299 /*
1300  * Check for alternatives when higher level complains
1301  * about service problems.  For now, invalidate cached
1302  * routing information.  If the route was created dynamically
1303  * (by a redirect), time to try a default gateway again.
1304  */
1305 void
1306 in_losing(struct inpcb *inp)
1307 {
1308         struct rtentry *rt;
1309         struct rt_addrinfo rtinfo;
1310
1311         if ((rt = inp->inp_route.ro_rt)) {
1312                 bzero(&rtinfo, sizeof(struct rt_addrinfo));
1313                 rtinfo.rti_info[RTAX_DST] = rt_key(rt);
1314                 rtinfo.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
1315                 rtinfo.rti_info[RTAX_NETMASK] = rt_mask(rt);
1316                 rtinfo.rti_flags = rt->rt_flags;
1317                 rt_missmsg(RTM_LOSING, &rtinfo, rt->rt_flags, 0);
1318                 if (rt->rt_flags & RTF_DYNAMIC) {
1319                         rtrequest(RTM_DELETE, rt_key(rt), rt->rt_gateway,
1320                             rt_mask(rt), rt->rt_flags, NULL);
1321                 }
1322                 inp->inp_route.ro_rt = NULL;
1323                 rtfree(rt);
1324                 /*
1325                  * A new route can be allocated
1326                  * the next time output is attempted.
1327                  */
1328         }
1329 }
1330
1331 /*
1332  * After a routing change, flush old routing
1333  * and allocate a (hopefully) better one.
1334  */
1335 void
1336 in_rtchange(struct inpcb *inp, int err)
1337 {
1338         if (inp->inp_route.ro_rt) {
1339                 rtfree(inp->inp_route.ro_rt);
1340                 inp->inp_route.ro_rt = NULL;
1341                 /*
1342                  * A new route can be allocated the next time
1343                  * output is attempted.
1344                  */
1345         }
1346 }
1347
1348 /*
1349  * Lookup a PCB based on the local address and port.
1350  *
1351  * This function is only used when scanning for a free port.
1352  */
1353 static struct inpcb *
1354 in_pcblookup_local(struct inpcbporthead *porthash, struct in_addr laddr,
1355                    u_int lport_arg, int wild_okay, struct ucred *cred)
1356 {
1357         struct prison *pscan;
1358         struct prison *pr;
1359         struct inpcb *inp;
1360         int matchwild = 3, wildcard;
1361         u_short lport = lport_arg;
1362         struct inpcbport *phd;
1363         struct inpcb *match = NULL;
1364
1365         /*
1366          * If the porthashbase is shared across several cpus, it must
1367          * have been locked.
1368          */
1369         ASSERT_PORTHASH_TOKEN_HELD(porthash);
1370
1371         /*
1372          * Best fit PCB lookup.
1373          *
1374          * First see if this local port is in use by looking on the
1375          * port hash list.
1376          */
1377         LIST_FOREACH(phd, porthash, phd_hash) {
1378                 if (phd->phd_port == lport)
1379                         break;
1380         }
1381         if (phd != NULL) {
1382                 pr = cred ? cred->cr_prison : NULL;
1383
1384                 /*
1385                  * Port is in use by one or more PCBs. Look for best
1386                  * fit.
1387                  *
1388                  * If in a prison we may wish to allow the jail to override
1389                  * a wildcard listen on the host.  Since the jail forces its
1390                  * own wildcard listens to a specific set of jail IPs, this
1391                  * override allows most services on the host to remain as
1392                  * they were and still be 'jail friendly'.
1393                  */
1394                 LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
1395                         wildcard = 0;
1396 #ifdef INET6
1397                         if (!INP_ISIPV4(inp))
1398                                 continue;
1399 #endif
1400                         if (inp->inp_faddr.s_addr != INADDR_ANY)
1401                                 wildcard++;
1402
1403                         /*
1404                          * Prison are independent of each other in terms
1405                          * of allowing bindings.  This can result in multiple
1406                          * overloaded bindings which in_pcblookup_pkthash()
1407                          * will have to sort out.
1408                          *
1409                          * Allow wildcarded entries to co-exist with specific
1410                          * entries.  Specific entries override wildcarded
1411                          * entries.
1412                          */
1413                         if (inp->inp_socket && inp->inp_socket->so_cred)
1414                                 pscan = inp->inp_socket->so_cred->cr_prison;
1415                         else
1416                                 pscan = NULL;
1417                         if (pr != pscan)
1418                                 continue;
1419                         if (inp->inp_laddr.s_addr == INADDR_ANY) {
1420                                 if (laddr.s_addr != INADDR_ANY)
1421                                         wildcard++;
1422                         } else {
1423                                 if (laddr.s_addr == INADDR_ANY)
1424                                         wildcard++;
1425                                 else if (inp->inp_laddr.s_addr != laddr.s_addr)
1426                                         continue;
1427                         }
1428                         if (wildcard && !wild_okay)
1429                                 continue;
1430                         if (wildcard < matchwild) {
1431                                 match = inp;
1432                                 matchwild = wildcard;
1433                                 if (matchwild == 0)
1434                                         break;
1435                         }
1436                 }
1437         }
1438         return (match);
1439 }
1440
1441 struct inpcb *
1442 in_pcblocalgroup_last(const struct inpcbinfo *pcbinfo,
1443     const struct inpcb *inp)
1444 {
1445         const struct inp_localgrphead *hdr;
1446         const struct inp_localgroup *grp;
1447         int i;
1448
1449         if (pcbinfo->localgrphashbase == NULL)
1450                 return NULL;
1451
1452         GET_PCBINFO_TOKEN(pcbinfo);
1453
1454         hdr = &pcbinfo->localgrphashbase[
1455             INP_PCBLOCALGRPHASH(inp->inp_lport, pcbinfo->localgrphashmask)];
1456
1457         LIST_FOREACH(grp, hdr, il_list) {
1458                 if (grp->il_af == inp->inp_af &&
1459                     grp->il_lport == inp->inp_lport &&
1460                     memcmp(&grp->il_dependladdr,
1461                         &inp->inp_inc.inc_ie.ie_dependladdr,
1462                         sizeof(grp->il_dependladdr)) == 0) {
1463                         break;
1464                 }
1465         }
1466         if (grp == NULL || grp->il_inpcnt == 1) {
1467                 REL_PCBINFO_TOKEN(pcbinfo);
1468                 return NULL;
1469         }
1470
1471         KASSERT(grp->il_inpcnt >= 2,
1472             ("invalid localgroup inp count %d", grp->il_inpcnt));
1473         for (i = 0; i < grp->il_inpcnt; ++i) {
1474                 if (grp->il_inp[i] == inp) {
1475                         int last = grp->il_inpcnt - 1;
1476
1477                         if (i == last)
1478                                 last = grp->il_inpcnt - 2;
1479                         REL_PCBINFO_TOKEN(pcbinfo);
1480                         return grp->il_inp[last];
1481                 }
1482         }
1483         REL_PCBINFO_TOKEN(pcbinfo);
1484         return NULL;
1485 }
1486
1487 static struct inpcb *
1488 inp_localgroup_lookup(const struct inpcbinfo *pcbinfo,
1489     struct in_addr laddr, uint16_t lport, uint32_t pkt_hash)
1490 {
1491         struct inpcb *local_wild;
1492         struct inpcb *jinp;
1493         struct inpcb *jinp_wild;
1494         struct inpcb *inp;
1495         const struct inp_localgrphead *hdr;
1496         const struct inp_localgroup *grp;
1497         struct sockaddr_in jsin;
1498         struct prison *pr;
1499         struct ucred *cred;
1500         int idx;
1501         int net_listen_ov_local;
1502         int net_listen_ov_wild;
1503
1504         ASSERT_PCBINFO_TOKEN_HELD(pcbinfo);
1505
1506         hdr = &pcbinfo->localgrphashbase[
1507             INP_PCBLOCALGRPHASH(lport, pcbinfo->localgrphashmask)];
1508
1509         /*
1510          * Order of socket selection:
1511          * 1. non-wild.
1512          * 2. wild.
1513          *
1514          * NOTE: Local group does not contain jailed sockets
1515          */
1516         jsin.sin_family = AF_INET;
1517         jsin.sin_addr.s_addr = laddr.s_addr;
1518
1519         jinp = NULL;
1520         jinp_wild = NULL;
1521         local_wild = NULL;
1522         net_listen_ov_local = 0;
1523         net_listen_ov_wild = 0;
1524
1525         LIST_FOREACH(grp, hdr, il_list) {
1526 #ifdef INET6
1527                 if (grp->il_af != AF_INET)
1528                         continue;
1529 #endif
1530                 if (grp->il_lport != lport)
1531                         continue;
1532
1533                 /*
1534                  * look for a match
1535                  */
1536                 idx = netisr_hashlsb(pkt_hash) % grp->il_inpcnt;
1537                 inp = grp->il_inp[idx];
1538
1539                 /*
1540                  * Modulo-N is used here, which greatly reduces
1541                  * completion queue token contention, thus more
1542                  * cpu time is saved.
1543                  */
1544                 if (grp->il_jailed) {
1545                         if (inp->inp_socket == NULL)
1546                                 continue;
1547                         cred = inp->inp_socket->so_cred;
1548                         if (cred == NULL)
1549                                 continue;
1550                         pr = cred->cr_prison;
1551                         if (pr == NULL)
1552                                 continue;
1553                         if (!jailed_ip(pr, (struct sockaddr *)&jsin))
1554                                 continue;
1555                         if (grp->il_laddr.s_addr == laddr.s_addr) {
1556                                 jinp = inp;
1557                                 if (PRISON_CAP_ISSET(pr->pr_caps, PRISON_CAP_NET_LISTEN_OVERRIDE))
1558                                         net_listen_ov_local = 1;
1559
1560                         } else if (grp->il_laddr.s_addr == INADDR_ANY &&
1561                                    jinp_wild == NULL) {
1562                                 jinp_wild = inp;
1563                                 if (PRISON_CAP_ISSET(pr->pr_caps, PRISON_CAP_NET_LISTEN_OVERRIDE))
1564                                         net_listen_ov_wild = 1;
1565                         }
1566                 } else {
1567                         if (grp->il_laddr.s_addr == laddr.s_addr) {
1568                                 return inp;
1569                         } else if (grp->il_laddr.s_addr == INADDR_ANY) {
1570                                 local_wild = inp;
1571                         }
1572                 }
1573         }
1574
1575         if (net_listen_ov_local)
1576                 return jinp;
1577         if (net_listen_ov_wild)
1578                 return jinp_wild;
1579         if (local_wild)
1580                 return (local_wild);
1581         if (jinp)
1582                 return (jinp);
1583         return (jinp_wild);
1584 }
1585
1586 /*
1587  * Lookup PCB in hash list.
1588  *
1589  * This is used to match incoming packets to a pcb
1590  */
1591 struct inpcb *
1592 in_pcblookup_pkthash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
1593     u_int fport_arg, struct in_addr laddr, u_int lport_arg,
1594     boolean_t wildcard, struct ifnet *ifp, const struct mbuf *m)
1595 {
1596         struct inpcbhead *head;
1597         struct inpcb *inp, *jinp=NULL;
1598         u_short fport = fport_arg, lport = lport_arg;
1599
1600         /*
1601          * First look for an exact match.
1602          */
1603         head = &pcbinfo->hashbase[INP_PCBCONNHASH(faddr.s_addr, fport,
1604                                                   laddr.s_addr, lport,
1605                                                   pcbinfo->hashmask)];
1606         LIST_FOREACH(inp, head, inp_hash) {
1607 #ifdef INET6
1608                 if (!INP_ISIPV4(inp))
1609                         continue;
1610 #endif
1611                 if (in_hosteq(inp->inp_faddr, faddr) &&
1612                     in_hosteq(inp->inp_laddr, laddr) &&
1613                     inp->inp_fport == fport && inp->inp_lport == lport) {
1614                         /*
1615                          * Found specific address, host overrides jailed
1616                          * inpcb.
1617                          */
1618                         if (inp->inp_socket == NULL ||
1619                             inp->inp_socket->so_cred->cr_prison == NULL) {
1620                                 return (inp);
1621                         }
1622                         if (jinp == NULL)
1623                                 jinp = inp;
1624                 }
1625         }
1626         if (jinp != NULL)
1627                 return (jinp);
1628
1629         /*
1630          * We generally get here for connections to wildcarded listeners.
1631          * Any wildcarded listeners in jails must be restricted to the
1632          * jailed IPs only.
1633          */
1634         if (wildcard) {
1635                 struct inpcb *local_wild = NULL;
1636                 struct inpcb *jinp_wild = NULL;
1637                 struct inpcontainer *ic;
1638                 struct inpcontainerhead *chead;
1639                 struct sockaddr_in jsin;
1640                 struct ucred *cred;
1641                 struct prison *pr;
1642                 int net_listen_ov_local = 0;
1643                 int net_listen_ov_wild = 0;
1644
1645                 GET_PCBINFO_TOKEN(pcbinfo);
1646
1647                 /*
1648                  * Check local group first.  When present, the localgroup
1649                  * hash utilizes the same non-jailed-vs/jailed priortization
1650                  * that the normal wildcardhash does.
1651                  */
1652                 if (pcbinfo->localgrphashbase != NULL &&
1653                     m != NULL && (m->m_flags & M_HASH)) {
1654                         inp = inp_localgroup_lookup(pcbinfo, laddr, lport,
1655                                                     m->m_pkthdr.hash);
1656                         if (inp != NULL) {
1657                                 REL_PCBINFO_TOKEN(pcbinfo);
1658                                 return inp;
1659                         }
1660                 }
1661
1662                 /*
1663                  * Order of socket selection:
1664                  *
1665                  * 1. non-jailed, non-wild.
1666                  * 2. non-jailed, wild.         (allow_listen_override on)
1667                  * 3. jailed, non-wild.
1668                  * 4. jailed, wild.
1669                  * 5. non-jailed, wild.         (allow_listen_override off)
1670                  *
1671                  * NOTE: jailed wildcards are still restricted to the jail
1672                  *       IPs.
1673                  *
1674                  * NOTE: (1) and (3) already handled above.
1675                  */
1676                 jsin.sin_family = AF_INET;
1677                 chead = &pcbinfo->wildcardhashbase[
1678                     INP_PCBWILDCARDHASH(lport, pcbinfo->wildcardhashmask)];
1679
1680                 LIST_FOREACH(ic, chead, ic_list) {
1681                         inp = ic->ic_inp;
1682                         if (inp->inp_flags & INP_PLACEMARKER)
1683                                 continue;
1684
1685                         /*
1686                          * Basic validation
1687                          */
1688 #ifdef INET6
1689                         if (!INP_ISIPV4(inp))
1690                                 continue;
1691 #endif
1692                         if (inp->inp_lport != lport)
1693                                 continue;
1694
1695                         /*
1696                          * Calculate prison, setup jsin for jailed_ip()
1697                          * check.
1698                          */
1699                         jsin.sin_addr.s_addr = laddr.s_addr;
1700                         pr = NULL;
1701                         cred = NULL;
1702                         if (inp->inp_socket) {
1703                                 cred = inp->inp_socket->so_cred;
1704                                 if (cred)
1705                                         pr = cred->cr_prison;
1706                         }
1707
1708                         /*
1709                          * Assign jinp, jinp_wild, and local_wild as
1710                          * appropriate, track whether the jail supports
1711                          * listen overrides.
1712                          */
1713                         if (pr) {
1714                                 if (!jailed_ip(pr, (struct sockaddr *)&jsin))
1715                                         continue;
1716                                 if (inp->inp_laddr.s_addr == laddr.s_addr &&
1717                                     jinp == NULL) {
1718                                         jinp = inp;
1719                                         if (PRISON_CAP_ISSET(pr->pr_caps, PRISON_CAP_NET_LISTEN_OVERRIDE))
1720                                                 net_listen_ov_local = 1;
1721                                 }
1722                                 if (inp->inp_laddr.s_addr == INADDR_ANY &&
1723                                     jinp_wild == NULL) {
1724                                         jinp_wild = inp;
1725                                         if (PRISON_CAP_ISSET(pr->pr_caps, PRISON_CAP_NET_LISTEN_OVERRIDE))
1726                                                 net_listen_ov_wild = 1;
1727                                 }
1728                         } else {
1729                                 if (inp->inp_laddr.s_addr == laddr.s_addr) {
1730                                         REL_PCBINFO_TOKEN(pcbinfo);
1731                                         return (inp);
1732                                 }
1733                                 if (inp->inp_laddr.s_addr == INADDR_ANY)
1734                                         local_wild = inp;
1735                         }
1736                 }
1737
1738                 REL_PCBINFO_TOKEN(pcbinfo);
1739
1740                 if (net_listen_ov_local)
1741                         return jinp;
1742                 if (net_listen_ov_wild)
1743                         return jinp_wild;
1744                 if (local_wild)
1745                         return (local_wild);
1746                 if (jinp)
1747                         return (jinp);
1748                 return (jinp_wild);
1749         }
1750
1751         /*
1752          * Not found.
1753          */
1754         return (NULL);
1755 }
1756
1757 struct inpcb *
1758 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
1759     u_int fport_arg, struct in_addr laddr, u_int lport_arg,
1760     boolean_t wildcard, struct ifnet *ifp)
1761 {
1762         return in_pcblookup_pkthash(pcbinfo, faddr, fport_arg,
1763             laddr, lport_arg, wildcard, ifp, NULL);
1764 }
1765
1766 /*
1767  * Insert PCB into connection hash table.
1768  */
1769 void
1770 in_pcbinsconnhash(struct inpcb *inp)
1771 {
1772         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1773         struct inpcbhead *bucket;
1774         u_int32_t hashkey_faddr, hashkey_laddr;
1775
1776 #ifdef INET6
1777         if (INP_ISIPV6(inp)) {
1778                 hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX JH */;
1779                 hashkey_laddr = inp->in6p_laddr.s6_addr32[3] /* XXX JH */;
1780         } else {
1781 #endif
1782                 hashkey_faddr = inp->inp_faddr.s_addr;
1783                 hashkey_laddr = inp->inp_laddr.s_addr;
1784 #ifdef INET6
1785         }
1786 #endif
1787
1788         KASSERT(&curthread->td_msgport == netisr_cpuport(pcbinfo->cpu),
1789             ("not in the correct netisr"));
1790         ASSERT_INP_NOTINHASH(inp);
1791         inp->inp_flags |= INP_CONNECTED;
1792
1793         /*
1794          * Insert into the connection hash table.
1795          */
1796         bucket = &pcbinfo->hashbase[INP_PCBCONNHASH(hashkey_faddr,
1797             inp->inp_fport, hashkey_laddr, inp->inp_lport, pcbinfo->hashmask)];
1798         LIST_INSERT_HEAD(bucket, inp, inp_hash);
1799 }
1800
1801 /*
1802  * Remove PCB from connection hash table.
1803  */
1804 void
1805 in_pcbremconnhash(struct inpcb *inp)
1806 {
1807         struct inpcbinfo *pcbinfo __debugvar = inp->inp_pcbinfo;
1808
1809         KASSERT(&curthread->td_msgport == netisr_cpuport(pcbinfo->cpu),
1810             ("not in the correct netisr"));
1811         KASSERT(inp->inp_flags & INP_CONNECTED, ("inp not connected"));
1812
1813         LIST_REMOVE(inp, inp_hash);
1814         inp->inp_flags &= ~INP_CONNECTED;
1815 }
1816
1817 /*
1818  * Insert PCB into port hash table.
1819  */
1820 void
1821 in_pcbinsporthash(struct inpcbporthead *pcbporthash, struct inpcb *inp)
1822 {
1823         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1824         struct inpcbport *phd;
1825
1826         /*
1827          * If the porthashbase is shared across several cpus, it must
1828          * have been locked.
1829          */
1830         ASSERT_PORTHASH_TOKEN_HELD(pcbporthash);
1831
1832         /*
1833          * Insert into the port hash table.
1834          */
1835
1836         /* Go through port list and look for a head for this lport. */
1837         LIST_FOREACH(phd, pcbporthash, phd_hash) {
1838                 if (phd->phd_port == inp->inp_lport)
1839                         break;
1840         }
1841
1842         /* If none exists, use saved one and tack it on. */
1843         if (phd == NULL) {
1844                 KKASSERT(pcbinfo->portsave != NULL);
1845                 phd = pcbinfo->portsave;
1846                 pcbinfo->portsave = NULL;
1847                 phd->phd_port = inp->inp_lport;
1848                 LIST_INIT(&phd->phd_pcblist);
1849                 LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
1850         }
1851
1852         inp->inp_porthash = pcbporthash;
1853         inp->inp_phd = phd;
1854         LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
1855
1856         /*
1857          * Malloc one inpcbport for later use.  It is safe to use
1858          * "wait" malloc here (port token would be released, if
1859          * malloc ever blocked), since all changes to the porthash
1860          * are done.
1861          */
1862         if (pcbinfo->portsave == NULL) {
1863                 pcbinfo->portsave = kmalloc(sizeof(*pcbinfo->portsave),
1864                                             M_PCB, M_INTWAIT | M_ZERO);
1865         }
1866 }
1867
1868 void
1869 in_pcbinsporthash_lport(struct inpcb *inp)
1870 {
1871         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1872         struct inpcbportinfo *portinfo;
1873         struct inpcbporthead *porthash;
1874         u_short lport_ho;
1875
1876         /* Locate the proper portinfo based on lport */
1877         lport_ho = ntohs(inp->inp_lport);
1878         portinfo = &pcbinfo->portinfo[lport_ho % pcbinfo->portinfo_cnt];
1879         KKASSERT((lport_ho % pcbinfo->portinfo_cnt) == portinfo->offset);
1880
1881         porthash = in_pcbporthash_head(portinfo, inp->inp_lport);
1882         GET_PORTHASH_TOKEN(porthash);
1883         in_pcbinsporthash(porthash, inp);
1884         REL_PORTHASH_TOKEN(porthash);
1885 }
1886
1887 void
1888 in_pcbremporthash(struct inpcb *inp)
1889 {
1890         struct inpcbporthead *porthash;
1891         struct inpcbport *phd;
1892
1893         if (inp->inp_phd == NULL)
1894                 return;
1895         KASSERT(inp->inp_lport != 0, ("inpcb has no lport"));
1896
1897         porthash = inp->inp_porthash;
1898         KASSERT(porthash != NULL, ("no porthash"));
1899
1900         GET_PORTHASH_TOKEN(porthash);
1901
1902         phd = inp->inp_phd;
1903         LIST_REMOVE(inp, inp_portlist);
1904         if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
1905                 LIST_REMOVE(phd, phd_hash);
1906                 kfree(phd, M_PCB);
1907         }
1908
1909         REL_PORTHASH_TOKEN(porthash);
1910
1911         inp->inp_phd = NULL;
1912         /* NOTE: Don't whack inp_lport, which may be used later */
1913 }
1914
1915 static struct inp_localgroup *
1916 inp_localgroup_alloc(u_char af, uint16_t port,
1917     const union in_dependaddr *addr, int size)
1918 {
1919         struct inp_localgroup *grp;
1920
1921         grp = kmalloc(__offsetof(struct inp_localgroup, il_inp[size]),
1922                       M_TEMP, M_INTWAIT | M_ZERO);
1923         grp->il_af = af;
1924         grp->il_lport = port;
1925         grp->il_dependladdr = *addr;
1926         grp->il_inpsiz = size;
1927
1928         return grp;
1929 }
1930
1931 static void
1932 inp_localgroup_free(struct inp_localgroup *grp)
1933 {
1934         kfree(grp, M_TEMP);
1935 }
1936
1937 static void
1938 inp_localgroup_destroy(struct inp_localgroup *grp)
1939 {
1940         LIST_REMOVE(grp, il_list);
1941         inp_localgroup_free(grp);
1942 }
1943
1944 static void
1945 inp_localgroup_copy(struct inp_localgroup *grp,
1946                     const struct inp_localgroup *old_grp)
1947 {
1948         int i;
1949
1950         KASSERT(old_grp->il_inpcnt < grp->il_inpsiz,
1951             ("invalid new local group size %d and old local group count %d",
1952              grp->il_inpsiz, old_grp->il_inpcnt));
1953         for (i = 0; i < old_grp->il_inpcnt; ++i)
1954                 grp->il_inp[i] = old_grp->il_inp[i];
1955         grp->il_inpcnt = old_grp->il_inpcnt;
1956 }
1957
1958 static void
1959 in_pcbinslocalgrphash_oncpu(struct inpcb *inp, struct inpcbinfo *pcbinfo)
1960 {
1961         struct inp_localgrphead *hdr;
1962         struct inp_localgroup *grp, *grp_alloc = NULL;
1963         u_char isjailed;
1964         int i, idx;
1965
1966         ASSERT_PCBINFO_TOKEN_HELD(pcbinfo);
1967
1968         if (pcbinfo->localgrphashbase == NULL)
1969                 return;
1970
1971         /*
1972          * Further separate groups by whether the inp is jailed or not.
1973          * This allows the inp_localgroup_lookup() code to manage port
1974          * overloading between jails and non-jails.
1975          *
1976          * XXX all jails are collected into one group, which works fine
1977          *     as we expect the jails to be listening on different addresses.
1978          *     If this changes in the future we may have to break the groups
1979          *     up by prison pointer as well.
1980          */
1981         if (inp->inp_socket && inp->inp_socket->so_cred)
1982                 isjailed = jailed(inp->inp_socket->so_cred);
1983         else
1984                 isjailed = 0;
1985
1986         hdr = &pcbinfo->localgrphashbase[
1987             INP_PCBLOCALGRPHASH(inp->inp_lport, pcbinfo->localgrphashmask)];
1988
1989 again:
1990         LIST_FOREACH(grp, hdr, il_list) {
1991                 if (grp->il_af == inp->inp_af &&
1992                     grp->il_lport == inp->inp_lport &&
1993                     grp->il_jailed == isjailed &&
1994                     memcmp(&grp->il_dependladdr,
1995                            &inp->inp_inc.inc_ie.ie_dependladdr,
1996                            sizeof(grp->il_dependladdr)) == 0) {
1997                         break;
1998                 }
1999         }
2000         if (grp == NULL) {
2001                 /*
2002                  * Create a new local group
2003                  */
2004                 if (grp_alloc == NULL) {
2005                         grp_alloc = inp_localgroup_alloc(inp->inp_af,
2006                             inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr,
2007                             INP_LOCALGROUP_SIZMIN);
2008                         /*
2009                          * Local group allocation could block and the
2010                          * local group w/ the same property might have
2011                          * been added by others when we were blocked;
2012                          * check again.
2013                          */
2014                         goto again;
2015                 } else {
2016                         /* Local group has been allocated; link it */
2017                         grp = grp_alloc;
2018                         grp->il_jailed = isjailed;
2019                         grp_alloc = NULL;
2020                         LIST_INSERT_HEAD(hdr, grp, il_list);
2021                 }
2022         } else if (grp->il_inpcnt == grp->il_inpsiz) {
2023 #if 0
2024                 /*
2025                  * REMOVED - Ensure that all entries are placed in the
2026                  *           localgroup so jail operations can be
2027                  *           deterministic on a il_lport basis.
2028                  */
2029                 if (grp->il_inpsiz >= INP_LOCALGROUP_SIZMAX) {
2030                         static int limit_logged = 0;
2031
2032                         if (!limit_logged) {
2033                                 limit_logged = 1;
2034                                 kprintf("local group port %d, "
2035                                     "limit reached\n", ntohs(grp->il_lport));
2036                         }
2037                         if (grp_alloc != NULL) {
2038                                 /*
2039                                  * This would happen if the local group
2040                                  * w/ the same property was expanded when
2041                                  * our local group allocation blocked.
2042                                  */
2043                                 inp_localgroup_free(grp_alloc);
2044                         }
2045                         return;
2046                 }
2047 #endif
2048
2049                 /*
2050                  * Expand this local group
2051                  */
2052                 if (grp_alloc == NULL ||
2053                     grp->il_inpcnt >= grp_alloc->il_inpsiz) {
2054                         if (grp_alloc != NULL)
2055                                 inp_localgroup_free(grp_alloc);
2056                         grp_alloc = inp_localgroup_alloc(grp->il_af,
2057                             grp->il_lport, &grp->il_dependladdr,
2058                             grp->il_inpsiz * 2);
2059                         /*
2060                          * Local group allocation could block and the
2061                          * local group w/ the same property might have
2062                          * been expanded by others when we were blocked;
2063                          * check again.
2064                          */
2065                         goto again;
2066                 }
2067
2068                 /*
2069                  * Save the old local group, link the new one, and then
2070                  * destroy the old local group
2071                  */
2072                 inp_localgroup_copy(grp_alloc, grp);
2073                 LIST_INSERT_HEAD(hdr, grp_alloc, il_list);
2074                 inp_localgroup_destroy(grp);
2075
2076                 grp = grp_alloc;
2077                 grp->il_jailed = isjailed;
2078                 grp_alloc = NULL;
2079         } else {
2080                 /*
2081                  * Found the local group
2082                  */
2083                 if (grp_alloc != NULL) {
2084                         /*
2085                          * This would happen if the local group w/ the
2086                          * same property was added or expanded when our
2087                          * local group allocation blocked.
2088                          */
2089                         inp_localgroup_free(grp_alloc);
2090                         grp_alloc = NULL;
2091                 }
2092         }
2093
2094         KASSERT(grp->il_inpcnt < grp->il_inpsiz,
2095             ("invalid local group size %d and count %d",
2096              grp->il_inpsiz, grp->il_inpcnt));
2097
2098         /*
2099          * Keep the local group sorted by the inpcb local group index
2100          * in ascending order.
2101          *
2102          * This eases the multi-process userland application which uses
2103          * SO_REUSEPORT sockets and binds process to the owner cpu of
2104          * the SO_REUSEPORT socket:
2105          * If we didn't sort the local group by the inpcb local group
2106          * index and one of the process owning an inpcb in this local
2107          * group restarted, e.g. crashed and restarted by watchdog,
2108          * other processes owning a inpcb in this local group would have
2109          * to detect that event, refetch its socket's owner cpu, and
2110          * re-bind.
2111          */
2112         idx = grp->il_inpcnt;
2113         for (i = 0; i < idx; ++i) {
2114                 struct inpcb *oinp = grp->il_inp[i];
2115
2116                 if (oinp->inp_lgrpindex > i) {
2117                         if (inp->inp_lgrpindex < 0) {
2118                                 inp->inp_lgrpindex = i;
2119                         } else if (inp->inp_lgrpindex != i) {
2120                                 if (bootverbose) {
2121                                         kprintf("inp %p: grpidx %d, "
2122                                             "assigned to %d, cpu%d\n",
2123                                             inp, inp->inp_lgrpindex, i,
2124                                             mycpuid);
2125                                 }
2126                         }
2127                         grp->il_inp[i] = inp;
2128
2129                         /* Pull down inpcbs */
2130                         for (; i < grp->il_inpcnt; ++i) {
2131                                 struct inpcb *oinp1 = grp->il_inp[i + 1];
2132
2133                                 grp->il_inp[i + 1] = oinp;
2134                                 oinp = oinp1;
2135                         }
2136                         grp->il_inpcnt++;
2137                         return;
2138                 }
2139         }
2140
2141         if (inp->inp_lgrpindex < 0) {
2142                 inp->inp_lgrpindex = idx;
2143         } else if (inp->inp_lgrpindex != idx) {
2144                 if (bootverbose) {
2145                         kprintf("inp %p: grpidx %d, assigned to %d, cpu%d\n",
2146                             inp, inp->inp_lgrpindex, idx, mycpuid);
2147                 }
2148         }
2149         grp->il_inp[idx] = inp;
2150         grp->il_inpcnt++;
2151 }
2152
2153 void
2154 in_pcbinswildcardhash_oncpu(struct inpcb *inp, struct inpcbinfo *pcbinfo)
2155 {
2156         struct inpcontainer *ic;
2157         struct inpcontainerhead *bucket;
2158
2159         GET_PCBINFO_TOKEN(pcbinfo);
2160
2161         in_pcbinslocalgrphash_oncpu(inp, pcbinfo);
2162
2163         bucket = &pcbinfo->wildcardhashbase[
2164             INP_PCBWILDCARDHASH(inp->inp_lport, pcbinfo->wildcardhashmask)];
2165
2166         ic = kmalloc(sizeof(struct inpcontainer), M_TEMP, M_INTWAIT);
2167         ic->ic_inp = inp;
2168         LIST_INSERT_HEAD(bucket, ic, ic_list);
2169
2170         REL_PCBINFO_TOKEN(pcbinfo);
2171 }
2172
2173 /*
2174  * Insert PCB into wildcard hash table.
2175  */
2176 void
2177 in_pcbinswildcardhash(struct inpcb *inp)
2178 {
2179         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2180
2181         KASSERT(&curthread->td_msgport == netisr_cpuport(pcbinfo->cpu),
2182             ("not in correct netisr"));
2183         ASSERT_INP_NOTINHASH(inp);
2184         inp->inp_flags |= INP_WILDCARD;
2185
2186         in_pcbinswildcardhash_oncpu(inp, pcbinfo);
2187 }
2188
2189 static void
2190 in_pcbremlocalgrphash_oncpu(struct inpcb *inp, struct inpcbinfo *pcbinfo)
2191 {
2192         struct inp_localgrphead *hdr;
2193         struct inp_localgroup *grp;
2194
2195         ASSERT_PCBINFO_TOKEN_HELD(pcbinfo);
2196
2197         if (pcbinfo->localgrphashbase == NULL)
2198                 return;
2199
2200         hdr = &pcbinfo->localgrphashbase[
2201             INP_PCBLOCALGRPHASH(inp->inp_lport, pcbinfo->localgrphashmask)];
2202
2203         LIST_FOREACH(grp, hdr, il_list) {
2204                 int i;
2205
2206                 for (i = 0; i < grp->il_inpcnt; ++i) {
2207                         if (grp->il_inp[i] != inp)
2208                                 continue;
2209
2210                         if (grp->il_inpcnt == 1) {
2211                                 /* Destroy this local group */
2212                                 inp_localgroup_destroy(grp);
2213                         } else {
2214                                 /* Pull up inpcbs */
2215                                 for (; i + 1 < grp->il_inpcnt; ++i)
2216                                         grp->il_inp[i] = grp->il_inp[i + 1];
2217                                 grp->il_inpcnt--;
2218                         }
2219                         return;
2220                 }
2221         }
2222 }
2223
2224 void
2225 in_pcbremwildcardhash_oncpu(struct inpcb *inp, struct inpcbinfo *pcbinfo)
2226 {
2227         struct inpcontainer *ic;
2228         struct inpcontainerhead *head;
2229
2230         GET_PCBINFO_TOKEN(pcbinfo);
2231
2232         in_pcbremlocalgrphash_oncpu(inp, pcbinfo);
2233
2234         /* find bucket */
2235         head = &pcbinfo->wildcardhashbase[
2236             INP_PCBWILDCARDHASH(inp->inp_lport, pcbinfo->wildcardhashmask)];
2237
2238         LIST_FOREACH(ic, head, ic_list) {
2239                 if (ic->ic_inp == inp)
2240                         goto found;
2241         }
2242         REL_PCBINFO_TOKEN(pcbinfo);
2243         return;                 /* not found! */
2244
2245 found:
2246         LIST_REMOVE(ic, ic_list);       /* remove container from bucket chain */
2247         REL_PCBINFO_TOKEN(pcbinfo);
2248         kfree(ic, M_TEMP);              /* deallocate container */
2249 }
2250
2251 /*
2252  * Remove PCB from wildcard hash table.
2253  */
2254 void
2255 in_pcbremwildcardhash(struct inpcb *inp)
2256 {
2257         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2258
2259         KASSERT(&curthread->td_msgport == netisr_cpuport(pcbinfo->cpu),
2260             ("not in correct netisr"));
2261         KASSERT(inp->inp_flags & INP_WILDCARD, ("inp not wildcard"));
2262
2263         in_pcbremwildcardhash_oncpu(inp, pcbinfo);
2264         inp->inp_lgrpindex = -1;
2265         inp->inp_flags &= ~INP_WILDCARD;
2266 }
2267
2268 /*
2269  * Remove PCB from various lists.
2270  */
2271 void
2272 in_pcbremlists(struct inpcb *inp)
2273 {
2274         in_pcbremporthash(inp);
2275         if (inp->inp_flags & INP_WILDCARD) {
2276                 in_pcbremwildcardhash(inp);
2277         } else if (inp->inp_flags & INP_CONNECTED) {
2278                 in_pcbremconnhash(inp);
2279         }
2280
2281         if (inp->inp_flags & INP_ONLIST)
2282                 in_pcbofflist(inp);
2283 }
2284
2285 int
2286 prison_xinpcb(struct thread *td, struct inpcb *inp)
2287 {
2288         struct ucred *cr;
2289
2290         if (td->td_proc == NULL)
2291                 return (0);
2292         cr = td->td_proc->p_ucred;
2293         if (cr->cr_prison == NULL)
2294                 return (0);
2295         if (inp->inp_socket && inp->inp_socket->so_cred &&
2296             inp->inp_socket->so_cred->cr_prison &&
2297             cr->cr_prison == inp->inp_socket->so_cred->cr_prison)
2298                 return (0);
2299         return (1);
2300 }
2301
2302 int
2303 in_pcblist_range(SYSCTL_HANDLER_ARGS)
2304 {
2305         struct inpcbinfo *pcbinfo_arr = arg1;
2306         int pcbinfo_arrlen = arg2;
2307         struct inpcb *marker;
2308         int cpu, origcpu;
2309         int error, n;
2310
2311         KASSERT(pcbinfo_arrlen <= netisr_ncpus && pcbinfo_arrlen >= 1,
2312             ("invalid pcbinfo count %d", pcbinfo_arrlen));
2313
2314         /*
2315          * The process of preparing the TCB list is too time-consuming and
2316          * resource-intensive to repeat twice on every request.
2317          */
2318         n = 0;
2319         if (req->oldptr == NULL) {
2320                 for (cpu = 0; cpu < pcbinfo_arrlen; ++cpu)
2321                         n += pcbinfo_arr[cpu].ipi_count;
2322                 req->oldidx = (n + n/8 + 10) * sizeof(struct xinpcb);
2323                 return 0;
2324         }
2325
2326         if (req->newptr != NULL)
2327                 return EPERM;
2328
2329         marker = kmalloc(sizeof(struct inpcb), M_TEMP, M_WAITOK|M_ZERO);
2330         marker->inp_flags |= INP_PLACEMARKER;
2331
2332         /*
2333          * OK, now we're committed to doing something.  Re-fetch ipi_count
2334          * after obtaining the generation count.
2335          */
2336         error = 0;
2337         origcpu = mycpuid;
2338         for (cpu = 0; cpu < pcbinfo_arrlen && error == 0; ++cpu) {
2339                 struct inpcbinfo *pcbinfo = &pcbinfo_arr[cpu];
2340                 struct inpcb *inp;
2341                 struct xinpcb xi;
2342                 int i;
2343
2344                 lwkt_migratecpu(cpu);
2345
2346                 GET_PCBINFO_TOKEN(pcbinfo);
2347
2348                 n = pcbinfo->ipi_count;
2349
2350                 LIST_INSERT_HEAD(&pcbinfo->pcblisthead, marker, inp_list);
2351                 i = 0;
2352                 while ((inp = LIST_NEXT(marker, inp_list)) != NULL && i < n) {
2353                         LIST_REMOVE(marker, inp_list);
2354                         LIST_INSERT_AFTER(inp, marker, inp_list);
2355
2356                         if (inp->inp_flags & INP_PLACEMARKER)
2357                                 continue;
2358                         if (prison_xinpcb(req->td, inp))
2359                                 continue;
2360
2361                         bzero(&xi, sizeof xi);
2362                         xi.xi_len = sizeof xi;
2363                         bcopy(inp, &xi.xi_inp, sizeof *inp);
2364                         if (inp->inp_socket)
2365                                 sotoxsocket(inp->inp_socket, &xi.xi_socket);
2366                         if ((error = SYSCTL_OUT(req, &xi, sizeof xi)) != 0)
2367                                 break;
2368                         ++i;
2369                 }
2370                 LIST_REMOVE(marker, inp_list);
2371
2372                 REL_PCBINFO_TOKEN(pcbinfo);
2373
2374                 if (error == 0 && i < n) {
2375                         bzero(&xi, sizeof xi);
2376                         xi.xi_len = sizeof xi;
2377                         while (i < n) {
2378                                 error = SYSCTL_OUT(req, &xi, sizeof xi);
2379                                 if (error)
2380                                         break;
2381                                 ++i;
2382                         }
2383                 }
2384         }
2385
2386         lwkt_migratecpu(origcpu);
2387         kfree(marker, M_TEMP);
2388         return error;
2389 }
2390
2391 int
2392 in_pcblist_ncpus(SYSCTL_HANDLER_ARGS)
2393 {
2394
2395         return (in_pcblist_range(oidp, arg1, netisr_ncpus, req));
2396 }
2397
2398 void
2399 in_savefaddr(struct socket *so, const struct sockaddr *faddr)
2400 {
2401         struct sockaddr_in *sin;
2402
2403         KASSERT(faddr->sa_family == AF_INET,
2404             ("not AF_INET faddr %d", faddr->sa_family));
2405
2406         sin = kmalloc(sizeof(*sin), M_SONAME, M_WAITOK | M_ZERO);
2407         sin->sin_family = AF_INET;
2408         sin->sin_len = sizeof(*sin);
2409         sin->sin_port = ((const struct sockaddr_in *)faddr)->sin_port;
2410         sin->sin_addr = ((const struct sockaddr_in *)faddr)->sin_addr;
2411
2412         so->so_faddr = (struct sockaddr *)sin;
2413 }
2414
2415 void
2416 in_pcbportinfo_init(struct inpcbportinfo *portinfo, int hashsize,
2417     u_short offset)
2418 {
2419         memset(portinfo, 0, sizeof(*portinfo));
2420
2421         portinfo->offset = offset;
2422         portinfo->porthashbase = phashinit(hashsize, M_PCB,
2423             &portinfo->porthashcnt);
2424 }
2425
2426 void
2427 in_pcbportrange(u_short *hi0, u_short *lo0, u_short ofs, u_short step)
2428 {
2429         int hi, lo;
2430
2431         if (step == 1)
2432                 return;
2433
2434         hi = *hi0;
2435         lo = *lo0;
2436
2437         hi = rounddown(hi, step);
2438         hi += ofs;
2439         if (hi > (int)*hi0)
2440                 hi -= step;
2441
2442         lo = roundup(lo, step);
2443         lo -= (step - ofs);
2444         if (lo < (int)*lo0)
2445                 lo += step;
2446
2447         *hi0 = hi;
2448         *lo0 = lo;
2449 }
2450
2451 void
2452 in_pcbglobalinit(void)
2453 {
2454         int cpu;
2455
2456         in_pcbmarkers = kmalloc(netisr_ncpus * sizeof(struct inpcb), M_PCB,
2457             M_WAITOK | M_ZERO);
2458         in_pcbcontainer_markers =
2459             kmalloc(netisr_ncpus * sizeof(struct inpcontainer), M_PCB,
2460             M_WAITOK | M_ZERO);
2461
2462         for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
2463                 struct inpcontainer *ic = &in_pcbcontainer_markers[cpu];
2464                 struct inpcb *marker = &in_pcbmarkers[cpu];
2465
2466                 marker->inp_flags |= INP_PLACEMARKER;
2467                 ic->ic_inp = marker;
2468         }
2469 }
2470
2471 struct inpcb *
2472 in_pcbmarker(void)
2473 {
2474
2475         ASSERT_NETISR_NCPUS(mycpuid);
2476         return &in_pcbmarkers[mycpuid];
2477 }
2478
2479 struct inpcontainer *
2480 in_pcbcontainer_marker(void)
2481 {
2482
2483         ASSERT_NETISR_NCPUS(mycpuid);
2484         return &in_pcbcontainer_markers[mycpuid];
2485 }
2486
2487 void
2488 in_pcbresetroute(struct inpcb *inp)
2489 {
2490         struct route *ro = &inp->inp_route;
2491
2492         if (ro->ro_rt != NULL)
2493                 RTFREE(ro->ro_rt);
2494         bzero(ro, sizeof(*ro));
2495 }