socket: Extend SO_REUSEPORT to distribute workload to available sockets
[dragonfly.git] / sys / netinet / in_pcb.c
1 /*
2  * Copyright (c) 2004 Jeffrey M. Hsu.  All rights reserved.
3  * Copyright (c) 2004 The DragonFly Project.  All rights reserved.
4  *
5  * This code is derived from software contributed to The DragonFly Project
6  * by Jeffrey M. Hsu.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of The DragonFly Project nor the names of its
17  *    contributors may be used to endorse or promote products derived
18  *    from this software without specific, prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
24  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
26  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
28  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
30  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  */
33
34 /*
35  * Copyright (c) 1982, 1986, 1991, 1993, 1995
36  *      The Regents of the University of California.  All rights reserved.
37  *
38  * Redistribution and use in source and binary forms, with or without
39  * modification, are permitted provided that the following conditions
40  * are met:
41  * 1. Redistributions of source code must retain the above copyright
42  *    notice, this list of conditions and the following disclaimer.
43  * 2. Redistributions in binary form must reproduce the above copyright
44  *    notice, this list of conditions and the following disclaimer in the
45  *    documentation and/or other materials provided with the distribution.
46  * 4. Neither the name of the University nor the names of its contributors
47  *    may be used to endorse or promote products derived from this software
48  *    without specific prior written permission.
49  *
50  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
51  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
52  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
53  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
54  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
55  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
56  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
57  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
58  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
59  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
60  * SUCH DAMAGE.
61  *
62  *      @(#)in_pcb.c    8.4 (Berkeley) 5/24/95
63  * $FreeBSD: src/sys/netinet/in_pcb.c,v 1.59.2.27 2004/01/02 04:06:42 ambrisko Exp $
64  */
65
66 #include "opt_ipsec.h"
67 #include "opt_inet6.h"
68
69 #include <sys/param.h>
70 #include <sys/systm.h>
71 #include <sys/malloc.h>
72 #include <sys/mbuf.h>
73 #include <sys/domain.h>
74 #include <sys/protosw.h>
75 #include <sys/socket.h>
76 #include <sys/socketvar.h>
77 #include <sys/proc.h>
78 #include <sys/priv.h>
79 #include <sys/jail.h>
80 #include <sys/kernel.h>
81 #include <sys/sysctl.h>
82
83 #include <sys/thread2.h>
84 #include <sys/socketvar2.h>
85 #include <sys/msgport2.h>
86
87 #include <machine/limits.h>
88
89 #include <net/if.h>
90 #include <net/if_types.h>
91 #include <net/route.h>
92
93 #include <netinet/in.h>
94 #include <netinet/in_pcb.h>
95 #include <netinet/in_var.h>
96 #include <netinet/ip_var.h>
97 #ifdef INET6
98 #include <netinet/ip6.h>
99 #include <netinet6/ip6_var.h>
100 #endif /* INET6 */
101
102 #ifdef IPSEC
103 #include <netinet6/ipsec.h>
104 #include <netproto/key/key.h>
105 #include <netproto/ipsec/esp_var.h>
106 #endif
107
108 #ifdef FAST_IPSEC
109 #if defined(IPSEC) || defined(IPSEC_ESP)
110 #error "Bad idea: don't compile with both IPSEC and FAST_IPSEC!"
111 #endif
112
113 #include <netproto/ipsec/ipsec.h>
114 #include <netproto/ipsec/key.h>
115 #define IPSEC
116 #endif /* FAST_IPSEC */
117
118 #define INP_LOCALGROUP_SIZMIN   8
119 #define INP_LOCALGROUP_SIZMAX   256
120
121 struct in_addr zeroin_addr;
122
123 /*
124  * These configure the range of local port addresses assigned to
125  * "unspecified" outgoing connections/packets/whatever.
126  */
127 int ipport_lowfirstauto = IPPORT_RESERVED - 1;  /* 1023 */
128 int ipport_lowlastauto = IPPORT_RESERVEDSTART;  /* 600 */
129
130 int ipport_firstauto = IPPORT_RESERVED;         /* 1024 */
131 int ipport_lastauto = IPPORT_USERRESERVED;      /* 5000 */
132
133 int ipport_hifirstauto = IPPORT_HIFIRSTAUTO;    /* 49152 */
134 int ipport_hilastauto = IPPORT_HILASTAUTO;      /* 65535 */
135
136 #define RANGECHK(var, min, max) \
137         if ((var) < (min)) { (var) = (min); } \
138         else if ((var) > (max)) { (var) = (max); }
139
140 int udpencap_enable = 1;        /* enabled by default */
141 int udpencap_port = 4500;       /* triggers decapsulation */
142
143 static int
144 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
145 {
146         int error;
147
148         error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
149         if (!error) {
150                 RANGECHK(ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
151                 RANGECHK(ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
152
153                 RANGECHK(ipport_firstauto, IPPORT_RESERVED, USHRT_MAX);
154                 RANGECHK(ipport_lastauto, IPPORT_RESERVED, USHRT_MAX);
155
156                 RANGECHK(ipport_hifirstauto, IPPORT_RESERVED, USHRT_MAX);
157                 RANGECHK(ipport_hilastauto, IPPORT_RESERVED, USHRT_MAX);
158         }
159         return (error);
160 }
161
162 #undef RANGECHK
163
164 SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0, "IP Ports");
165
166 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, CTLTYPE_INT|CTLFLAG_RW,
167            &ipport_lowfirstauto, 0, &sysctl_net_ipport_check, "I", "");
168 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, CTLTYPE_INT|CTLFLAG_RW,
169            &ipport_lowlastauto, 0, &sysctl_net_ipport_check, "I", "");
170 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, CTLTYPE_INT|CTLFLAG_RW,
171            &ipport_firstauto, 0, &sysctl_net_ipport_check, "I", "");
172 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, CTLTYPE_INT|CTLFLAG_RW,
173            &ipport_lastauto, 0, &sysctl_net_ipport_check, "I", "");
174 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, CTLTYPE_INT|CTLFLAG_RW,
175            &ipport_hifirstauto, 0, &sysctl_net_ipport_check, "I", "");
176 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, CTLTYPE_INT|CTLFLAG_RW,
177            &ipport_hilastauto, 0, &sysctl_net_ipport_check, "I", "");
178
179 /*
180  * in_pcb.c: manage the Protocol Control Blocks.
181  *
182  * NOTE: It is assumed that most of these functions will be called from
183  * a critical section.  XXX - There are, unfortunately, a few exceptions
184  * to this rule that should be fixed.
185  *
186  * NOTE: The caller should initialize the cpu field to the cpu running the
187  * protocol stack associated with this inpcbinfo.
188  */
189
190 void
191 in_pcbinfo_init(struct inpcbinfo *pcbinfo)
192 {
193         LIST_INIT(&pcbinfo->pcblisthead);
194         pcbinfo->cpu = -1;
195         pcbinfo->portsave = kmalloc(sizeof(*pcbinfo->portsave), M_PCB,
196                                     M_WAITOK | M_ZERO);
197 }
198
199 struct baddynamicports baddynamicports;
200
201 /*
202  * Check if the specified port is invalid for dynamic allocation.
203  */
204 int
205 in_baddynamic(u_int16_t port, u_int16_t proto)
206 {
207         switch (proto) {
208         case IPPROTO_TCP:
209                 return (DP_ISSET(baddynamicports.tcp, port));
210         case IPPROTO_UDP:
211 #ifdef IPSEC
212                 /* Cannot preset this as it is a sysctl */
213                 if (port == udpencap_port)
214                         return (1);
215 #endif
216                 return (DP_ISSET(baddynamicports.udp, port));
217         default:
218                 return (0);
219         }
220 }
221
222
223 /*
224  * Allocate a PCB and associate it with the socket.
225  */
226 int
227 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
228 {
229         struct inpcb *inp;
230 #ifdef IPSEC
231         int error;
232 #endif
233
234         inp = kmalloc(pcbinfo->ipi_size, M_PCB, M_WAITOK|M_ZERO);
235         inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
236         inp->inp_pcbinfo = inp->inp_cpcbinfo = pcbinfo;
237         inp->inp_socket = so;
238 #ifdef IPSEC
239         error = ipsec_init_policy(so, &inp->inp_sp);
240         if (error != 0) {
241                 kfree(inp, M_PCB);
242                 return (error);
243         }
244 #endif
245 #ifdef INET6
246         if (INP_SOCKAF(so) == AF_INET6 && ip6_v6only)
247                 inp->inp_flags |= IN6P_IPV6_V6ONLY;
248         if (ip6_auto_flowlabel)
249                 inp->inp_flags |= IN6P_AUTOFLOWLABEL;
250 #endif
251         soreference(so);
252         so->so_pcb = inp;
253         LIST_INSERT_HEAD(&pcbinfo->pcblisthead, inp, inp_list);
254         pcbinfo->ipi_count++;
255         return (0);
256 }
257
258 /*
259  * Unlink a pcb with the intention of moving it to another cpu with a
260  * different pcbinfo.  While unlinked nothing should attempt to dereference
261  * inp_pcbinfo, NULL it out so we assert if it does.
262  */
263 void
264 in_pcbunlink(struct inpcb *inp, struct inpcbinfo *pcbinfo)
265 {
266         KKASSERT(inp->inp_pcbinfo == pcbinfo);
267
268         LIST_REMOVE(inp, inp_list);
269         pcbinfo->ipi_count--;
270         inp->inp_pcbinfo = NULL;
271 }
272
273 /*
274  * Relink a pcb into a new pcbinfo.
275  */
276 void
277 in_pcblink(struct inpcb *inp, struct inpcbinfo *pcbinfo)
278 {
279         KKASSERT(inp->inp_pcbinfo == NULL);
280         inp->inp_pcbinfo = pcbinfo;
281         LIST_INSERT_HEAD(&pcbinfo->pcblisthead, inp, inp_list);
282         pcbinfo->ipi_count++;
283 }
284
285 int
286 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct thread *td)
287 {
288         struct socket *so = inp->inp_socket;
289         unsigned short *lastport;
290         struct sockaddr_in *sin;
291         struct sockaddr_in jsin;
292         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
293         struct ucred *cred = NULL;
294         u_short lport = 0;
295         int wild = 0, reuseport = (so->so_options & SO_REUSEPORT);
296         int error;
297
298         if (TAILQ_EMPTY(&in_ifaddrheads[mycpuid])) /* XXX broken! */
299                 return (EADDRNOTAVAIL);
300         if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
301                 return (EINVAL);        /* already bound */
302
303         if (!(so->so_options & (SO_REUSEADDR|SO_REUSEPORT)))
304                 wild = 1;    /* neither SO_REUSEADDR nor SO_REUSEPORT is set */
305         if (td->td_proc)
306                 cred = td->td_proc->p_ucred;
307
308         /*
309          * This has to be atomic.  If the porthash is shared across multiple
310          * protocol threads (aka tcp) then the token will be non-NULL.
311          */
312         if (pcbinfo->porttoken)
313                 lwkt_gettoken(pcbinfo->porttoken);
314
315         if (nam != NULL) {
316                 sin = (struct sockaddr_in *)nam;
317                 if (nam->sa_len != sizeof *sin) {
318                         error = EINVAL;
319                         goto done;
320                 }
321 #ifdef notdef
322                 /*
323                  * We should check the family, but old programs
324                  * incorrectly fail to initialize it.
325                  */
326                 if (sin->sin_family != AF_INET) {
327                         error = EAFNOSUPPORT;
328                         goto done;
329                 }
330 #endif
331                 if (!prison_replace_wildcards(td, nam)) {
332                         error = EINVAL;
333                         goto done;
334                 }
335                 lport = sin->sin_port;
336                 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
337                         /*
338                          * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
339                          * allow complete duplication of binding if
340                          * SO_REUSEPORT is set, or if SO_REUSEADDR is set
341                          * and a multicast address is bound on both
342                          * new and duplicated sockets.
343                          */
344                         if (so->so_options & SO_REUSEADDR)
345                                 reuseport = SO_REUSEADDR | SO_REUSEPORT;
346                 } else if (sin->sin_addr.s_addr != INADDR_ANY) {
347                         sin->sin_port = 0;              /* yech... */
348                         bzero(&sin->sin_zero, sizeof sin->sin_zero);
349                         if (ifa_ifwithaddr((struct sockaddr *)sin) == NULL) {
350                                 error = EADDRNOTAVAIL;
351                                 goto done;
352                         }
353                 }
354                 if (lport != 0) {
355                         struct inpcb *t;
356
357                         /* GROSS */
358                         if (ntohs(lport) < IPPORT_RESERVED &&
359                             cred &&
360                             priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0)) {
361                                 error = EACCES;
362                                 goto done;
363                         }
364                         if (so->so_cred->cr_uid != 0 &&
365                             !IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
366                                 t = in_pcblookup_local(pcbinfo,
367                                                        sin->sin_addr,
368                                                        lport,
369                                                        INPLOOKUP_WILDCARD,
370                                                        cred);
371                                 if (t &&
372                                     (!in_nullhost(sin->sin_addr) ||
373                                      !in_nullhost(t->inp_laddr) ||
374                                      (t->inp_socket->so_options &
375                                          SO_REUSEPORT) == 0) &&
376                                     (so->so_cred->cr_uid !=
377                                      t->inp_socket->so_cred->cr_uid)) {
378 #ifdef INET6
379                                         if (!in_nullhost(sin->sin_addr) ||
380                                             !in_nullhost(t->inp_laddr) ||
381                                             INP_SOCKAF(so) ==
382                                             INP_SOCKAF(t->inp_socket))
383 #endif
384                                         {
385                                                 error = EADDRINUSE;
386                                                 goto done;
387                                         }
388                                 }
389                         }
390                         if (cred && !prison_replace_wildcards(td, nam)) {
391                                 error = EADDRNOTAVAIL;
392                                 goto done;
393                         }
394                         t = in_pcblookup_local(pcbinfo, sin->sin_addr, lport,
395                                                wild, cred);
396                         if (t && !(reuseport & t->inp_socket->so_options)) {
397 #ifdef INET6
398                                 if (!in_nullhost(sin->sin_addr) ||
399                                     !in_nullhost(t->inp_laddr) ||
400                                     INP_SOCKAF(so) == INP_SOCKAF(t->inp_socket))
401 #endif
402                                 {
403                                         error = EADDRINUSE;
404                                         goto done;
405                                 }
406                         }
407                 }
408                 inp->inp_laddr = sin->sin_addr;
409         }
410         if (lport == 0) {
411                 ushort first, last;
412                 int count;
413
414                 jsin.sin_family = AF_INET;
415                 jsin.sin_addr.s_addr = inp->inp_laddr.s_addr;
416                 if (!prison_replace_wildcards(td, (struct sockaddr *)&jsin)) {
417                         inp->inp_laddr.s_addr = INADDR_ANY;
418                         error = EINVAL;
419                         goto done;
420                 }
421                 inp->inp_laddr.s_addr = jsin.sin_addr.s_addr;
422
423                 inp->inp_flags |= INP_ANONPORT;
424
425                 if (inp->inp_flags & INP_HIGHPORT) {
426                         first = ipport_hifirstauto;     /* sysctl */
427                         last  = ipport_hilastauto;
428                         lastport = &pcbinfo->lasthi;
429                 } else if (inp->inp_flags & INP_LOWPORT) {
430                         if (cred &&
431                             (error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0))) {
432                                 inp->inp_laddr.s_addr = INADDR_ANY;
433                                 goto done;
434                         }
435                         first = ipport_lowfirstauto;    /* 1023 */
436                         last  = ipport_lowlastauto;     /* 600 */
437                         lastport = &pcbinfo->lastlow;
438                 } else {
439                         first = ipport_firstauto;       /* sysctl */
440                         last  = ipport_lastauto;
441                         lastport = &pcbinfo->lastport;
442                 }
443                 /*
444                  * Simple check to ensure all ports are not used up causing
445                  * a deadlock here.
446                  *
447                  * We split the two cases (up and down) so that the direction
448                  * is not being tested on each round of the loop.
449                  */
450                 if (first > last) {
451                         /*
452                          * counting down
453                          */
454                         count = first - last;
455
456                         do {
457                                 if (count-- < 0) {      /* completely used? */
458                                         inp->inp_laddr.s_addr = INADDR_ANY;
459                                         error = EADDRNOTAVAIL;
460                                         goto done;
461                                 }
462                                 --*lastport;
463                                 if (*lastport > first || *lastport < last)
464                                         *lastport = first;
465                                 lport = htons(*lastport);
466                         } while (in_pcblookup_local(pcbinfo, inp->inp_laddr,
467                                                     lport, wild, cred));
468                 } else {
469                         /*
470                          * counting up
471                          */
472                         count = last - first;
473
474                         do {
475                                 if (count-- < 0) {      /* completely used? */
476                                         inp->inp_laddr.s_addr = INADDR_ANY;
477                                         error = EADDRNOTAVAIL;
478                                         goto done;
479                                 }
480                                 ++*lastport;
481                                 if (*lastport < first || *lastport > last)
482                                         *lastport = first;
483                                 lport = htons(*lastport);
484                         } while (in_pcblookup_local(pcbinfo, inp->inp_laddr,
485                                                     lport, wild, cred));
486                 }
487         }
488         inp->inp_lport = lport;
489
490         jsin.sin_family = AF_INET;
491         jsin.sin_addr.s_addr = inp->inp_laddr.s_addr;
492         if (!prison_replace_wildcards(td, (struct sockaddr*)&jsin)) {
493                 inp->inp_laddr.s_addr = INADDR_ANY;
494                 inp->inp_lport = 0;
495                 error = EINVAL;
496                 goto done;
497         }
498         inp->inp_laddr.s_addr = jsin.sin_addr.s_addr;
499
500         if (in_pcbinsporthash(inp) != 0) {
501                 inp->inp_laddr.s_addr = INADDR_ANY;
502                 inp->inp_lport = 0;
503                 error = EAGAIN;
504                 goto done;
505         }
506         error = 0;
507 done:
508         if (pcbinfo->porttoken)
509                 lwkt_reltoken(pcbinfo->porttoken);
510         return error;
511 }
512
513 static struct inpcb *
514 in_pcblookup_addrport(struct inpcbinfo *pcbinfo, struct in_addr laddr,
515     u_short lport, struct in_addr faddr, u_short fport, struct ucred *cred)
516 {
517         struct inpcb *inp;
518         struct inpcbporthead *porthash;
519         struct inpcbport *phd;
520         struct inpcb *match = NULL;
521
522         /*
523          * If the porthashbase is shared across several cpus we need
524          * to lock.
525          */
526         if (pcbinfo->porttoken)
527                 lwkt_gettoken(pcbinfo->porttoken);
528
529         /*
530          * Best fit PCB lookup.
531          *
532          * First see if this local port is in use by looking on the
533          * port hash list.
534          */
535         porthash = &pcbinfo->porthashbase[
536                         INP_PCBPORTHASH(lport, pcbinfo->porthashmask)];
537         LIST_FOREACH(phd, porthash, phd_hash) {
538                 if (phd->phd_port == lport)
539                         break;
540         }
541         if (phd != NULL) {
542                 LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
543 #ifdef INET6
544                         if ((inp->inp_vflag & INP_IPV4) == 0)
545                                 continue;
546 #endif
547                         if (inp->inp_laddr.s_addr != INADDR_ANY &&
548                             inp->inp_laddr.s_addr != laddr.s_addr)
549                                 continue;
550
551                         if (inp->inp_faddr.s_addr != INADDR_ANY &&
552                             inp->inp_faddr.s_addr != faddr.s_addr)
553                                 continue;
554
555                         if (inp->inp_fport != 0 && inp->inp_fport != fport)
556                                 continue;
557
558                         if (cred == NULL ||
559                             cred->cr_prison ==
560                             inp->inp_socket->so_cred->cr_prison) {
561                                 match = inp;
562                                 break;
563                         }
564                 }
565         }
566         if (pcbinfo->porttoken)
567                 lwkt_reltoken(pcbinfo->porttoken);
568         return (match);
569 }
570
571 int
572 in_pcbconn_bind(struct inpcb *inp, const struct sockaddr *nam,
573     struct thread *td)
574 {
575         struct proc *p = td->td_proc;
576         unsigned short *lastport;
577         const struct sockaddr_in *sin = (const struct sockaddr_in *)nam;
578         struct sockaddr_in jsin;
579         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
580         struct ucred *cred = NULL;
581         u_short lport = 0;
582         ushort first, last;
583         int count, error, dup = 0;
584
585         if (TAILQ_EMPTY(&in_ifaddrheads[mycpuid])) /* XXX broken! */
586                 return (EADDRNOTAVAIL);
587
588         KKASSERT(inp->inp_laddr.s_addr != INADDR_ANY);
589         if (inp->inp_lport != 0)
590                 return (EINVAL);        /* already bound */
591
592         KKASSERT(p);
593         cred = p->p_ucred;
594
595         /*
596          * This has to be atomic.  If the porthash is shared across multiple
597          * protocol threads (aka tcp) then the token will be non-NULL.
598          */
599         if (pcbinfo->porttoken)
600                 lwkt_gettoken(pcbinfo->porttoken);
601
602         jsin.sin_family = AF_INET;
603         jsin.sin_addr.s_addr = inp->inp_laddr.s_addr;
604         if (!prison_replace_wildcards(td, (struct sockaddr *)&jsin)) {
605                 inp->inp_laddr.s_addr = INADDR_ANY;
606                 error = EINVAL;
607                 goto done;
608         }
609         inp->inp_laddr.s_addr = jsin.sin_addr.s_addr;
610
611         inp->inp_flags |= INP_ANONPORT;
612
613         if (inp->inp_flags & INP_HIGHPORT) {
614                 first = ipport_hifirstauto;     /* sysctl */
615                 last  = ipport_hilastauto;
616                 lastport = &pcbinfo->lasthi;
617         } else if (inp->inp_flags & INP_LOWPORT) {
618                 if (cred &&
619                     (error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0))) {
620                         inp->inp_laddr.s_addr = INADDR_ANY;
621                         goto done;
622                 }
623                 first = ipport_lowfirstauto;    /* 1023 */
624                 last  = ipport_lowlastauto;     /* 600 */
625                 lastport = &pcbinfo->lastlow;
626         } else {
627                 first = ipport_firstauto;       /* sysctl */
628                 last  = ipport_lastauto;
629                 lastport = &pcbinfo->lastport;
630         }
631
632 again:
633         /*
634          * Simple check to ensure all ports are not used up causing
635          * a deadlock here.
636          *
637          * We split the two cases (up and down) so that the direction
638          * is not being tested on each round of the loop.
639          */
640         if (first > last) {
641                 /*
642                  * counting down
643                  */
644                 count = first - last;
645
646                 do {
647                         if (count-- < 0) {      /* completely used? */
648                                 inp->inp_laddr.s_addr = INADDR_ANY;
649                                 error = EADDRNOTAVAIL;
650                                 goto done;
651                         }
652                         --*lastport;
653                         if (*lastport > first || *lastport < last)
654                                 *lastport = first;
655                         lport = htons(*lastport);
656                 } while (in_pcblookup_addrport(pcbinfo, inp->inp_laddr, lport,
657                                 sin->sin_addr, sin->sin_port, cred));
658         } else {
659                 /*
660                  * counting up
661                  */
662                 count = last - first;
663
664                 do {
665                         if (count-- < 0) {      /* completely used? */
666                                 inp->inp_laddr.s_addr = INADDR_ANY;
667                                 error = EADDRNOTAVAIL;
668                                 goto done;
669                         }
670                         ++*lastport;
671                         if (*lastport < first || *lastport > last)
672                                 *lastport = first;
673                         lport = htons(*lastport);
674                 } while (in_pcblookup_addrport(pcbinfo, inp->inp_laddr, lport,
675                                 sin->sin_addr, sin->sin_port, cred));
676         }
677
678         /* This could happen on loopback interface */
679         if (sin->sin_port == lport &&
680             sin->sin_addr.s_addr == inp->inp_laddr.s_addr) {
681                 if (dup) {
682                         /*
683                          * Duplicate again; give up
684                          */
685                         inp->inp_laddr.s_addr = INADDR_ANY;
686                         error = EADDRNOTAVAIL;
687                         goto done;
688                 }
689                 dup = 1;
690                 goto again;
691         }
692         inp->inp_lport = lport;
693
694         jsin.sin_family = AF_INET;
695         jsin.sin_addr.s_addr = inp->inp_laddr.s_addr;
696         if (!prison_replace_wildcards(td, (struct sockaddr*)&jsin)) {
697                 inp->inp_laddr.s_addr = INADDR_ANY;
698                 inp->inp_lport = 0;
699                 error = EINVAL;
700                 goto done;
701         }
702         inp->inp_laddr.s_addr = jsin.sin_addr.s_addr;
703
704         if (in_pcbinsporthash(inp) != 0) {
705                 inp->inp_laddr.s_addr = INADDR_ANY;
706                 inp->inp_lport = 0;
707                 error = EAGAIN;
708                 goto done;
709         }
710         error = 0;
711 done:
712         if (pcbinfo->porttoken)
713                 lwkt_reltoken(pcbinfo->porttoken);
714         return error;
715 }
716
717 /*
718  *   Transform old in_pcbconnect() into an inner subroutine for new
719  *   in_pcbconnect(): Do some validity-checking on the remote
720  *   address (in mbuf 'nam') and then determine local host address
721  *   (i.e., which interface) to use to access that remote host.
722  *
723  *   This preserves definition of in_pcbconnect(), while supporting a
724  *   slightly different version for T/TCP.  (This is more than
725  *   a bit of a kludge, but cleaning up the internal interfaces would
726  *   have forced minor changes in every protocol).
727  */
728 int
729 in_pcbladdr(struct inpcb *inp, struct sockaddr *nam,
730         struct sockaddr_in **plocal_sin, struct thread *td)
731 {
732         struct in_ifaddr *ia;
733         struct ucred *cred = NULL;
734         struct sockaddr_in *sin = (struct sockaddr_in *)nam;
735         struct sockaddr *jsin;
736         int jailed = 0, alloc_route = 0;
737
738         if (nam->sa_len != sizeof *sin)
739                 return (EINVAL);
740         if (sin->sin_family != AF_INET)
741                 return (EAFNOSUPPORT);
742         if (sin->sin_port == 0)
743                 return (EADDRNOTAVAIL);
744         if (td && td->td_proc && td->td_proc->p_ucred)
745                 cred = td->td_proc->p_ucred;
746         if (cred && cred->cr_prison)
747                 jailed = 1;
748         if (!TAILQ_EMPTY(&in_ifaddrheads[mycpuid])) {
749                 ia = TAILQ_FIRST(&in_ifaddrheads[mycpuid])->ia;
750                 /*
751                  * If the destination address is INADDR_ANY,
752                  * use the primary local address.
753                  * If the supplied address is INADDR_BROADCAST,
754                  * and the primary interface supports broadcast,
755                  * choose the broadcast address for that interface.
756                  */
757                 if (sin->sin_addr.s_addr == INADDR_ANY)
758                         sin->sin_addr = IA_SIN(ia)->sin_addr;
759                 else if (sin->sin_addr.s_addr == (u_long)INADDR_BROADCAST &&
760                     (ia->ia_ifp->if_flags & IFF_BROADCAST))
761                         sin->sin_addr = satosin(&ia->ia_broadaddr)->sin_addr;
762         }
763         if (inp->inp_laddr.s_addr == INADDR_ANY) {
764                 struct route *ro;
765
766                 ia = NULL;
767                 /*
768                  * If route is known or can be allocated now,
769                  * our src addr is taken from the i/f, else punt.
770                  * Note that we should check the address family of the cached
771                  * destination, in case of sharing the cache with IPv6.
772                  */
773                 ro = &inp->inp_route;
774                 if (ro->ro_rt &&
775                     (!(ro->ro_rt->rt_flags & RTF_UP) ||
776                      ro->ro_dst.sa_family != AF_INET ||
777                      satosin(&ro->ro_dst)->sin_addr.s_addr !=
778                                       sin->sin_addr.s_addr ||
779                      inp->inp_socket->so_options & SO_DONTROUTE)) {
780                         RTFREE(ro->ro_rt);
781                         ro->ro_rt = NULL;
782                 }
783                 if (!(inp->inp_socket->so_options & SO_DONTROUTE) && /*XXX*/
784                     (ro->ro_rt == NULL ||
785                     ro->ro_rt->rt_ifp == NULL)) {
786                         /* No route yet, so try to acquire one */
787                         bzero(&ro->ro_dst, sizeof(struct sockaddr_in));
788                         ro->ro_dst.sa_family = AF_INET;
789                         ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
790                         ((struct sockaddr_in *) &ro->ro_dst)->sin_addr =
791                                 sin->sin_addr;
792                         rtalloc(ro);
793                         alloc_route = 1;
794                 }
795                 /*
796                  * If we found a route, use the address
797                  * corresponding to the outgoing interface
798                  * unless it is the loopback (in case a route
799                  * to our address on another net goes to loopback).
800                  */
801                 if (ro->ro_rt && !(ro->ro_rt->rt_ifp->if_flags & IFF_LOOPBACK)) {
802                         if (jailed) {
803                                 if (jailed_ip(cred->cr_prison, 
804                                     ro->ro_rt->rt_ifa->ifa_addr)) {
805                                         ia = ifatoia(ro->ro_rt->rt_ifa);
806                                 }
807                         } else {
808                                 ia = ifatoia(ro->ro_rt->rt_ifa);
809                         }
810                 }
811                 if (ia == NULL) {
812                         u_short fport = sin->sin_port;
813
814                         sin->sin_port = 0;
815                         ia = ifatoia(ifa_ifwithdstaddr(sintosa(sin)));
816                         if (ia && jailed && !jailed_ip(cred->cr_prison,
817                             sintosa(&ia->ia_addr)))
818                                 ia = NULL;
819                         if (ia == NULL)
820                                 ia = ifatoia(ifa_ifwithnet(sintosa(sin)));
821                         if (ia && jailed && !jailed_ip(cred->cr_prison,
822                             sintosa(&ia->ia_addr)))
823                                 ia = NULL;
824                         sin->sin_port = fport;
825                         if (ia == NULL &&
826                             !TAILQ_EMPTY(&in_ifaddrheads[mycpuid]))
827                                 ia = TAILQ_FIRST(&in_ifaddrheads[mycpuid])->ia;
828                         if (ia && jailed && !jailed_ip(cred->cr_prison,
829                             sintosa(&ia->ia_addr)))
830                                 ia = NULL;
831
832                         if (!jailed && ia == NULL)
833                                 goto fail;
834                 }
835                 /*
836                  * If the destination address is multicast and an outgoing
837                  * interface has been set as a multicast option, use the
838                  * address of that interface as our source address.
839                  */
840                 if (!jailed && IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) &&
841                     inp->inp_moptions != NULL) {
842                         struct ip_moptions *imo;
843                         struct ifnet *ifp;
844
845                         imo = inp->inp_moptions;
846                         if (imo->imo_multicast_ifp != NULL) {
847                                 struct in_ifaddr_container *iac;
848
849                                 ifp = imo->imo_multicast_ifp;
850                                 ia = NULL;
851                                 TAILQ_FOREACH(iac,
852                                 &in_ifaddrheads[mycpuid], ia_link) {
853                                         if (iac->ia->ia_ifp == ifp) {
854                                                 ia = iac->ia;
855                                                 break;
856                                         }
857                                 }
858                                 if (ia == NULL)
859                                         goto fail;
860                         }
861                 }
862                 /*
863                  * Don't do pcblookup call here; return interface in plocal_sin
864                  * and exit to caller, that will do the lookup.
865                  */
866                 if (ia == NULL && jailed) {
867                         if ((jsin = prison_get_nonlocal(cred->cr_prison, AF_INET, NULL)) != NULL ||
868                             (jsin = prison_get_local(cred->cr_prison, AF_INET, NULL)) != NULL) {
869                                 *plocal_sin = satosin(jsin);
870                         } else {
871                                 /* IPv6 only Jail */
872                                 goto fail;
873                         }
874                 } else {
875                         *plocal_sin = &ia->ia_addr;
876                 }
877         }
878         return (0);
879 fail:
880         if (alloc_route) {
881                 struct route *ro = &inp->inp_route;
882
883                 if (ro->ro_rt != NULL)
884                         RTFREE(ro->ro_rt);
885                 bzero(ro, sizeof(*ro));
886         }
887         return (EADDRNOTAVAIL);
888 }
889
890 /*
891  * Outer subroutine:
892  * Connect from a socket to a specified address.
893  * Both address and port must be specified in argument sin.
894  * If don't have a local address for this socket yet,
895  * then pick one.
896  */
897 int
898 in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct thread *td)
899 {
900         struct sockaddr_in *if_sin;
901         struct sockaddr_in *sin = (struct sockaddr_in *)nam;
902         int error;
903
904         /* Call inner routine to assign local interface address. */
905         if ((error = in_pcbladdr(inp, nam, &if_sin, td)) != 0)
906                 return (error);
907
908         if (in_pcblookup_hash(inp->inp_cpcbinfo, sin->sin_addr, sin->sin_port,
909                               inp->inp_laddr.s_addr ?
910                                 inp->inp_laddr : if_sin->sin_addr,
911                               inp->inp_lport, FALSE, NULL) != NULL) {
912                 return (EADDRINUSE);
913         }
914         if (inp->inp_laddr.s_addr == INADDR_ANY) {
915                 if (inp->inp_lport == 0) {
916                         error = in_pcbbind(inp, NULL, td);
917                         if (error)
918                                 return (error);
919                 }
920                 inp->inp_laddr = if_sin->sin_addr;
921         }
922         inp->inp_faddr = sin->sin_addr;
923         inp->inp_fport = sin->sin_port;
924         in_pcbinsconnhash(inp);
925         return (0);
926 }
927
928 void
929 in_pcbdisconnect(struct inpcb *inp)
930 {
931
932         inp->inp_faddr.s_addr = INADDR_ANY;
933         inp->inp_fport = 0;
934         in_pcbremconnhash(inp);
935         if (inp->inp_socket->so_state & SS_NOFDREF)
936                 in_pcbdetach(inp);
937 }
938
939 void
940 in_pcbdetach(struct inpcb *inp)
941 {
942         struct socket *so = inp->inp_socket;
943         struct inpcbinfo *ipi = inp->inp_pcbinfo;
944
945 #ifdef IPSEC
946         ipsec4_delete_pcbpolicy(inp);
947 #endif /*IPSEC*/
948         inp->inp_gencnt = ++ipi->ipi_gencnt;
949         KKASSERT((so->so_state & SS_ASSERTINPROG) == 0);
950         in_pcbremlists(inp);
951         so->so_pcb = NULL;
952         sofree(so);                     /* remove pcb ref */
953         if (inp->inp_options)
954                 m_free(inp->inp_options);
955         if (inp->inp_route.ro_rt)
956                 rtfree(inp->inp_route.ro_rt);
957         ip_freemoptions(inp->inp_moptions);
958         inp->inp_vflag = 0;
959         kfree(inp, M_PCB);
960 }
961
962 /*
963  * The calling convention of in_setsockaddr() and in_setpeeraddr() was
964  * modified to match the pru_sockaddr() and pru_peeraddr() entry points
965  * in struct pr_usrreqs, so that protocols can just reference then directly
966  * without the need for a wrapper function.  The socket must have a valid
967  * (i.e., non-nil) PCB, but it should be impossible to get an invalid one
968  * except through a kernel programming error, so it is acceptable to panic
969  * (or in this case trap) if the PCB is invalid.  (Actually, we don't trap
970  * because there actually /is/ a programming error somewhere... XXX)
971  */
972 int
973 in_setsockaddr(struct socket *so, struct sockaddr **nam)
974 {
975         struct inpcb *inp;
976         struct sockaddr_in *sin;
977
978         /*
979          * Do the malloc first in case it blocks.
980          */
981         sin = kmalloc(sizeof *sin, M_SONAME, M_WAITOK | M_ZERO);
982         sin->sin_family = AF_INET;
983         sin->sin_len = sizeof *sin;
984
985         crit_enter();
986         inp = so->so_pcb;
987         if (!inp) {
988                 crit_exit();
989                 kfree(sin, M_SONAME);
990                 return (ECONNRESET);
991         }
992         sin->sin_port = inp->inp_lport;
993         sin->sin_addr = inp->inp_laddr;
994         crit_exit();
995
996         *nam = (struct sockaddr *)sin;
997         return (0);
998 }
999
1000 void
1001 in_setsockaddr_dispatch(netmsg_t msg)
1002 {
1003         int error;
1004
1005         error = in_setsockaddr(msg->base.nm_so, msg->peeraddr.nm_nam);
1006         lwkt_replymsg(&msg->lmsg, error);
1007 }
1008
1009 int
1010 in_setpeeraddr(struct socket *so, struct sockaddr **nam)
1011 {
1012         struct inpcb *inp;
1013         struct sockaddr_in *sin;
1014
1015         /*
1016          * Do the malloc first in case it blocks.
1017          */
1018         sin = kmalloc(sizeof *sin, M_SONAME, M_WAITOK | M_ZERO);
1019         sin->sin_family = AF_INET;
1020         sin->sin_len = sizeof *sin;
1021
1022         crit_enter();
1023         inp = so->so_pcb;
1024         if (!inp) {
1025                 crit_exit();
1026                 kfree(sin, M_SONAME);
1027                 return (ECONNRESET);
1028         }
1029         sin->sin_port = inp->inp_fport;
1030         sin->sin_addr = inp->inp_faddr;
1031         crit_exit();
1032
1033         *nam = (struct sockaddr *)sin;
1034         return (0);
1035 }
1036
1037 void
1038 in_setpeeraddr_dispatch(netmsg_t msg)
1039 {
1040         int error;
1041
1042         error = in_setpeeraddr(msg->base.nm_so, msg->peeraddr.nm_nam);
1043         lwkt_replymsg(&msg->lmsg, error);
1044 }
1045
1046 void
1047 in_pcbnotifyall(struct inpcbhead *head, struct in_addr faddr, int err,
1048                 void (*notify)(struct inpcb *, int))
1049 {
1050         struct inpcb *inp, *ninp;
1051
1052         /*
1053          * note: if INP_PLACEMARKER is set we must ignore the rest of
1054          * the structure and skip it.
1055          */
1056         crit_enter();
1057         LIST_FOREACH_MUTABLE(inp, head, inp_list, ninp) {
1058                 if (inp->inp_flags & INP_PLACEMARKER)
1059                         continue;
1060 #ifdef INET6
1061                 if (!(inp->inp_vflag & INP_IPV4))
1062                         continue;
1063 #endif
1064                 if (inp->inp_faddr.s_addr != faddr.s_addr ||
1065                     inp->inp_socket == NULL)
1066                         continue;
1067                 (*notify)(inp, err);            /* can remove inp from list! */
1068         }
1069         crit_exit();
1070 }
1071
1072 void
1073 in_pcbpurgeif0(struct inpcb *head, struct ifnet *ifp)
1074 {
1075         struct inpcb *inp;
1076         struct ip_moptions *imo;
1077         int i, gap;
1078
1079         for (inp = head; inp != NULL; inp = LIST_NEXT(inp, inp_list)) {
1080                 if (inp->inp_flags & INP_PLACEMARKER)
1081                         continue;
1082                 imo = inp->inp_moptions;
1083                 if ((inp->inp_vflag & INP_IPV4) && imo != NULL) {
1084                         /*
1085                          * Unselect the outgoing interface if it is being
1086                          * detached.
1087                          */
1088                         if (imo->imo_multicast_ifp == ifp)
1089                                 imo->imo_multicast_ifp = NULL;
1090
1091                         /*
1092                          * Drop multicast group membership if we joined
1093                          * through the interface being detached.
1094                          */
1095                         for (i = 0, gap = 0; i < imo->imo_num_memberships;
1096                             i++) {
1097                                 if (imo->imo_membership[i]->inm_ifp == ifp) {
1098                                         in_delmulti(imo->imo_membership[i]);
1099                                         gap++;
1100                                 } else if (gap != 0)
1101                                         imo->imo_membership[i - gap] =
1102                                             imo->imo_membership[i];
1103                         }
1104                         imo->imo_num_memberships -= gap;
1105                 }
1106         }
1107 }
1108
1109 /*
1110  * Check for alternatives when higher level complains
1111  * about service problems.  For now, invalidate cached
1112  * routing information.  If the route was created dynamically
1113  * (by a redirect), time to try a default gateway again.
1114  */
1115 void
1116 in_losing(struct inpcb *inp)
1117 {
1118         struct rtentry *rt;
1119         struct rt_addrinfo rtinfo;
1120
1121         if ((rt = inp->inp_route.ro_rt)) {
1122                 bzero(&rtinfo, sizeof(struct rt_addrinfo));
1123                 rtinfo.rti_info[RTAX_DST] = rt_key(rt);
1124                 rtinfo.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
1125                 rtinfo.rti_info[RTAX_NETMASK] = rt_mask(rt);
1126                 rtinfo.rti_flags = rt->rt_flags;
1127                 rt_missmsg(RTM_LOSING, &rtinfo, rt->rt_flags, 0);
1128                 if (rt->rt_flags & RTF_DYNAMIC)
1129                         rtrequest1_global(RTM_DELETE, &rtinfo, NULL, NULL);
1130                 inp->inp_route.ro_rt = NULL;
1131                 rtfree(rt);
1132                 /*
1133                  * A new route can be allocated
1134                  * the next time output is attempted.
1135                  */
1136         }
1137 }
1138
1139 /*
1140  * After a routing change, flush old routing
1141  * and allocate a (hopefully) better one.
1142  */
1143 void
1144 in_rtchange(struct inpcb *inp, int err)
1145 {
1146         if (inp->inp_route.ro_rt) {
1147                 rtfree(inp->inp_route.ro_rt);
1148                 inp->inp_route.ro_rt = NULL;
1149                 /*
1150                  * A new route can be allocated the next time
1151                  * output is attempted.
1152                  */
1153         }
1154 }
1155
1156 /*
1157  * Lookup a PCB based on the local address and port.
1158  */
1159 struct inpcb *
1160 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
1161                    u_int lport_arg, int wild_okay, struct ucred *cred)
1162 {
1163         struct inpcb *inp;
1164         int matchwild = 3, wildcard;
1165         u_short lport = lport_arg;
1166         struct inpcbporthead *porthash;
1167         struct inpcbport *phd;
1168         struct inpcb *match = NULL;
1169
1170         /*
1171          * If the porthashbase is shared across several cpus we need
1172          * to lock.
1173          */
1174         if (pcbinfo->porttoken)
1175                 lwkt_gettoken(pcbinfo->porttoken);
1176
1177         /*
1178          * Best fit PCB lookup.
1179          *
1180          * First see if this local port is in use by looking on the
1181          * port hash list.
1182          */
1183         porthash = &pcbinfo->porthashbase[
1184                         INP_PCBPORTHASH(lport, pcbinfo->porthashmask)];
1185         LIST_FOREACH(phd, porthash, phd_hash) {
1186                 if (phd->phd_port == lport)
1187                         break;
1188         }
1189         if (phd != NULL) {
1190                 /*
1191                  * Port is in use by one or more PCBs. Look for best
1192                  * fit.
1193                  */
1194                 LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
1195                         wildcard = 0;
1196 #ifdef INET6
1197                         if ((inp->inp_vflag & INP_IPV4) == 0)
1198                                 continue;
1199 #endif
1200                         if (inp->inp_faddr.s_addr != INADDR_ANY)
1201                                 wildcard++;
1202                         if (inp->inp_laddr.s_addr != INADDR_ANY) {
1203                                 if (laddr.s_addr == INADDR_ANY)
1204                                         wildcard++;
1205                                 else if (inp->inp_laddr.s_addr != laddr.s_addr)
1206                                         continue;
1207                         } else {
1208                                 if (laddr.s_addr != INADDR_ANY)
1209                                         wildcard++;
1210                         }
1211                         if (wildcard && !wild_okay)
1212                                 continue;
1213                         if (wildcard < matchwild &&
1214                             (cred == NULL ||
1215                              cred->cr_prison == 
1216                                         inp->inp_socket->so_cred->cr_prison)) {
1217                                 match = inp;
1218                                 matchwild = wildcard;
1219                                 if (matchwild == 0) {
1220                                         break;
1221                                 }
1222                         }
1223                 }
1224         }
1225         if (pcbinfo->porttoken)
1226                 lwkt_reltoken(pcbinfo->porttoken);
1227         return (match);
1228 }
1229
1230 static struct inpcb *
1231 inp_localgroup_lookup(const struct inpcbinfo *pcbinfo,
1232     struct in_addr laddr, uint16_t lport, uint32_t pkt_hash)
1233 {
1234         struct inpcb *local_wild = NULL;
1235         const struct inp_localgrphead *hdr;
1236         const struct inp_localgroup *grp;
1237
1238         hdr = &pcbinfo->localgrphashbase[
1239             INP_PCBLOCALGRPHASH(lport, pcbinfo->localgrphashmask)];
1240         pkt_hash >>= ncpus2_shift;
1241
1242         /*
1243          * Order of socket selection:
1244          * 1. non-wild.
1245          * 2. wild.
1246          *
1247          * NOTE:
1248          * - Local group does not contain jailed sockets
1249          * - Local group does not contain IPv4 mapped INET6 wild sockets
1250          */
1251         LIST_FOREACH(grp, hdr, il_list) {
1252 #ifdef INET6
1253                 if (!(grp->il_vflag & INP_IPV4))
1254                         continue;
1255 #endif
1256                 if (grp->il_lport == lport) {
1257                         int idx;
1258
1259                         idx = pkt_hash / grp->il_factor;
1260                         KASSERT(idx < grp->il_inpcnt && idx >= 0,
1261                             ("invalid hash %04x, cnt %d or fact %d",
1262                              pkt_hash, grp->il_inpcnt, grp->il_factor));
1263
1264                         if (grp->il_laddr.s_addr == laddr.s_addr)
1265                                 return grp->il_inp[idx];
1266                         else if (grp->il_laddr.s_addr == INADDR_ANY)
1267                                 local_wild = grp->il_inp[idx];
1268                 }
1269         }
1270         if (local_wild != NULL)
1271                 return local_wild;
1272         return NULL;
1273 }
1274
1275 /*
1276  * Lookup PCB in hash list.
1277  */
1278 struct inpcb *
1279 in_pcblookup_pkthash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
1280     u_int fport_arg, struct in_addr laddr, u_int lport_arg,
1281     boolean_t wildcard, struct ifnet *ifp, const struct mbuf *m)
1282 {
1283         struct inpcbhead *head;
1284         struct inpcb *inp, *jinp=NULL;
1285         u_short fport = fport_arg, lport = lport_arg;
1286
1287         /*
1288          * First look for an exact match.
1289          */
1290         head = &pcbinfo->hashbase[INP_PCBCONNHASH(faddr.s_addr, fport,
1291             laddr.s_addr, lport, pcbinfo->hashmask)];
1292         LIST_FOREACH(inp, head, inp_hash) {
1293 #ifdef INET6
1294                 if (!(inp->inp_vflag & INP_IPV4))
1295                         continue;
1296 #endif
1297                 if (in_hosteq(inp->inp_faddr, faddr) &&
1298                     in_hosteq(inp->inp_laddr, laddr) &&
1299                     inp->inp_fport == fport && inp->inp_lport == lport) {
1300                         /* found */
1301                         if (inp->inp_socket == NULL ||
1302                             inp->inp_socket->so_cred->cr_prison == NULL) {
1303                                 return (inp);
1304                         } else {
1305                                 if  (jinp == NULL)
1306                                         jinp = inp;
1307                         }
1308                 }
1309         }
1310         if (jinp != NULL)
1311                 return (jinp);
1312         if (wildcard) {
1313                 struct inpcb *local_wild = NULL;
1314                 struct inpcb *jinp_wild = NULL;
1315 #ifdef INET6
1316                 struct inpcb *local_wild_mapped = NULL;
1317 #endif
1318                 struct inpcontainer *ic;
1319                 struct inpcontainerhead *chead;
1320                 struct sockaddr_in jsin;
1321                 struct ucred *cred;
1322
1323                 /*
1324                  * Check local group first
1325                  */
1326                 if (pcbinfo->localgrphashbase != NULL &&
1327                     m != NULL && (m->m_flags & M_HASH) &&
1328                     !(ifp && ifp->if_type == IFT_FAITH)) {
1329                         inp = inp_localgroup_lookup(pcbinfo,
1330                             laddr, lport, m->m_pkthdr.hash);
1331                         if (inp != NULL)
1332                                 return inp;
1333                 }
1334
1335                 /*
1336                  * Order of socket selection:
1337                  * 1. non-jailed, non-wild.
1338                  * 2. non-jailed, wild.
1339                  * 3. jailed, non-wild.
1340                  * 4. jailed, wild.
1341                  */
1342                 jsin.sin_family = AF_INET;
1343                 chead = &pcbinfo->wildcardhashbase[
1344                     INP_PCBWILDCARDHASH(lport, pcbinfo->wildcardhashmask)];
1345                 LIST_FOREACH(ic, chead, ic_list) {
1346                         inp = ic->ic_inp;
1347                         jsin.sin_addr.s_addr = laddr.s_addr;
1348 #ifdef INET6
1349                         if (!(inp->inp_vflag & INP_IPV4))
1350                                 continue;
1351 #endif
1352                         if (inp->inp_socket != NULL)
1353                                 cred = inp->inp_socket->so_cred;
1354                         else
1355                                 cred = NULL;
1356                         if (cred != NULL && jailed(cred)) {
1357                                 if (jinp != NULL)
1358                                         continue;
1359                                 else
1360                                         if (!jailed_ip(cred->cr_prison,
1361                                             (struct sockaddr *)&jsin))
1362                                                 continue;
1363                         }
1364                         if (inp->inp_lport == lport) {
1365                                 if (ifp && ifp->if_type == IFT_FAITH &&
1366                                     !(inp->inp_flags & INP_FAITH))
1367                                         continue;
1368                                 if (inp->inp_laddr.s_addr == laddr.s_addr) {
1369                                         if (cred != NULL && jailed(cred))
1370                                                 jinp = inp;
1371                                         else
1372                                                 return (inp);
1373                                 }
1374                                 if (inp->inp_laddr.s_addr == INADDR_ANY) {
1375 #ifdef INET6
1376                                         if (INP_CHECK_SOCKAF(inp->inp_socket,
1377                                                              AF_INET6))
1378                                                 local_wild_mapped = inp;
1379                                         else
1380 #endif
1381                                                 if (cred != NULL &&
1382                                                     jailed(cred))
1383                                                         jinp_wild = inp;
1384                                                 else
1385                                                         local_wild = inp;
1386                                 }
1387                         }
1388                 }
1389                 if (local_wild != NULL)
1390                         return (local_wild);
1391 #ifdef INET6
1392                 if (local_wild_mapped != NULL)
1393                         return (local_wild_mapped);
1394 #endif
1395                 if (jinp != NULL)
1396                         return (jinp);
1397                 return (jinp_wild);
1398         }
1399
1400         /*
1401          * Not found.
1402          */
1403         return (NULL);
1404 }
1405
1406 struct inpcb *
1407 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
1408     u_int fport_arg, struct in_addr laddr, u_int lport_arg,
1409     boolean_t wildcard, struct ifnet *ifp)
1410 {
1411         return in_pcblookup_pkthash(pcbinfo, faddr, fport_arg,
1412             laddr, lport_arg, wildcard, ifp, NULL);
1413 }
1414
1415 /*
1416  * Insert PCB into connection hash table.
1417  */
1418 void
1419 in_pcbinsconnhash(struct inpcb *inp)
1420 {
1421         struct inpcbinfo *pcbinfo = inp->inp_cpcbinfo;
1422         struct inpcbhead *bucket;
1423         u_int32_t hashkey_faddr, hashkey_laddr;
1424
1425 #ifdef INET6
1426         if (inp->inp_vflag & INP_IPV6) {
1427                 hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX JH */;
1428                 hashkey_laddr = inp->in6p_laddr.s6_addr32[3] /* XXX JH */;
1429         } else {
1430 #endif
1431                 hashkey_faddr = inp->inp_faddr.s_addr;
1432                 hashkey_laddr = inp->inp_laddr.s_addr;
1433 #ifdef INET6
1434         }
1435 #endif
1436
1437         KASSERT(!(inp->inp_flags & INP_WILDCARD),
1438                 ("already on wildcardhash"));
1439         KASSERT(!(inp->inp_flags & INP_CONNECTED),
1440                 ("already on connhash"));
1441         inp->inp_flags |= INP_CONNECTED;
1442
1443         /*
1444          * Insert into the connection hash table.
1445          */
1446         bucket = &pcbinfo->hashbase[INP_PCBCONNHASH(hashkey_faddr,
1447             inp->inp_fport, hashkey_laddr, inp->inp_lport, pcbinfo->hashmask)];
1448         LIST_INSERT_HEAD(bucket, inp, inp_hash);
1449 }
1450
1451 /*
1452  * Remove PCB from connection hash table.
1453  */
1454 void
1455 in_pcbremconnhash(struct inpcb *inp)
1456 {
1457         KASSERT(inp->inp_flags & INP_CONNECTED, ("inp not connected"));
1458         LIST_REMOVE(inp, inp_hash);
1459         inp->inp_flags &= ~INP_CONNECTED;
1460 }
1461
1462 /*
1463  * Insert PCB into port hash table.
1464  */
1465 int
1466 in_pcbinsporthash(struct inpcb *inp)
1467 {
1468         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1469         struct inpcbporthead *pcbporthash;
1470         struct inpcbport *phd;
1471
1472         /*
1473          * If the porthashbase is shared across several cpus we need
1474          * to lock.
1475          */
1476         if (pcbinfo->porttoken)
1477                 lwkt_gettoken(pcbinfo->porttoken);
1478
1479         /*
1480          * Insert into the port hash table.
1481          */
1482         pcbporthash = &pcbinfo->porthashbase[
1483             INP_PCBPORTHASH(inp->inp_lport, pcbinfo->porthashmask)];
1484
1485         /* Go through port list and look for a head for this lport. */
1486         LIST_FOREACH(phd, pcbporthash, phd_hash) {
1487                 if (phd->phd_port == inp->inp_lport)
1488                         break;
1489         }
1490
1491         /* If none exists, malloc one and tack it on. */
1492         if (phd == NULL) {
1493                 KKASSERT(pcbinfo->portsave != NULL);
1494                 phd = pcbinfo->portsave;
1495                 pcbinfo->portsave = NULL;
1496                 phd->phd_port = inp->inp_lport;
1497                 LIST_INIT(&phd->phd_pcblist);
1498                 LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
1499         }
1500
1501         inp->inp_phd = phd;
1502         LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
1503
1504         if (pcbinfo->porttoken)
1505                 lwkt_reltoken(pcbinfo->porttoken);
1506         if (pcbinfo->portsave == NULL) {
1507                 pcbinfo->portsave = kmalloc(sizeof(*pcbinfo->portsave),
1508                                             M_PCB, M_INTWAIT | M_ZERO);
1509         }
1510         return (0);
1511 }
1512
1513 static struct inp_localgroup *
1514 inp_localgroup_alloc(struct inp_localgrphead *hdr, u_char vflag,
1515     uint16_t port, const union in_dependaddr *addr, int size)
1516 {
1517         struct inp_localgroup *grp;
1518
1519         grp = kmalloc(__offsetof(struct inp_localgroup, il_inp[size]),
1520             M_TEMP, M_INTWAIT | M_ZERO);
1521         grp->il_vflag = vflag;
1522         grp->il_lport = port;
1523         grp->il_dependladdr = *addr;
1524         grp->il_inpsiz = size;
1525
1526         LIST_INSERT_HEAD(hdr, grp, il_list);
1527
1528         return grp;
1529 }
1530
1531 static void
1532 inp_localgroup_free(struct inp_localgroup *grp)
1533 {
1534         LIST_REMOVE(grp, il_list);
1535         kfree(grp, M_TEMP);
1536 }
1537
1538 static struct inp_localgroup *
1539 inp_localgroup_resize(struct inp_localgrphead *hdr,
1540     struct inp_localgroup *old_grp, int size)
1541 {
1542         struct inp_localgroup *grp;
1543         int i;
1544
1545         grp = inp_localgroup_alloc(hdr, old_grp->il_vflag,
1546             old_grp->il_lport, &old_grp->il_dependladdr, size);
1547
1548         KASSERT(old_grp->il_inpcnt < grp->il_inpsiz,
1549             ("invalid new local group size %d and old local group count %d",
1550              grp->il_inpsiz, old_grp->il_inpcnt));
1551         for (i = 0; i < old_grp->il_inpcnt; ++i)
1552                 grp->il_inp[i] = old_grp->il_inp[i];
1553         grp->il_inpcnt = old_grp->il_inpcnt;
1554         grp->il_factor = old_grp->il_factor;
1555
1556         inp_localgroup_free(old_grp);
1557
1558         return grp;
1559 }
1560
1561 static void
1562 inp_localgroup_factor(struct inp_localgroup *grp)
1563 {
1564         grp->il_factor =
1565             ((uint32_t)(0xffff >> ncpus2_shift) / grp->il_inpcnt) + 1;
1566         KASSERT(grp->il_factor != 0, ("invalid local group factor, "
1567             "ncpus2_shift %d, inpcnt %d", ncpus2_shift, grp->il_inpcnt));
1568 }
1569
1570 static void
1571 in_pcbinslocalgrphash_oncpu(struct inpcb *inp, struct inpcbinfo *pcbinfo)
1572 {
1573         struct inp_localgrphead *hdr;
1574         struct inp_localgroup *grp;
1575         struct ucred *cred;
1576
1577         if (pcbinfo->localgrphashbase == NULL)
1578                 return;
1579
1580         /*
1581          * XXX don't allow jailed socket to join local group
1582          */
1583         if (inp->inp_socket != NULL)
1584                 cred = inp->inp_socket->so_cred;
1585         else
1586                 cred = NULL;
1587         if (cred != NULL && jailed(cred))
1588                 return;
1589
1590 #ifdef INET6
1591         /*
1592          * XXX don't allow IPv4 mapped INET6 wild socket
1593          */
1594         if ((inp->inp_vflag & INP_IPV4) &&
1595             inp->inp_laddr.s_addr == INADDR_ANY &&
1596             INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6))
1597                 return;
1598 #endif
1599
1600         hdr = &pcbinfo->localgrphashbase[
1601             INP_PCBLOCALGRPHASH(inp->inp_lport, pcbinfo->localgrphashmask)];
1602
1603         LIST_FOREACH(grp, hdr, il_list) {
1604                 if (grp->il_vflag == inp->inp_vflag &&
1605                     grp->il_lport == inp->inp_lport &&
1606                     memcmp(&grp->il_dependladdr,
1607                         &inp->inp_inc.inc_ie.ie_dependladdr,
1608                         sizeof(grp->il_dependladdr)) == 0) {
1609                         break;
1610                 }
1611         }
1612         if (grp == NULL) {
1613                 /* Create new local group */
1614                 grp = inp_localgroup_alloc(hdr, inp->inp_vflag,
1615                     inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr,
1616                     INP_LOCALGROUP_SIZMIN);
1617         } else if (grp->il_inpcnt == grp->il_inpsiz) {
1618                 if (grp->il_inpsiz >= INP_LOCALGROUP_SIZMAX) {
1619                         static int limit_logged = 0;
1620
1621                         if (!limit_logged) {
1622                                 limit_logged = 1;
1623                                 kprintf("local group port %d, "
1624                                     "limit reached\n", ntohs(grp->il_lport));
1625                         }
1626                         return;
1627                 }
1628
1629                 /* Expand this local group */
1630                 grp = inp_localgroup_resize(hdr, grp, grp->il_inpsiz * 2);
1631         }
1632
1633         KASSERT(grp->il_inpcnt < grp->il_inpsiz,
1634             ("invalid local group size %d and count %d",
1635              grp->il_inpsiz, grp->il_inpcnt));
1636         grp->il_inp[grp->il_inpcnt] = inp;
1637         grp->il_inpcnt++;
1638         inp_localgroup_factor(grp);
1639 }
1640
1641 void
1642 in_pcbinswildcardhash_oncpu(struct inpcb *inp, struct inpcbinfo *pcbinfo)
1643 {
1644         struct inpcontainer *ic;
1645         struct inpcontainerhead *bucket;
1646
1647         in_pcbinslocalgrphash_oncpu(inp, pcbinfo);
1648
1649         bucket = &pcbinfo->wildcardhashbase[
1650             INP_PCBWILDCARDHASH(inp->inp_lport, pcbinfo->wildcardhashmask)];
1651
1652         ic = kmalloc(sizeof(struct inpcontainer), M_TEMP, M_INTWAIT);
1653         ic->ic_inp = inp;
1654         LIST_INSERT_HEAD(bucket, ic, ic_list);
1655 }
1656
1657 /*
1658  * Insert PCB into wildcard hash table.
1659  */
1660 void
1661 in_pcbinswildcardhash(struct inpcb *inp)
1662 {
1663         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1664
1665         KASSERT(!(inp->inp_flags & INP_CONNECTED),
1666                 ("already on connhash"));
1667         KASSERT(!(inp->inp_flags & INP_WILDCARD),
1668                 ("already on wildcardhash"));
1669         inp->inp_flags |= INP_WILDCARD;
1670
1671         in_pcbinswildcardhash_oncpu(inp, pcbinfo);
1672 }
1673
1674 static void
1675 in_pcbremlocalgrphash_oncpu(struct inpcb *inp, struct inpcbinfo *pcbinfo)
1676 {
1677         struct inp_localgrphead *hdr;
1678         struct inp_localgroup *grp;
1679
1680         if (pcbinfo->localgrphashbase == NULL)
1681                 return;
1682
1683         hdr = &pcbinfo->localgrphashbase[
1684             INP_PCBLOCALGRPHASH(inp->inp_lport, pcbinfo->localgrphashmask)];
1685
1686         LIST_FOREACH(grp, hdr, il_list) {
1687                 int i;
1688
1689                 for (i = 0; i < grp->il_inpcnt; ++i) {
1690                         if (grp->il_inp[i] != inp)
1691                                 continue;
1692
1693                         if (grp->il_inpcnt == 1) {
1694                                 /* Free this local group */
1695                                 inp_localgroup_free(grp);
1696                         } else {
1697                                 /* Pull up inpcbs */
1698                                 for (; i + 1 < grp->il_inpcnt; ++i)
1699                                         grp->il_inp[i] = grp->il_inp[i + 1];
1700                                 grp->il_inpcnt--;
1701                                 inp_localgroup_factor(grp);
1702
1703                                 if (grp->il_inpsiz > INP_LOCALGROUP_SIZMIN &&
1704                                     grp->il_inpcnt <= (grp->il_inpsiz / 4)) {
1705                                         /* Shrink this local group */
1706                                         grp = inp_localgroup_resize(hdr, grp,
1707                                             grp->il_inpsiz / 2);
1708                                 }
1709                         }
1710                         return;
1711                 }
1712         }
1713 }
1714
1715 void
1716 in_pcbremwildcardhash_oncpu(struct inpcb *inp, struct inpcbinfo *pcbinfo)
1717 {
1718         struct inpcontainer *ic;
1719         struct inpcontainerhead *head;
1720
1721         in_pcbremlocalgrphash_oncpu(inp, pcbinfo);
1722
1723         /* find bucket */
1724         head = &pcbinfo->wildcardhashbase[
1725             INP_PCBWILDCARDHASH(inp->inp_lport, pcbinfo->wildcardhashmask)];
1726
1727         LIST_FOREACH(ic, head, ic_list) {
1728                 if (ic->ic_inp == inp)
1729                         goto found;
1730         }
1731         return;                 /* not found! */
1732
1733 found:
1734         LIST_REMOVE(ic, ic_list);       /* remove container from bucket chain */
1735         kfree(ic, M_TEMP);              /* deallocate container */
1736 }
1737
1738 /*
1739  * Remove PCB from wildcard hash table.
1740  */
1741 void
1742 in_pcbremwildcardhash(struct inpcb *inp)
1743 {
1744         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1745
1746         KASSERT(inp->inp_flags & INP_WILDCARD, ("inp not wildcard"));
1747         in_pcbremwildcardhash_oncpu(inp, pcbinfo);
1748         inp->inp_flags &= ~INP_WILDCARD;
1749 }
1750
1751 /*
1752  * Remove PCB from various lists.
1753  */
1754 void
1755 in_pcbremlists(struct inpcb *inp)
1756 {
1757         struct inpcbinfo *pcbinfo;
1758
1759         if (inp->inp_lport) {
1760                 struct inpcbport *phd;
1761
1762                 pcbinfo = inp->inp_pcbinfo;
1763                 if (pcbinfo->porttoken)
1764                         lwkt_gettoken(pcbinfo->porttoken);
1765
1766                 phd = inp->inp_phd;
1767                 LIST_REMOVE(inp, inp_portlist);
1768                 if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
1769                         LIST_REMOVE(phd, phd_hash);
1770                         kfree(phd, M_PCB);
1771                 }
1772                 if (pcbinfo->porttoken)
1773                         lwkt_reltoken(pcbinfo->porttoken);
1774         }
1775         if (inp->inp_flags & INP_WILDCARD) {
1776                 in_pcbremwildcardhash(inp);
1777         } else if (inp->inp_flags & INP_CONNECTED) {
1778                 in_pcbremconnhash(inp);
1779         }
1780         LIST_REMOVE(inp, inp_list);
1781         inp->inp_pcbinfo->ipi_count--;
1782 }
1783
1784 int
1785 prison_xinpcb(struct thread *td, struct inpcb *inp)
1786 {
1787         struct ucred *cr;
1788
1789         if (td->td_proc == NULL)
1790                 return (0);
1791         cr = td->td_proc->p_ucred;
1792         if (cr->cr_prison == NULL)
1793                 return (0);
1794         if (inp->inp_socket && inp->inp_socket->so_cred &&
1795             inp->inp_socket->so_cred->cr_prison &&
1796             cr->cr_prison == inp->inp_socket->so_cred->cr_prison)
1797                 return (0);
1798         return (1);
1799 }
1800
1801 int
1802 in_pcblist_global(SYSCTL_HANDLER_ARGS)
1803 {
1804         struct inpcbinfo *pcbinfo = arg1;
1805         struct inpcb *inp, *marker;
1806         struct xinpcb xi;
1807         int error, i, n;
1808
1809         /*
1810          * The process of preparing the TCB list is too time-consuming and
1811          * resource-intensive to repeat twice on every request.
1812          */
1813         if (req->oldptr == NULL) {
1814                 n = pcbinfo->ipi_count;
1815                 req->oldidx = (n + n/8 + 10) * sizeof(struct xinpcb);
1816                 return 0;
1817         }
1818
1819         if (req->newptr != NULL)
1820                 return EPERM;
1821
1822         /*
1823          * OK, now we're committed to doing something.  Re-fetch ipi_count
1824          * after obtaining the generation count.
1825          */
1826         n = pcbinfo->ipi_count;
1827
1828         marker = kmalloc(sizeof(struct inpcb), M_TEMP, M_WAITOK|M_ZERO);
1829         marker->inp_flags |= INP_PLACEMARKER;
1830         LIST_INSERT_HEAD(&pcbinfo->pcblisthead, marker, inp_list);
1831
1832         i = 0;
1833         error = 0;
1834
1835         while ((inp = LIST_NEXT(marker, inp_list)) != NULL && i < n) {
1836                 LIST_REMOVE(marker, inp_list);
1837                 LIST_INSERT_AFTER(inp, marker, inp_list);
1838
1839                 if (inp->inp_flags & INP_PLACEMARKER)
1840                         continue;
1841                 if (prison_xinpcb(req->td, inp))
1842                         continue;
1843                 bzero(&xi, sizeof xi);
1844                 xi.xi_len = sizeof xi;
1845                 bcopy(inp, &xi.xi_inp, sizeof *inp);
1846                 if (inp->inp_socket)
1847                         sotoxsocket(inp->inp_socket, &xi.xi_socket);
1848                 if ((error = SYSCTL_OUT(req, &xi, sizeof xi)) != 0)
1849                         break;
1850                 ++i;
1851         }
1852         LIST_REMOVE(marker, inp_list);
1853         if (error == 0 && i < n) {
1854                 bzero(&xi, sizeof xi);
1855                 xi.xi_len = sizeof xi;
1856                 while (i < n) {
1857                         error = SYSCTL_OUT(req, &xi, sizeof xi);
1858                         ++i;
1859                 }
1860         }
1861         kfree(marker, M_TEMP);
1862         return(error);
1863 }
1864
1865 int
1866 in_pcblist_global_nomarker(SYSCTL_HANDLER_ARGS, struct xinpcb **xi0, int *nxi0)
1867 {
1868         struct inpcbinfo *pcbinfo = arg1;
1869         struct inpcb *inp;
1870         struct xinpcb *xi;
1871         int nxi;
1872
1873         *nxi0 = 0;
1874         *xi0 = NULL;
1875
1876         /*
1877          * The process of preparing the PCB list is too time-consuming and
1878          * resource-intensive to repeat twice on every request.
1879          */
1880         if (req->oldptr == NULL) {
1881                 int n = pcbinfo->ipi_count;
1882
1883                 req->oldidx = (n + n/8 + 10) * sizeof(struct xinpcb);
1884                 return 0;
1885         }
1886
1887         if (req->newptr != NULL)
1888                 return EPERM;
1889
1890         if (pcbinfo->ipi_count == 0)
1891                 return 0;
1892
1893         nxi = 0;
1894         xi = kmalloc(pcbinfo->ipi_count * sizeof(*xi), M_TEMP,
1895                      M_WAITOK | M_ZERO | M_NULLOK);
1896         if (xi == NULL)
1897                 return ENOMEM;
1898
1899         LIST_FOREACH(inp, &pcbinfo->pcblisthead, inp_list) {
1900                 struct xinpcb *xi_ptr = &xi[nxi];
1901
1902                 if (prison_xinpcb(req->td, inp))
1903                         continue;
1904
1905                 xi_ptr->xi_len = sizeof(*xi_ptr);
1906                 bcopy(inp, &xi_ptr->xi_inp, sizeof(*inp));
1907                 if (inp->inp_socket)
1908                         sotoxsocket(inp->inp_socket, &xi_ptr->xi_socket);
1909                 ++nxi;
1910         }
1911
1912         if (nxi == 0) {
1913                 kfree(xi, M_TEMP);
1914                 return 0;
1915         }
1916
1917         *nxi0 = nxi;
1918         *xi0 = xi;
1919
1920         return 0;
1921 }
1922
1923 void
1924 in_savefaddr(struct socket *so, const struct sockaddr *faddr)
1925 {
1926         struct sockaddr_in *sin;
1927
1928         KASSERT(faddr->sa_family == AF_INET,
1929             ("not AF_INET faddr %d", faddr->sa_family));
1930
1931         sin = kmalloc(sizeof(*sin), M_SONAME, M_WAITOK | M_ZERO);
1932         sin->sin_family = AF_INET;
1933         sin->sin_len = sizeof(*sin);
1934         sin->sin_port = ((const struct sockaddr_in *)faddr)->sin_port;
1935         sin->sin_addr = ((const struct sockaddr_in *)faddr)->sin_addr;
1936
1937         so->so_faddr = (struct sockaddr *)sin;
1938 }