Kernel part of PF
[dragonfly.git] / sys / netinet / in_pcb.c
1 /*
2  * Copyright (c) 2004 Jeffrey M. Hsu.  All rights reserved.
3  * Copyright (c) 2004 The DragonFly Project.  All rights reserved.
4  * 
5  * This code is derived from software contributed to The DragonFly Project
6  * by Jeffrey M. Hsu.
7  * 
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of The DragonFly Project nor the names of its
17  *    contributors may be used to endorse or promote products derived
18  *    from this software without specific, prior written permission.
19  * 
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
24  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
26  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
28  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
30  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  */
33
34 /*
35  * Copyright (c) 2004 Jeffrey M. Hsu.  All rights reserved.
36  *
37  * License terms: all terms for the DragonFly license above plus the following:
38  *
39  * 4. All advertising materials mentioning features or use of this software
40  *    must display the following acknowledgement:
41  *
42  *      This product includes software developed by Jeffrey M. Hsu
43  *      for the DragonFly Project.
44  *
45  *    This requirement may be waived with permission from Jeffrey Hsu.
46  *    This requirement will sunset and may be removed on July 8 2005,
47  *    after which the standard DragonFly license (as shown above) will
48  *    apply.
49  */
50
51 /*
52  * Copyright (c) 1982, 1986, 1991, 1993, 1995
53  *      The Regents of the University of California.  All rights reserved.
54  *
55  * Redistribution and use in source and binary forms, with or without
56  * modification, are permitted provided that the following conditions
57  * are met:
58  * 1. Redistributions of source code must retain the above copyright
59  *    notice, this list of conditions and the following disclaimer.
60  * 2. Redistributions in binary form must reproduce the above copyright
61  *    notice, this list of conditions and the following disclaimer in the
62  *    documentation and/or other materials provided with the distribution.
63  * 3. All advertising materials mentioning features or use of this software
64  *    must display the following acknowledgement:
65  *      This product includes software developed by the University of
66  *      California, Berkeley and its contributors.
67  * 4. Neither the name of the University nor the names of its contributors
68  *    may be used to endorse or promote products derived from this software
69  *    without specific prior written permission.
70  *
71  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
72  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
73  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
74  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
75  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
76  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
77  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
78  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
79  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
80  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
81  * SUCH DAMAGE.
82  *
83  *      @(#)in_pcb.c    8.4 (Berkeley) 5/24/95
84  * $FreeBSD: src/sys/netinet/in_pcb.c,v 1.59.2.27 2004/01/02 04:06:42 ambrisko Exp $
85  * $DragonFly: src/sys/netinet/in_pcb.c,v 1.25 2004/08/11 02:36:22 dillon Exp $
86  */
87
88 #include "opt_ipsec.h"
89 #include "opt_inet6.h"
90
91 #include <sys/param.h>
92 #include <sys/systm.h>
93 #include <sys/malloc.h>
94 #include <sys/mbuf.h>
95 #include <sys/domain.h>
96 #include <sys/protosw.h>
97 #include <sys/socket.h>
98 #include <sys/socketvar.h>
99 #include <sys/proc.h>
100 #include <sys/jail.h>
101 #include <sys/kernel.h>
102 #include <sys/sysctl.h>
103
104 #include <machine/limits.h>
105
106 #include <vm/vm_zone.h>
107
108 #include <net/if.h>
109 #include <net/if_types.h>
110 #include <net/route.h>
111
112 #include <netinet/in.h>
113 #include <netinet/in_pcb.h>
114 #include <netinet/in_var.h>
115 #include <netinet/ip_var.h>
116 #ifdef INET6
117 #include <netinet/ip6.h>
118 #include <netinet6/ip6_var.h>
119 #endif /* INET6 */
120
121 #ifdef IPSEC
122 #include <netinet6/ipsec.h>
123 #include <netproto/key/key.h>
124 #endif
125
126 #ifdef FAST_IPSEC
127 #if defined(IPSEC) || defined(IPSEC_ESP)
128 #error "Bad idea: don't compile with both IPSEC and FAST_IPSEC!"
129 #endif
130
131 #include <netipsec/ipsec.h>
132 #include <netipsec/key.h>
133 #define IPSEC
134 #endif /* FAST_IPSEC */
135
136 struct in_addr zeroin_addr;
137
138 /*
139  * These configure the range of local port addresses assigned to
140  * "unspecified" outgoing connections/packets/whatever.
141  */
142 int ipport_lowfirstauto = IPPORT_RESERVED - 1;  /* 1023 */
143 int ipport_lowlastauto = IPPORT_RESERVEDSTART;  /* 600 */
144
145 int ipport_firstauto = IPPORT_RESERVED;         /* 1024 */
146 int ipport_lastauto = IPPORT_USERRESERVED;      /* 5000 */
147
148 int ipport_hifirstauto = IPPORT_HIFIRSTAUTO;    /* 49152 */
149 int ipport_hilastauto = IPPORT_HILASTAUTO;      /* 65535 */
150
151 /* Allocate ephermal source ports in random order. */
152 int ipport_randomized = 1;
153
154 static __inline void
155 RANGECHK(int var, int min, int max)
156 {
157         if (var < min)
158                 var = min;
159         else if (var > max)
160                 var = max;
161 }
162
163 static int
164 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
165 {
166         int error;
167
168         error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
169         if (!error) {
170                 RANGECHK(ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
171                 RANGECHK(ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
172
173                 RANGECHK(ipport_firstauto, IPPORT_RESERVED, USHRT_MAX);
174                 RANGECHK(ipport_lastauto, IPPORT_RESERVED, USHRT_MAX);
175
176                 RANGECHK(ipport_hifirstauto, IPPORT_RESERVED, USHRT_MAX);
177                 RANGECHK(ipport_hilastauto, IPPORT_RESERVED, USHRT_MAX);
178         }
179         return (error);
180 }
181
182 SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0, "IP Ports");
183
184 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, CTLTYPE_INT|CTLFLAG_RW,
185            &ipport_lowfirstauto, 0, &sysctl_net_ipport_check, "I", "");
186 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, CTLTYPE_INT|CTLFLAG_RW,
187            &ipport_lowlastauto, 0, &sysctl_net_ipport_check, "I", "");
188 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, CTLTYPE_INT|CTLFLAG_RW,
189            &ipport_firstauto, 0, &sysctl_net_ipport_check, "I", "");
190 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, CTLTYPE_INT|CTLFLAG_RW,
191            &ipport_lastauto, 0, &sysctl_net_ipport_check, "I", "");
192 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, CTLTYPE_INT|CTLFLAG_RW,
193            &ipport_hifirstauto, 0, &sysctl_net_ipport_check, "I", "");
194 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, CTLTYPE_INT|CTLFLAG_RW,
195            &ipport_hilastauto, 0, &sysctl_net_ipport_check, "I", "");
196 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized, CTLFLAG_RW,
197            &ipport_randomized, 0, "");
198
199 /*
200  * in_pcb.c: manage the Protocol Control Blocks.
201  *
202  * NOTE: It is assumed that most of these functions will be called at
203  * splnet(). XXX - There are, unfortunately, a few exceptions to this
204  * rule that should be fixed.
205  *
206  * NOTE: The caller should initialize the cpu field to the cpu running the
207  * protocol stack associated with this inpcbinfo.
208  */
209
210 void
211 in_pcbinfo_init(struct inpcbinfo *pcbinfo)
212 {
213         LIST_INIT(&pcbinfo->pcblisthead);
214         pcbinfo->cpu = -1;
215 }
216
217 /*
218  * Allocate a PCB and associate it with the socket.
219  */
220 int
221 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
222 {
223         struct inpcb *inp;
224 #ifdef IPSEC
225         int error;
226 #endif
227
228         inp = zalloc(pcbinfo->ipi_zone);
229         if (inp == NULL)
230                 return (ENOBUFS);
231         bzero((caddr_t)inp, sizeof *inp);
232         inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
233         inp->inp_pcbinfo = inp->inp_cpcbinfo = pcbinfo;
234         inp->inp_socket = so;
235 #ifdef IPSEC
236         error = ipsec_init_policy(so, &inp->inp_sp);
237         if (error != 0) {
238                 zfree(pcbinfo->ipi_zone, inp);
239                 return (error);
240         }
241 #endif
242 #ifdef INET6
243         if (INP_SOCKAF(so) == AF_INET6 && ip6_v6only)
244                 inp->inp_flags |= IN6P_IPV6_V6ONLY;
245         if (ip6_auto_flowlabel)
246                 inp->inp_flags |= IN6P_AUTOFLOWLABEL;
247 #endif
248         so->so_pcb = (caddr_t)inp;
249         LIST_INSERT_HEAD(&pcbinfo->pcblisthead, inp, inp_list);
250         pcbinfo->ipi_count++;
251         return (0);
252 }
253
254 int
255 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct thread *td)
256 {
257         struct socket *so = inp->inp_socket;
258         struct proc *p = td->td_proc;
259         unsigned short *lastport;
260         struct sockaddr_in *sin;
261         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
262         u_short lport = 0;
263         int wild = 0, reuseport = (so->so_options & SO_REUSEPORT);
264         int error, prison = 0;
265
266         KKASSERT(p);
267
268         if (TAILQ_EMPTY(&in_ifaddrhead)) /* XXX broken! */
269                 return (EADDRNOTAVAIL);
270         if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
271                 return (EINVAL);        /* already bound */
272         if (!(so->so_options & (SO_REUSEADDR|SO_REUSEPORT)))
273                 wild = 1;    /* neither SO_REUSEADDR nor SO_REUSEPORT is set */
274         if (nam != NULL) {
275                 sin = (struct sockaddr_in *)nam;
276                 if (nam->sa_len != sizeof *sin)
277                         return (EINVAL);
278 #ifdef notdef
279                 /*
280                  * We should check the family, but old programs
281                  * incorrectly fail to initialize it.
282                  */
283                 if (sin->sin_family != AF_INET)
284                         return (EAFNOSUPPORT);
285 #endif
286                 if (sin->sin_addr.s_addr != INADDR_ANY &&
287                     prison_ip(td, 0, &sin->sin_addr.s_addr))
288                                 return (EINVAL);
289                 lport = sin->sin_port;
290                 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
291                         /*
292                          * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
293                          * allow complete duplication of binding if
294                          * SO_REUSEPORT is set, or if SO_REUSEADDR is set
295                          * and a multicast address is bound on both
296                          * new and duplicated sockets.
297                          */
298                         if (so->so_options & SO_REUSEADDR)
299                                 reuseport = SO_REUSEADDR | SO_REUSEPORT;
300                 } else if (sin->sin_addr.s_addr != INADDR_ANY) {
301                         sin->sin_port = 0;              /* yech... */
302                         bzero(&sin->sin_zero, sizeof sin->sin_zero);
303                         if (ifa_ifwithaddr((struct sockaddr *)sin) == NULL)
304                                 return (EADDRNOTAVAIL);
305                 }
306                 if (lport != 0) {
307                         struct inpcb *t;
308
309                         /* GROSS */
310                         if (ntohs(lport) < IPPORT_RESERVED &&
311                             p && suser_cred(p->p_ucred, PRISON_ROOT))
312                                 return (EACCES);
313                         if (p && p->p_ucred->cr_prison)
314                                 prison = 1;
315                         if (so->so_cred->cr_uid != 0 &&
316                             !IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
317                                 t = in_pcblookup_local(inp->inp_pcbinfo,
318                                     sin->sin_addr, lport,
319                                     prison ? 0 : INPLOOKUP_WILDCARD);
320                                 if (t &&
321                                     (!in_nullhost(sin->sin_addr) ||
322                                      !in_nullhost(t->inp_laddr) ||
323                                      (t->inp_socket->so_options &
324                                          SO_REUSEPORT) == 0) &&
325                                     (so->so_cred->cr_uid !=
326                                      t->inp_socket->so_cred->cr_uid)) {
327 #ifdef INET6
328                                         if (!in_nullhost(sin->sin_addr) ||
329                                             !in_nullhost(t->inp_laddr) ||
330                                             INP_SOCKAF(so) ==
331                                             INP_SOCKAF(t->inp_socket))
332 #endif
333                                         return (EADDRINUSE);
334                                 }
335                         }
336                         if (prison && prison_ip(td, 0, &sin->sin_addr.s_addr))
337                                 return (EADDRNOTAVAIL);
338                         t = in_pcblookup_local(pcbinfo, sin->sin_addr,
339                             lport, prison ? 0 : wild);
340                         if (t && !(reuseport & t->inp_socket->so_options)) {
341 #ifdef INET6
342                                 if (!in_nullhost(sin->sin_addr) ||
343                                     !in_nullhost(t->inp_laddr) ||
344                                     INP_SOCKAF(so) == INP_SOCKAF(t->inp_socket))
345 #endif
346                                 return (EADDRINUSE);
347                         }
348                 }
349                 inp->inp_laddr = sin->sin_addr;
350         }
351         if (lport == 0) {
352                 ushort first, last;
353                 int count;
354
355                 if (inp->inp_laddr.s_addr != INADDR_ANY &&
356                     prison_ip(td, 0, &inp->inp_laddr.s_addr )) {
357                         inp->inp_laddr.s_addr = INADDR_ANY;
358                         return (EINVAL);
359                 }
360                 inp->inp_flags |= INP_ANONPORT;
361
362                 if (inp->inp_flags & INP_HIGHPORT) {
363                         first = ipport_hifirstauto;     /* sysctl */
364                         last  = ipport_hilastauto;
365                         lastport = &pcbinfo->lasthi;
366                 } else if (inp->inp_flags & INP_LOWPORT) {
367                         if (p &&
368                             (error = suser_cred(p->p_ucred, PRISON_ROOT))) {
369                                 inp->inp_laddr.s_addr = INADDR_ANY;
370                                 return (error);
371                         }
372                         first = ipport_lowfirstauto;    /* 1023 */
373                         last  = ipport_lowlastauto;     /* 600 */
374                         lastport = &pcbinfo->lastlow;
375                 } else {
376                         first = ipport_firstauto;       /* sysctl */
377                         last  = ipport_lastauto;
378                         lastport = &pcbinfo->lastport;
379                 }
380                 /*
381                  * Simple check to ensure all ports are not used up causing
382                  * a deadlock here.
383                  *
384                  * We split the two cases (up and down) so that the direction
385                  * is not being tested on each round of the loop.
386                  */
387                 if (first > last) {
388                         /*
389                          * counting down
390                          */
391                         if (ipport_randomized)
392                                 *lastport = first -
393                                     (arc4random() % (first - last));
394                         count = first - last;
395
396                         do {
397                                 if (count-- < 0) {      /* completely used? */
398                                         inp->inp_laddr.s_addr = INADDR_ANY;
399                                         return (EADDRNOTAVAIL);
400                                 }
401                                 --*lastport;
402                                 if (*lastport > first || *lastport < last)
403                                         *lastport = first;
404                                 lport = htons(*lastport);
405                         } while (in_pcblookup_local(pcbinfo,
406                                  inp->inp_laddr, lport, wild));
407                 } else {
408                         /*
409                          * counting up
410                          */
411                         if (ipport_randomized)
412                                 *lastport = first +
413                                     (arc4random() % (last - first));
414                         count = last - first;
415
416                         do {
417                                 if (count-- < 0) {      /* completely used? */
418                                         inp->inp_laddr.s_addr = INADDR_ANY;
419                                         return (EADDRNOTAVAIL);
420                                 }
421                                 ++*lastport;
422                                 if (*lastport < first || *lastport > last)
423                                         *lastport = first;
424                                 lport = htons(*lastport);
425                         } while (in_pcblookup_local(pcbinfo,
426                                  inp->inp_laddr, lport, wild));
427                 }
428         }
429         inp->inp_lport = lport;
430         if (prison_ip(td, 0, &inp->inp_laddr.s_addr)) {
431                 inp->inp_laddr.s_addr = INADDR_ANY;
432                 inp->inp_lport = 0;
433                 return (EINVAL);
434         }
435         if (in_pcbinsporthash(inp) != 0) {
436                 inp->inp_laddr.s_addr = INADDR_ANY;
437                 inp->inp_lport = 0;
438                 return (EAGAIN);
439         }
440         return (0);
441 }
442
443 /*
444  *   Transform old in_pcbconnect() into an inner subroutine for new
445  *   in_pcbconnect(): Do some validity-checking on the remote
446  *   address (in mbuf 'nam') and then determine local host address
447  *   (i.e., which interface) to use to access that remote host.
448  *
449  *   This preserves definition of in_pcbconnect(), while supporting a
450  *   slightly different version for T/TCP.  (This is more than
451  *   a bit of a kludge, but cleaning up the internal interfaces would
452  *   have forced minor changes in every protocol).
453  */
454 int
455 in_pcbladdr(inp, nam, plocal_sin)
456         struct inpcb *inp;
457         struct sockaddr *nam;
458         struct sockaddr_in **plocal_sin;
459 {
460         struct in_ifaddr *ia;
461         struct sockaddr_in *sin = (struct sockaddr_in *)nam;
462
463         if (nam->sa_len != sizeof *sin)
464                 return (EINVAL);
465         if (sin->sin_family != AF_INET)
466                 return (EAFNOSUPPORT);
467         if (sin->sin_port == 0)
468                 return (EADDRNOTAVAIL);
469         if (!TAILQ_EMPTY(&in_ifaddrhead)) {
470                 ia = TAILQ_FIRST(&in_ifaddrhead);
471                 /*
472                  * If the destination address is INADDR_ANY,
473                  * use the primary local address.
474                  * If the supplied address is INADDR_BROADCAST,
475                  * and the primary interface supports broadcast,
476                  * choose the broadcast address for that interface.
477                  */
478                 if (sin->sin_addr.s_addr == INADDR_ANY)
479                         sin->sin_addr = IA_SIN(ia)->sin_addr;
480                 else if (sin->sin_addr.s_addr == (u_long)INADDR_BROADCAST &&
481                     (ia->ia_ifp->if_flags & IFF_BROADCAST))
482                         sin->sin_addr = satosin(&ia->ia_broadaddr)->sin_addr;
483         }
484         if (inp->inp_laddr.s_addr == INADDR_ANY) {
485                 struct route *ro;
486
487                 ia = (struct in_ifaddr *)NULL;
488                 /*
489                  * If route is known or can be allocated now,
490                  * our src addr is taken from the i/f, else punt.
491                  * Note that we should check the address family of the cached
492                  * destination, in case of sharing the cache with IPv6.
493                  */
494                 ro = &inp->inp_route;
495                 if (ro->ro_rt &&
496                     (!(ro->ro_rt->rt_flags & RTF_UP) ||
497                      ro->ro_dst.sa_family != AF_INET ||
498                      satosin(&ro->ro_dst)->sin_addr.s_addr !=
499                          sin->sin_addr.s_addr ||
500                      inp->inp_socket->so_options & SO_DONTROUTE)) {
501                         RTFREE(ro->ro_rt);
502                         ro->ro_rt = (struct rtentry *)NULL;
503                 }
504                 if (!(inp->inp_socket->so_options & SO_DONTROUTE) && /*XXX*/
505                     (ro->ro_rt == (struct rtentry *)NULL ||
506                     ro->ro_rt->rt_ifp == (struct ifnet *)NULL)) {
507                         /* No route yet, so try to acquire one */
508                         bzero(&ro->ro_dst, sizeof(struct sockaddr_in));
509                         ro->ro_dst.sa_family = AF_INET;
510                         ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
511                         ((struct sockaddr_in *) &ro->ro_dst)->sin_addr =
512                                 sin->sin_addr;
513                         rtalloc(ro);
514                 }
515                 /*
516                  * If we found a route, use the address
517                  * corresponding to the outgoing interface
518                  * unless it is the loopback (in case a route
519                  * to our address on another net goes to loopback).
520                  */
521                 if (ro->ro_rt && !(ro->ro_rt->rt_ifp->if_flags & IFF_LOOPBACK))
522                         ia = ifatoia(ro->ro_rt->rt_ifa);
523                 if (ia == NULL) {
524                         u_short fport = sin->sin_port;
525
526                         sin->sin_port = 0;
527                         ia = ifatoia(ifa_ifwithdstaddr(sintosa(sin)));
528                         if (ia == NULL)
529                                 ia = ifatoia(ifa_ifwithnet(sintosa(sin)));
530                         sin->sin_port = fport;
531                         if (ia == NULL)
532                                 ia = TAILQ_FIRST(&in_ifaddrhead);
533                         if (ia == NULL)
534                                 return (EADDRNOTAVAIL);
535                 }
536                 /*
537                  * If the destination address is multicast and an outgoing
538                  * interface has been set as a multicast option, use the
539                  * address of that interface as our source address.
540                  */
541                 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) &&
542                     inp->inp_moptions != NULL) {
543                         struct ip_moptions *imo;
544                         struct ifnet *ifp;
545
546                         imo = inp->inp_moptions;
547                         if (imo->imo_multicast_ifp != NULL) {
548                                 ifp = imo->imo_multicast_ifp;
549                                 TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link)
550                                         if (ia->ia_ifp == ifp)
551                                                 break;
552                                 if (ia == NULL)
553                                         return (EADDRNOTAVAIL);
554                         }
555                 }
556                 /*
557                  * Don't do pcblookup call here; return interface in plocal_sin
558                  * and exit to caller, that will do the lookup.
559                  */
560                 *plocal_sin = &ia->ia_addr;
561
562         }
563         return (0);
564 }
565
566 /*
567  * Outer subroutine:
568  * Connect from a socket to a specified address.
569  * Both address and port must be specified in argument sin.
570  * If don't have a local address for this socket yet,
571  * then pick one.
572  */
573 int
574 in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct thread *td)
575 {
576         struct sockaddr_in *if_sin;
577         struct sockaddr_in *sin = (struct sockaddr_in *)nam;
578         struct sockaddr_in sa;
579         struct ucred *cr = td->td_proc ? td->td_proc->p_ucred : NULL;
580         int error;
581
582         if (cr && cr->cr_prison != NULL && in_nullhost(inp->inp_laddr)) {
583                 bzero(&sa, sizeof sa);
584                 sa.sin_addr.s_addr = htonl(cr->cr_prison->pr_ip);
585                 sa.sin_len = sizeof sa;
586                 sa.sin_family = AF_INET;
587                 error = in_pcbbind(inp, (struct sockaddr *)&sa, td);
588                 if (error)
589                         return (error);
590         }
591
592         /* Call inner routine to assign local interface address. */
593         if ((error = in_pcbladdr(inp, nam, &if_sin)) != 0)
594                 return (error);
595
596         if (in_pcblookup_hash(inp->inp_cpcbinfo, sin->sin_addr, sin->sin_port,
597             inp->inp_laddr.s_addr ? inp->inp_laddr : if_sin->sin_addr,
598             inp->inp_lport, FALSE, NULL) != NULL) {
599                 return (EADDRINUSE);
600         }
601         if (inp->inp_laddr.s_addr == INADDR_ANY) {
602                 if (inp->inp_lport == 0) {
603                         error = in_pcbbind(inp, (struct sockaddr *)NULL, td);
604                         if (error)
605                                 return (error);
606                 }
607                 inp->inp_laddr = if_sin->sin_addr;
608         }
609         inp->inp_faddr = sin->sin_addr;
610         inp->inp_fport = sin->sin_port;
611         in_pcbinsconnhash(inp);
612         return (0);
613 }
614
615 void
616 in_pcbdisconnect(inp)
617         struct inpcb *inp;
618 {
619
620         inp->inp_faddr.s_addr = INADDR_ANY;
621         inp->inp_fport = 0;
622         in_pcbremconnhash(inp);
623         if (inp->inp_socket->so_state & SS_NOFDREF)
624                 in_pcbdetach(inp);
625 }
626
627 void
628 in_pcbdetach(inp)
629         struct inpcb *inp;
630 {
631         struct socket *so = inp->inp_socket;
632         struct inpcbinfo *ipi = inp->inp_pcbinfo;
633
634 #ifdef IPSEC
635         ipsec4_delete_pcbpolicy(inp);
636 #endif /*IPSEC*/
637         inp->inp_gencnt = ++ipi->ipi_gencnt;
638         in_pcbremlists(inp);
639         so->so_pcb = 0;
640         sofree(so);
641         if (inp->inp_options)
642                 (void)m_free(inp->inp_options);
643         if (inp->inp_route.ro_rt)
644                 rtfree(inp->inp_route.ro_rt);
645         ip_freemoptions(inp->inp_moptions);
646         inp->inp_vflag = 0;
647         zfree(ipi->ipi_zone, inp);
648 }
649
650 /*
651  * The calling convention of in_setsockaddr() and in_setpeeraddr() was
652  * modified to match the pru_sockaddr() and pru_peeraddr() entry points
653  * in struct pr_usrreqs, so that protocols can just reference then directly
654  * without the need for a wrapper function.  The socket must have a valid
655  * (i.e., non-nil) PCB, but it should be impossible to get an invalid one
656  * except through a kernel programming error, so it is acceptable to panic
657  * (or in this case trap) if the PCB is invalid.  (Actually, we don't trap
658  * because there actually /is/ a programming error somewhere... XXX)
659  */
660 int
661 in_setsockaddr(so, nam)
662         struct socket *so;
663         struct sockaddr **nam;
664 {
665         int s;
666         struct inpcb *inp;
667         struct sockaddr_in *sin;
668
669         /*
670          * Do the malloc first in case it blocks.
671          */
672         MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME,
673                 M_WAITOK | M_ZERO);
674         sin->sin_family = AF_INET;
675         sin->sin_len = sizeof *sin;
676
677         s = splnet();
678         inp = sotoinpcb(so);
679         if (!inp) {
680                 splx(s);
681                 free(sin, M_SONAME);
682                 return (ECONNRESET);
683         }
684         sin->sin_port = inp->inp_lport;
685         sin->sin_addr = inp->inp_laddr;
686         splx(s);
687
688         *nam = (struct sockaddr *)sin;
689         return (0);
690 }
691
692 int
693 in_setpeeraddr(so, nam)
694         struct socket *so;
695         struct sockaddr **nam;
696 {
697         int s;
698         struct inpcb *inp;
699         struct sockaddr_in *sin;
700
701         /*
702          * Do the malloc first in case it blocks.
703          */
704         MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME,
705                 M_WAITOK | M_ZERO);
706         sin->sin_family = AF_INET;
707         sin->sin_len = sizeof *sin;
708
709         s = splnet();
710         inp = sotoinpcb(so);
711         if (!inp) {
712                 splx(s);
713                 free(sin, M_SONAME);
714                 return (ECONNRESET);
715         }
716         sin->sin_port = inp->inp_fport;
717         sin->sin_addr = inp->inp_faddr;
718         splx(s);
719
720         *nam = (struct sockaddr *)sin;
721         return (0);
722 }
723
724 void
725 in_pcbnotifyall(head, faddr, errno, notify)
726         struct inpcbhead *head;
727         struct in_addr faddr;
728         void (*notify) (struct inpcb *, int);
729 {
730         struct inpcb *inp, *ninp;
731         int s;
732
733         /*
734          * note: if INP_PLACEMARKER is set we must ignore the rest of
735          * the structure and skip it.
736          */
737         s = splnet();
738         for (inp = LIST_FIRST(head); inp != NULL; inp = ninp) {
739                 ninp = LIST_NEXT(inp, inp_list);
740                 if (inp->inp_flags & INP_PLACEMARKER)
741                         continue;
742 #ifdef INET6
743                 if (!(inp->inp_vflag & INP_IPV4))
744                         continue;
745 #endif
746                 if (inp->inp_faddr.s_addr != faddr.s_addr ||
747                     inp->inp_socket == NULL)
748                         continue;
749                 (*notify)(inp, errno);
750         }
751         splx(s);
752 }
753
754 void
755 in_pcbpurgeif0(head, ifp)
756         struct inpcb *head;
757         struct ifnet *ifp;
758 {
759         struct inpcb *inp;
760         struct ip_moptions *imo;
761         int i, gap;
762
763         for (inp = head; inp != NULL; inp = LIST_NEXT(inp, inp_list)) {
764                 if (inp->inp_flags & INP_PLACEMARKER)
765                         continue;
766                 imo = inp->inp_moptions;
767                 if ((inp->inp_vflag & INP_IPV4) && imo != NULL) {
768                         /*
769                          * Unselect the outgoing interface if it is being
770                          * detached.
771                          */
772                         if (imo->imo_multicast_ifp == ifp)
773                                 imo->imo_multicast_ifp = NULL;
774
775                         /*
776                          * Drop multicast group membership if we joined
777                          * through the interface being detached.
778                          */
779                         for (i = 0, gap = 0; i < imo->imo_num_memberships;
780                             i++) {
781                                 if (imo->imo_membership[i]->inm_ifp == ifp) {
782                                         in_delmulti(imo->imo_membership[i]);
783                                         gap++;
784                                 } else if (gap != 0)
785                                         imo->imo_membership[i - gap] =
786                                             imo->imo_membership[i];
787                         }
788                         imo->imo_num_memberships -= gap;
789                 }
790         }
791 }
792
793 /*
794  * Check for alternatives when higher level complains
795  * about service problems.  For now, invalidate cached
796  * routing information.  If the route was created dynamically
797  * (by a redirect), time to try a default gateway again.
798  */
799 void
800 in_losing(inp)
801         struct inpcb *inp;
802 {
803         struct rtentry *rt;
804         struct rt_addrinfo info;
805
806         if ((rt = inp->inp_route.ro_rt)) {
807                 bzero((caddr_t)&info, sizeof info);
808                 info.rti_flags = rt->rt_flags;
809                 info.rti_info[RTAX_DST] = rt_key(rt);
810                 info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
811                 info.rti_info[RTAX_NETMASK] = rt_mask(rt);
812                 rt_missmsg(RTM_LOSING, &info, rt->rt_flags, 0);
813                 if (rt->rt_flags & RTF_DYNAMIC)
814                         (void) rtrequest1(RTM_DELETE, &info, NULL);
815                 inp->inp_route.ro_rt = NULL;
816                 rtfree(rt);
817                 /*
818                  * A new route can be allocated
819                  * the next time output is attempted.
820                  */
821         }
822 }
823
824 /*
825  * After a routing change, flush old routing
826  * and allocate a (hopefully) better one.
827  */
828 void
829 in_rtchange(inp, errno)
830         struct inpcb *inp;
831         int errno;
832 {
833         if (inp->inp_route.ro_rt) {
834                 rtfree(inp->inp_route.ro_rt);
835                 inp->inp_route.ro_rt = 0;
836                 /*
837                  * A new route can be allocated the next time
838                  * output is attempted.
839                  */
840         }
841 }
842
843 /*
844  * Lookup a PCB based on the local address and port.
845  */
846 struct inpcb *
847 in_pcblookup_local(pcbinfo, laddr, lport_arg, wild_okay)
848         struct inpcbinfo *pcbinfo;
849         struct in_addr laddr;
850         u_int lport_arg;
851         int wild_okay;
852 {
853         struct inpcb *inp;
854         int matchwild = 3, wildcard;
855         u_short lport = lport_arg;
856
857         struct inpcbporthead *porthash;
858         struct inpcbport *phd;
859         struct inpcb *match = NULL;
860
861         /*
862          * Best fit PCB lookup.
863          *
864          * First see if this local port is in use by looking on the
865          * port hash list.
866          */
867         porthash = &pcbinfo->porthashbase[INP_PCBPORTHASH(lport,
868             pcbinfo->porthashmask)];
869         LIST_FOREACH(phd, porthash, phd_hash) {
870                 if (phd->phd_port == lport)
871                         break;
872         }
873         if (phd != NULL) {
874                 /*
875                  * Port is in use by one or more PCBs. Look for best
876                  * fit.
877                  */
878                 LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
879                         wildcard = 0;
880 #ifdef INET6
881                         if ((inp->inp_vflag & INP_IPV4) == 0)
882                                 continue;
883 #endif
884                         if (inp->inp_faddr.s_addr != INADDR_ANY)
885                                 wildcard++;
886                         if (inp->inp_laddr.s_addr != INADDR_ANY) {
887                                 if (laddr.s_addr == INADDR_ANY)
888                                         wildcard++;
889                                 else if (inp->inp_laddr.s_addr != laddr.s_addr)
890                                         continue;
891                         } else {
892                                 if (laddr.s_addr != INADDR_ANY)
893                                         wildcard++;
894                         }
895                         if (wildcard && !wild_okay)
896                                 continue;
897                         if (wildcard < matchwild) {
898                                 match = inp;
899                                 matchwild = wildcard;
900                                 if (matchwild == 0) {
901                                         break;
902                                 }
903                         }
904                 }
905         }
906         return (match);
907 }
908
909 /*
910  * Lookup PCB in hash list.
911  */
912 struct inpcb *
913 in_pcblookup_hash(pcbinfo, faddr, fport_arg, laddr, lport_arg, wildcard, ifp)
914         struct inpcbinfo *pcbinfo;
915         struct in_addr faddr, laddr;
916         u_int fport_arg, lport_arg;
917         boolean_t wildcard;
918         struct ifnet *ifp;
919 {
920         struct inpcbhead *head;
921         struct inpcb *inp;
922         u_short fport = fport_arg, lport = lport_arg;
923
924         /*
925          * First look for an exact match.
926          */
927         head = &pcbinfo->hashbase[INP_PCBCONNHASH(faddr.s_addr, fport,
928             laddr.s_addr, lport, pcbinfo->hashmask)];
929         LIST_FOREACH(inp, head, inp_hash) {
930 #ifdef INET6
931                 if (!(inp->inp_vflag & INP_IPV4))
932                         continue;
933 #endif
934                 if (in_hosteq(inp->inp_faddr, faddr) &&
935                     in_hosteq(inp->inp_laddr, laddr) &&
936                     inp->inp_fport == fport && inp->inp_lport == lport) {
937                         /* found */
938                         return (inp);
939                 }
940         }
941
942         if (wildcard) {
943                 struct inpcb *local_wild = NULL;
944 #ifdef INET6
945                 struct inpcb *local_wild_mapped = NULL;
946 #endif
947                 struct inpcontainer *ic;
948                 struct inpcontainerhead *chead;
949
950                 chead = &pcbinfo->wildcardhashbase[
951                     INP_PCBWILDCARDHASH(lport, pcbinfo->wildcardhashmask)];
952                 LIST_FOREACH(ic, chead, ic_list) {
953                         inp = ic->ic_inp;
954 #ifdef INET6
955                         if (!(inp->inp_vflag & INP_IPV4))
956                                 continue;
957 #endif
958                         if (inp->inp_lport == lport) {
959                                 if (ifp && ifp->if_type == IFT_FAITH &&
960                                     !(inp->inp_flags & INP_FAITH))
961                                         continue;
962                                 if (inp->inp_laddr.s_addr == laddr.s_addr)
963                                         return (inp);
964                                 if (inp->inp_laddr.s_addr == INADDR_ANY) {
965 #ifdef INET6
966                                         if (INP_CHECK_SOCKAF(inp->inp_socket,
967                                                              AF_INET6))
968                                                 local_wild_mapped = inp;
969                                         else
970 #endif
971                                                 local_wild = inp;
972                                 }
973                         }
974                 }
975 #ifdef INET6
976                 if (local_wild == NULL)
977                         return (local_wild_mapped);
978 #endif
979                 return (local_wild);
980         }
981
982         /*
983          * Not found.
984          */
985         return (NULL);
986 }
987
988 /*
989  * Insert PCB into connection hash table.
990  */
991 void
992 in_pcbinsconnhash(struct inpcb *inp)
993 {
994         struct inpcbinfo *pcbinfo = inp->inp_cpcbinfo;
995         struct inpcbhead *bucket;
996         u_int32_t hashkey_faddr, hashkey_laddr;
997
998 #ifdef INET6
999         if (inp->inp_vflag & INP_IPV6) {
1000                 hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX JH */;
1001                 hashkey_laddr = inp->in6p_laddr.s6_addr32[3] /* XXX JH */;
1002         } else {
1003 #endif
1004                 hashkey_faddr = inp->inp_faddr.s_addr;
1005                 hashkey_laddr = inp->inp_laddr.s_addr;
1006 #ifdef INET6
1007         }
1008 #endif
1009
1010         KASSERT(!(inp->inp_flags & INP_CONNECTED), ("already on hash list"));
1011         inp->inp_flags |= INP_CONNECTED;
1012
1013         /*
1014          * Insert into the connection hash table.
1015          */
1016         bucket = &pcbinfo->hashbase[INP_PCBCONNHASH(hashkey_faddr,
1017             inp->inp_fport, hashkey_laddr, inp->inp_lport, pcbinfo->hashmask)];
1018         LIST_INSERT_HEAD(bucket, inp, inp_hash);
1019 }
1020
1021 /*
1022  * Remove PCB from connection hash table.
1023  */
1024 void
1025 in_pcbremconnhash(struct inpcb *inp)
1026 {
1027         KASSERT(inp->inp_flags & INP_CONNECTED, ("inp not connected"));
1028         LIST_REMOVE(inp, inp_hash);
1029         inp->inp_flags &= ~INP_CONNECTED;
1030 }
1031
1032 /*
1033  * Insert PCB into port hash table.
1034  */
1035 int
1036 in_pcbinsporthash(struct inpcb *inp)
1037 {
1038         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1039         struct inpcbporthead *pcbporthash;
1040         struct inpcbport *phd;
1041
1042         /*
1043          * Insert into the port hash table.
1044          */
1045         pcbporthash = &pcbinfo->porthashbase[
1046             INP_PCBPORTHASH(inp->inp_lport, pcbinfo->porthashmask)];
1047
1048         /* Go through port list and look for a head for this lport. */
1049         LIST_FOREACH(phd, pcbporthash, phd_hash)
1050                 if (phd->phd_port == inp->inp_lport)
1051                         break;
1052
1053         /* If none exists, malloc one and tack it on. */
1054         if (phd == NULL) {
1055                 MALLOC(phd, struct inpcbport *, sizeof(struct inpcbport),
1056                     M_PCB, M_INTWAIT | M_NULLOK);
1057                 if (phd == NULL)
1058                         return (ENOBUFS); /* XXX */
1059                 phd->phd_port = inp->inp_lport;
1060                 LIST_INIT(&phd->phd_pcblist);
1061                 LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
1062         }
1063
1064         inp->inp_phd = phd;
1065         LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
1066
1067         return (0);
1068 }
1069
1070 void
1071 in_pcbinswildcardhash_oncpu(struct inpcb *inp, struct inpcbinfo *pcbinfo)
1072 {
1073         struct inpcontainer *ic;
1074         struct inpcontainerhead *bucket;
1075
1076         bucket = &pcbinfo->wildcardhashbase[
1077             INP_PCBWILDCARDHASH(inp->inp_lport, pcbinfo->wildcardhashmask)];
1078
1079         ic = malloc(sizeof(struct inpcontainer), M_TEMP, M_INTWAIT);
1080         ic->ic_inp = inp;
1081         LIST_INSERT_HEAD(bucket, ic, ic_list);
1082 }
1083
1084 /*
1085  * Insert PCB into wildcard hash table.
1086  */
1087 void
1088 in_pcbinswildcardhash(struct inpcb *inp)
1089 {
1090         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1091         
1092         KKASSERT(pcbinfo != NULL);
1093
1094         in_pcbinswildcardhash_oncpu(inp, pcbinfo);
1095         inp->inp_flags |= INP_WILDCARD;
1096 }
1097
1098 void
1099 in_pcbremwildcardhash_oncpu(struct inpcb *inp, struct inpcbinfo *pcbinfo)
1100 {
1101         struct inpcontainer *ic;
1102         struct inpcontainerhead *head;
1103
1104         /* find bucket */
1105         head = &pcbinfo->wildcardhashbase[
1106             INP_PCBWILDCARDHASH(inp->inp_lport, pcbinfo->wildcardhashmask)];
1107
1108         LIST_FOREACH(ic, head, ic_list) {
1109                 if (ic->ic_inp == inp)
1110                         goto found;
1111         }
1112         return;                 /* not found! */
1113
1114 found:
1115         LIST_REMOVE(ic, ic_list);       /* remove container from bucket chain */
1116         free(ic, M_TEMP);               /* deallocate container */
1117 }
1118
1119 /*
1120  * Remove PCB from wildcard hash table.
1121  */
1122 void
1123 in_pcbremwildcardhash(struct inpcb *inp)
1124 {
1125         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1126
1127         KASSERT(inp->inp_flags & INP_WILDCARD, ("inp not wildcard"));
1128         in_pcbremwildcardhash_oncpu(inp, pcbinfo);
1129         inp->inp_flags &= ~INP_WILDCARD;
1130 }
1131
1132 /*
1133  * Remove PCB from various lists.
1134  */
1135 void
1136 in_pcbremlists(inp)
1137         struct inpcb *inp;
1138 {
1139         if (inp->inp_lport) {
1140                 struct inpcbport *phd = inp->inp_phd;
1141
1142                 LIST_REMOVE(inp, inp_portlist);
1143                 if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
1144                         LIST_REMOVE(phd, phd_hash);
1145                         free(phd, M_PCB);
1146                 }
1147         }
1148         if (inp->inp_flags & INP_WILDCARD) {
1149                 in_pcbremwildcardhash(inp);
1150         } else if (inp->inp_flags & INP_CONNECTED) {
1151                 in_pcbremconnhash(inp);
1152         }
1153         LIST_REMOVE(inp, inp_list);
1154         inp->inp_pcbinfo->ipi_count--;
1155 }
1156
1157 int
1158 prison_xinpcb(struct thread *td, struct inpcb *inp)
1159 {
1160         struct ucred *cr;
1161
1162         if (td->td_proc == NULL)
1163                 return (0);
1164         cr = td->td_proc->p_ucred;
1165         if (cr->cr_prison == NULL)
1166                 return (0);
1167         if (ntohl(inp->inp_laddr.s_addr) == cr->cr_prison->pr_ip)
1168                 return (0);
1169         return (1);
1170 }