Use M_INTWAIT instead of M_NOWAIT in the ip messaging redispatch case to
[dragonfly.git] / sys / netinet / ip_input.c
1 /*
2  * Copyright (c) 1982, 1986, 1988, 1993
3  *      The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. All advertising materials mentioning features or use of this software
14  *    must display the following acknowledgement:
15  *      This product includes software developed by the University of
16  *      California, Berkeley and its contributors.
17  * 4. Neither the name of the University nor the names of its contributors
18  *    may be used to endorse or promote products derived from this software
19  *    without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  *      @(#)ip_input.c  8.2 (Berkeley) 1/4/94
34  * $FreeBSD: src/sys/netinet/ip_input.c,v 1.130.2.52 2003/03/07 07:01:28 silby Exp $
35  * $DragonFly: src/sys/netinet/ip_input.c,v 1.21 2004/04/22 04:26:28 dillon Exp $
36  */
37
38 #define _IP_VHL
39
40 #include "opt_bootp.h"
41 #include "opt_ipfw.h"
42 #include "opt_ipdn.h"
43 #include "opt_ipdivert.h"
44 #include "opt_ipfilter.h"
45 #include "opt_ipstealth.h"
46 #include "opt_ipsec.h"
47 #include "opt_pfil_hooks.h"
48 #include "opt_random_ip_id.h"
49
50 #include <sys/param.h>
51 #include <sys/systm.h>
52 #include <sys/mbuf.h>
53 #include <sys/malloc.h>
54 #include <sys/domain.h>
55 #include <sys/protosw.h>
56 #include <sys/socket.h>
57 #include <sys/time.h>
58 #include <sys/kernel.h>
59 #include <sys/syslog.h>
60 #include <sys/sysctl.h>
61 #include <sys/in_cksum.h>
62
63 #include <sys/thread2.h>
64 #include <sys/msgport2.h>
65
66 #include <net/if.h>
67 #include <net/if_types.h>
68 #include <net/if_var.h>
69 #include <net/if_dl.h>
70 #ifdef PFIL_HOOKS
71 #include <net/pfil.h>
72 #endif
73 #include <net/route.h>
74 #include <net/netisr.h>
75 #include <net/intrq.h>
76
77 #include <netinet/in.h>
78 #include <netinet/in_systm.h>
79 #include <netinet/in_var.h>
80 #include <netinet/ip.h>
81 #include <netinet/in_pcb.h>
82 #include <netinet/ip_var.h>
83 #include <netinet/ip_icmp.h>
84
85 #include <netinet/ipprotosw.h>
86
87 #include <sys/socketvar.h>
88
89 #include <net/ipfw/ip_fw.h>
90 #include <net/dummynet/ip_dummynet.h>
91
92 #ifdef IPSEC
93 #include <netinet6/ipsec.h>
94 #include <netproto/key/key.h>
95 #endif
96
97 #ifdef FAST_IPSEC
98 #include <netipsec/ipsec.h>
99 #include <netipsec/key.h>
100 #endif
101
102 int rsvp_on = 0;
103 static int ip_rsvp_on;
104 struct socket *ip_rsvpd;
105
106 int ipforwarding = 0;
107 SYSCTL_INT(_net_inet_ip, IPCTL_FORWARDING, forwarding, CTLFLAG_RW,
108     &ipforwarding, 0, "Enable IP forwarding between interfaces");
109
110 static int ipsendredirects = 1; /* XXX */
111 SYSCTL_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_RW,
112     &ipsendredirects, 0, "Enable sending IP redirects");
113
114 int ip_defttl = IPDEFTTL;
115 SYSCTL_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_RW,
116     &ip_defttl, 0, "Maximum TTL on IP packets");
117
118 static int ip_dosourceroute = 0;
119 SYSCTL_INT(_net_inet_ip, IPCTL_SOURCEROUTE, sourceroute, CTLFLAG_RW,
120     &ip_dosourceroute, 0, "Enable forwarding source routed IP packets");
121
122 static int ip_acceptsourceroute = 0;
123 SYSCTL_INT(_net_inet_ip, IPCTL_ACCEPTSOURCEROUTE, accept_sourceroute,
124     CTLFLAG_RW, &ip_acceptsourceroute, 0,
125     "Enable accepting source routed IP packets");
126
127 static int ip_keepfaith = 0;
128 SYSCTL_INT(_net_inet_ip, IPCTL_KEEPFAITH, keepfaith, CTLFLAG_RW,
129     &ip_keepfaith, 0,
130     "Enable packet capture for FAITH IPv4->IPv6 translater daemon");
131
132 static int nipq = 0;    /* total # of reass queues */
133 static int maxnipq;
134 SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragpackets, CTLFLAG_RW,
135     &maxnipq, 0,
136     "Maximum number of IPv4 fragment reassembly queue entries");
137
138 static int maxfragsperpacket;
139 SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_RW,
140     &maxfragsperpacket, 0,
141     "Maximum number of IPv4 fragments allowed per packet");
142
143 static int ip_sendsourcequench = 0;
144 SYSCTL_INT(_net_inet_ip, OID_AUTO, sendsourcequench, CTLFLAG_RW,
145     &ip_sendsourcequench, 0,
146     "Enable the transmission of source quench packets");
147
148 /*
149  * XXX - Setting ip_checkinterface mostly implements the receive side of
150  * the Strong ES model described in RFC 1122, but since the routing table
151  * and transmit implementation do not implement the Strong ES model,
152  * setting this to 1 results in an odd hybrid.
153  *
154  * XXX - ip_checkinterface currently must be disabled if you use ipnat
155  * to translate the destination address to another local interface.
156  *
157  * XXX - ip_checkinterface must be disabled if you add IP aliases
158  * to the loopback interface instead of the interface where the
159  * packets for those addresses are received.
160  */
161 static int ip_checkinterface = 0;
162 SYSCTL_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_RW,
163     &ip_checkinterface, 0, "Verify packet arrives on correct interface");
164
165 #ifdef DIAGNOSTIC
166 static int ipprintfs = 0;
167 #endif
168
169 static struct ifqueue ipintrq;
170 static int ipqmaxlen = IFQ_MAXLEN;
171
172 extern  struct domain inetdomain;
173 extern  struct ipprotosw inetsw[];
174 u_char  ip_protox[IPPROTO_MAX];
175 struct  in_ifaddrhead in_ifaddrhead;            /* first inet address */
176 struct  in_ifaddrhashhead *in_ifaddrhashtbl;    /* inet addr hash table */
177 u_long  in_ifaddrhmask;                         /* mask for hash table */
178
179 SYSCTL_INT(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_queue_maxlen, CTLFLAG_RW,
180     &ipintrq.ifq_maxlen, 0, "Maximum size of the IP input queue");
181 SYSCTL_INT(_net_inet_ip, IPCTL_INTRQDROPS, intr_queue_drops, CTLFLAG_RD,
182     &ipintrq.ifq_drops, 0, "Number of packets dropped from the IP input queue");
183
184 struct ipstat ipstat;
185 SYSCTL_STRUCT(_net_inet_ip, IPCTL_STATS, stats, CTLFLAG_RW,
186     &ipstat, ipstat, "IP statistics (struct ipstat, netinet/ip_var.h)");
187
188 /* Packet reassembly stuff */
189 #define IPREASS_NHASH_LOG2      6
190 #define IPREASS_NHASH           (1 << IPREASS_NHASH_LOG2)
191 #define IPREASS_HMASK           (IPREASS_NHASH - 1)
192 #define IPREASS_HASH(x,y)                                               \
193     (((((x) & 0xF) | ((((x) >> 8) & 0xF) << 4)) ^ (y)) & IPREASS_HMASK)
194
195 static struct ipq ipq[IPREASS_NHASH];
196 const  int    ipintrq_present = 1;
197
198 #ifdef IPCTL_DEFMTU
199 SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW,
200     &ip_mtu, 0, "Default MTU");
201 #endif
202
203 #ifdef IPSTEALTH
204 static int ipstealth = 0;
205 SYSCTL_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_RW, &ipstealth, 0, "");
206 #else
207 static const int ipstealth = 0;
208 #endif
209
210
211 /* Firewall hooks */
212 ip_fw_chk_t *ip_fw_chk_ptr;
213 int fw_enable = 1;
214 int fw_one_pass = 1;
215
216 /* Dummynet hooks */
217 ip_dn_io_t *ip_dn_io_ptr;
218
219 #ifdef PFIL_HOOKS
220 struct pfil_head inet_pfil_hook;
221 #endif
222
223 /*
224  * XXX this is ugly -- the following two global variables are
225  * used to store packet state while it travels through the stack.
226  * Note that the code even makes assumptions on the size and
227  * alignment of fields inside struct ip_srcrt so e.g. adding some
228  * fields will break the code. This needs to be fixed.
229  *
230  * We need to save the IP options in case a protocol wants to respond
231  * to an incoming packet over the same route if the packet got here
232  * using IP source routing.  This allows connection establishment and
233  * maintenance when the remote end is on a network that is not known
234  * to us.
235  */
236 static int ip_nhops = 0;
237
238 static  struct ip_srcrt {
239         struct  in_addr dst;                    /* final destination */
240         char    nop;                            /* one NOP to align */
241         char    srcopt[IPOPT_OFFSET + 1];       /* OPTVAL, OLEN and OFFSET */
242         struct  in_addr route[MAX_IPOPTLEN/sizeof(struct in_addr)];
243 } ip_srcrt;
244
245 static void             save_rte (u_char *, struct in_addr);
246 static int              ip_dooptions (struct mbuf *m, int,
247                                         struct sockaddr_in *next_hop);
248 static void             ip_forward (struct mbuf *m, int srcrt,
249                                         struct sockaddr_in *next_hop);
250 static void             ip_freef (struct ipq *);
251 static int              ip_input_handler (struct netmsg *);
252 static struct mbuf      *ip_reass (struct mbuf *, struct ipq *,
253                                         struct ipq *, u_int32_t *, u_int16_t *);
254
255 /*
256  * IP initialization: fill in IP protocol switch table.
257  * All protocols not implemented in kernel go to raw IP protocol handler.
258  */
259 void
260 ip_init()
261 {
262         struct ipprotosw *pr;
263         int i;
264
265         TAILQ_INIT(&in_ifaddrhead);
266         in_ifaddrhashtbl = hashinit(INADDR_NHASH, M_IFADDR, &in_ifaddrhmask);
267         pr = (struct ipprotosw *)pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW);
268         if (pr == NULL)
269                 panic("ip_init");
270         for (i = 0; i < IPPROTO_MAX; i++)
271                 ip_protox[i] = pr - inetsw;
272         for (pr = (struct ipprotosw *)inetdomain.dom_protosw;
273              pr < (struct ipprotosw *)inetdomain.dom_protoswNPROTOSW; pr++)
274                 if (pr->pr_domain->dom_family == PF_INET &&
275                     pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW)
276                         ip_protox[pr->pr_protocol] = pr - inetsw;
277
278 #ifdef PFIL_HOOKS
279         inet_pfil_hook.ph_type = PFIL_TYPE_AF;
280         inet_pfil_hook.ph_af = AF_INET;
281         if ((i = pfil_head_register(&inet_pfil_hook)) != 0)
282                 printf("%s: WARNING: unable to register pfil hook, "
283                         "error %d\n", __func__, i);
284 #endif
285
286         for (i = 0; i < IPREASS_NHASH; i++)
287             ipq[i].next = ipq[i].prev = &ipq[i];
288
289         maxnipq = nmbclusters / 32;
290         maxfragsperpacket = 16;
291
292 #ifndef RANDOM_IP_ID
293         ip_id = time_second & 0xffff;
294 #endif
295         ipintrq.ifq_maxlen = ipqmaxlen;
296
297         netisr_register(NETISR_IP, ip_mport, ip_input_handler);
298 }
299
300 /*
301  * XXX watch out this one. It is perhaps used as a cache for
302  * the most recently used route ? it is cleared in in_addroute()
303  * when a new route is successfully created.
304  */
305 struct route ipforward_rt;
306 static struct sockaddr_in ipaddr = { sizeof(ipaddr), AF_INET };
307
308 /* Do transport protocol processing. */
309 static void
310 transport_processing_oncpu(struct mbuf *m, int hlen, struct ip *ip,
311                            struct sockaddr_in *nexthop)
312 {
313         /*
314          * Switch out to protocol's input routine.
315          */
316         if (nexthop && ip->ip_p == IPPROTO_TCP) {
317                 /* TCP needs IPFORWARD info if available */
318                 struct m_hdr tag;
319
320                 tag.mh_type = MT_TAG;
321                 tag.mh_flags = PACKET_TAG_IPFORWARD;
322                 tag.mh_data = (caddr_t)nexthop;
323                 tag.mh_next = m;
324
325                 (*inetsw[ip_protox[ip->ip_p]].pr_input)
326                     ((struct mbuf *)&tag, hlen, ip->ip_p);
327         } else {
328                 (*inetsw[ip_protox[ip->ip_p]].pr_input)(m, hlen, ip->ip_p);
329         }
330 }
331
332 struct netmsg_transport_packet {
333         struct lwkt_msg         nm_lmsg;
334         struct mbuf             *nm_mbuf;
335         int                     nm_hlen;
336         boolean_t               nm_hasnexthop;
337         struct sockaddr_in      nm_nexthop;
338 };
339
340 static int
341 transport_processing_handler(lwkt_msg_t lmsg)
342 {
343         struct netmsg_transport_packet *msg = (void *)lmsg;
344         struct sockaddr_in *nexthop;
345         struct ip *ip;
346
347         ip = mtod(msg->nm_mbuf, struct ip *);
348         nexthop = msg->nm_hasnexthop ? &msg->nm_nexthop : NULL;
349         transport_processing_oncpu(msg->nm_mbuf, msg->nm_hlen, ip, nexthop);
350         lwkt_replymsg(lmsg, 0);
351         return(EASYNC);
352 }
353
354 static int
355 ip_input_handler(struct netmsg *msg0)
356 {
357         struct mbuf *m = ((struct netmsg_packet *)msg0)->nm_packet;
358
359         ip_input(m);
360         lwkt_replymsg(&msg0->nm_lmsg, 0);
361         return(EASYNC);
362 }
363
364 /*
365  * Ip input routine.  Checksum and byte swap header.  If fragmented
366  * try to reassemble.  Process options.  Pass to next level.
367  */
368 void
369 ip_input(struct mbuf *m)
370 {
371         struct ip *ip;
372         struct ipq *fp;
373         struct in_ifaddr *ia = NULL;
374         struct ifaddr *ifa;
375         int i, hlen, checkif;
376         u_short sum;
377         struct in_addr pkt_dst;
378         u_int32_t divert_info = 0;              /* packet divert/tee info */
379         struct ip_fw_args args;
380         boolean_t using_srcrt = FALSE;          /* forward (by PFIL_HOOKS) */
381         boolean_t needredispatch = FALSE;
382 #ifdef PFIL_HOOKS
383         struct in_addr odst;                    /* original dst address(NAT) */
384 #endif
385 #ifdef FAST_IPSEC
386         struct m_tag *mtag;
387         struct tdb_ident *tdbi;
388         struct secpolicy *sp;
389         int s, error;
390 #endif
391
392         args.eh = NULL;
393         args.oif = NULL;
394         args.rule = NULL;
395         args.divert_rule = 0;                   /* divert cookie */
396         args.next_hop = NULL;
397
398         /* Grab info from MT_TAG mbufs prepended to the chain. */
399         for (; m && m->m_type == MT_TAG; m = m->m_next) {
400                 switch(m->_m_tag_id) {
401                 default:
402                         printf("ip_input: unrecognised MT_TAG tag %d\n",
403                             m->_m_tag_id);
404                         break;
405
406                 case PACKET_TAG_DUMMYNET:
407                         args.rule = ((struct dn_pkt *)m)->rule;
408                         break;
409
410                 case PACKET_TAG_DIVERT:
411                         args.divert_rule = (int)m->m_hdr.mh_data & 0xffff;
412                         break;
413
414                 case PACKET_TAG_IPFORWARD:
415                         args.next_hop = (struct sockaddr_in *)m->m_hdr.mh_data;
416                         break;
417                 }
418         }
419
420         KASSERT(m != NULL && (m->m_flags & M_PKTHDR) != 0,
421             ("ip_input: no HDR"));
422
423         if (args.rule) {        /* dummynet already filtered us */
424                 ip = mtod(m, struct ip *);
425                 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
426                 goto iphack;
427         }
428
429         ipstat.ips_total++;
430
431         /* length checks already done in ip_demux() */
432         KASSERT(m->m_len >= sizeof(ip), ("IP header not in one mbuf"));
433
434         ip = mtod(m, struct ip *);
435
436         if (IP_VHL_V(ip->ip_vhl) != IPVERSION) {
437                 ipstat.ips_badvers++;
438                 goto bad;
439         }
440
441         hlen = IP_VHL_HL(ip->ip_vhl) << 2;
442         /* length checks already done in ip_demux() */
443         KASSERT(hlen >= sizeof(struct ip), ("IP header len too small"));
444         KASSERT(m->m_len >= hlen, ("packet shorter than IP header length"));
445
446         /* 127/8 must not appear on wire - RFC1122 */
447         if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
448             (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
449                 if (!(m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK)) {
450                         ipstat.ips_badaddr++;
451                         goto bad;
452                 }
453         }
454
455         if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) {
456                 sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID);
457         } else {
458                 if (hlen == sizeof(struct ip)) {
459                         sum = in_cksum_hdr(ip);
460                 } else {
461                         sum = in_cksum(m, hlen);
462                 }
463         }
464         if (sum) {
465                 ipstat.ips_badsum++;
466                 goto bad;
467         }
468
469         /*
470          * Convert fields to host representation.
471          */
472         ip->ip_len = ntohs(ip->ip_len);
473         if (ip->ip_len < hlen) {
474                 ipstat.ips_badlen++;
475                 goto bad;
476         }
477         ip->ip_off = ntohs(ip->ip_off);
478
479         /*
480          * Check that the amount of data in the buffers
481          * is as at least much as the IP header would have us expect.
482          * Trim mbufs if longer than we expect.
483          * Drop packet if shorter than we expect.
484          */
485         if (m->m_pkthdr.len < ip->ip_len) {
486                 ipstat.ips_tooshort++;
487                 goto bad;
488         }
489         if (m->m_pkthdr.len > ip->ip_len) {
490                 if (m->m_len == m->m_pkthdr.len) {
491                         m->m_len = ip->ip_len;
492                         m->m_pkthdr.len = ip->ip_len;
493                 } else
494                         m_adj(m, ip->ip_len - m->m_pkthdr.len);
495         }
496 #if defined(IPSEC) && !defined(IPSEC_FILTERGIF)
497         /*
498          * Bypass packet filtering for packets from a tunnel (gif).
499          */
500         if (ipsec_gethist(m, NULL))
501                 goto pass;
502 #endif
503
504         /*
505          * IpHack's section.
506          * Right now when no processing on packet has done
507          * and it is still fresh out of network we do our black
508          * deals with it.
509          * - Firewall: deny/allow/divert
510          * - Xlate: translate packet's addr/port (NAT).
511          * - Pipe: pass pkt through dummynet.
512          * - Wrap: fake packet's addr/port <unimpl.>
513          * - Encapsulate: put it in another IP and send out. <unimp.>
514          */
515
516 iphack:
517
518 #ifdef PFIL_HOOKS
519         /*
520          * Run through list of hooks for input packets.
521          *
522          * NB: Beware of the destination address changing (e.g.
523          *     by NAT rewriting). When this happens, tell
524          *     ip_forward to do the right thing.
525          */
526         odst = ip->ip_dst;
527         if (pfil_run_hooks(&inet_pfil_hook, &m, m->m_pkthdr.rcvif, PFIL_IN))
528                 return;
529         if (m == NULL)                  /* consumed by filter */
530                 return;
531         ip = mtod(m, struct ip *);
532         using_srcrt = (odst.s_addr != ip->ip_dst.s_addr);
533 #endif
534
535         if (fw_enable && IPFW_LOADED) {
536                 /*
537                  * If we've been forwarded from the output side, then
538                  * skip the firewall a second time
539                  */
540                 if (args.next_hop)
541                         goto ours;
542
543                 args.m = m;
544                 i = ip_fw_chk_ptr(&args);
545                 m = args.m;
546
547                 if ( (i & IP_FW_PORT_DENY_FLAG) || m == NULL) { /* drop */
548                         if (m)
549                                 m_freem(m);
550                         return;
551                 }
552                 ip = mtod(m, struct ip *); /* just in case m changed */
553                 if (i == 0 && args.next_hop == NULL)    /* common case */
554                         goto pass;
555                 if (DUMMYNET_LOADED && (i & IP_FW_PORT_DYNT_FLAG)) {
556                         /* Send packet to the appropriate pipe */
557                         ip_dn_io_ptr(m, i&0xffff, DN_TO_IP_IN, &args);
558                         return;
559                 }
560 #ifdef IPDIVERT
561                 if (i != 0 && !(i & IP_FW_PORT_DYNT_FLAG)) {
562                         /* Divert or tee packet */
563                         divert_info = i;
564                         goto ours;
565                 }
566 #endif
567                 if (i == 0 && args.next_hop != NULL)
568                         goto pass;
569                 /*
570                  * if we get here, the packet must be dropped
571                  */
572                 m_freem(m);
573                 return;
574         }
575 pass:
576
577         /*
578          * Process options and, if not destined for us,
579          * ship it on.  ip_dooptions returns 1 when an
580          * error was detected (causing an icmp message
581          * to be sent and the original packet to be freed).
582          */
583         ip_nhops = 0;           /* for source routed packets */
584         if (hlen > sizeof(struct ip) && ip_dooptions(m, 0, args.next_hop))
585                 return;
586
587         /* greedy RSVP, snatches any PATH packet of the RSVP protocol and no
588          * matter if it is destined to another node, or whether it is
589          * a multicast one, RSVP wants it! and prevents it from being forwarded
590          * anywhere else. Also checks if the rsvp daemon is running before
591          * grabbing the packet.
592          */
593         if (rsvp_on && ip->ip_p == IPPROTO_RSVP)
594                 goto ours;
595
596         /*
597          * Check our list of addresses, to see if the packet is for us.
598          * If we don't have any addresses, assume any unicast packet
599          * we receive might be for us (and let the upper layers deal
600          * with it).
601          */
602         if (TAILQ_EMPTY(&in_ifaddrhead) && !(m->m_flags & (M_MCAST | M_BCAST)))
603                 goto ours;
604
605         /*
606          * Cache the destination address of the packet; this may be
607          * changed by use of 'ipfw fwd'.
608          */
609         pkt_dst = args.next_hop ? args.next_hop->sin_addr : ip->ip_dst;
610
611         /*
612          * Enable a consistency check between the destination address
613          * and the arrival interface for a unicast packet (the RFC 1122
614          * strong ES model) if IP forwarding is disabled and the packet
615          * is not locally generated and the packet is not subject to
616          * 'ipfw fwd'.
617          *
618          * XXX - Checking also should be disabled if the destination
619          * address is ipnat'ed to a different interface.
620          *
621          * XXX - Checking is incompatible with IP aliases added
622          * to the loopback interface instead of the interface where
623          * the packets are received.
624          */
625         checkif = ip_checkinterface &&
626                   !ipforwarding &&
627                   m->m_pkthdr.rcvif != NULL &&
628                   !(m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) &&
629                   (args.next_hop == NULL);
630
631         /*
632          * Check for exact addresses in the hash bucket.
633          */
634         LIST_FOREACH(ia, INADDR_HASH(pkt_dst.s_addr), ia_hash) {
635                 /*
636                  * If the address matches, verify that the packet
637                  * arrived via the correct interface if checking is
638                  * enabled.
639                  */
640                 if (IA_SIN(ia)->sin_addr.s_addr == pkt_dst.s_addr &&
641                     (!checkif || ia->ia_ifp == m->m_pkthdr.rcvif))
642                         goto ours;
643         }
644         /*
645          * Check for broadcast addresses.
646          *
647          * Only accept broadcast packets that arrive via the matching
648          * interface.  Reception of forwarded directed broadcasts would
649          * be handled via ip_forward() and ether_output() with the loopback
650          * into the stack for SIMPLEX interfaces handled by ether_output().
651          */
652         if (m->m_pkthdr.rcvif->if_flags & IFF_BROADCAST) {
653                 TAILQ_FOREACH(ifa, &m->m_pkthdr.rcvif->if_addrhead, ifa_link) {
654                         if (ifa->ifa_addr->sa_family != AF_INET)
655                                 continue;
656                         ia = ifatoia(ifa);
657                         if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr ==
658                                                                 pkt_dst.s_addr)
659                                 goto ours;
660                         if (ia->ia_netbroadcast.s_addr == pkt_dst.s_addr)
661                                 goto ours;
662 #ifdef BOOTP_COMPAT
663                         if (IA_SIN(ia)->sin_addr.s_addr == INADDR_ANY)
664                                 goto ours;
665 #endif
666                 }
667         }
668         if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
669                 struct in_multi *inm;
670                 if (ip_mrouter) {
671                         /*
672                          * If we are acting as a multicast router, all
673                          * incoming multicast packets are passed to the
674                          * kernel-level multicast forwarding function.
675                          * The packet is returned (relatively) intact; if
676                          * ip_mforward() returns a non-zero value, the packet
677                          * must be discarded, else it may be accepted below.
678                          */
679                         if (ip_mforward &&
680                             ip_mforward(ip, m->m_pkthdr.rcvif, m, NULL) != 0) {
681                                 ipstat.ips_cantforward++;
682                                 m_freem(m);
683                                 return;
684                         }
685
686                         /*
687                          * The process-level routing daemon needs to receive
688                          * all multicast IGMP packets, whether or not this
689                          * host belongs to their destination groups.
690                          */
691                         if (ip->ip_p == IPPROTO_IGMP)
692                                 goto ours;
693                         ipstat.ips_forward++;
694                 }
695                 /*
696                  * See if we belong to the destination multicast group on the
697                  * arrival interface.
698                  */
699                 IN_LOOKUP_MULTI(ip->ip_dst, m->m_pkthdr.rcvif, inm);
700                 if (inm == NULL) {
701                         ipstat.ips_notmember++;
702                         m_freem(m);
703                         return;
704                 }
705                 goto ours;
706         }
707         if (ip->ip_dst.s_addr == INADDR_BROADCAST)
708                 goto ours;
709         if (ip->ip_dst.s_addr == INADDR_ANY)
710                 goto ours;
711
712         /*
713          * FAITH(Firewall Aided Internet Translator)
714          */
715         if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_type == IFT_FAITH) {
716                 if (ip_keepfaith) {
717                         if (ip->ip_p == IPPROTO_TCP || ip->ip_p == IPPROTO_ICMP)
718                                 goto ours;
719                 }
720                 m_freem(m);
721                 return;
722         }
723
724         /*
725          * Not for us; forward if possible and desirable.
726          */
727         if (!ipforwarding) {
728                 ipstat.ips_cantforward++;
729                 m_freem(m);
730         } else {
731 #ifdef IPSEC
732                 /*
733                  * Enforce inbound IPsec SPD.
734                  */
735                 if (ipsec4_in_reject(m, NULL)) {
736                         ipsecstat.in_polvio++;
737                         goto bad;
738                 }
739 #endif
740 #ifdef FAST_IPSEC
741                 mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL);
742                 s = splnet();
743                 if (mtag != NULL) {
744                         tdbi = (struct tdb_ident *)(mtag + 1);
745                         sp = ipsec_getpolicy(tdbi, IPSEC_DIR_INBOUND);
746                 } else {
747                         sp = ipsec_getpolicybyaddr(m, IPSEC_DIR_INBOUND,
748                                                    IP_FORWARDING, &error);
749                 }
750                 if (sp == NULL) {       /* NB: can happen if error */
751                         splx(s);
752                         /*XXX error stat???*/
753                         DPRINTF(("ip_input: no SP for forwarding\n"));  /*XXX*/
754                         goto bad;
755                 }
756
757                 /*
758                  * Check security policy against packet attributes.
759                  */
760                 error = ipsec_in_reject(sp, m);
761                 KEY_FREESP(&sp);
762                 splx(s);
763                 if (error) {
764                         ipstat.ips_cantforward++;
765                         goto bad;
766                 }
767 #endif
768                 ip_forward(m, using_srcrt, args.next_hop);
769         }
770         return;
771
772 ours:
773
774         /*
775          * IPSTEALTH: Process non-routing options only
776          * if the packet is destined for us.
777          */
778         if (ipstealth &&
779             hlen > sizeof(struct ip) &&
780             ip_dooptions(m, 1, args.next_hop))
781                 return;
782
783         /* Count the packet in the ip address stats */
784         if (ia != NULL) {
785                 ia->ia_ifa.if_ipackets++;
786                 ia->ia_ifa.if_ibytes += m->m_pkthdr.len;
787         }
788
789         /*
790          * If offset or IP_MF are set, must reassemble.
791          * Otherwise, nothing need be done.
792          * (We could look in the reassembly queue to see
793          * if the packet was previously fragmented,
794          * but it's not worth the time; just let them time out.)
795          */
796         if (ip->ip_off & (IP_MF | IP_OFFMASK)) {
797
798                 /* If maxnipq is 0, never accept fragments. */
799                 if (maxnipq == 0) {
800                         ipstat.ips_fragments++;
801                         ipstat.ips_fragdropped++;
802                         goto bad;
803                 }
804
805                 sum = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id);
806                 /*
807                  * Look for queue of fragments
808                  * of this datagram.
809                  */
810                 for (fp = ipq[sum].next; fp != &ipq[sum]; fp = fp->next)
811                         if (ip->ip_id == fp->ipq_id &&
812                             ip->ip_src.s_addr == fp->ipq_src.s_addr &&
813                             ip->ip_dst.s_addr == fp->ipq_dst.s_addr &&
814                             ip->ip_p == fp->ipq_p)
815                                 goto found;
816
817                 fp = NULL;
818
819                 /*
820                  * Enforce upper bound on number of fragmented packets
821                  * for which we attempt reassembly;
822                  * If maxnipq is -1, accept all fragments without limitation.
823                  */
824                 if ((nipq > maxnipq) && (maxnipq > 0)) {
825                         /*
826                          * drop something from the tail of the current queue
827                          * before proceeding further
828                          */
829                         if (ipq[sum].prev == &ipq[sum]) {   /* gak */
830                                 for (i = 0; i < IPREASS_NHASH; i++) {
831                                         if (ipq[i].prev != &ipq[i]) {
832                                                 ipstat.ips_fragtimeout +=
833                                                     ipq[i].prev->ipq_nfrags;
834                                                 ip_freef(ipq[i].prev);
835                                                 break;
836                                         }
837                                 }
838                         } else {
839                                 ipstat.ips_fragtimeout +=
840                                     ipq[sum].prev->ipq_nfrags;
841                                 ip_freef(ipq[sum].prev);
842                         }
843                 }
844 found:
845                 /*
846                  * Adjust ip_len to not reflect header,
847                  * convert offset of this to bytes.
848                  */
849                 ip->ip_len -= hlen;
850                 if (ip->ip_off & IP_MF) {
851                         /*
852                          * Make sure that fragments have a data length
853                          * that's a non-zero multiple of 8 bytes.
854                          */
855                         if (ip->ip_len == 0 || (ip->ip_len & 0x7) != 0) {
856                                 ipstat.ips_toosmall++; /* XXX */
857                                 goto bad;
858                         }
859                         m->m_flags |= M_FRAG;
860                 } else
861                         m->m_flags &= ~M_FRAG;
862                 ip->ip_off <<= 3;
863
864                 /*
865                  * Attempt reassembly; if it succeeds, proceed.
866                  * ip_reass() will return a different mbuf, and update
867                  * the divert info in divert_info and args.divert_rule.
868                  */
869                 ipstat.ips_fragments++;
870                 m->m_pkthdr.header = ip;
871                 m = ip_reass(m, fp, &ipq[sum], &divert_info, &args.divert_rule);
872                 if (m == NULL)
873                         return;
874                 ipstat.ips_reassembled++;
875                 needredispatch = TRUE;
876                 ip = mtod(m, struct ip *);
877                 /* Get the header length of the reassembled packet */
878                 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
879 #ifdef IPDIVERT
880                 /* Restore original checksum before diverting packet */
881                 if (divert_info != 0) {
882                         ip->ip_len += hlen;
883                         ip->ip_len = htons(ip->ip_len);
884                         ip->ip_off = htons(ip->ip_off);
885                         ip->ip_sum = 0;
886                         if (hlen == sizeof(struct ip))
887                                 ip->ip_sum = in_cksum_hdr(ip);
888                         else
889                                 ip->ip_sum = in_cksum(m, hlen);
890                         ip->ip_off = ntohs(ip->ip_off);
891                         ip->ip_len = ntohs(ip->ip_len);
892                         ip->ip_len -= hlen;
893                 }
894 #endif
895         } else {
896                 ip->ip_len -= hlen;
897         }
898
899 #ifdef IPDIVERT
900         /*
901          * Divert or tee packet to the divert protocol if required.
902          */
903         if (divert_info != 0) {
904                 struct mbuf *clone = NULL;
905
906                 /* Clone packet if we're doing a 'tee' */
907                 if ((divert_info & IP_FW_PORT_TEE_FLAG) != 0)
908                         clone = m_dup(m, M_DONTWAIT);
909
910                 /* Restore packet header fields to original values */
911                 ip->ip_len += hlen;
912                 ip->ip_len = htons(ip->ip_len);
913                 ip->ip_off = htons(ip->ip_off);
914
915                 /* Deliver packet to divert input routine */
916                 divert_packet(m, 1, divert_info & 0xffff, args.divert_rule);
917                 ipstat.ips_delivered++;
918
919                 /* If 'tee', continue with original packet */
920                 if (clone == NULL)
921                         return;
922                 m = clone;
923                 ip = mtod(m, struct ip *);
924                 ip->ip_len += hlen;
925                 /*
926                  * Jump backwards to complete processing of the
927                  * packet. But first clear divert_info to avoid
928                  * entering this block again.
929                  * We do not need to clear args.divert_rule
930                  * or args.next_hop as they will not be used.
931                  */
932                 divert_info = 0;
933                 goto pass;
934         }
935 #endif
936
937 #ifdef IPSEC
938         /*
939          * enforce IPsec policy checking if we are seeing last header.
940          * note that we do not visit this with protocols with pcb layer
941          * code - like udp/tcp/raw ip.
942          */
943         if ((inetsw[ip_protox[ip->ip_p]].pr_flags & PR_LASTHDR) &&
944             ipsec4_in_reject(m, NULL)) {
945                 ipsecstat.in_polvio++;
946                 goto bad;
947         }
948 #endif
949 #if FAST_IPSEC
950         /*
951          * enforce IPsec policy checking if we are seeing last header.
952          * note that we do not visit this with protocols with pcb layer
953          * code - like udp/tcp/raw ip.
954          */
955         if (inetsw[ip_protox[ip->ip_p]].pr_flags & PR_LASTHDR) {
956                 /*
957                  * Check if the packet has already had IPsec processing
958                  * done.  If so, then just pass it along.  This tag gets
959                  * set during AH, ESP, etc. input handling, before the
960                  * packet is returned to the ip input queue for delivery.
961                  */
962                 mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL);
963                 s = splnet();
964                 if (mtag != NULL) {
965                         tdbi = (struct tdb_ident *)(mtag + 1);
966                         sp = ipsec_getpolicy(tdbi, IPSEC_DIR_INBOUND);
967                 } else {
968                         sp = ipsec_getpolicybyaddr(m, IPSEC_DIR_INBOUND,
969                                                    IP_FORWARDING, &error);
970                 }
971                 if (sp != NULL) {
972                         /*
973                          * Check security policy against packet attributes.
974                          */
975                         error = ipsec_in_reject(sp, m);
976                         KEY_FREESP(&sp);
977                 } else {
978                         /* XXX error stat??? */
979                         error = EINVAL;
980 DPRINTF(("ip_input: no SP, packet discarded\n"));/*XXX*/
981                         goto bad;
982                 }
983                 splx(s);
984                 if (error)
985                         goto bad;
986         }
987 #endif /* FAST_IPSEC */
988
989         ipstat.ips_delivered++;
990         if (needredispatch) {
991                 struct netmsg_transport_packet *msg;
992                 lwkt_port_t port;
993
994                 msg = malloc(sizeof(struct netmsg_transport_packet),
995                                 M_LWKTMSG, M_INTWAIT | M_NULLOK);
996                 if (msg == NULL)
997                         goto bad;
998
999                 lwkt_initmsg(&msg->nm_lmsg, &netisr_afree_rport, 0,
1000                         lwkt_cmd_func(transport_processing_handler),
1001                         lwkt_cmd_op_none);
1002                 msg->nm_mbuf = m;
1003                 msg->nm_hlen = hlen;
1004                 msg->nm_hasnexthop = (args.next_hop != NULL);
1005                 if (msg->nm_hasnexthop)
1006                         msg->nm_nexthop = *args.next_hop;  /* structure copy */
1007
1008                 ip->ip_off = htons(ip->ip_off);
1009                 ip->ip_len = htons(ip->ip_len);
1010                 port = ip_mport(m);
1011                 ip->ip_len = ntohs(ip->ip_len);
1012                 ip->ip_off = ntohs(ip->ip_off);
1013
1014                 lwkt_sendmsg(port, &msg->nm_lmsg);
1015         } else {
1016                 transport_processing_oncpu(m, hlen, ip, args.next_hop);
1017         }
1018         return;
1019
1020 bad:
1021         m_freem(m);
1022 }
1023
1024 /*
1025  * Take incoming datagram fragment and try to reassemble it into
1026  * whole datagram.  If a chain for reassembly of this datagram already
1027  * exists, then it is given as fp; otherwise have to make a chain.
1028  *
1029  * When IPDIVERT enabled, keep additional state with each packet that
1030  * tells us if we need to divert or tee the packet we're building.
1031  * In particular, *divinfo includes the port and TEE flag,
1032  * *divert_rule is the number of the matching rule.
1033  */
1034
1035 static struct mbuf *
1036 ip_reass(struct mbuf *m, struct ipq *fp, struct ipq *where,
1037          u_int32_t *divinfo, u_int16_t *divert_rule)
1038 {
1039         struct ip *ip = mtod(m, struct ip *);
1040         struct mbuf *p = NULL, *q, *nq;
1041         struct mbuf *t;
1042         int hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1043         int i, next;
1044
1045         /*
1046          * Presence of header sizes in mbufs
1047          * would confuse code below.
1048          */
1049         m->m_data += hlen;
1050         m->m_len -= hlen;
1051
1052         /*
1053          * If first fragment to arrive, create a reassembly queue.
1054          */
1055         if (fp == NULL) {
1056                 if ((t = m_get(M_DONTWAIT, MT_FTABLE)) == NULL)
1057                         goto dropfrag;
1058                 fp = mtod(t, struct ipq *);
1059                 insque(fp, where);
1060                 nipq++;
1061                 fp->ipq_nfrags = 1;
1062                 fp->ipq_ttl = IPFRAGTTL;
1063                 fp->ipq_p = ip->ip_p;
1064                 fp->ipq_id = ip->ip_id;
1065                 fp->ipq_src = ip->ip_src;
1066                 fp->ipq_dst = ip->ip_dst;
1067                 fp->ipq_frags = m;
1068                 m->m_nextpkt = NULL;
1069 #ifdef IPDIVERT
1070                 fp->ipq_div_info = 0;
1071                 fp->ipq_div_cookie = 0;
1072 #endif
1073                 goto inserted;
1074         } else {
1075                 fp->ipq_nfrags++;
1076         }
1077
1078 #define GETIP(m)        ((struct ip*)((m)->m_pkthdr.header))
1079
1080         /*
1081          * Find a segment which begins after this one does.
1082          */
1083         for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt)
1084                 if (GETIP(q)->ip_off > ip->ip_off)
1085                         break;
1086
1087         /*
1088          * If there is a preceding segment, it may provide some of
1089          * our data already.  If so, drop the data from the incoming
1090          * segment.  If it provides all of our data, drop us, otherwise
1091          * stick new segment in the proper place.
1092          *
1093          * If some of the data is dropped from the the preceding
1094          * segment, then it's checksum is invalidated.
1095          */
1096         if (p) {
1097                 i = GETIP(p)->ip_off + GETIP(p)->ip_len - ip->ip_off;
1098                 if (i > 0) {
1099                         if (i >= ip->ip_len)
1100                                 goto dropfrag;
1101                         m_adj(m, i);
1102                         m->m_pkthdr.csum_flags = 0;
1103                         ip->ip_off += i;
1104                         ip->ip_len -= i;
1105                 }
1106                 m->m_nextpkt = p->m_nextpkt;
1107                 p->m_nextpkt = m;
1108         } else {
1109                 m->m_nextpkt = fp->ipq_frags;
1110                 fp->ipq_frags = m;
1111         }
1112
1113         /*
1114          * While we overlap succeeding segments trim them or,
1115          * if they are completely covered, dequeue them.
1116          */
1117         for (; q != NULL && ip->ip_off + ip->ip_len > GETIP(q)->ip_off;
1118              q = nq) {
1119                 i = (ip->ip_off + ip->ip_len) - GETIP(q)->ip_off;
1120                 if (i < GETIP(q)->ip_len) {
1121                         GETIP(q)->ip_len -= i;
1122                         GETIP(q)->ip_off += i;
1123                         m_adj(q, i);
1124                         q->m_pkthdr.csum_flags = 0;
1125                         break;
1126                 }
1127                 nq = q->m_nextpkt;
1128                 m->m_nextpkt = nq;
1129                 ipstat.ips_fragdropped++;
1130                 fp->ipq_nfrags--;
1131                 m_freem(q);
1132         }
1133
1134 inserted:
1135
1136 #ifdef IPDIVERT
1137         /*
1138          * Transfer firewall instructions to the fragment structure.
1139          * Only trust info in the fragment at offset 0.
1140          */
1141         if (ip->ip_off == 0) {
1142                 fp->ipq_div_info = *divinfo;
1143                 fp->ipq_div_cookie = *divert_rule;
1144         }
1145         *divinfo = 0;
1146         *divert_rule = 0;
1147 #endif
1148
1149         /*
1150          * Check for complete reassembly and perform frag per packet
1151          * limiting.
1152          *
1153          * Frag limiting is performed here so that the nth frag has
1154          * a chance to complete the packet before we drop the packet.
1155          * As a result, n+1 frags are actually allowed per packet, but
1156          * only n will ever be stored. (n = maxfragsperpacket.)
1157          *
1158          */
1159         next = 0;
1160         for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) {
1161                 if (GETIP(q)->ip_off != next) {
1162                         if (fp->ipq_nfrags > maxfragsperpacket) {
1163                                 ipstat.ips_fragdropped += fp->ipq_nfrags;
1164                                 ip_freef(fp);
1165                         }
1166                         return (NULL);
1167                 }
1168                 next += GETIP(q)->ip_len;
1169         }
1170         /* Make sure the last packet didn't have the IP_MF flag */
1171         if (p->m_flags & M_FRAG) {
1172                 if (fp->ipq_nfrags > maxfragsperpacket) {
1173                         ipstat.ips_fragdropped += fp->ipq_nfrags;
1174                         ip_freef(fp);
1175                 }
1176                 return (NULL);
1177         }
1178
1179         /*
1180          * Reassembly is complete.  Make sure the packet is a sane size.
1181          */
1182         q = fp->ipq_frags;
1183         ip = GETIP(q);
1184         if (next + (IP_VHL_HL(ip->ip_vhl) << 2) > IP_MAXPACKET) {
1185                 ipstat.ips_toolong++;
1186                 ipstat.ips_fragdropped += fp->ipq_nfrags;
1187                 ip_freef(fp);
1188                 return (NULL);
1189         }
1190
1191         /*
1192          * Concatenate fragments.
1193          */
1194         m = q;
1195         t = m->m_next;
1196         m->m_next = NULL;
1197         m_cat(m, t);
1198         nq = q->m_nextpkt;
1199         q->m_nextpkt = NULL;
1200         for (q = nq; q != NULL; q = nq) {
1201                 nq = q->m_nextpkt;
1202                 q->m_nextpkt = NULL;
1203                 m->m_pkthdr.csum_flags &= q->m_pkthdr.csum_flags;
1204                 m->m_pkthdr.csum_data += q->m_pkthdr.csum_data;
1205                 m_cat(m, q);
1206         }
1207
1208 #ifdef IPDIVERT
1209         /*
1210          * Extract firewall instructions from the fragment structure.
1211          */
1212         *divinfo = fp->ipq_div_info;
1213         *divert_rule = fp->ipq_div_cookie;
1214 #endif
1215
1216         /*
1217          * Create header for new ip packet by
1218          * modifying header of first packet;
1219          * dequeue and discard fragment reassembly header.
1220          * Make header visible.
1221          */
1222         ip->ip_len = next;
1223         ip->ip_src = fp->ipq_src;
1224         ip->ip_dst = fp->ipq_dst;
1225         remque(fp);
1226         nipq--;
1227         (void) m_free(dtom(fp));
1228         m->m_len += (IP_VHL_HL(ip->ip_vhl) << 2);
1229         m->m_data -= (IP_VHL_HL(ip->ip_vhl) << 2);
1230         /* some debugging cruft by sklower, below, will go away soon */
1231         if (m->m_flags & M_PKTHDR) { /* XXX this should be done elsewhere */
1232                 int plen = 0;
1233
1234                 for (t = m; t; t = t->m_next)
1235                         plen += t->m_len;
1236                 m->m_pkthdr.len = plen;
1237         }
1238         return (m);
1239
1240 dropfrag:
1241 #ifdef IPDIVERT
1242         *divinfo = 0;
1243         *divert_rule = 0;
1244 #endif
1245         ipstat.ips_fragdropped++;
1246         if (fp != NULL)
1247                 fp->ipq_nfrags--;
1248         m_freem(m);
1249         return (NULL);
1250
1251 #undef GETIP
1252 }
1253
1254 /*
1255  * Free a fragment reassembly header and all
1256  * associated datagrams.
1257  */
1258 static void
1259 ip_freef(struct ipq *fp)
1260 {
1261         struct mbuf *q;
1262
1263         while (fp->ipq_frags) {
1264                 q = fp->ipq_frags;
1265                 fp->ipq_frags = q->m_nextpkt;
1266                 m_freem(q);
1267         }
1268         remque(fp);
1269         (void) m_free(dtom(fp));
1270         nipq--;
1271 }
1272
1273 /*
1274  * IP timer processing;
1275  * if a timer expires on a reassembly
1276  * queue, discard it.
1277  */
1278 void
1279 ip_slowtimo()
1280 {
1281         struct ipq *fp;
1282         int s = splnet();
1283         int i;
1284
1285         for (i = 0; i < IPREASS_NHASH; i++) {
1286                 fp = ipq[i].next;
1287                 if (fp == NULL)
1288                         continue;
1289                 while (fp != &ipq[i]) {
1290                         --fp->ipq_ttl;
1291                         fp = fp->next;
1292                         if (fp->prev->ipq_ttl == 0) {
1293                                 ipstat.ips_fragtimeout += fp->prev->ipq_nfrags;
1294                                 ip_freef(fp->prev);
1295                         }
1296                 }
1297         }
1298         /*
1299          * If we are over the maximum number of fragments
1300          * (due to the limit being lowered), drain off
1301          * enough to get down to the new limit.
1302          */
1303         if (maxnipq >= 0 && nipq > maxnipq) {
1304                 for (i = 0; i < IPREASS_NHASH; i++) {
1305                         while (nipq > maxnipq &&
1306                                 (ipq[i].next != &ipq[i])) {
1307                                 ipstat.ips_fragdropped +=
1308                                     ipq[i].next->ipq_nfrags;
1309                                 ip_freef(ipq[i].next);
1310                         }
1311                 }
1312         }
1313         ipflow_slowtimo();
1314         splx(s);
1315 }
1316
1317 /*
1318  * Drain off all datagram fragments.
1319  */
1320 void
1321 ip_drain()
1322 {
1323         int i;
1324
1325         for (i = 0; i < IPREASS_NHASH; i++) {
1326                 while (ipq[i].next != &ipq[i]) {
1327                         ipstat.ips_fragdropped += ipq[i].next->ipq_nfrags;
1328                         ip_freef(ipq[i].next);
1329                 }
1330         }
1331         in_rtqdrain();
1332 }
1333
1334 /*
1335  * Do option processing on a datagram,
1336  * possibly discarding it if bad options are encountered,
1337  * or forwarding it if source-routed.
1338  * The pass argument is used when operating in the IPSTEALTH
1339  * mode to tell what options to process:
1340  * [LS]SRR (pass 0) or the others (pass 1).
1341  * The reason for as many as two passes is that when doing IPSTEALTH,
1342  * non-routing options should be processed only if the packet is for us.
1343  * Returns 1 if packet has been forwarded/freed,
1344  * 0 if the packet should be processed further.
1345  */
1346 static int
1347 ip_dooptions(struct mbuf *m, int pass, struct sockaddr_in *next_hop)
1348 {
1349         struct ip *ip = mtod(m, struct ip *);
1350         u_char *cp;
1351         struct in_ifaddr *ia;
1352         int opt, optlen, cnt, off, code, type = ICMP_PARAMPROB;
1353         boolean_t forward = FALSE;
1354         struct in_addr *sin, dst;
1355         n_time ntime;
1356
1357         dst = ip->ip_dst;
1358         cp = (u_char *)(ip + 1);
1359         cnt = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof(struct ip);
1360         for (; cnt > 0; cnt -= optlen, cp += optlen) {
1361                 opt = cp[IPOPT_OPTVAL];
1362                 if (opt == IPOPT_EOL)
1363                         break;
1364                 if (opt == IPOPT_NOP)
1365                         optlen = 1;
1366                 else {
1367                         if (cnt < IPOPT_OLEN + sizeof(*cp)) {
1368                                 code = &cp[IPOPT_OLEN] - (u_char *)ip;
1369                                 goto bad;
1370                         }
1371                         optlen = cp[IPOPT_OLEN];
1372                         if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) {
1373                                 code = &cp[IPOPT_OLEN] - (u_char *)ip;
1374                                 goto bad;
1375                         }
1376                 }
1377                 switch (opt) {
1378
1379                 default:
1380                         break;
1381
1382                 /*
1383                  * Source routing with record.
1384                  * Find interface with current destination address.
1385                  * If none on this machine then drop if strictly routed,
1386                  * or do nothing if loosely routed.
1387                  * Record interface address and bring up next address
1388                  * component.  If strictly routed make sure next
1389                  * address is on directly accessible net.
1390                  */
1391                 case IPOPT_LSRR:
1392                 case IPOPT_SSRR:
1393                         if (ipstealth && pass > 0)
1394                                 break;
1395                         if (optlen < IPOPT_OFFSET + sizeof(*cp)) {
1396                                 code = &cp[IPOPT_OLEN] - (u_char *)ip;
1397                                 goto bad;
1398                         }
1399                         if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) {
1400                                 code = &cp[IPOPT_OFFSET] - (u_char *)ip;
1401                                 goto bad;
1402                         }
1403                         ipaddr.sin_addr = ip->ip_dst;
1404                         ia = (struct in_ifaddr *)
1405                                 ifa_ifwithaddr((struct sockaddr *)&ipaddr);
1406                         if (ia == NULL) {
1407                                 if (opt == IPOPT_SSRR) {
1408                                         type = ICMP_UNREACH;
1409                                         code = ICMP_UNREACH_SRCFAIL;
1410                                         goto bad;
1411                                 }
1412                                 if (!ip_dosourceroute)
1413                                         goto nosourcerouting;
1414                                 /*
1415                                  * Loose routing, and not at next destination
1416                                  * yet; nothing to do except forward.
1417                                  */
1418                                 break;
1419                         }
1420                         off--;                  /* 0 origin */
1421                         if (off > optlen - (int)sizeof(struct in_addr)) {
1422                                 /*
1423                                  * End of source route.  Should be for us.
1424                                  */
1425                                 if (!ip_acceptsourceroute)
1426                                         goto nosourcerouting;
1427                                 save_rte(cp, ip->ip_src);
1428                                 break;
1429                         }
1430                         if (ipstealth)
1431                                 goto dropit;
1432                         if (!ip_dosourceroute) {
1433                                 if (ipforwarding) {
1434                                         char buf[16]; /* aaa.bbb.ccc.ddd\0 */
1435                                         /*
1436                                          * Acting as a router, so generate ICMP
1437                                          */
1438 nosourcerouting:
1439                                         strcpy(buf, inet_ntoa(ip->ip_dst));
1440                                         log(LOG_WARNING,
1441                                             "attempted source route from %s to %s\n",
1442                                             inet_ntoa(ip->ip_src), buf);
1443                                         type = ICMP_UNREACH;
1444                                         code = ICMP_UNREACH_SRCFAIL;
1445                                         goto bad;
1446                                 } else {
1447                                         /*
1448                                          * Not acting as a router,
1449                                          * so silently drop.
1450                                          */
1451 dropit:
1452                                         ipstat.ips_cantforward++;
1453                                         m_freem(m);
1454                                         return (1);
1455                                 }
1456                         }
1457
1458                         /*
1459                          * locate outgoing interface
1460                          */
1461                         (void)memcpy(&ipaddr.sin_addr, cp + off,
1462                             sizeof(ipaddr.sin_addr));
1463
1464                         if (opt == IPOPT_SSRR) {
1465 #define INA     struct in_ifaddr *
1466 #define SA      struct sockaddr *
1467                                 if ((ia = (INA)ifa_ifwithdstaddr((SA)&ipaddr))
1468                                                                         == NULL)
1469                                         ia = (INA)ifa_ifwithnet((SA)&ipaddr);
1470                         } else
1471                                 ia = ip_rtaddr(ipaddr.sin_addr, &ipforward_rt);
1472                         if (ia == NULL) {
1473                                 type = ICMP_UNREACH;
1474                                 code = ICMP_UNREACH_SRCFAIL;
1475                                 goto bad;
1476                         }
1477                         ip->ip_dst = ipaddr.sin_addr;
1478                         (void)memcpy(cp + off, &(IA_SIN(ia)->sin_addr),
1479                             sizeof(struct in_addr));
1480                         cp[IPOPT_OFFSET] += sizeof(struct in_addr);
1481                         /*
1482                          * Let ip_intr's mcast routing check handle mcast pkts
1483                          */
1484                         forward = !IN_MULTICAST(ntohl(ip->ip_dst.s_addr));
1485                         break;
1486
1487                 case IPOPT_RR:
1488                         if (ipstealth && pass == 0)
1489                                 break;
1490                         if (optlen < IPOPT_OFFSET + sizeof(*cp)) {
1491                                 code = &cp[IPOPT_OFFSET] - (u_char *)ip;
1492                                 goto bad;
1493                         }
1494                         if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) {
1495                                 code = &cp[IPOPT_OFFSET] - (u_char *)ip;
1496                                 goto bad;
1497                         }
1498                         /*
1499                          * If no space remains, ignore.
1500                          */
1501                         off--;                  /* 0 origin */
1502                         if (off > optlen - (int)sizeof(struct in_addr))
1503                                 break;
1504                         (void)memcpy(&ipaddr.sin_addr, &ip->ip_dst,
1505                             sizeof(ipaddr.sin_addr));
1506                         /*
1507                          * locate outgoing interface; if we're the destination,
1508                          * use the incoming interface (should be same).
1509                          */
1510                         if ((ia = (INA)ifa_ifwithaddr((SA)&ipaddr)) == NULL &&
1511                             (ia = ip_rtaddr(ipaddr.sin_addr, &ipforward_rt))
1512                                                                      == NULL) {
1513                                 type = ICMP_UNREACH;
1514                                 code = ICMP_UNREACH_HOST;
1515                                 goto bad;
1516                         }
1517                         (void)memcpy(cp + off, &(IA_SIN(ia)->sin_addr),
1518                             sizeof(struct in_addr));
1519                         cp[IPOPT_OFFSET] += sizeof(struct in_addr);
1520                         break;
1521
1522                 case IPOPT_TS:
1523                         if (ipstealth && pass == 0)
1524                                 break;
1525                         code = cp - (u_char *)ip;
1526                         if (optlen < 4 || optlen > 40) {
1527                                 code = &cp[IPOPT_OLEN] - (u_char *)ip;
1528                                 goto bad;
1529                         }
1530                         if ((off = cp[IPOPT_OFFSET]) < 5) {
1531                                 code = &cp[IPOPT_OLEN] - (u_char *)ip;
1532                                 goto bad;
1533                         }
1534                         if (off > optlen - (int)sizeof(int32_t)) {
1535                                 cp[IPOPT_OFFSET + 1] += (1 << 4);
1536                                 if ((cp[IPOPT_OFFSET + 1] & 0xf0) == 0) {
1537                                         code = &cp[IPOPT_OFFSET] - (u_char *)ip;
1538                                         goto bad;
1539                                 }
1540                                 break;
1541                         }
1542                         off--;                          /* 0 origin */
1543                         sin = (struct in_addr *)(cp + off);
1544                         switch (cp[IPOPT_OFFSET + 1] & 0x0f) {
1545
1546                         case IPOPT_TS_TSONLY:
1547                                 break;
1548
1549                         case IPOPT_TS_TSANDADDR:
1550                                 if (off + sizeof(n_time) +
1551                                     sizeof(struct in_addr) > optlen) {
1552                                         code = &cp[IPOPT_OFFSET] - (u_char *)ip;
1553                                         goto bad;
1554                                 }
1555                                 ipaddr.sin_addr = dst;
1556                                 ia = (INA)ifaof_ifpforaddr((SA)&ipaddr,
1557                                                             m->m_pkthdr.rcvif);
1558                                 if (ia == NULL)
1559                                         continue;
1560                                 (void)memcpy(sin, &IA_SIN(ia)->sin_addr,
1561                                     sizeof(struct in_addr));
1562                                 cp[IPOPT_OFFSET] += sizeof(struct in_addr);
1563                                 off += sizeof(struct in_addr);
1564                                 break;
1565
1566                         case IPOPT_TS_PRESPEC:
1567                                 if (off + sizeof(n_time) +
1568                                     sizeof(struct in_addr) > optlen) {
1569                                         code = &cp[IPOPT_OFFSET] - (u_char *)ip;
1570                                         goto bad;
1571                                 }
1572                                 (void)memcpy(&ipaddr.sin_addr, sin,
1573                                     sizeof(struct in_addr));
1574                                 if (ifa_ifwithaddr((SA)&ipaddr) == NULL)
1575                                         continue;
1576                                 cp[IPOPT_OFFSET] += sizeof(struct in_addr);
1577                                 off += sizeof(struct in_addr);
1578                                 break;
1579
1580                         default:
1581                                 code = &cp[IPOPT_OFFSET + 1] - (u_char *)ip;
1582                                 goto bad;
1583                         }
1584                         ntime = iptime();
1585                         (void)memcpy(cp + off, &ntime, sizeof(n_time));
1586                         cp[IPOPT_OFFSET] += sizeof(n_time);
1587                 }
1588         }
1589         if (forward && ipforwarding) {
1590                 ip_forward(m, 1, next_hop);
1591                 return (1);
1592         }
1593         return (0);
1594 bad:
1595         icmp_error(m, type, code, 0, NULL);
1596         ipstat.ips_badoptions++;
1597         return (1);
1598 }
1599
1600 /*
1601  * Given address of next destination (final or next hop),
1602  * return internet address info of interface to be used to get there.
1603  */
1604 struct in_ifaddr *
1605 ip_rtaddr(struct in_addr dst, struct route *rt)
1606 {
1607         struct sockaddr_in *sin;
1608
1609         sin = (struct sockaddr_in *)&rt->ro_dst;
1610
1611         if (rt->ro_rt == NULL || dst.s_addr != sin->sin_addr.s_addr) {
1612                 if (rt->ro_rt) {
1613                         RTFREE(rt->ro_rt);
1614                         rt->ro_rt = NULL;
1615                 }
1616                 sin->sin_family = AF_INET;
1617                 sin->sin_len = sizeof(*sin);
1618                 sin->sin_addr = dst;
1619                 rtalloc_ign(rt, RTF_PRCLONING);
1620         }
1621
1622         if (rt->ro_rt == NULL)
1623                 return (NULL);
1624
1625         return (ifatoia(rt->ro_rt->rt_ifa));
1626 }
1627
1628 /*
1629  * Save incoming source route for use in replies,
1630  * to be picked up later by ip_srcroute if the receiver is interested.
1631  */
1632 void
1633 save_rte(u_char *option, struct in_addr dst)
1634 {
1635         unsigned olen;
1636
1637         olen = option[IPOPT_OLEN];
1638 #ifdef DIAGNOSTIC
1639         if (ipprintfs)
1640                 printf("save_rte: olen %d\n", olen);
1641 #endif
1642         if (olen > sizeof(ip_srcrt) - (1 + sizeof(dst)))
1643                 return;
1644         bcopy(option, ip_srcrt.srcopt, olen);
1645         ip_nhops = (olen - IPOPT_OFFSET - 1) / sizeof(struct in_addr);
1646         ip_srcrt.dst = dst;
1647 }
1648
1649 /*
1650  * Retrieve incoming source route for use in replies,
1651  * in the same form used by setsockopt.
1652  * The first hop is placed before the options, will be removed later.
1653  */
1654 struct mbuf *
1655 ip_srcroute()
1656 {
1657         struct in_addr *p, *q;
1658         struct mbuf *m;
1659
1660         if (ip_nhops == 0)
1661                 return (NULL);
1662         m = m_get(M_DONTWAIT, MT_HEADER);
1663         if (m == NULL)
1664                 return (NULL);
1665
1666 #define OPTSIZ  (sizeof(ip_srcrt.nop) + sizeof(ip_srcrt.srcopt))
1667
1668         /* length is (nhops+1)*sizeof(addr) + sizeof(nop + srcrt header) */
1669         m->m_len = ip_nhops * sizeof(struct in_addr) + sizeof(struct in_addr) +
1670             OPTSIZ;
1671 #ifdef DIAGNOSTIC
1672         if (ipprintfs)
1673                 printf("ip_srcroute: nhops %d mlen %d", ip_nhops, m->m_len);
1674 #endif
1675
1676         /*
1677          * First save first hop for return route
1678          */
1679         p = &ip_srcrt.route[ip_nhops - 1];
1680         *(mtod(m, struct in_addr *)) = *p--;
1681 #ifdef DIAGNOSTIC
1682         if (ipprintfs)
1683                 printf(" hops %lx", ntohl(mtod(m, struct in_addr *)->s_addr));
1684 #endif
1685
1686         /*
1687          * Copy option fields and padding (nop) to mbuf.
1688          */
1689         ip_srcrt.nop = IPOPT_NOP;
1690         ip_srcrt.srcopt[IPOPT_OFFSET] = IPOPT_MINOFF;
1691         (void)memcpy(mtod(m, caddr_t) + sizeof(struct in_addr), &ip_srcrt.nop,
1692             OPTSIZ);
1693         q = (struct in_addr *)(mtod(m, caddr_t) +
1694             sizeof(struct in_addr) + OPTSIZ);
1695 #undef OPTSIZ
1696         /*
1697          * Record return path as an IP source route,
1698          * reversing the path (pointers are now aligned).
1699          */
1700         while (p >= ip_srcrt.route) {
1701 #ifdef DIAGNOSTIC
1702                 if (ipprintfs)
1703                         printf(" %lx", ntohl(q->s_addr));
1704 #endif
1705                 *q++ = *p--;
1706         }
1707         /*
1708          * Last hop goes to final destination.
1709          */
1710         *q = ip_srcrt.dst;
1711 #ifdef DIAGNOSTIC
1712         if (ipprintfs)
1713                 printf(" %lx\n", ntohl(q->s_addr));
1714 #endif
1715         return (m);
1716 }
1717
1718 /*
1719  * Strip out IP options.
1720  */
1721 void
1722 ip_stripoptions(struct mbuf *m)
1723 {
1724         int datalen;
1725         struct ip *ip = mtod(m, struct ip *);
1726         caddr_t opts;
1727         int optlen;
1728
1729         optlen = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof(struct ip);
1730         opts = (caddr_t)(ip + 1);
1731         datalen = m->m_len - (sizeof(struct ip) + optlen);
1732         bcopy(opts + optlen, opts, datalen);
1733         m->m_len -= optlen;
1734         if (m->m_flags & M_PKTHDR)
1735                 m->m_pkthdr.len -= optlen;
1736         ip->ip_vhl = IP_MAKE_VHL(IPVERSION, sizeof(struct ip) >> 2);
1737 }
1738
1739 u_char inetctlerrmap[PRC_NCMDS] = {
1740         0,              0,              0,              0,
1741         0,              EMSGSIZE,       EHOSTDOWN,      EHOSTUNREACH,
1742         EHOSTUNREACH,   EHOSTUNREACH,   ECONNREFUSED,   ECONNREFUSED,
1743         EMSGSIZE,       EHOSTUNREACH,   0,              0,
1744         0,              0,              0,              0,
1745         ENOPROTOOPT,    ECONNREFUSED
1746 };
1747
1748 /*
1749  * Forward a packet.  If some error occurs return the sender
1750  * an icmp packet.  Note we can't always generate a meaningful
1751  * icmp message because icmp doesn't have a large enough repertoire
1752  * of codes and types.
1753  *
1754  * If not forwarding, just drop the packet.  This could be confusing
1755  * if ipforwarding was zero but some routing protocol was advancing
1756  * us as a gateway to somewhere.  However, we must let the routing
1757  * protocol deal with that.
1758  *
1759  * The using_srcrt parameter indicates whether the packet is being forwarded
1760  * via a source route.
1761  */
1762 static void
1763 ip_forward(struct mbuf *m, int using_srcrt, struct sockaddr_in *next_hop)
1764 {
1765         struct ip *ip = mtod(m, struct ip *);
1766         struct sockaddr_in *sin;
1767         struct rtentry *rt;
1768         int error, type = 0, code = 0;
1769         struct mbuf *mcopy;
1770         n_long dest;
1771         struct in_addr pkt_dst;
1772         struct ifnet *destifp;
1773         struct m_hdr tag;
1774 #if defined(IPSEC) || defined(FAST_IPSEC)
1775         struct ifnet dummyifp;
1776 #endif
1777
1778         dest = 0;
1779         /*
1780          * Cache the destination address of the packet; this may be
1781          * changed by use of 'ipfw fwd'.
1782          */
1783         pkt_dst = next_hop ? next_hop->sin_addr : ip->ip_dst;
1784
1785 #ifdef DIAGNOSTIC
1786         if (ipprintfs)
1787                 printf("forward: src %lx dst %lx ttl %x\n",
1788                        ip->ip_src.s_addr, pkt_dst.s_addr, ip->ip_ttl);
1789 #endif
1790
1791         if (m->m_flags & (M_BCAST | M_MCAST) || !in_canforward(pkt_dst)) {
1792                 ipstat.ips_cantforward++;
1793                 m_freem(m);
1794                 return;
1795         }
1796         if (!ipstealth && ip->ip_ttl <= IPTTLDEC) {
1797                 icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, dest, NULL);
1798                 return;
1799         }
1800
1801         sin = (struct sockaddr_in *)&ipforward_rt.ro_dst;
1802         if ((rt = ipforward_rt.ro_rt) == NULL ||
1803             pkt_dst.s_addr != sin->sin_addr.s_addr) {
1804                 if (ipforward_rt.ro_rt) {
1805                         RTFREE(ipforward_rt.ro_rt);
1806                         ipforward_rt.ro_rt = NULL;
1807                 }
1808                 sin->sin_family = AF_INET;
1809                 sin->sin_len = sizeof(*sin);
1810                 sin->sin_addr = pkt_dst;
1811
1812                 rtalloc_ign(&ipforward_rt, RTF_PRCLONING);
1813                 if (ipforward_rt.ro_rt == NULL) {
1814                         icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, dest,
1815                                    NULL);
1816                         return;
1817                 }
1818                 rt = ipforward_rt.ro_rt;
1819         }
1820
1821         /*
1822          * Save the IP header and at most 8 bytes of the payload,
1823          * in case we need to generate an ICMP message to the src.
1824          *
1825          * XXX this can be optimized a lot by saving the data in a local
1826          * buffer on the stack (72 bytes at most), and only allocating the
1827          * mbuf if really necessary. The vast majority of the packets
1828          * are forwarded without having to send an ICMP back (either
1829          * because unnecessary, or because rate limited), so we are
1830          * really we are wasting a lot of work here.
1831          *
1832          * We don't use m_copy() because it might return a reference
1833          * to a shared cluster. Both this function and ip_output()
1834          * assume exclusive access to the IP header in `m', so any
1835          * data in a cluster may change before we reach icmp_error().
1836          */
1837         MGET(mcopy, M_DONTWAIT, m->m_type);
1838         if (mcopy != NULL && !m_dup_pkthdr(mcopy, m, M_DONTWAIT)) {
1839                 /*
1840                  * It's probably ok if the pkthdr dup fails (because
1841                  * the deep copy of the tag chain failed), but for now
1842                  * be conservative and just discard the copy since
1843                  * code below may some day want the tags.
1844                  */
1845                 m_free(mcopy);
1846                 mcopy = NULL;
1847         }
1848         if (mcopy != NULL) {
1849                 mcopy->m_len = imin((IP_VHL_HL(ip->ip_vhl) << 2) + 8,
1850                     (int)ip->ip_len);
1851                 m_copydata(m, 0, mcopy->m_len, mtod(mcopy, caddr_t));
1852         }
1853
1854         if (!ipstealth)
1855                 ip->ip_ttl -= IPTTLDEC;
1856
1857         /*
1858          * If forwarding packet using same interface that it came in on,
1859          * perhaps should send a redirect to sender to shortcut a hop.
1860          * Only send redirect if source is sending directly to us,
1861          * and if packet was not source routed (or has any options).
1862          * Also, don't send redirect if forwarding using a default route
1863          * or a route modified by a redirect.
1864          */
1865         if (rt->rt_ifp == m->m_pkthdr.rcvif &&
1866             !(rt->rt_flags & (RTF_DYNAMIC | RTF_MODIFIED)) &&
1867             satosin(rt_key(rt))->sin_addr.s_addr != INADDR_ANY &&
1868             ipsendredirects && !using_srcrt && next_hop != NULL) {
1869                 u_long src = ntohl(ip->ip_src.s_addr);
1870
1871 #define RTA(rt) ((struct in_ifaddr *)(rt->rt_ifa))
1872                 if (RTA(rt) != NULL &&
1873                     (src & RTA(rt)->ia_subnetmask) == RTA(rt)->ia_subnet) {
1874                         if (rt->rt_flags & RTF_GATEWAY)
1875                                 dest = satosin(rt->rt_gateway)->sin_addr.s_addr;
1876                         else
1877                                 dest = pkt_dst.s_addr;
1878                         /*
1879                          * Router requirements says to only send
1880                          * host redirects.
1881                          */
1882                         type = ICMP_REDIRECT;
1883                         code = ICMP_REDIRECT_HOST;
1884 #ifdef DIAGNOSTIC
1885                         if (ipprintfs)
1886                                 printf("redirect (%d) to %lx\n", code, dest);
1887 #endif
1888                 }
1889         }
1890
1891         if (next_hop) {
1892                 /* Pass IPFORWARD info if available */
1893
1894                 tag.mh_type = MT_TAG;
1895                 tag.mh_flags = PACKET_TAG_IPFORWARD;
1896                 tag.mh_data = (caddr_t)next_hop;
1897                 tag.mh_next = m;
1898                 m = (struct mbuf *)&tag;
1899         }
1900
1901         error = ip_output(m, NULL, &ipforward_rt, IP_FORWARDING, NULL, NULL);
1902
1903         if (error)
1904                 ipstat.ips_cantforward++;
1905         else {
1906                 ipstat.ips_forward++;
1907                 if (type)
1908                         ipstat.ips_redirectsent++;
1909                 else {
1910                         if (mcopy) {
1911                                 ipflow_create(&ipforward_rt, mcopy);
1912                                 m_freem(mcopy);
1913                         }
1914                         return;
1915                 }
1916         }
1917         if (mcopy == NULL)
1918                 return;
1919         destifp = NULL;
1920
1921         switch (error) {
1922
1923         case 0:                         /* forwarded, but need redirect */
1924                 /* type, code set above */
1925                 break;
1926
1927         case ENETUNREACH:               /* shouldn't happen, checked above */
1928         case EHOSTUNREACH:
1929         case ENETDOWN:
1930         case EHOSTDOWN:
1931         default:
1932                 type = ICMP_UNREACH;
1933                 code = ICMP_UNREACH_HOST;
1934                 break;
1935
1936         case EMSGSIZE:
1937                 type = ICMP_UNREACH;
1938                 code = ICMP_UNREACH_NEEDFRAG;
1939 #ifdef IPSEC
1940                 /*
1941                  * If the packet is routed over IPsec tunnel, tell the
1942                  * originator the tunnel MTU.
1943                  *      tunnel MTU = if MTU - sizeof(IP) - ESP/AH hdrsiz
1944                  * XXX quickhack!!!
1945                  */
1946                 if (ipforward_rt.ro_rt) {
1947                         struct secpolicy *sp = NULL;
1948                         int ipsecerror;
1949                         int ipsechdr;
1950                         struct route *ro;
1951
1952                         sp = ipsec4_getpolicybyaddr(mcopy,
1953                                                     IPSEC_DIR_OUTBOUND,
1954                                                     IP_FORWARDING,
1955                                                     &ipsecerror);
1956
1957                         if (sp == NULL)
1958                                 destifp = ipforward_rt.ro_rt->rt_ifp;
1959                         else {
1960                                 /* count IPsec header size */
1961                                 ipsechdr = ipsec4_hdrsiz(mcopy,
1962                                                          IPSEC_DIR_OUTBOUND,
1963                                                          NULL);
1964
1965                                 /*
1966                                  * find the correct route for outer IPv4
1967                                  * header, compute tunnel MTU.
1968                                  *
1969                                  * XXX BUG ALERT
1970                                  * The "dummyifp" code relies upon the fact
1971                                  * that icmp_error() touches only ifp->if_mtu.
1972                                  */
1973                                 /*XXX*/
1974                                 destifp = NULL;
1975                                 if (sp->req != NULL
1976                                  && sp->req->sav != NULL
1977                                  && sp->req->sav->sah != NULL) {
1978                                         ro = &sp->req->sav->sah->sa_route;
1979                                         if (ro->ro_rt && ro->ro_rt->rt_ifp) {
1980                                                 dummyifp.if_mtu =
1981                                                     ro->ro_rt->rt_ifp->if_mtu;
1982                                                 dummyifp.if_mtu -= ipsechdr;
1983                                                 destifp = &dummyifp;
1984                                         }
1985                                 }
1986
1987                                 key_freesp(sp);
1988                         }
1989                 }
1990 #elif FAST_IPSEC
1991                 /*
1992                  * If the packet is routed over IPsec tunnel, tell the
1993                  * originator the tunnel MTU.
1994                  *      tunnel MTU = if MTU - sizeof(IP) - ESP/AH hdrsiz
1995                  * XXX quickhack!!!
1996                  */
1997                 if (ipforward_rt.ro_rt) {
1998                         struct secpolicy *sp = NULL;
1999                         int ipsecerror;
2000                         int ipsechdr;
2001                         struct route *ro;
2002
2003                         sp = ipsec_getpolicybyaddr(mcopy,
2004                                                    IPSEC_DIR_OUTBOUND,
2005                                                    IP_FORWARDING,
2006                                                    &ipsecerror);
2007
2008                         if (sp == NULL)
2009                                 destifp = ipforward_rt.ro_rt->rt_ifp;
2010                         else {
2011                                 /* count IPsec header size */
2012                                 ipsechdr = ipsec4_hdrsiz(mcopy,
2013                                                          IPSEC_DIR_OUTBOUND,
2014                                                          NULL);
2015
2016                                 /*
2017                                  * find the correct route for outer IPv4
2018                                  * header, compute tunnel MTU.
2019                                  *
2020                                  * XXX BUG ALERT
2021                                  * The "dummyifp" code relies upon the fact
2022                                  * that icmp_error() touches only ifp->if_mtu.
2023                                  */
2024                                 /*XXX*/
2025                                 destifp = NULL;
2026                                 if (sp->req != NULL
2027                                  && sp->req->sav != NULL
2028                                  && sp->req->sav->sah != NULL) {
2029                                         ro = &sp->req->sav->sah->sa_route;
2030                                         if (ro->ro_rt && ro->ro_rt->rt_ifp) {
2031                                                 dummyifp.if_mtu =
2032                                                     ro->ro_rt->rt_ifp->if_mtu;
2033                                                 dummyifp.if_mtu -= ipsechdr;
2034                                                 destifp = &dummyifp;
2035                                         }
2036                                 }
2037
2038                                 KEY_FREESP(&sp);
2039                         }
2040                 }
2041 #else /* !IPSEC && !FAST_IPSEC */
2042                 if (ipforward_rt.ro_rt)
2043                         destifp = ipforward_rt.ro_rt->rt_ifp;
2044 #endif /*IPSEC*/
2045                 ipstat.ips_cantfrag++;
2046                 break;
2047
2048         case ENOBUFS:
2049                 /*
2050                  * A router should not generate ICMP_SOURCEQUENCH as
2051                  * required in RFC1812 Requirements for IP Version 4 Routers.
2052                  * Source quench could be a big problem under DoS attacks,
2053                  * or if the underlying interface is rate-limited.
2054                  * Those who need source quench packets may re-enable them
2055                  * via the net.inet.ip.sendsourcequench sysctl.
2056                  */
2057                 if (!ip_sendsourcequench) {
2058                         m_freem(mcopy);
2059                         return;
2060                 } else {
2061                         type = ICMP_SOURCEQUENCH;
2062                         code = 0;
2063                 }
2064                 break;
2065
2066         case EACCES:                    /* ipfw denied packet */
2067                 m_freem(mcopy);
2068                 return;
2069         }
2070         icmp_error(mcopy, type, code, dest, destifp);
2071 }
2072
2073 void
2074 ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip,
2075                struct mbuf *m)
2076 {
2077         if (inp->inp_socket->so_options & SO_TIMESTAMP) {
2078                 struct timeval tv;
2079
2080                 microtime(&tv);
2081                 *mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv),
2082                     SCM_TIMESTAMP, SOL_SOCKET);
2083                 if (*mp)
2084                         mp = &(*mp)->m_next;
2085         }
2086         if (inp->inp_flags & INP_RECVDSTADDR) {
2087                 *mp = sbcreatecontrol((caddr_t) &ip->ip_dst,
2088                     sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP);
2089                 if (*mp)
2090                         mp = &(*mp)->m_next;
2091         }
2092 #ifdef notyet
2093         /* XXX
2094          * Moving these out of udp_input() made them even more broken
2095          * than they already were.
2096          */
2097         /* options were tossed already */
2098         if (inp->inp_flags & INP_RECVOPTS) {
2099                 *mp = sbcreatecontrol((caddr_t) opts_deleted_above,
2100                     sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP);
2101                 if (*mp)
2102                         mp = &(*mp)->m_next;
2103         }
2104         /* ip_srcroute doesn't do what we want here, need to fix */
2105         if (inp->inp_flags & INP_RECVRETOPTS) {
2106                 *mp = sbcreatecontrol((caddr_t) ip_srcroute(),
2107                     sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP);
2108                 if (*mp)
2109                         mp = &(*mp)->m_next;
2110         }
2111 #endif
2112         if (inp->inp_flags & INP_RECVIF) {
2113                 struct ifnet *ifp;
2114                 struct sdlbuf {
2115                         struct sockaddr_dl sdl;
2116                         u_char  pad[32];
2117                 } sdlbuf;
2118                 struct sockaddr_dl *sdp;
2119                 struct sockaddr_dl *sdl2 = &sdlbuf.sdl;
2120
2121                 if (((ifp = m->m_pkthdr.rcvif)) &&
2122                     ((ifp->if_index != 0) && (ifp->if_index <= if_index))) {
2123                         sdp = (struct sockaddr_dl *)
2124                             ifnet_addrs[ifp->if_index - 1]->ifa_addr;
2125                         /*
2126                          * Change our mind and don't try copy.
2127                          */
2128                         if ((sdp->sdl_family != AF_LINK) ||
2129                             (sdp->sdl_len > sizeof(sdlbuf))) {
2130                                 goto makedummy;
2131                         }
2132                         bcopy(sdp, sdl2, sdp->sdl_len);
2133                 } else {
2134 makedummy:
2135                         sdl2->sdl_len =
2136                             offsetof(struct sockaddr_dl, sdl_data[0]);
2137                         sdl2->sdl_family = AF_LINK;
2138                         sdl2->sdl_index = 0;
2139                         sdl2->sdl_nlen = sdl2->sdl_alen = sdl2->sdl_slen = 0;
2140                 }
2141                 *mp = sbcreatecontrol((caddr_t) sdl2, sdl2->sdl_len,
2142                         IP_RECVIF, IPPROTO_IP);
2143                 if (*mp)
2144                         mp = &(*mp)->m_next;
2145         }
2146 }
2147
2148 /*
2149  * XXX these routines are called from the upper part of the kernel.
2150  *
2151  * They could also be moved to ip_mroute.c, since all the RSVP
2152  *  handling is done there already.
2153  */
2154 int
2155 ip_rsvp_init(struct socket *so)
2156 {
2157         if (so->so_type != SOCK_RAW ||
2158             so->so_proto->pr_protocol != IPPROTO_RSVP)
2159                 return EOPNOTSUPP;
2160
2161         if (ip_rsvpd != NULL)
2162                 return EADDRINUSE;
2163
2164         ip_rsvpd = so;
2165         /*
2166          * This may seem silly, but we need to be sure we don't over-increment
2167          * the RSVP counter, in case something slips up.
2168          */
2169         if (!ip_rsvp_on) {
2170                 ip_rsvp_on = 1;
2171                 rsvp_on++;
2172         }
2173
2174         return 0;
2175 }
2176
2177 int
2178 ip_rsvp_done(void)
2179 {
2180         ip_rsvpd = NULL;
2181         /*
2182          * This may seem silly, but we need to be sure we don't over-decrement
2183          * the RSVP counter, in case something slips up.
2184          */
2185         if (ip_rsvp_on) {
2186                 ip_rsvp_on = 0;
2187                 rsvp_on--;
2188         }
2189         return 0;
2190 }
2191
2192 void
2193 rsvp_input(struct mbuf *m, int off, int proto)  /* XXX must fixup manually */
2194 {
2195         if (rsvp_input_p) { /* call the real one if loaded */
2196                 rsvp_input_p(m, off, proto);
2197                 return;
2198         }
2199
2200         /* Can still get packets with rsvp_on = 0 if there is a local member
2201          * of the group to which the RSVP packet is addressed.  But in this
2202          * case we want to throw the packet away.
2203          */
2204
2205         if (!rsvp_on) {
2206                 m_freem(m);
2207                 return;
2208         }
2209
2210         if (ip_rsvpd != NULL) {
2211                 rip_input(m, off, proto);
2212                 return;
2213         }
2214         /* Drop the packet */
2215         m_freem(m);
2216 }