Add support for Protocol Independent Multicast.
[dragonfly.git] / sys / netinet / ip_output.c
1 /*
2  * Copyright (c) 1982, 1986, 1988, 1990, 1993
3  *      The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. All advertising materials mentioning features or use of this software
14  *    must display the following acknowledgement:
15  *      This product includes software developed by the University of
16  *      California, Berkeley and its contributors.
17  * 4. Neither the name of the University nor the names of its contributors
18  *    may be used to endorse or promote products derived from this software
19  *    without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  *      @(#)ip_output.c 8.3 (Berkeley) 1/21/94
34  * $FreeBSD: src/sys/netinet/ip_output.c,v 1.99.2.37 2003/04/15 06:44:45 silby Exp $
35  * $DragonFly: src/sys/netinet/ip_output.c,v 1.8 2003/08/24 23:07:07 hsu Exp $
36  */
37
38 #define _IP_VHL
39
40 #include "opt_ipfw.h"
41 #include "opt_ipdn.h"
42 #include "opt_ipdivert.h"
43 #include "opt_ipfilter.h"
44 #include "opt_ipsec.h"
45 #include "opt_random_ip_id.h"
46 #include "opt_mbuf_stress_test.h"
47
48 #include <sys/param.h>
49 #include <sys/systm.h>
50 #include <sys/kernel.h>
51 #include <sys/malloc.h>
52 #include <sys/mbuf.h>
53 #include <sys/protosw.h>
54 #include <sys/socket.h>
55 #include <sys/socketvar.h>
56 #include <sys/proc.h>
57 #include <sys/sysctl.h>
58
59 #include <net/if.h>
60 #include <net/route.h>
61
62 #include <netinet/in.h>
63 #include <netinet/in_systm.h>
64 #include <netinet/ip.h>
65 #include <netinet/in_pcb.h>
66 #include <netinet/in_var.h>
67 #include <netinet/ip_var.h>
68
69 #include <machine/in_cksum.h>
70
71 static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "internet multicast options");
72
73 #ifdef IPSEC
74 #include <netinet6/ipsec.h>
75 #include <netproto/key/key.h>
76 #ifdef IPSEC_DEBUG
77 #include <netproto/key/key_debug.h>
78 #else
79 #define KEYDEBUG(lev,arg)
80 #endif
81 #endif /*IPSEC*/
82
83 #ifdef FAST_IPSEC
84 #include <netipsec/ipsec.h>
85 #include <netipsec/xform.h>
86 #include <netipsec/key.h>
87 #endif /*FAST_IPSEC*/
88
89 #include <net/ipfw/ip_fw.h>
90 #include <net/dummynet/ip_dummynet.h>
91
92 #define print_ip(x, a, y)        printf("%s %d.%d.%d.%d%s",\
93                                 x, (ntohl(a.s_addr)>>24)&0xFF,\
94                                   (ntohl(a.s_addr)>>16)&0xFF,\
95                                   (ntohl(a.s_addr)>>8)&0xFF,\
96                                   (ntohl(a.s_addr))&0xFF, y);
97
98 u_short ip_id;
99
100 #ifdef MBUF_STRESS_TEST
101 int mbuf_frag_size = 0;
102 SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW,
103         &mbuf_frag_size, 0, "Fragment outgoing mbufs to this size");
104 #endif
105
106 static struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *);
107 static struct ifnet *ip_multicast_if(struct in_addr *, int *);
108 static void     ip_mloopback
109         (struct ifnet *, struct mbuf *, struct sockaddr_in *, int);
110 static int      ip_getmoptions
111         (struct sockopt *, struct ip_moptions *);
112 static int      ip_pcbopts(int, struct mbuf **, struct mbuf *);
113 static int      ip_setmoptions
114         (struct sockopt *, struct ip_moptions **);
115
116 int     ip_optcopy(struct ip *, struct ip *);
117 extern int (*fr_checkp) (struct ip *, int, struct ifnet *, int, struct mbuf **);
118
119
120 extern  struct protosw inetsw[];
121
122 /*
123  * IP output.  The packet in mbuf chain m contains a skeletal IP
124  * header (with len, off, ttl, proto, tos, src, dst).
125  * The mbuf chain containing the packet will be freed.
126  * The mbuf opt, if present, will not be freed.
127  */
128 int
129 ip_output(struct mbuf *m0, struct mbuf *opt, struct route *ro,
130         int flags, struct ip_moptions *imo, struct inpcb *inp)
131 {
132         struct ip *ip;
133         struct ifnet *ifp = NULL;       /* keep compiler happy */
134         struct mbuf *m;
135         int hlen = sizeof (struct ip);
136         int len, off, error = 0;
137         struct sockaddr_in *dst = NULL; /* keep compiler happy */
138         struct in_ifaddr *ia = NULL;
139         int isbroadcast, sw_csum;
140         struct in_addr pkt_dst;
141 #ifdef IPSEC
142         struct route iproute;
143         struct secpolicy *sp = NULL;
144         struct socket *so = inp ? inp->inp_socket : NULL;
145 #endif
146 #ifdef FAST_IPSEC
147         struct route iproute;
148         struct m_tag *mtag;
149         struct secpolicy *sp = NULL;
150         struct tdb_ident *tdbi;
151         int s;
152 #endif /* FAST_IPSEC */
153         struct ip_fw_args args;
154         int src_was_INADDR_ANY = 0;     /* as the name says... */
155
156         args.eh = NULL;
157         args.rule = NULL;
158         args.next_hop = NULL;
159         args.divert_rule = 0;                   /* divert cookie */
160
161         /* Grab info from MT_TAG mbufs prepended to the chain. */
162         for (; m0 && m0->m_type == MT_TAG; m0 = m0->m_next) {
163                 switch(m0->_m_tag_id) {
164                 default:
165                         printf("ip_output: unrecognised MT_TAG tag %d\n",
166                             m0->_m_tag_id);
167                         break;
168
169                 case PACKET_TAG_DUMMYNET:
170                         /*
171                          * the packet was already tagged, so part of the
172                          * processing was already done, and we need to go down.
173                          * Get parameters from the header.
174                          */
175                         args.rule = ((struct dn_pkt *)m0)->rule;
176                         opt = NULL ;
177                         ro = & ( ((struct dn_pkt *)m0)->ro ) ;
178                         imo = NULL ;
179                         dst = ((struct dn_pkt *)m0)->dn_dst ;
180                         ifp = ((struct dn_pkt *)m0)->ifp ;
181                         flags = ((struct dn_pkt *)m0)->flags ;
182                         break;
183
184                 case PACKET_TAG_DIVERT:
185                         args.divert_rule = (int)m0->m_data & 0xffff;
186                         break;
187
188                 case PACKET_TAG_IPFORWARD:
189                         args.next_hop = (struct sockaddr_in *)m0->m_data;
190                         break;
191                 }
192         }
193         m = m0;
194
195         KASSERT(!m || (m->m_flags & M_PKTHDR) != 0, ("ip_output: no HDR"));
196 #ifndef FAST_IPSEC
197         KASSERT(ro != NULL, ("ip_output: no route, proto %d",
198             mtod(m, struct ip *)->ip_p));
199 #endif
200
201         if (args.rule != NULL) {        /* dummynet already saw us */
202                 ip = mtod(m, struct ip *);
203                 hlen = IP_VHL_HL(ip->ip_vhl) << 2 ;
204                 if (ro->ro_rt)
205                         ia = ifatoia(ro->ro_rt->rt_ifa);
206                 goto sendit;
207         }
208
209         if (opt) {
210                 len = 0;
211                 m = ip_insertoptions(m, opt, &len);
212                 if (len != 0)
213                         hlen = len;
214         }
215         ip = mtod(m, struct ip *);
216         pkt_dst = args.next_hop ? args.next_hop->sin_addr : ip->ip_dst;
217
218         /*
219          * Fill in IP header.
220          */
221         if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
222                 ip->ip_vhl = IP_MAKE_VHL(IPVERSION, hlen >> 2);
223                 ip->ip_off &= IP_DF;
224 #ifdef RANDOM_IP_ID
225                 ip->ip_id = ip_randomid();
226 #else
227                 ip->ip_id = htons(ip_id++);
228 #endif
229                 ipstat.ips_localout++;
230         } else {
231                 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
232         }
233
234 #ifdef FAST_IPSEC
235         if (ro == NULL) {
236                 ro = &iproute;
237                 bzero(ro, sizeof (*ro));
238         }
239 #endif /* FAST_IPSEC */
240         dst = (struct sockaddr_in *)&ro->ro_dst;
241         /*
242          * If there is a cached route,
243          * check that it is to the same destination
244          * and is still up.  If not, free it and try again.
245          * The address family should also be checked in case of sharing the
246          * cache with IPv6.
247          */
248         if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 ||
249                           dst->sin_family != AF_INET ||
250                           dst->sin_addr.s_addr != pkt_dst.s_addr)) {
251                 RTFREE(ro->ro_rt);
252                 ro->ro_rt = (struct rtentry *)0;
253         }
254         if (ro->ro_rt == 0) {
255                 bzero(dst, sizeof(*dst));
256                 dst->sin_family = AF_INET;
257                 dst->sin_len = sizeof(*dst);
258                 dst->sin_addr = pkt_dst;
259         }
260         /*
261          * If routing to interface only,
262          * short circuit routing lookup.
263          */
264         if (flags & IP_ROUTETOIF) {
265                 if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == 0 &&
266                     (ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == 0) {
267                         ipstat.ips_noroute++;
268                         error = ENETUNREACH;
269                         goto bad;
270                 }
271                 ifp = ia->ia_ifp;
272                 ip->ip_ttl = 1;
273                 isbroadcast = in_broadcast(dst->sin_addr, ifp);
274         } else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
275             imo != NULL && imo->imo_multicast_ifp != NULL) {
276                 /*
277                  * Bypass the normal routing lookup for multicast
278                  * packets if the interface is specified.
279                  */
280                 ifp = imo->imo_multicast_ifp;
281                 IFP_TO_IA(ifp, ia);
282                 isbroadcast = 0;        /* fool gcc */
283         } else {
284                 /*
285                  * If this is the case, we probably don't want to allocate
286                  * a protocol-cloned route since we didn't get one from the
287                  * ULP.  This lets TCP do its thing, while not burdening
288                  * forwarding or ICMP with the overhead of cloning a route.
289                  * Of course, we still want to do any cloning requested by
290                  * the link layer, as this is probably required in all cases
291                  * for correct operation (as it is for ARP).
292                  */
293                 if (ro->ro_rt == 0)
294                         rtalloc_ign(ro, RTF_PRCLONING);
295                 if (ro->ro_rt == 0) {
296                         ipstat.ips_noroute++;
297                         error = EHOSTUNREACH;
298                         goto bad;
299                 }
300                 ia = ifatoia(ro->ro_rt->rt_ifa);
301                 ifp = ro->ro_rt->rt_ifp;
302                 ro->ro_rt->rt_use++;
303                 if (ro->ro_rt->rt_flags & RTF_GATEWAY)
304                         dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway;
305                 if (ro->ro_rt->rt_flags & RTF_HOST)
306                         isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST);
307                 else
308                         isbroadcast = in_broadcast(dst->sin_addr, ifp);
309         }
310         if (IN_MULTICAST(ntohl(pkt_dst.s_addr))) {
311                 struct in_multi *inm;
312
313                 m->m_flags |= M_MCAST;
314                 /*
315                  * IP destination address is multicast.  Make sure "dst"
316                  * still points to the address in "ro".  (It may have been
317                  * changed to point to a gateway address, above.)
318                  */
319                 dst = (struct sockaddr_in *)&ro->ro_dst;
320                 /*
321                  * See if the caller provided any multicast options
322                  */
323                 if (imo != NULL) {
324                         ip->ip_ttl = imo->imo_multicast_ttl;
325                         if (imo->imo_multicast_vif != -1)
326                                 ip->ip_src.s_addr =
327                                     ip_mcast_src ?
328                                     ip_mcast_src(imo->imo_multicast_vif) :
329                                     INADDR_ANY;
330                 } else
331                         ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL;
332                 /*
333                  * Confirm that the outgoing interface supports multicast.
334                  */
335                 if ((imo == NULL) || (imo->imo_multicast_vif == -1)) {
336                         if ((ifp->if_flags & IFF_MULTICAST) == 0) {
337                                 ipstat.ips_noroute++;
338                                 error = ENETUNREACH;
339                                 goto bad;
340                         }
341                 }
342                 /*
343                  * If source address not specified yet, use address
344                  * of outgoing interface.
345                  */
346                 if (ip->ip_src.s_addr == INADDR_ANY) {
347                         /* Interface may have no addresses. */
348                         if (ia != NULL)
349                                 ip->ip_src = IA_SIN(ia)->sin_addr;
350                 }
351
352                 if (ip_mrouter && (flags & IP_FORWARDING) == 0) {
353                         /*
354                          * XXX
355                          * delayed checksums are not currently
356                          * compatible with IP multicast routing
357                          */
358                         if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
359                                 in_delayed_cksum(m);
360                                 m->m_pkthdr.csum_flags &=
361                                         ~CSUM_DELAY_DATA;
362                         }
363                 }
364                 IN_LOOKUP_MULTI(pkt_dst, ifp, inm);
365                 if (inm != NULL &&
366                    (imo == NULL || imo->imo_multicast_loop)) {
367                         /*
368                          * If we belong to the destination multicast group
369                          * on the outgoing interface, and the caller did not
370                          * forbid loopback, loop back a copy.
371                          */
372                         ip_mloopback(ifp, m, dst, hlen);
373                 }
374                 else {
375                         /*
376                          * If we are acting as a multicast router, perform
377                          * multicast forwarding as if the packet had just
378                          * arrived on the interface to which we are about
379                          * to send.  The multicast forwarding function
380                          * recursively calls this function, using the
381                          * IP_FORWARDING flag to prevent infinite recursion.
382                          *
383                          * Multicasts that are looped back by ip_mloopback(),
384                          * above, will be forwarded by the ip_input() routine,
385                          * if necessary.
386                          */
387                         if (ip_mrouter && (flags & IP_FORWARDING) == 0) {
388                                 /*
389                                  * If rsvp daemon is not running, do not
390                                  * set ip_moptions. This ensures that the packet
391                                  * is multicast and not just sent down one link
392                                  * as prescribed by rsvpd.
393                                  */
394                                 if (!rsvp_on)
395                                         imo = NULL;
396                                 if (ip_mforward &&
397                                     ip_mforward(ip, ifp, m, imo) != 0) {
398                                         m_freem(m);
399                                         goto done;
400                                 }
401                         }
402                 }
403
404                 /*
405                  * Multicasts with a time-to-live of zero may be looped-
406                  * back, above, but must not be transmitted on a network.
407                  * Also, multicasts addressed to the loopback interface
408                  * are not sent -- the above call to ip_mloopback() will
409                  * loop back a copy if this host actually belongs to the
410                  * destination group on the loopback interface.
411                  */
412                 if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) {
413                         m_freem(m);
414                         goto done;
415                 }
416
417                 goto sendit;
418         }
419 #ifndef notdef
420         /*
421          * If the source address is not specified yet, use the address
422          * of the outoing interface. In case, keep note we did that, so
423          * if the the firewall changes the next-hop causing the output
424          * interface to change, we can fix that.
425          */
426         if (ip->ip_src.s_addr == INADDR_ANY) {
427                 /* Interface may have no addresses. */
428                 if (ia != NULL) {
429                         ip->ip_src = IA_SIN(ia)->sin_addr;
430                         src_was_INADDR_ANY = 1;
431                 }
432         }
433 #endif /* notdef */
434         /*
435          * Verify that we have any chance at all of being able to queue
436          *      the packet or packet fragments
437          */
438         if ((ifp->if_snd.ifq_len + ip->ip_len / ifp->if_mtu + 1) >=
439                 ifp->if_snd.ifq_maxlen) {
440                         error = ENOBUFS;
441                         ipstat.ips_odropped++;
442                         goto bad;
443         }
444
445         /*
446          * Look for broadcast address and
447          * verify user is allowed to send
448          * such a packet.
449          */
450         if (isbroadcast) {
451                 if ((ifp->if_flags & IFF_BROADCAST) == 0) {
452                         error = EADDRNOTAVAIL;
453                         goto bad;
454                 }
455                 if ((flags & IP_ALLOWBROADCAST) == 0) {
456                         error = EACCES;
457                         goto bad;
458                 }
459                 /* don't allow broadcast messages to be fragmented */
460                 if (ip->ip_len > ifp->if_mtu) {
461                         error = EMSGSIZE;
462                         goto bad;
463                 }
464                 m->m_flags |= M_BCAST;
465         } else {
466                 m->m_flags &= ~M_BCAST;
467         }
468
469 sendit:
470 #ifdef IPSEC
471         /* get SP for this packet */
472         if (so == NULL)
473                 sp = ipsec4_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND, flags, &error);
474         else
475                 sp = ipsec4_getpolicybysock(m, IPSEC_DIR_OUTBOUND, so, &error);
476
477         if (sp == NULL) {
478                 ipsecstat.out_inval++;
479                 goto bad;
480         }
481
482         error = 0;
483
484         /* check policy */
485         switch (sp->policy) {
486         case IPSEC_POLICY_DISCARD:
487                 /*
488                  * This packet is just discarded.
489                  */
490                 ipsecstat.out_polvio++;
491                 goto bad;
492
493         case IPSEC_POLICY_BYPASS:
494         case IPSEC_POLICY_NONE:
495                 /* no need to do IPsec. */
496                 goto skip_ipsec;
497         
498         case IPSEC_POLICY_IPSEC:
499                 if (sp->req == NULL) {
500                         /* acquire a policy */
501                         error = key_spdacquire(sp);
502                         goto bad;
503                 }
504                 break;
505
506         case IPSEC_POLICY_ENTRUST:
507         default:
508                 printf("ip_output: Invalid policy found. %d\n", sp->policy);
509         }
510     {
511         struct ipsec_output_state state;
512         bzero(&state, sizeof(state));
513         state.m = m;
514         if (flags & IP_ROUTETOIF) {
515                 state.ro = &iproute;
516                 bzero(&iproute, sizeof(iproute));
517         } else
518                 state.ro = ro;
519         state.dst = (struct sockaddr *)dst;
520
521         ip->ip_sum = 0;
522
523         /*
524          * XXX
525          * delayed checksums are not currently compatible with IPsec
526          */
527         if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
528                 in_delayed_cksum(m);
529                 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
530         }
531
532         ip->ip_len = htons(ip->ip_len);
533         ip->ip_off = htons(ip->ip_off);
534
535         error = ipsec4_output(&state, sp, flags);
536
537         m = state.m;
538         if (flags & IP_ROUTETOIF) {
539                 /*
540                  * if we have tunnel mode SA, we may need to ignore
541                  * IP_ROUTETOIF.
542                  */
543                 if (state.ro != &iproute || state.ro->ro_rt != NULL) {
544                         flags &= ~IP_ROUTETOIF;
545                         ro = state.ro;
546                 }
547         } else
548                 ro = state.ro;
549         dst = (struct sockaddr_in *)state.dst;
550         if (error) {
551                 /* mbuf is already reclaimed in ipsec4_output. */
552                 m0 = NULL;
553                 switch (error) {
554                 case EHOSTUNREACH:
555                 case ENETUNREACH:
556                 case EMSGSIZE:
557                 case ENOBUFS:
558                 case ENOMEM:
559                         break;
560                 default:
561                         printf("ip4_output (ipsec): error code %d\n", error);
562                         /*fall through*/
563                 case ENOENT:
564                         /* don't show these error codes to the user */
565                         error = 0;
566                         break;
567                 }
568                 goto bad;
569         }
570     }
571
572         /* be sure to update variables that are affected by ipsec4_output() */
573         ip = mtod(m, struct ip *);
574 #ifdef _IP_VHL
575         hlen = IP_VHL_HL(ip->ip_vhl) << 2;
576 #else
577         hlen = ip->ip_hl << 2;
578 #endif
579         if (ro->ro_rt == NULL) {
580                 if ((flags & IP_ROUTETOIF) == 0) {
581                         printf("ip_output: "
582                                 "can't update route after IPsec processing\n");
583                         error = EHOSTUNREACH;   /*XXX*/
584                         goto bad;
585                 }
586         } else {
587                 ia = ifatoia(ro->ro_rt->rt_ifa);
588                 ifp = ro->ro_rt->rt_ifp;
589         }
590
591         /* make it flipped, again. */
592         ip->ip_len = ntohs(ip->ip_len);
593         ip->ip_off = ntohs(ip->ip_off);
594 skip_ipsec:
595 #endif /*IPSEC*/
596 #ifdef FAST_IPSEC
597         /*
598          * Check the security policy (SP) for the packet and, if
599          * required, do IPsec-related processing.  There are two
600          * cases here; the first time a packet is sent through
601          * it will be untagged and handled by ipsec4_checkpolicy.
602          * If the packet is resubmitted to ip_output (e.g. after
603          * AH, ESP, etc. processing), there will be a tag to bypass
604          * the lookup and related policy checking.
605          */
606         mtag = m_tag_find(m, PACKET_TAG_IPSEC_PENDING_TDB, NULL);
607         s = splnet();
608         if (mtag != NULL) {
609                 tdbi = (struct tdb_ident *)(mtag + 1);
610                 sp = ipsec_getpolicy(tdbi, IPSEC_DIR_OUTBOUND);
611                 if (sp == NULL)
612                         error = -EINVAL;        /* force silent drop */
613                 m_tag_delete(m, mtag);
614         } else {
615                 sp = ipsec4_checkpolicy(m, IPSEC_DIR_OUTBOUND, flags,
616                                         &error, inp);
617         }
618         /*
619          * There are four return cases:
620          *    sp != NULL                    apply IPsec policy
621          *    sp == NULL, error == 0        no IPsec handling needed
622          *    sp == NULL, error == -EINVAL  discard packet w/o error
623          *    sp == NULL, error != 0        discard packet, report error
624          */
625         if (sp != NULL) {
626                 /* Loop detection, check if ipsec processing already done */
627                 KASSERT(sp->req != NULL, ("ip_output: no ipsec request"));
628                 for (mtag = m_tag_first(m); mtag != NULL;
629                      mtag = m_tag_next(m, mtag)) {
630                         if (mtag->m_tag_cookie != MTAG_ABI_COMPAT)
631                                 continue;
632                         if (mtag->m_tag_id != PACKET_TAG_IPSEC_OUT_DONE &&
633                             mtag->m_tag_id != PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED)
634                                 continue;
635                         /*
636                          * Check if policy has an SA associated with it.
637                          * This can happen when an SP has yet to acquire
638                          * an SA; e.g. on first reference.  If it occurs,
639                          * then we let ipsec4_process_packet do its thing.
640                          */
641                         if (sp->req->sav == NULL)
642                                 break;
643                         tdbi = (struct tdb_ident *)(mtag + 1);
644                         if (tdbi->spi == sp->req->sav->spi &&
645                             tdbi->proto == sp->req->sav->sah->saidx.proto &&
646                             bcmp(&tdbi->dst, &sp->req->sav->sah->saidx.dst,
647                                  sizeof (union sockaddr_union)) == 0) {
648                                 /*
649                                  * No IPsec processing is needed, free
650                                  * reference to SP.
651                                  *
652                                  * NB: null pointer to avoid free at
653                                  *     done: below.
654                                  */
655                                 KEY_FREESP(&sp), sp = NULL;
656                                 splx(s);
657                                 goto spd_done;
658                         }
659                 }
660
661                 /*
662                  * Do delayed checksums now because we send before
663                  * this is done in the normal processing path.
664                  */
665                 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
666                         in_delayed_cksum(m);
667                         m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
668                 }
669
670                 ip->ip_len = htons(ip->ip_len);
671                 ip->ip_off = htons(ip->ip_off);
672
673                 /* NB: callee frees mbuf */
674                 error = ipsec4_process_packet(m, sp->req, flags, 0);
675                 /*
676                  * Preserve KAME behaviour: ENOENT can be returned
677                  * when an SA acquire is in progress.  Don't propagate
678                  * this to user-level; it confuses applications.
679                  *
680                  * XXX this will go away when the SADB is redone.
681                  */
682                 if (error == ENOENT)
683                         error = 0;
684                 splx(s);
685                 goto done;
686         } else {
687                 splx(s);
688
689                 if (error != 0) {
690                         /*
691                          * Hack: -EINVAL is used to signal that a packet
692                          * should be silently discarded.  This is typically
693                          * because we asked key management for an SA and
694                          * it was delayed (e.g. kicked up to IKE).
695                          */
696                         if (error == -EINVAL)
697                                 error = 0;
698                         goto bad;
699                 } else {
700                         /* No IPsec processing for this packet. */
701                 }
702 #ifdef notyet
703                 /*
704                  * If deferred crypto processing is needed, check that
705                  * the interface supports it.
706                  */ 
707                 mtag = m_tag_find(m, PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED, NULL);
708                 if (mtag != NULL && (ifp->if_capenable & IFCAP_IPSEC) == 0) {
709                         /* notify IPsec to do its own crypto */
710                         ipsp_skipcrypto_unmark((struct tdb_ident *)(mtag + 1));
711                         error = EHOSTUNREACH;
712                         goto bad;
713                 }
714 #endif
715         }
716 spd_done:
717 #endif /* FAST_IPSEC */
718         /*
719          * IpHack's section.
720          * - Xlate: translate packet's addr/port (NAT).
721          * - Firewall: deny/allow/etc.
722          * - Wrap: fake packet's addr/port <unimpl.>
723          * - Encapsulate: put it in another IP and send out. <unimp.>
724          */ 
725         if (fr_checkp) {
726                 struct  mbuf    *m1 = m;
727
728                 if ((error = (*fr_checkp)(ip, hlen, ifp, 1, &m1)) || !m1)
729                         goto done;
730                 ip = mtod(m = m1, struct ip *);
731         }
732
733         /*
734          * Check with the firewall...
735          * but not if we are already being fwd'd from a firewall.
736          */
737         if (fw_enable && IPFW_LOADED && !args.next_hop) {
738                 struct sockaddr_in *old = dst;
739
740                 args.m = m;
741                 args.next_hop = dst;
742                 args.oif = ifp;
743                 off = ip_fw_chk_ptr(&args);
744                 m = args.m;
745                 dst = args.next_hop;
746
747                 /*
748                  * On return we must do the following:
749                  * m == NULL    -> drop the pkt (old interface, deprecated)
750                  * (off & IP_FW_PORT_DENY_FLAG) -> drop the pkt (new interface)
751                  * 1<=off<= 0xffff              -> DIVERT
752                  * (off & IP_FW_PORT_DYNT_FLAG) -> send to a DUMMYNET pipe
753                  * (off & IP_FW_PORT_TEE_FLAG)  -> TEE the packet
754                  * dst != old                   -> IPFIREWALL_FORWARD
755                  * off==0, dst==old             -> accept
756                  * If some of the above modules are not compiled in, then
757                  * we should't have to check the corresponding condition
758                  * (because the ipfw control socket should not accept
759                  * unsupported rules), but better play safe and drop
760                  * packets in case of doubt.
761                  */
762                 if ( (off & IP_FW_PORT_DENY_FLAG) || m == NULL) {
763                         if (m)
764                                 m_freem(m);
765                         error = EACCES;
766                         goto done;
767                 }
768                 ip = mtod(m, struct ip *);
769                 if (off == 0 && dst == old)             /* common case */
770                         goto pass;
771                 if (DUMMYNET_LOADED && (off & IP_FW_PORT_DYNT_FLAG) != 0) {
772                         /*
773                          * pass the pkt to dummynet. Need to include
774                          * pipe number, m, ifp, ro, dst because these are
775                          * not recomputed in the next pass.
776                          * All other parameters have been already used and
777                          * so they are not needed anymore. 
778                          * XXX note: if the ifp or ro entry are deleted
779                          * while a pkt is in dummynet, we are in trouble!
780                          */ 
781                         args.ro = ro;
782                         args.dst = dst;
783                         args.flags = flags;
784
785                         error = ip_dn_io_ptr(m, off & 0xffff, DN_TO_IP_OUT,
786                                 &args);
787                         goto done;
788                 }
789 #ifdef IPDIVERT
790                 if (off != 0 && (off & IP_FW_PORT_DYNT_FLAG) == 0) {
791                         struct mbuf *clone = NULL;
792
793                         /* Clone packet if we're doing a 'tee' */
794                         if ((off & IP_FW_PORT_TEE_FLAG) != 0)
795                                 clone = m_dup(m, M_DONTWAIT);
796
797                         /*
798                          * XXX
799                          * delayed checksums are not currently compatible
800                          * with divert sockets.
801                          */
802                         if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
803                                 in_delayed_cksum(m);
804                                 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
805                         }
806
807                         /* Restore packet header fields to original values */
808                         ip->ip_len = htons(ip->ip_len);
809                         ip->ip_off = htons(ip->ip_off);
810
811                         /* Deliver packet to divert input routine */
812                         divert_packet(m, 0, off & 0xffff, args.divert_rule);
813
814                         /* If 'tee', continue with original packet */
815                         if (clone != NULL) {
816                                 m = clone;
817                                 ip = mtod(m, struct ip *);
818                                 goto pass;
819                         }
820                         goto done;
821                 }
822 #endif
823
824                 /* IPFIREWALL_FORWARD */
825                 /*
826                  * Check dst to make sure it is directly reachable on the
827                  * interface we previously thought it was.
828                  * If it isn't (which may be likely in some situations) we have
829                  * to re-route it (ie, find a route for the next-hop and the
830                  * associated interface) and set them here. This is nested
831                  * forwarding which in most cases is undesirable, except where
832                  * such control is nigh impossible. So we do it here.
833                  * And I'm babbling.
834                  */
835                 if (off == 0 && old != dst) { /* FORWARD, dst has changed */
836 #if 0
837                         /*
838                          * XXX To improve readability, this block should be
839                          * changed into a function call as below:
840                          */
841                         error = ip_ipforward(&m, &dst, &ifp);
842                         if (error)
843                                 goto bad;
844                         if (m == NULL) /* ip_input consumed the mbuf */
845                                 goto done;
846 #else
847                         struct in_ifaddr *ia;
848
849                         /*
850                          * XXX sro_fwd below is static, and a pointer
851                          * to it gets passed to routines downstream.
852                          * This could have surprisingly bad results in
853                          * practice, because its content is overwritten
854                          * by subsequent packets.
855                          */
856                         /* There must be a better way to do this next line... */
857                         static struct route sro_fwd;
858                         struct route *ro_fwd = &sro_fwd;
859
860 #if 0
861                         print_ip("IPFIREWALL_FORWARD: New dst ip: ",
862                             dst->sin_addr, "\n");
863 #endif
864
865                         /*
866                          * We need to figure out if we have been forwarded
867                          * to a local socket. If so, then we should somehow 
868                          * "loop back" to ip_input, and get directed to the
869                          * PCB as if we had received this packet. This is
870                          * because it may be dificult to identify the packets
871                          * you want to forward until they are being output
872                          * and have selected an interface. (e.g. locally
873                          * initiated packets) If we used the loopback inteface,
874                          * we would not be able to control what happens 
875                          * as the packet runs through ip_input() as
876                          * it is done through a ISR.
877                          */
878                         LIST_FOREACH(ia,
879                             INADDR_HASH(dst->sin_addr.s_addr), ia_hash) {
880                                 /*
881                                  * If the addr to forward to is one
882                                  * of ours, we pretend to
883                                  * be the destination for this packet.
884                                  */
885                                 if (IA_SIN(ia)->sin_addr.s_addr ==
886                                                  dst->sin_addr.s_addr)
887                                         break;
888                         }
889                         if (ia) {       /* tell ip_input "dont filter" */
890                                 struct m_hdr tag;
891
892                                 tag.mh_type = MT_TAG;
893                                 tag.mh_flags = PACKET_TAG_IPFORWARD;
894                                 tag.mh_data = (caddr_t)args.next_hop;
895                                 tag.mh_next = m;
896
897                                 if (m->m_pkthdr.rcvif == NULL)
898                                         m->m_pkthdr.rcvif = ifunit("lo0");
899                                 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
900                                         m->m_pkthdr.csum_flags |=
901                                             CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
902                                         m0->m_pkthdr.csum_data = 0xffff;
903                                 }
904                                 m->m_pkthdr.csum_flags |=
905                                     CSUM_IP_CHECKED | CSUM_IP_VALID;
906                                 ip->ip_len = htons(ip->ip_len);
907                                 ip->ip_off = htons(ip->ip_off);
908                                 ip_input((struct mbuf *)&tag);
909                                 goto done;
910                         }
911                         /* Some of the logic for this was
912                          * nicked from above.
913                          *
914                          * This rewrites the cached route in a local PCB.
915                          * Is this what we want to do?
916                          */
917                         bcopy(dst, &ro_fwd->ro_dst, sizeof(*dst));
918
919                         ro_fwd->ro_rt = 0;
920                         rtalloc_ign(ro_fwd, RTF_PRCLONING);
921
922                         if (ro_fwd->ro_rt == 0) {
923                                 ipstat.ips_noroute++;
924                                 error = EHOSTUNREACH;
925                                 goto bad;
926                         }
927
928                         ia = ifatoia(ro_fwd->ro_rt->rt_ifa);
929                         ifp = ro_fwd->ro_rt->rt_ifp;
930                         ro_fwd->ro_rt->rt_use++;
931                         if (ro_fwd->ro_rt->rt_flags & RTF_GATEWAY)
932                                 dst = (struct sockaddr_in *)
933                                         ro_fwd->ro_rt->rt_gateway;
934                         if (ro_fwd->ro_rt->rt_flags & RTF_HOST)
935                                 isbroadcast =
936                                     (ro_fwd->ro_rt->rt_flags & RTF_BROADCAST);
937                         else
938                                 isbroadcast = in_broadcast(dst->sin_addr, ifp);
939                         if (ro->ro_rt)
940                                 RTFREE(ro->ro_rt);
941                         ro->ro_rt = ro_fwd->ro_rt;
942                         dst = (struct sockaddr_in *)&ro_fwd->ro_dst;
943
944 #endif  /* ... block to be put into a function */
945                         /*
946                          * If we added a default src ip earlier,
947                          * which would have been gotten from the-then
948                          * interface, do it again, from the new one.
949                          */
950                         if (src_was_INADDR_ANY)
951                                 ip->ip_src = IA_SIN(ia)->sin_addr;
952                         goto pass ;
953                 }
954
955                 /*
956                  * if we get here, none of the above matches, and 
957                  * we have to drop the pkt
958                  */
959                 m_freem(m);
960                 error = EACCES; /* not sure this is the right error msg */
961                 goto done;
962         }
963
964 pass:
965         /* 127/8 must not appear on wire - RFC1122. */
966         if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
967             (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
968                 if ((ifp->if_flags & IFF_LOOPBACK) == 0) {
969                         ipstat.ips_badaddr++;
970                         error = EADDRNOTAVAIL;
971                         goto bad;
972                 }
973         }
974
975         m->m_pkthdr.csum_flags |= CSUM_IP;
976         sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_hwassist;
977         if (sw_csum & CSUM_DELAY_DATA) {
978                 in_delayed_cksum(m);
979                 sw_csum &= ~CSUM_DELAY_DATA;
980         }
981         m->m_pkthdr.csum_flags &= ifp->if_hwassist;
982
983         /*
984          * If small enough for interface, or the interface will take
985          * care of the fragmentation for us, can just send directly.
986          */
987         if (ip->ip_len <= ifp->if_mtu || ifp->if_hwassist & CSUM_FRAGMENT) {
988                 ip->ip_len = htons(ip->ip_len);
989                 ip->ip_off = htons(ip->ip_off);
990                 ip->ip_sum = 0;
991                 if (sw_csum & CSUM_DELAY_IP) {
992                         if (ip->ip_vhl == IP_VHL_BORING) {
993                                 ip->ip_sum = in_cksum_hdr(ip);
994                         } else {
995                                 ip->ip_sum = in_cksum(m, hlen);
996                         }
997                 }
998
999                 /* Record statistics for this interface address. */
1000                 if (!(flags & IP_FORWARDING) && ia) {
1001                         ia->ia_ifa.if_opackets++;
1002                         ia->ia_ifa.if_obytes += m->m_pkthdr.len;
1003                 }
1004
1005 #ifdef IPSEC
1006                 /* clean ipsec history once it goes out of the node */
1007                 ipsec_delaux(m);
1008 #endif
1009
1010 #ifdef MBUF_STRESS_TEST
1011                 if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size) {
1012                         struct mbuf *m1, *m2;
1013                         int length, tmp;
1014
1015                         tmp = length = m->m_pkthdr.len;
1016
1017                         while ((length -= mbuf_frag_size) >= 1) {
1018                                 m1 = m_split(m, length, M_DONTWAIT);
1019                                 if (m1 == NULL)
1020                                         break;
1021                                 m1->m_flags &= ~M_PKTHDR;
1022                                 m2 = m;
1023                                 while (m2->m_next != NULL)
1024                                         m2 = m2->m_next;
1025                                 m2->m_next = m1;
1026                         }
1027                         m->m_pkthdr.len = tmp;
1028                 }
1029 #endif
1030                 error = (*ifp->if_output)(ifp, m,
1031                                 (struct sockaddr *)dst, ro->ro_rt);
1032                 goto done;
1033         }
1034
1035         if (ip->ip_off & IP_DF) {
1036                 error = EMSGSIZE;
1037                 /*
1038                  * This case can happen if the user changed the MTU
1039                  * of an interface after enabling IP on it.  Because
1040                  * most netifs don't keep track of routes pointing to
1041                  * them, there is no way for one to update all its
1042                  * routes when the MTU is changed.
1043                  */
1044                 if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) &&
1045                     !(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU) &&
1046                     (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) {
1047                         ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu;
1048                 }
1049                 ipstat.ips_cantfrag++;
1050                 goto bad;
1051         }
1052
1053         /*
1054          * Too large for interface; fragment if possible. If successful,
1055          * on return, m will point to a list of packets to be sent.
1056          */
1057         error = ip_fragment(ip, &m, ifp->if_mtu, ifp->if_hwassist, sw_csum);
1058         if (error)
1059                 goto bad;
1060         for (; m; m = m0) {
1061                 m0 = m->m_nextpkt;
1062                 m->m_nextpkt = 0;
1063 #ifdef IPSEC
1064                 /* clean ipsec history once it goes out of the node */
1065                 ipsec_delaux(m);
1066 #endif
1067                 if (error == 0) {
1068                         /* Record statistics for this interface address. */
1069                         if (ia != NULL) {
1070                                 ia->ia_ifa.if_opackets++;
1071                                 ia->ia_ifa.if_obytes += m->m_pkthdr.len;
1072                         }
1073                         
1074                         error = (*ifp->if_output)(ifp, m,
1075                             (struct sockaddr *)dst, ro->ro_rt);
1076                 } else
1077                         m_freem(m);
1078         }
1079
1080         if (error == 0)
1081                 ipstat.ips_fragmented++;
1082
1083 done:
1084 #ifdef IPSEC
1085         if (ro == &iproute && ro->ro_rt) {
1086                 RTFREE(ro->ro_rt);
1087                 ro->ro_rt = NULL;
1088         }
1089         if (sp != NULL) {
1090                 KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
1091                         printf("DP ip_output call free SP:%p\n", sp));
1092                 key_freesp(sp);
1093         }
1094 #endif
1095 #ifdef FAST_IPSEC
1096         if (ro == &iproute && ro->ro_rt) {
1097                 RTFREE(ro->ro_rt);
1098                 ro->ro_rt = NULL;
1099         }
1100         if (sp != NULL)
1101                 KEY_FREESP(&sp);
1102 #endif
1103         return (error);
1104 bad:
1105         m_freem(m);
1106         goto done;
1107 }
1108
1109 /*
1110  * Create a chain of fragments which fit the given mtu. m_frag points to the
1111  * mbuf to be fragmented; on return it points to the chain with the fragments.
1112  * Return 0 if no error. If error, m_frag may contain a partially built
1113  * chain of fragments that should be freed by the caller.
1114  *
1115  * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist)
1116  * sw_csum contains the delayed checksums flags (e.g., CSUM_DELAY_IP).
1117  */
1118 int
1119 ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
1120             u_long if_hwassist_flags, int sw_csum)
1121 {
1122         int error = 0;
1123         int hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1124         int len = (mtu - hlen) & ~7;    /* size of payload in each fragment */
1125         int off;
1126         struct mbuf *m0 = *m_frag;      /* the original packet          */
1127         int firstlen;
1128         struct mbuf **mnext;
1129         int nfrags;
1130
1131         if (ip->ip_off & IP_DF) {       /* Fragmentation not allowed */
1132                 ipstat.ips_cantfrag++;
1133                 return EMSGSIZE;
1134         }
1135
1136         /*
1137          * Must be able to put at least 8 bytes per fragment.
1138          */
1139         if (len < 8)
1140                 return EMSGSIZE;
1141
1142         /*
1143          * If the interface will not calculate checksums on
1144          * fragmented packets, then do it here.
1145          */
1146         if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA &&
1147             (if_hwassist_flags & CSUM_IP_FRAGS) == 0) {
1148                 in_delayed_cksum(m0);
1149                 m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
1150         }
1151
1152         if (len > PAGE_SIZE) {
1153                 /* 
1154                  * Fragment large datagrams such that each segment 
1155                  * contains a multiple of PAGE_SIZE amount of data, 
1156                  * plus headers. This enables a receiver to perform 
1157                  * page-flipping zero-copy optimizations.
1158                  *
1159                  * XXX When does this help given that sender and receiver
1160                  * could have different page sizes, and also mtu could
1161                  * be less than the receiver's page size ?
1162                  */
1163                 int newlen;
1164                 struct mbuf *m;
1165
1166                 for (m = m0, off = 0; m && (off+m->m_len) <= mtu; m = m->m_next)
1167                         off += m->m_len;
1168
1169                 /*
1170                  * firstlen (off - hlen) must be aligned on an 
1171                  * 8-byte boundary
1172                  */
1173                 if (off < hlen)
1174                         goto smart_frag_failure;
1175                 off = ((off - hlen) & ~7) + hlen;
1176                 newlen = (~PAGE_MASK) & mtu;
1177                 if ((newlen + sizeof (struct ip)) > mtu) {
1178                         /* we failed, go back the default */
1179 smart_frag_failure:
1180                         newlen = len;
1181                         off = hlen + len;
1182                 }
1183                 len = newlen;
1184
1185         } else {
1186                 off = hlen + len;
1187         }
1188
1189         firstlen = off - hlen;
1190         mnext = &m0->m_nextpkt;         /* pointer to next packet */
1191
1192         /*
1193          * Loop through length of segment after first fragment,
1194          * make new header and copy data of each part and link onto chain.
1195          * Here, m0 is the original packet, m is the fragment being created.
1196          * The fragments are linked off the m_nextpkt of the original
1197          * packet, which after processing serves as the first fragment.
1198          */
1199         for (nfrags = 1; off < ip->ip_len; off += len, nfrags++) {
1200                 struct ip *mhip;        /* ip header on the fragment */
1201                 struct mbuf *m;
1202                 int mhlen = sizeof (struct ip);
1203
1204                 MGETHDR(m, M_DONTWAIT, MT_HEADER);
1205                 if (m == 0) {
1206                         error = ENOBUFS;
1207                         ipstat.ips_odropped++;
1208                         goto done;
1209                 }
1210                 m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG;
1211                 /*
1212                  * In the first mbuf, leave room for the link header, then
1213                  * copy the original IP header including options. The payload
1214                  * goes into an additional mbuf chain returned by m_copy().
1215                  */
1216                 m->m_data += max_linkhdr;
1217                 mhip = mtod(m, struct ip *);
1218                 *mhip = *ip;
1219                 if (hlen > sizeof (struct ip)) {
1220                         mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
1221                         mhip->ip_vhl = IP_MAKE_VHL(IPVERSION, mhlen >> 2);
1222                 }
1223                 m->m_len = mhlen;
1224                 /* XXX do we need to add ip->ip_off below ? */
1225                 mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off;
1226                 if (off + len >= ip->ip_len) {  /* last fragment */
1227                         len = ip->ip_len - off;
1228                         m->m_flags |= M_LASTFRAG;
1229                 } else
1230                         mhip->ip_off |= IP_MF;
1231                 mhip->ip_len = htons((u_short)(len + mhlen));
1232                 m->m_next = m_copy(m0, off, len);
1233                 if (m->m_next == 0) {           /* copy failed */
1234                         m_free(m);
1235                         error = ENOBUFS;        /* ??? */
1236                         ipstat.ips_odropped++;
1237                         goto done;
1238                 }
1239                 m->m_pkthdr.len = mhlen + len;
1240                 m->m_pkthdr.rcvif = (struct ifnet *)0;
1241                 m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags;
1242                 mhip->ip_off = htons(mhip->ip_off);
1243                 mhip->ip_sum = 0;
1244                 if (sw_csum & CSUM_DELAY_IP)
1245                         mhip->ip_sum = in_cksum(m, mhlen);
1246                 *mnext = m;
1247                 mnext = &m->m_nextpkt;
1248         }
1249         ipstat.ips_ofragments += nfrags;
1250
1251         /* set first marker for fragment chain */
1252         m0->m_flags |= M_FIRSTFRAG | M_FRAG;
1253         m0->m_pkthdr.csum_data = nfrags;
1254
1255         /*
1256          * Update first fragment by trimming what's been copied out
1257          * and updating header.
1258          */
1259         m_adj(m0, hlen + firstlen - ip->ip_len);
1260         m0->m_pkthdr.len = hlen + firstlen;
1261         ip->ip_len = htons((u_short)m0->m_pkthdr.len);
1262         ip->ip_off |= IP_MF;
1263         ip->ip_off = htons(ip->ip_off);
1264         ip->ip_sum = 0;
1265         if (sw_csum & CSUM_DELAY_IP)
1266                 ip->ip_sum = in_cksum(m0, hlen);
1267
1268 done:
1269         *m_frag = m0;
1270         return error;
1271 }
1272
1273 void
1274 in_delayed_cksum(struct mbuf *m)
1275 {
1276         struct ip *ip;
1277         u_short csum, offset;
1278
1279         ip = mtod(m, struct ip *);
1280         offset = IP_VHL_HL(ip->ip_vhl) << 2 ;
1281         csum = in_cksum_skip(m, ip->ip_len, offset);
1282         if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0)
1283                 csum = 0xffff;
1284         offset += m->m_pkthdr.csum_data;        /* checksum offset */
1285
1286         if (offset + sizeof(u_short) > m->m_len) {
1287                 printf("delayed m_pullup, m->len: %d  off: %d  p: %d\n",
1288                     m->m_len, offset, ip->ip_p);
1289                 /*
1290                  * XXX
1291                  * this shouldn't happen, but if it does, the
1292                  * correct behavior may be to insert the checksum
1293                  * in the existing chain instead of rearranging it.
1294                  */
1295                 m = m_pullup(m, offset + sizeof(u_short));
1296         }
1297         *(u_short *)(m->m_data + offset) = csum;
1298 }
1299
1300 /*
1301  * Insert IP options into preformed packet.
1302  * Adjust IP destination as required for IP source routing,
1303  * as indicated by a non-zero in_addr at the start of the options.
1304  *
1305  * XXX This routine assumes that the packet has no options in place.
1306  */
1307 static struct mbuf *
1308 ip_insertoptions(m, opt, phlen)
1309         struct mbuf *m;
1310         struct mbuf *opt;
1311         int *phlen;
1312 {
1313         struct ipoption *p = mtod(opt, struct ipoption *);
1314         struct mbuf *n;
1315         struct ip *ip = mtod(m, struct ip *);
1316         unsigned optlen;
1317
1318         optlen = opt->m_len - sizeof(p->ipopt_dst);
1319         if (optlen + (u_short)ip->ip_len > IP_MAXPACKET) {
1320                 *phlen = 0;
1321                 return (m);             /* XXX should fail */
1322         }
1323         if (p->ipopt_dst.s_addr)
1324                 ip->ip_dst = p->ipopt_dst;
1325         if (m->m_flags & M_EXT || m->m_data - optlen < m->m_pktdat) {
1326                 MGETHDR(n, M_DONTWAIT, MT_HEADER);
1327                 if (n == 0) {
1328                         *phlen = 0;
1329                         return (m);
1330                 }
1331                 n->m_pkthdr.rcvif = (struct ifnet *)0;
1332                 n->m_pkthdr.len = m->m_pkthdr.len + optlen;
1333                 m->m_len -= sizeof(struct ip);
1334                 m->m_data += sizeof(struct ip);
1335                 n->m_next = m;
1336                 m = n;
1337                 m->m_len = optlen + sizeof(struct ip);
1338                 m->m_data += max_linkhdr;
1339                 (void)memcpy(mtod(m, void *), ip, sizeof(struct ip));
1340         } else {
1341                 m->m_data -= optlen;
1342                 m->m_len += optlen;
1343                 m->m_pkthdr.len += optlen;
1344                 ovbcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
1345         }
1346         ip = mtod(m, struct ip *);
1347         bcopy(p->ipopt_list, ip + 1, optlen);
1348         *phlen = sizeof(struct ip) + optlen;
1349         ip->ip_vhl = IP_MAKE_VHL(IPVERSION, *phlen >> 2);
1350         ip->ip_len += optlen;
1351         return (m);
1352 }
1353
1354 /*
1355  * Copy options from ip to jp,
1356  * omitting those not copied during fragmentation.
1357  */
1358 int
1359 ip_optcopy(ip, jp)
1360         struct ip *ip, *jp;
1361 {
1362         u_char *cp, *dp;
1363         int opt, optlen, cnt;
1364
1365         cp = (u_char *)(ip + 1);
1366         dp = (u_char *)(jp + 1);
1367         cnt = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof (struct ip);
1368         for (; cnt > 0; cnt -= optlen, cp += optlen) {
1369                 opt = cp[0];
1370                 if (opt == IPOPT_EOL)
1371                         break;
1372                 if (opt == IPOPT_NOP) {
1373                         /* Preserve for IP mcast tunnel's LSRR alignment. */
1374                         *dp++ = IPOPT_NOP;
1375                         optlen = 1;
1376                         continue;
1377                 }
1378
1379                 KASSERT(cnt >= IPOPT_OLEN + sizeof(*cp),
1380                     ("ip_optcopy: malformed ipv4 option"));
1381                 optlen = cp[IPOPT_OLEN];
1382                 KASSERT(optlen >= IPOPT_OLEN + sizeof(*cp) && optlen <= cnt,
1383                     ("ip_optcopy: malformed ipv4 option"));
1384
1385                 /* bogus lengths should have been caught by ip_dooptions */
1386                 if (optlen > cnt)
1387                         optlen = cnt;
1388                 if (IPOPT_COPIED(opt)) {
1389                         bcopy(cp, dp, optlen);
1390                         dp += optlen;
1391                 }
1392         }
1393         for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++)
1394                 *dp++ = IPOPT_EOL;
1395         return (optlen);
1396 }
1397
1398 /*
1399  * IP socket option processing.
1400  */
1401 int
1402 ip_ctloutput(so, sopt)
1403         struct socket *so;
1404         struct sockopt *sopt;
1405 {
1406         struct  inpcb *inp = sotoinpcb(so);
1407         int     error, optval;
1408
1409         error = optval = 0;
1410         if (sopt->sopt_level != IPPROTO_IP) {
1411                 return (EINVAL);
1412         }
1413
1414         switch (sopt->sopt_dir) {
1415         case SOPT_SET:
1416                 switch (sopt->sopt_name) {
1417                 case IP_OPTIONS:
1418 #ifdef notyet
1419                 case IP_RETOPTS:
1420 #endif
1421                 {
1422                         struct mbuf *m;
1423                         if (sopt->sopt_valsize > MLEN) {
1424                                 error = EMSGSIZE;
1425                                 break;
1426                         }
1427                         MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_HEADER);
1428                         if (m == 0) {
1429                                 error = ENOBUFS;
1430                                 break;
1431                         }
1432                         m->m_len = sopt->sopt_valsize;
1433                         error = sooptcopyin(sopt, mtod(m, char *), m->m_len,
1434                                             m->m_len);
1435                         
1436                         return (ip_pcbopts(sopt->sopt_name, &inp->inp_options,
1437                                            m));
1438                 }
1439
1440                 case IP_TOS:
1441                 case IP_TTL:
1442                 case IP_RECVOPTS:
1443                 case IP_RECVRETOPTS:
1444                 case IP_RECVDSTADDR:
1445                 case IP_RECVIF:
1446                 case IP_FAITH:
1447                         error = sooptcopyin(sopt, &optval, sizeof optval,
1448                                             sizeof optval);
1449                         if (error)
1450                                 break;
1451
1452                         switch (sopt->sopt_name) {
1453                         case IP_TOS:
1454                                 inp->inp_ip_tos = optval;
1455                                 break;
1456
1457                         case IP_TTL:
1458                                 inp->inp_ip_ttl = optval;
1459                                 break;
1460 #define OPTSET(bit) \
1461         if (optval) \
1462                 inp->inp_flags |= bit; \
1463         else \
1464                 inp->inp_flags &= ~bit;
1465
1466                         case IP_RECVOPTS:
1467                                 OPTSET(INP_RECVOPTS);
1468                                 break;
1469
1470                         case IP_RECVRETOPTS:
1471                                 OPTSET(INP_RECVRETOPTS);
1472                                 break;
1473
1474                         case IP_RECVDSTADDR:
1475                                 OPTSET(INP_RECVDSTADDR);
1476                                 break;
1477
1478                         case IP_RECVIF:
1479                                 OPTSET(INP_RECVIF);
1480                                 break;
1481
1482                         case IP_FAITH:
1483                                 OPTSET(INP_FAITH);
1484                                 break;
1485                         }
1486                         break;
1487 #undef OPTSET
1488
1489                 case IP_MULTICAST_IF:
1490                 case IP_MULTICAST_VIF:
1491                 case IP_MULTICAST_TTL:
1492                 case IP_MULTICAST_LOOP:
1493                 case IP_ADD_MEMBERSHIP:
1494                 case IP_DROP_MEMBERSHIP:
1495                         error = ip_setmoptions(sopt, &inp->inp_moptions);
1496                         break;
1497
1498                 case IP_PORTRANGE:
1499                         error = sooptcopyin(sopt, &optval, sizeof optval,
1500                                             sizeof optval);
1501                         if (error)
1502                                 break;
1503
1504                         switch (optval) {
1505                         case IP_PORTRANGE_DEFAULT:
1506                                 inp->inp_flags &= ~(INP_LOWPORT);
1507                                 inp->inp_flags &= ~(INP_HIGHPORT);
1508                                 break;
1509
1510                         case IP_PORTRANGE_HIGH:
1511                                 inp->inp_flags &= ~(INP_LOWPORT);
1512                                 inp->inp_flags |= INP_HIGHPORT;
1513                                 break;
1514
1515                         case IP_PORTRANGE_LOW:
1516                                 inp->inp_flags &= ~(INP_HIGHPORT);
1517                                 inp->inp_flags |= INP_LOWPORT;
1518                                 break;
1519
1520                         default:
1521                                 error = EINVAL;
1522                                 break;
1523                         }
1524                         break;
1525
1526 #if defined(IPSEC) || defined(FAST_IPSEC)
1527                 case IP_IPSEC_POLICY:
1528                 {
1529                         caddr_t req;
1530                         size_t len = 0;
1531                         int priv;
1532                         struct mbuf *m;
1533                         int optname;
1534
1535                         if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */
1536                                 break;
1537                         if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */
1538                                 break;
1539                         priv = (sopt->sopt_td != NULL &&
1540                                 suser(sopt->sopt_td) != 0) ? 0 : 1;
1541                         req = mtod(m, caddr_t);
1542                         len = m->m_len;
1543                         optname = sopt->sopt_name;
1544                         error = ipsec4_set_policy(inp, optname, req, len, priv);
1545                         m_freem(m);
1546                         break;
1547                 }
1548 #endif /*IPSEC*/
1549
1550                 default:
1551                         error = ENOPROTOOPT;
1552                         break;
1553                 }
1554                 break;
1555
1556         case SOPT_GET:
1557                 switch (sopt->sopt_name) {
1558                 case IP_OPTIONS:
1559                 case IP_RETOPTS:
1560                         if (inp->inp_options)
1561                                 error = sooptcopyout(sopt, 
1562                                                      mtod(inp->inp_options,
1563                                                           char *),
1564                                                      inp->inp_options->m_len);
1565                         else
1566                                 sopt->sopt_valsize = 0;
1567                         break;
1568
1569                 case IP_TOS:
1570                 case IP_TTL:
1571                 case IP_RECVOPTS:
1572                 case IP_RECVRETOPTS:
1573                 case IP_RECVDSTADDR:
1574                 case IP_RECVIF:
1575                 case IP_PORTRANGE:
1576                 case IP_FAITH:
1577                         switch (sopt->sopt_name) {
1578
1579                         case IP_TOS:
1580                                 optval = inp->inp_ip_tos;
1581                                 break;
1582
1583                         case IP_TTL:
1584                                 optval = inp->inp_ip_ttl;
1585                                 break;
1586
1587 #define OPTBIT(bit)     (inp->inp_flags & bit ? 1 : 0)
1588
1589                         case IP_RECVOPTS:
1590                                 optval = OPTBIT(INP_RECVOPTS);
1591                                 break;
1592
1593                         case IP_RECVRETOPTS:
1594                                 optval = OPTBIT(INP_RECVRETOPTS);
1595                                 break;
1596
1597                         case IP_RECVDSTADDR:
1598                                 optval = OPTBIT(INP_RECVDSTADDR);
1599                                 break;
1600
1601                         case IP_RECVIF:
1602                                 optval = OPTBIT(INP_RECVIF);
1603                                 break;
1604
1605                         case IP_PORTRANGE:
1606                                 if (inp->inp_flags & INP_HIGHPORT)
1607                                         optval = IP_PORTRANGE_HIGH;
1608                                 else if (inp->inp_flags & INP_LOWPORT)
1609                                         optval = IP_PORTRANGE_LOW;
1610                                 else
1611                                         optval = 0;
1612                                 break;
1613
1614                         case IP_FAITH:
1615                                 optval = OPTBIT(INP_FAITH);
1616                                 break;
1617                         }
1618                         error = sooptcopyout(sopt, &optval, sizeof optval);
1619                         break;
1620
1621                 case IP_MULTICAST_IF:
1622                 case IP_MULTICAST_VIF:
1623                 case IP_MULTICAST_TTL:
1624                 case IP_MULTICAST_LOOP:
1625                 case IP_ADD_MEMBERSHIP:
1626                 case IP_DROP_MEMBERSHIP:
1627                         error = ip_getmoptions(sopt, inp->inp_moptions);
1628                         break;
1629
1630 #if defined(IPSEC) || defined(FAST_IPSEC)
1631                 case IP_IPSEC_POLICY:
1632                 {
1633                         struct mbuf *m = NULL;
1634                         caddr_t req = NULL;
1635                         size_t len = 0;
1636
1637                         if (m != 0) {
1638                                 req = mtod(m, caddr_t);
1639                                 len = m->m_len;
1640                         }
1641                         error = ipsec4_get_policy(sotoinpcb(so), req, len, &m);
1642                         if (error == 0)
1643                                 error = soopt_mcopyout(sopt, m); /* XXX */
1644                         if (error == 0)
1645                                 m_freem(m);
1646                         break;
1647                 }
1648 #endif /*IPSEC*/
1649
1650                 default:
1651                         error = ENOPROTOOPT;
1652                         break;
1653                 }
1654                 break;
1655         }
1656         return (error);
1657 }
1658
1659 /*
1660  * Set up IP options in pcb for insertion in output packets.
1661  * Store in mbuf with pointer in pcbopt, adding pseudo-option
1662  * with destination address if source routed.
1663  */
1664 static int
1665 ip_pcbopts(optname, pcbopt, m)
1666         int optname;
1667         struct mbuf **pcbopt;
1668         struct mbuf *m;
1669 {
1670         int cnt, optlen;
1671         u_char *cp;
1672         u_char opt;
1673
1674         /* turn off any old options */
1675         if (*pcbopt)
1676                 (void)m_free(*pcbopt);
1677         *pcbopt = 0;
1678         if (m == (struct mbuf *)0 || m->m_len == 0) {
1679                 /*
1680                  * Only turning off any previous options.
1681                  */
1682                 if (m)
1683                         (void)m_free(m);
1684                 return (0);
1685         }
1686
1687         if (m->m_len % sizeof(int32_t))
1688                 goto bad;
1689         /*
1690          * IP first-hop destination address will be stored before
1691          * actual options; move other options back
1692          * and clear it when none present.
1693          */
1694         if (m->m_data + m->m_len + sizeof(struct in_addr) >= &m->m_dat[MLEN])
1695                 goto bad;
1696         cnt = m->m_len;
1697         m->m_len += sizeof(struct in_addr);
1698         cp = mtod(m, u_char *) + sizeof(struct in_addr);
1699         ovbcopy(mtod(m, caddr_t), (caddr_t)cp, (unsigned)cnt);
1700         bzero(mtod(m, caddr_t), sizeof(struct in_addr));
1701
1702         for (; cnt > 0; cnt -= optlen, cp += optlen) {
1703                 opt = cp[IPOPT_OPTVAL];
1704                 if (opt == IPOPT_EOL)
1705                         break;
1706                 if (opt == IPOPT_NOP)
1707                         optlen = 1;
1708                 else {
1709                         if (cnt < IPOPT_OLEN + sizeof(*cp))
1710                                 goto bad;
1711                         optlen = cp[IPOPT_OLEN];
1712                         if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt)
1713                                 goto bad;
1714                 }
1715                 switch (opt) {
1716
1717                 default:
1718                         break;
1719
1720                 case IPOPT_LSRR:
1721                 case IPOPT_SSRR:
1722                         /*
1723                          * user process specifies route as:
1724                          *      ->A->B->C->D
1725                          * D must be our final destination (but we can't
1726                          * check that since we may not have connected yet).
1727                          * A is first hop destination, which doesn't appear in
1728                          * actual IP option, but is stored before the options.
1729                          */
1730                         if (optlen < IPOPT_MINOFF - 1 + sizeof(struct in_addr))
1731                                 goto bad;
1732                         m->m_len -= sizeof(struct in_addr);
1733                         cnt -= sizeof(struct in_addr);
1734                         optlen -= sizeof(struct in_addr);
1735                         cp[IPOPT_OLEN] = optlen;
1736                         /*
1737                          * Move first hop before start of options.
1738                          */
1739                         bcopy((caddr_t)&cp[IPOPT_OFFSET+1], mtod(m, caddr_t),
1740                             sizeof(struct in_addr));
1741                         /*
1742                          * Then copy rest of options back
1743                          * to close up the deleted entry.
1744                          */
1745                         ovbcopy((caddr_t)(&cp[IPOPT_OFFSET+1] +
1746                             sizeof(struct in_addr)),
1747                             (caddr_t)&cp[IPOPT_OFFSET+1],
1748                             (unsigned)cnt + sizeof(struct in_addr));
1749                         break;
1750                 }
1751         }
1752         if (m->m_len > MAX_IPOPTLEN + sizeof(struct in_addr))
1753                 goto bad;
1754         *pcbopt = m;
1755         return (0);
1756
1757 bad:
1758         (void)m_free(m);
1759         return (EINVAL);
1760 }
1761
1762 /*
1763  * XXX
1764  * The whole multicast option thing needs to be re-thought.
1765  * Several of these options are equally applicable to non-multicast
1766  * transmission, and one (IP_MULTICAST_TTL) totally duplicates a
1767  * standard option (IP_TTL).
1768  */
1769
1770 /*
1771  * following RFC1724 section 3.3, 0.0.0.0/8 is interpreted as interface index.
1772  */
1773 static struct ifnet *
1774 ip_multicast_if(a, ifindexp)
1775         struct in_addr *a;
1776         int *ifindexp;
1777 {
1778         int ifindex;
1779         struct ifnet *ifp;
1780
1781         if (ifindexp)
1782                 *ifindexp = 0;
1783         if (ntohl(a->s_addr) >> 24 == 0) {
1784                 ifindex = ntohl(a->s_addr) & 0xffffff;
1785                 if (ifindex < 0 || if_index < ifindex)
1786                         return NULL;
1787                 ifp = ifindex2ifnet[ifindex];
1788                 if (ifindexp)
1789                         *ifindexp = ifindex;
1790         } else {
1791                 INADDR_TO_IFP(*a, ifp);
1792         }
1793         return ifp;
1794 }
1795
1796 /*
1797  * Set the IP multicast options in response to user setsockopt().
1798  */
1799 static int
1800 ip_setmoptions(sopt, imop)
1801         struct sockopt *sopt;
1802         struct ip_moptions **imop;
1803 {
1804         int error = 0;
1805         int i;
1806         struct in_addr addr;
1807         struct ip_mreq mreq;
1808         struct ifnet *ifp;
1809         struct ip_moptions *imo = *imop;
1810         struct route ro;
1811         struct sockaddr_in *dst;
1812         int ifindex;
1813         int s;
1814
1815         if (imo == NULL) {
1816                 /*
1817                  * No multicast option buffer attached to the pcb;
1818                  * allocate one and initialize to default values.
1819                  */
1820                 imo = (struct ip_moptions*)malloc(sizeof(*imo), M_IPMOPTS,
1821                     M_WAITOK);
1822
1823                 if (imo == NULL)
1824                         return (ENOBUFS);
1825                 *imop = imo;
1826                 imo->imo_multicast_ifp = NULL;
1827                 imo->imo_multicast_addr.s_addr = INADDR_ANY;
1828                 imo->imo_multicast_vif = -1;
1829                 imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1830                 imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
1831                 imo->imo_num_memberships = 0;
1832         }
1833
1834         switch (sopt->sopt_name) {
1835         /* store an index number for the vif you wanna use in the send */
1836         case IP_MULTICAST_VIF:
1837                 if (legal_vif_num == 0) {
1838                         error = EOPNOTSUPP;
1839                         break;
1840                 }
1841                 error = sooptcopyin(sopt, &i, sizeof i, sizeof i);
1842                 if (error)
1843                         break;
1844                 if (!legal_vif_num(i) && (i != -1)) {
1845                         error = EINVAL;
1846                         break;
1847                 }
1848                 imo->imo_multicast_vif = i;
1849                 break;
1850
1851         case IP_MULTICAST_IF:
1852                 /*
1853                  * Select the interface for outgoing multicast packets.
1854                  */
1855                 error = sooptcopyin(sopt, &addr, sizeof addr, sizeof addr);
1856                 if (error)
1857                         break;
1858                 /*
1859                  * INADDR_ANY is used to remove a previous selection.
1860                  * When no interface is selected, a default one is
1861                  * chosen every time a multicast packet is sent.
1862                  */
1863                 if (addr.s_addr == INADDR_ANY) {
1864                         imo->imo_multicast_ifp = NULL;
1865                         break;
1866                 }
1867                 /*
1868                  * The selected interface is identified by its local
1869                  * IP address.  Find the interface and confirm that
1870                  * it supports multicasting.
1871                  */
1872                 s = splimp();
1873                 ifp = ip_multicast_if(&addr, &ifindex);
1874                 if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
1875                         splx(s);
1876                         error = EADDRNOTAVAIL;
1877                         break;
1878                 }
1879                 imo->imo_multicast_ifp = ifp;
1880                 if (ifindex)
1881                         imo->imo_multicast_addr = addr;
1882                 else
1883                         imo->imo_multicast_addr.s_addr = INADDR_ANY;
1884                 splx(s);
1885                 break;
1886
1887         case IP_MULTICAST_TTL:
1888                 /*
1889                  * Set the IP time-to-live for outgoing multicast packets.
1890                  * The original multicast API required a char argument,
1891                  * which is inconsistent with the rest of the socket API.
1892                  * We allow either a char or an int.
1893                  */
1894                 if (sopt->sopt_valsize == 1) {
1895                         u_char ttl;
1896                         error = sooptcopyin(sopt, &ttl, 1, 1);
1897                         if (error)
1898                                 break;
1899                         imo->imo_multicast_ttl = ttl;
1900                 } else {
1901                         u_int ttl;
1902                         error = sooptcopyin(sopt, &ttl, sizeof ttl, 
1903                                             sizeof ttl);
1904                         if (error)
1905                                 break;
1906                         if (ttl > 255)
1907                                 error = EINVAL;
1908                         else
1909                                 imo->imo_multicast_ttl = ttl;
1910                 }
1911                 break;
1912
1913         case IP_MULTICAST_LOOP:
1914                 /*
1915                  * Set the loopback flag for outgoing multicast packets.
1916                  * Must be zero or one.  The original multicast API required a
1917                  * char argument, which is inconsistent with the rest
1918                  * of the socket API.  We allow either a char or an int.
1919                  */
1920                 if (sopt->sopt_valsize == 1) {
1921                         u_char loop;
1922                         error = sooptcopyin(sopt, &loop, 1, 1);
1923                         if (error)
1924                                 break;
1925                         imo->imo_multicast_loop = !!loop;
1926                 } else {
1927                         u_int loop;
1928                         error = sooptcopyin(sopt, &loop, sizeof loop,
1929                                             sizeof loop);
1930                         if (error)
1931                                 break;
1932                         imo->imo_multicast_loop = !!loop;
1933                 }
1934                 break;
1935
1936         case IP_ADD_MEMBERSHIP:
1937                 /*
1938                  * Add a multicast group membership.
1939                  * Group must be a valid IP multicast address.
1940                  */
1941                 error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq);
1942                 if (error)
1943                         break;
1944
1945                 if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) {
1946                         error = EINVAL;
1947                         break;
1948                 }
1949                 s = splimp();
1950                 /*
1951                  * If no interface address was provided, use the interface of
1952                  * the route to the given multicast address.
1953                  */
1954                 if (mreq.imr_interface.s_addr == INADDR_ANY) {
1955                         bzero((caddr_t)&ro, sizeof(ro));
1956                         dst = (struct sockaddr_in *)&ro.ro_dst;
1957                         dst->sin_len = sizeof(*dst);
1958                         dst->sin_family = AF_INET;
1959                         dst->sin_addr = mreq.imr_multiaddr;
1960                         rtalloc(&ro);
1961                         if (ro.ro_rt == NULL) {
1962                                 error = EADDRNOTAVAIL;
1963                                 splx(s);
1964                                 break;
1965                         }
1966                         ifp = ro.ro_rt->rt_ifp;
1967                         rtfree(ro.ro_rt);
1968                 }
1969                 else {
1970                         ifp = ip_multicast_if(&mreq.imr_interface, NULL);
1971                 }
1972
1973                 /*
1974                  * See if we found an interface, and confirm that it
1975                  * supports multicast.
1976                  */
1977                 if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
1978                         error = EADDRNOTAVAIL;
1979                         splx(s);
1980                         break;
1981                 }
1982                 /*
1983                  * See if the membership already exists or if all the
1984                  * membership slots are full.
1985                  */
1986                 for (i = 0; i < imo->imo_num_memberships; ++i) {
1987                         if (imo->imo_membership[i]->inm_ifp == ifp &&
1988                             imo->imo_membership[i]->inm_addr.s_addr
1989                                                 == mreq.imr_multiaddr.s_addr)
1990                                 break;
1991                 }
1992                 if (i < imo->imo_num_memberships) {
1993                         error = EADDRINUSE;
1994                         splx(s);
1995                         break;
1996                 }
1997                 if (i == IP_MAX_MEMBERSHIPS) {
1998                         error = ETOOMANYREFS;
1999                         splx(s);
2000                         break;
2001                 }
2002                 /*
2003                  * Everything looks good; add a new record to the multicast
2004                  * address list for the given interface.
2005                  */
2006                 if ((imo->imo_membership[i] =
2007                     in_addmulti(&mreq.imr_multiaddr, ifp)) == NULL) {
2008                         error = ENOBUFS;
2009                         splx(s);
2010                         break;
2011                 }
2012                 ++imo->imo_num_memberships;
2013                 splx(s);
2014                 break;
2015
2016         case IP_DROP_MEMBERSHIP:
2017                 /*
2018                  * Drop a multicast group membership.
2019                  * Group must be a valid IP multicast address.
2020                  */
2021                 error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq);
2022                 if (error)
2023                         break;
2024
2025                 if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) {
2026                         error = EINVAL;
2027                         break;
2028                 }
2029
2030                 s = splimp();
2031                 /*
2032                  * If an interface address was specified, get a pointer
2033                  * to its ifnet structure.
2034                  */
2035                 if (mreq.imr_interface.s_addr == INADDR_ANY)
2036                         ifp = NULL;
2037                 else {
2038                         ifp = ip_multicast_if(&mreq.imr_interface, NULL);
2039                         if (ifp == NULL) {
2040                                 error = EADDRNOTAVAIL;
2041                                 splx(s);
2042                                 break;
2043                         }
2044                 }
2045                 /*
2046                  * Find the membership in the membership array.
2047                  */
2048                 for (i = 0; i < imo->imo_num_memberships; ++i) {
2049                         if ((ifp == NULL ||
2050                              imo->imo_membership[i]->inm_ifp == ifp) &&
2051                              imo->imo_membership[i]->inm_addr.s_addr ==
2052                              mreq.imr_multiaddr.s_addr)
2053                                 break;
2054                 }
2055                 if (i == imo->imo_num_memberships) {
2056                         error = EADDRNOTAVAIL;
2057                         splx(s);
2058                         break;
2059                 }
2060                 /*
2061                  * Give up the multicast address record to which the
2062                  * membership points.
2063                  */
2064                 in_delmulti(imo->imo_membership[i]);
2065                 /*
2066                  * Remove the gap in the membership array.
2067                  */
2068                 for (++i; i < imo->imo_num_memberships; ++i)
2069                         imo->imo_membership[i-1] = imo->imo_membership[i];
2070                 --imo->imo_num_memberships;
2071                 splx(s);
2072                 break;
2073
2074         default:
2075                 error = EOPNOTSUPP;
2076                 break;
2077         }
2078
2079         /*
2080          * If all options have default values, no need to keep the mbuf.
2081          */
2082         if (imo->imo_multicast_ifp == NULL &&
2083             imo->imo_multicast_vif == -1 &&
2084             imo->imo_multicast_ttl == IP_DEFAULT_MULTICAST_TTL &&
2085             imo->imo_multicast_loop == IP_DEFAULT_MULTICAST_LOOP &&
2086             imo->imo_num_memberships == 0) {
2087                 free(*imop, M_IPMOPTS);
2088                 *imop = NULL;
2089         }
2090
2091         return (error);
2092 }
2093
2094 /*
2095  * Return the IP multicast options in response to user getsockopt().
2096  */
2097 static int
2098 ip_getmoptions(sopt, imo)
2099         struct sockopt *sopt;
2100         struct ip_moptions *imo;
2101 {
2102         struct in_addr addr;
2103         struct in_ifaddr *ia;
2104         int error, optval;
2105         u_char coptval;
2106
2107         error = 0;
2108         switch (sopt->sopt_name) {
2109         case IP_MULTICAST_VIF: 
2110                 if (imo != NULL)
2111                         optval = imo->imo_multicast_vif;
2112                 else
2113                         optval = -1;
2114                 error = sooptcopyout(sopt, &optval, sizeof optval);
2115                 break;
2116
2117         case IP_MULTICAST_IF:
2118                 if (imo == NULL || imo->imo_multicast_ifp == NULL)
2119                         addr.s_addr = INADDR_ANY;
2120                 else if (imo->imo_multicast_addr.s_addr) {
2121                         /* return the value user has set */
2122                         addr = imo->imo_multicast_addr;
2123                 } else {
2124                         IFP_TO_IA(imo->imo_multicast_ifp, ia);
2125                         addr.s_addr = (ia == NULL) ? INADDR_ANY
2126                                 : IA_SIN(ia)->sin_addr.s_addr;
2127                 }
2128                 error = sooptcopyout(sopt, &addr, sizeof addr);
2129                 break;
2130
2131         case IP_MULTICAST_TTL:
2132                 if (imo == 0)
2133                         optval = coptval = IP_DEFAULT_MULTICAST_TTL;
2134                 else
2135                         optval = coptval = imo->imo_multicast_ttl;
2136                 if (sopt->sopt_valsize == 1)
2137                         error = sooptcopyout(sopt, &coptval, 1);
2138                 else
2139                         error = sooptcopyout(sopt, &optval, sizeof optval);
2140                 break;
2141
2142         case IP_MULTICAST_LOOP:
2143                 if (imo == 0)
2144                         optval = coptval = IP_DEFAULT_MULTICAST_LOOP;
2145                 else
2146                         optval = coptval = imo->imo_multicast_loop;
2147                 if (sopt->sopt_valsize == 1)
2148                         error = sooptcopyout(sopt, &coptval, 1);
2149                 else
2150                         error = sooptcopyout(sopt, &optval, sizeof optval);
2151                 break;
2152
2153         default:
2154                 error = ENOPROTOOPT;
2155                 break;
2156         }
2157         return (error);
2158 }
2159
2160 /*
2161  * Discard the IP multicast options.
2162  */
2163 void
2164 ip_freemoptions(imo)
2165         struct ip_moptions *imo;
2166 {
2167         int i;
2168
2169         if (imo != NULL) {
2170                 for (i = 0; i < imo->imo_num_memberships; ++i)
2171                         in_delmulti(imo->imo_membership[i]);
2172                 free(imo, M_IPMOPTS);
2173         }
2174 }
2175
2176 /*
2177  * Routine called from ip_output() to loop back a copy of an IP multicast
2178  * packet to the input queue of a specified interface.  Note that this
2179  * calls the output routine of the loopback "driver", but with an interface
2180  * pointer that might NOT be a loopback interface -- evil, but easier than
2181  * replicating that code here.
2182  */
2183 static void
2184 ip_mloopback(ifp, m, dst, hlen)
2185         struct ifnet *ifp;
2186         struct mbuf *m;
2187         struct sockaddr_in *dst;
2188         int hlen;
2189 {
2190         struct ip *ip;
2191         struct mbuf *copym;
2192
2193         copym = m_copy(m, 0, M_COPYALL);
2194         if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen))
2195                 copym = m_pullup(copym, hlen);
2196         if (copym != NULL) {
2197                 /*
2198                  * We don't bother to fragment if the IP length is greater
2199                  * than the interface's MTU.  Can this possibly matter?
2200                  */
2201                 ip = mtod(copym, struct ip *);
2202                 ip->ip_len = htons(ip->ip_len);
2203                 ip->ip_off = htons(ip->ip_off);
2204                 ip->ip_sum = 0;
2205                 if (ip->ip_vhl == IP_VHL_BORING) {
2206                         ip->ip_sum = in_cksum_hdr(ip);
2207                 } else {
2208                         ip->ip_sum = in_cksum(copym, hlen);
2209                 }
2210                 /*
2211                  * NB:
2212                  * It's not clear whether there are any lingering
2213                  * reentrancy problems in other areas which might
2214                  * be exposed by using ip_input directly (in
2215                  * particular, everything which modifies the packet
2216                  * in-place).  Yet another option is using the
2217                  * protosw directly to deliver the looped back
2218                  * packet.  For the moment, we'll err on the side
2219                  * of safety by using if_simloop().
2220                  */
2221 #if 1 /* XXX */
2222                 if (dst->sin_family != AF_INET) {
2223                         printf("ip_mloopback: bad address family %d\n",
2224                                                 dst->sin_family);
2225                         dst->sin_family = AF_INET;
2226                 }
2227 #endif
2228
2229 #ifdef notdef
2230                 copym->m_pkthdr.rcvif = ifp;
2231                 ip_input(copym);
2232 #else
2233                 /* if the checksum hasn't been computed, mark it as valid */
2234                 if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
2235                         copym->m_pkthdr.csum_flags |=
2236                             CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
2237                         copym->m_pkthdr.csum_data = 0xffff;
2238                 }
2239                 if_simloop(ifp, copym, dst->sin_family, 0);
2240 #endif
2241         }
2242 }