Correct BSD License clause numbering from 1-2-4 to 1-2-3.
[dragonfly.git] / sys / net / if.c
1 /*
2  * Copyright (c) 1980, 1986, 1993
3  *      The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. Neither the name of the University nor the names of its contributors
14  *    may be used to endorse or promote products derived from this software
15  *    without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  *      @(#)if.c        8.3 (Berkeley) 1/4/94
30  * $FreeBSD: src/sys/net/if.c,v 1.185 2004/03/13 02:35:03 brooks Exp $
31  */
32
33 #include "opt_compat.h"
34 #include "opt_inet6.h"
35 #include "opt_inet.h"
36 #include "opt_ifpoll.h"
37
38 #include <sys/param.h>
39 #include <sys/malloc.h>
40 #include <sys/mbuf.h>
41 #include <sys/systm.h>
42 #include <sys/proc.h>
43 #include <sys/priv.h>
44 #include <sys/protosw.h>
45 #include <sys/socket.h>
46 #include <sys/socketvar.h>
47 #include <sys/socketops.h>
48 #include <sys/protosw.h>
49 #include <sys/kernel.h>
50 #include <sys/ktr.h>
51 #include <sys/mutex.h>
52 #include <sys/sockio.h>
53 #include <sys/syslog.h>
54 #include <sys/sysctl.h>
55 #include <sys/domain.h>
56 #include <sys/thread.h>
57 #include <sys/serialize.h>
58 #include <sys/bus.h>
59
60 #include <sys/thread2.h>
61 #include <sys/msgport2.h>
62 #include <sys/mutex2.h>
63
64 #include <net/if.h>
65 #include <net/if_arp.h>
66 #include <net/if_dl.h>
67 #include <net/if_types.h>
68 #include <net/if_var.h>
69 #include <net/ifq_var.h>
70 #include <net/radix.h>
71 #include <net/route.h>
72 #include <net/if_clone.h>
73 #include <net/netisr2.h>
74 #include <net/netmsg2.h>
75
76 #include <machine/atomic.h>
77 #include <machine/stdarg.h>
78 #include <machine/smp.h>
79
80 #if defined(INET) || defined(INET6)
81 /*XXX*/
82 #include <netinet/in.h>
83 #include <netinet/in_var.h>
84 #include <netinet/if_ether.h>
85 #ifdef INET6
86 #include <netinet6/in6_var.h>
87 #include <netinet6/in6_ifattach.h>
88 #endif
89 #endif
90
91 #if defined(COMPAT_43)
92 #include <emulation/43bsd/43bsd_socket.h>
93 #endif /* COMPAT_43 */
94
95 struct netmsg_ifaddr {
96         struct netmsg_base base;
97         struct ifaddr   *ifa;
98         struct ifnet    *ifp;
99         int             tail;
100 };
101
102 struct ifsubq_stage_head {
103         TAILQ_HEAD(, ifsubq_stage)      stg_head;
104 } __cachealign;
105
106 /*
107  * System initialization
108  */
109 static void     if_attachdomain(void *);
110 static void     if_attachdomain1(struct ifnet *);
111 static int      ifconf(u_long, caddr_t, struct ucred *);
112 static void     ifinit(void *);
113 static void     ifnetinit(void *);
114 static void     if_slowtimo(void *);
115 static void     link_rtrequest(int, struct rtentry *, struct rt_addrinfo *);
116 static int      if_rtdel(struct radix_node *, void *);
117
118 /* Helper functions */
119 static void     ifsq_watchdog_reset(struct ifsubq_watchdog *);
120
121 #ifdef INET6
122 /*
123  * XXX: declare here to avoid to include many inet6 related files..
124  * should be more generalized?
125  */
126 extern void     nd6_setmtu(struct ifnet *);
127 #endif
128
129 SYSCTL_NODE(_net, PF_LINK, link, CTLFLAG_RW, 0, "Link layers");
130 SYSCTL_NODE(_net_link, 0, generic, CTLFLAG_RW, 0, "Generic link-management");
131
132 static int ifsq_stage_cntmax = 4;
133 TUNABLE_INT("net.link.stage_cntmax", &ifsq_stage_cntmax);
134 SYSCTL_INT(_net_link, OID_AUTO, stage_cntmax, CTLFLAG_RW,
135     &ifsq_stage_cntmax, 0, "ifq staging packet count max");
136
137 static int if_stats_compat = 0;
138 SYSCTL_INT(_net_link, OID_AUTO, stats_compat, CTLFLAG_RW,
139     &if_stats_compat, 0, "Compat the old ifnet stats");
140
141 SYSINIT(interfaces, SI_SUB_PROTO_IF, SI_ORDER_FIRST, ifinit, NULL)
142 /* Must be after netisr_init */
143 SYSINIT(ifnet, SI_SUB_PRE_DRIVERS, SI_ORDER_SECOND, ifnetinit, NULL)
144
145 static  if_com_alloc_t *if_com_alloc[256];
146 static  if_com_free_t *if_com_free[256];
147
148 MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address");
149 MALLOC_DEFINE(M_IFMADDR, "ether_multi", "link-level multicast address");
150 MALLOC_DEFINE(M_IFNET, "ifnet", "interface structure");
151
152 int                     ifqmaxlen = IFQ_MAXLEN;
153 struct ifnethead        ifnet = TAILQ_HEAD_INITIALIZER(ifnet);
154
155 struct callout          if_slowtimo_timer;
156
157 int                     if_index = 0;
158 struct ifnet            **ifindex2ifnet = NULL;
159 static struct thread    ifnet_threads[MAXCPU];
160
161 static struct ifsubq_stage_head ifsubq_stage_heads[MAXCPU];
162
163 #ifdef notyet
164 #define IFQ_KTR_STRING          "ifq=%p"
165 #define IFQ_KTR_ARGS    struct ifaltq *ifq
166 #ifndef KTR_IFQ
167 #define KTR_IFQ                 KTR_ALL
168 #endif
169 KTR_INFO_MASTER(ifq);
170 KTR_INFO(KTR_IFQ, ifq, enqueue, 0, IFQ_KTR_STRING, IFQ_KTR_ARGS);
171 KTR_INFO(KTR_IFQ, ifq, dequeue, 1, IFQ_KTR_STRING, IFQ_KTR_ARGS);
172 #define logifq(name, arg)       KTR_LOG(ifq_ ## name, arg)
173
174 #define IF_START_KTR_STRING     "ifp=%p"
175 #define IF_START_KTR_ARGS       struct ifnet *ifp
176 #ifndef KTR_IF_START
177 #define KTR_IF_START            KTR_ALL
178 #endif
179 KTR_INFO_MASTER(if_start);
180 KTR_INFO(KTR_IF_START, if_start, run, 0,
181          IF_START_KTR_STRING, IF_START_KTR_ARGS);
182 KTR_INFO(KTR_IF_START, if_start, sched, 1,
183          IF_START_KTR_STRING, IF_START_KTR_ARGS);
184 KTR_INFO(KTR_IF_START, if_start, avoid, 2,
185          IF_START_KTR_STRING, IF_START_KTR_ARGS);
186 KTR_INFO(KTR_IF_START, if_start, contend_sched, 3,
187          IF_START_KTR_STRING, IF_START_KTR_ARGS);
188 KTR_INFO(KTR_IF_START, if_start, chase_sched, 4,
189          IF_START_KTR_STRING, IF_START_KTR_ARGS);
190 #define logifstart(name, arg)   KTR_LOG(if_start_ ## name, arg)
191 #endif
192
193 TAILQ_HEAD(, ifg_group) ifg_head = TAILQ_HEAD_INITIALIZER(ifg_head);
194
195 /*
196  * Network interface utility routines.
197  *
198  * Routines with ifa_ifwith* names take sockaddr *'s as
199  * parameters.
200  */
201 /* ARGSUSED*/
202 void
203 ifinit(void *dummy)
204 {
205         struct ifnet *ifp;
206
207         callout_init(&if_slowtimo_timer);
208
209         crit_enter();
210         TAILQ_FOREACH(ifp, &ifnet, if_link) {
211                 if (ifp->if_snd.altq_maxlen == 0) {
212                         if_printf(ifp, "XXX: driver didn't set ifq_maxlen\n");
213                         ifq_set_maxlen(&ifp->if_snd, ifqmaxlen);
214                 }
215         }
216         crit_exit();
217
218         if_slowtimo(0);
219 }
220
221 static void
222 ifsq_ifstart_ipifunc(void *arg)
223 {
224         struct ifaltq_subque *ifsq = arg;
225         struct lwkt_msg *lmsg = ifsq_get_ifstart_lmsg(ifsq, mycpuid);
226
227         crit_enter();
228         if (lmsg->ms_flags & MSGF_DONE)
229                 lwkt_sendmsg(netisr_cpuport(mycpuid), lmsg);
230         crit_exit();
231 }
232
233 static __inline void
234 ifsq_stage_remove(struct ifsubq_stage_head *head, struct ifsubq_stage *stage)
235 {
236         KKASSERT(stage->stg_flags & IFSQ_STAGE_FLAG_QUED);
237         TAILQ_REMOVE(&head->stg_head, stage, stg_link);
238         stage->stg_flags &= ~(IFSQ_STAGE_FLAG_QUED | IFSQ_STAGE_FLAG_SCHED);
239         stage->stg_cnt = 0;
240         stage->stg_len = 0;
241 }
242
243 static __inline void
244 ifsq_stage_insert(struct ifsubq_stage_head *head, struct ifsubq_stage *stage)
245 {
246         KKASSERT((stage->stg_flags &
247             (IFSQ_STAGE_FLAG_QUED | IFSQ_STAGE_FLAG_SCHED)) == 0);
248         stage->stg_flags |= IFSQ_STAGE_FLAG_QUED;
249         TAILQ_INSERT_TAIL(&head->stg_head, stage, stg_link);
250 }
251
252 /*
253  * Schedule ifnet.if_start on the subqueue owner CPU
254  */
255 static void
256 ifsq_ifstart_schedule(struct ifaltq_subque *ifsq, int force)
257 {
258         int cpu;
259
260         if (!force && curthread->td_type == TD_TYPE_NETISR &&
261             ifsq_stage_cntmax > 0) {
262                 struct ifsubq_stage *stage = ifsq_get_stage(ifsq, mycpuid);
263
264                 stage->stg_cnt = 0;
265                 stage->stg_len = 0;
266                 if ((stage->stg_flags & IFSQ_STAGE_FLAG_QUED) == 0)
267                         ifsq_stage_insert(&ifsubq_stage_heads[mycpuid], stage);
268                 stage->stg_flags |= IFSQ_STAGE_FLAG_SCHED;
269                 return;
270         }
271
272         cpu = ifsq_get_cpuid(ifsq);
273         if (cpu != mycpuid)
274                 lwkt_send_ipiq(globaldata_find(cpu), ifsq_ifstart_ipifunc, ifsq);
275         else
276                 ifsq_ifstart_ipifunc(ifsq);
277 }
278
279 /*
280  * NOTE:
281  * This function will release ifnet.if_start subqueue interlock,
282  * if ifnet.if_start for the subqueue does not need to be scheduled
283  */
284 static __inline int
285 ifsq_ifstart_need_schedule(struct ifaltq_subque *ifsq, int running)
286 {
287         if (!running || ifsq_is_empty(ifsq)
288 #ifdef ALTQ
289             || ifsq->ifsq_altq->altq_tbr != NULL
290 #endif
291         ) {
292                 ALTQ_SQ_LOCK(ifsq);
293                 /*
294                  * ifnet.if_start subqueue interlock is released, if:
295                  * 1) Hardware can not take any packets, due to
296                  *    o  interface is marked down
297                  *    o  hardware queue is full (ifsq_is_oactive)
298                  *    Under the second situation, hardware interrupt
299                  *    or polling(4) will call/schedule ifnet.if_start
300                  *    on the subqueue when hardware queue is ready
301                  * 2) There is no packet in the subqueue.
302                  *    Further ifq_dispatch or ifq_handoff will call/
303                  *    schedule ifnet.if_start on the subqueue.
304                  * 3) TBR is used and it does not allow further
305                  *    dequeueing.
306                  *    TBR callout will call ifnet.if_start on the
307                  *    subqueue.
308                  */
309                 if (!running || !ifsq_data_ready(ifsq)) {
310                         ifsq_clr_started(ifsq);
311                         ALTQ_SQ_UNLOCK(ifsq);
312                         return 0;
313                 }
314                 ALTQ_SQ_UNLOCK(ifsq);
315         }
316         return 1;
317 }
318
319 static void
320 ifsq_ifstart_dispatch(netmsg_t msg)
321 {
322         struct lwkt_msg *lmsg = &msg->base.lmsg;
323         struct ifaltq_subque *ifsq = lmsg->u.ms_resultp;
324         struct ifnet *ifp = ifsq_get_ifp(ifsq);
325         int running = 0, need_sched;
326
327         crit_enter();
328         lwkt_replymsg(lmsg, 0); /* reply ASAP */
329         crit_exit();
330
331         if (mycpuid != ifsq_get_cpuid(ifsq)) {
332                 /*
333                  * We need to chase the subqueue owner CPU change.
334                  */
335                 ifsq_ifstart_schedule(ifsq, 1);
336                 return;
337         }
338
339         ifsq_serialize_hw(ifsq);
340         if ((ifp->if_flags & IFF_RUNNING) && !ifsq_is_oactive(ifsq)) {
341                 ifp->if_start(ifp, ifsq);
342                 if ((ifp->if_flags & IFF_RUNNING) && !ifsq_is_oactive(ifsq))
343                         running = 1;
344         }
345         need_sched = ifsq_ifstart_need_schedule(ifsq, running);
346         ifsq_deserialize_hw(ifsq);
347
348         if (need_sched) {
349                 /*
350                  * More data need to be transmitted, ifnet.if_start is
351                  * scheduled on the subqueue owner CPU, and we keep going.
352                  * NOTE: ifnet.if_start subqueue interlock is not released.
353                  */
354                 ifsq_ifstart_schedule(ifsq, 0);
355         }
356 }
357
358 /* Device driver ifnet.if_start helper function */
359 void
360 ifsq_devstart(struct ifaltq_subque *ifsq)
361 {
362         struct ifnet *ifp = ifsq_get_ifp(ifsq);
363         int running = 0;
364
365         ASSERT_ALTQ_SQ_SERIALIZED_HW(ifsq);
366
367         ALTQ_SQ_LOCK(ifsq);
368         if (ifsq_is_started(ifsq) || !ifsq_data_ready(ifsq)) {
369                 ALTQ_SQ_UNLOCK(ifsq);
370                 return;
371         }
372         ifsq_set_started(ifsq);
373         ALTQ_SQ_UNLOCK(ifsq);
374
375         ifp->if_start(ifp, ifsq);
376
377         if ((ifp->if_flags & IFF_RUNNING) && !ifsq_is_oactive(ifsq))
378                 running = 1;
379
380         if (ifsq_ifstart_need_schedule(ifsq, running)) {
381                 /*
382                  * More data need to be transmitted, ifnet.if_start is
383                  * scheduled on ifnet's CPU, and we keep going.
384                  * NOTE: ifnet.if_start interlock is not released.
385                  */
386                 ifsq_ifstart_schedule(ifsq, 0);
387         }
388 }
389
390 void
391 if_devstart(struct ifnet *ifp)
392 {
393         ifsq_devstart(ifq_get_subq_default(&ifp->if_snd));
394 }
395
396 /* Device driver ifnet.if_start schedule helper function */
397 void
398 ifsq_devstart_sched(struct ifaltq_subque *ifsq)
399 {
400         ifsq_ifstart_schedule(ifsq, 1);
401 }
402
403 void
404 if_devstart_sched(struct ifnet *ifp)
405 {
406         ifsq_devstart_sched(ifq_get_subq_default(&ifp->if_snd));
407 }
408
409 static void
410 if_default_serialize(struct ifnet *ifp, enum ifnet_serialize slz __unused)
411 {
412         lwkt_serialize_enter(ifp->if_serializer);
413 }
414
415 static void
416 if_default_deserialize(struct ifnet *ifp, enum ifnet_serialize slz __unused)
417 {
418         lwkt_serialize_exit(ifp->if_serializer);
419 }
420
421 static int
422 if_default_tryserialize(struct ifnet *ifp, enum ifnet_serialize slz __unused)
423 {
424         return lwkt_serialize_try(ifp->if_serializer);
425 }
426
427 #ifdef INVARIANTS
428 static void
429 if_default_serialize_assert(struct ifnet *ifp,
430                             enum ifnet_serialize slz __unused,
431                             boolean_t serialized)
432 {
433         if (serialized)
434                 ASSERT_SERIALIZED(ifp->if_serializer);
435         else
436                 ASSERT_NOT_SERIALIZED(ifp->if_serializer);
437 }
438 #endif
439
440 /*
441  * Attach an interface to the list of "active" interfaces.
442  *
443  * The serializer is optional.
444  */
445 void
446 if_attach(struct ifnet *ifp, lwkt_serialize_t serializer)
447 {
448         unsigned socksize, ifasize;
449         int namelen, masklen;
450         struct sockaddr_dl *sdl;
451         struct ifaddr *ifa;
452         struct ifaltq *ifq;
453         int i, q;
454
455         static int if_indexlim = 8;
456
457         if (ifp->if_serialize != NULL) {
458                 KASSERT(ifp->if_deserialize != NULL &&
459                         ifp->if_tryserialize != NULL &&
460                         ifp->if_serialize_assert != NULL,
461                         ("serialize functions are partially setup"));
462
463                 /*
464                  * If the device supplies serialize functions,
465                  * then clear if_serializer to catch any invalid
466                  * usage of this field.
467                  */
468                 KASSERT(serializer == NULL,
469                         ("both serialize functions and default serializer "
470                          "are supplied"));
471                 ifp->if_serializer = NULL;
472         } else {
473                 KASSERT(ifp->if_deserialize == NULL &&
474                         ifp->if_tryserialize == NULL &&
475                         ifp->if_serialize_assert == NULL,
476                         ("serialize functions are partially setup"));
477                 ifp->if_serialize = if_default_serialize;
478                 ifp->if_deserialize = if_default_deserialize;
479                 ifp->if_tryserialize = if_default_tryserialize;
480 #ifdef INVARIANTS
481                 ifp->if_serialize_assert = if_default_serialize_assert;
482 #endif
483
484                 /*
485                  * The serializer can be passed in from the device,
486                  * allowing the same serializer to be used for both
487                  * the interrupt interlock and the device queue.
488                  * If not specified, the netif structure will use an
489                  * embedded serializer.
490                  */
491                 if (serializer == NULL) {
492                         serializer = &ifp->if_default_serializer;
493                         lwkt_serialize_init(serializer);
494                 }
495                 ifp->if_serializer = serializer;
496         }
497
498         mtx_init(&ifp->if_ioctl_mtx);
499         mtx_lock(&ifp->if_ioctl_mtx);
500
501         TAILQ_INSERT_TAIL(&ifnet, ifp, if_link);
502         ifp->if_index = ++if_index;
503
504         /*
505          * XXX -
506          * The old code would work if the interface passed a pre-existing
507          * chain of ifaddrs to this code.  We don't trust our callers to
508          * properly initialize the tailq, however, so we no longer allow
509          * this unlikely case.
510          */
511         ifp->if_addrheads = kmalloc(ncpus * sizeof(struct ifaddrhead),
512                                     M_IFADDR, M_WAITOK | M_ZERO);
513         for (i = 0; i < ncpus; ++i)
514                 TAILQ_INIT(&ifp->if_addrheads[i]);
515
516         TAILQ_INIT(&ifp->if_prefixhead);
517         TAILQ_INIT(&ifp->if_multiaddrs);
518         TAILQ_INIT(&ifp->if_groups);
519         getmicrotime(&ifp->if_lastchange);
520         if (ifindex2ifnet == NULL || if_index >= if_indexlim) {
521                 unsigned int n;
522                 struct ifnet **q;
523
524                 if_indexlim <<= 1;
525
526                 /* grow ifindex2ifnet */
527                 n = if_indexlim * sizeof(*q);
528                 q = kmalloc(n, M_IFADDR, M_WAITOK | M_ZERO);
529                 if (ifindex2ifnet) {
530                         bcopy(ifindex2ifnet, q, n/2);
531                         kfree(ifindex2ifnet, M_IFADDR);
532                 }
533                 ifindex2ifnet = q;
534         }
535
536         ifindex2ifnet[if_index] = ifp;
537
538         /*
539          * create a Link Level name for this device
540          */
541         namelen = strlen(ifp->if_xname);
542         masklen = offsetof(struct sockaddr_dl, sdl_data[0]) + namelen;
543         socksize = masklen + ifp->if_addrlen;
544 #define ROUNDUP(a) (1 + (((a) - 1) | (sizeof(long) - 1)))
545         if (socksize < sizeof(*sdl))
546                 socksize = sizeof(*sdl);
547         socksize = ROUNDUP(socksize);
548 #undef ROUNDUP
549         ifasize = sizeof(struct ifaddr) + 2 * socksize;
550         ifa = ifa_create(ifasize, M_WAITOK);
551         sdl = (struct sockaddr_dl *)(ifa + 1);
552         sdl->sdl_len = socksize;
553         sdl->sdl_family = AF_LINK;
554         bcopy(ifp->if_xname, sdl->sdl_data, namelen);
555         sdl->sdl_nlen = namelen;
556         sdl->sdl_index = ifp->if_index;
557         sdl->sdl_type = ifp->if_type;
558         ifp->if_lladdr = ifa;
559         ifa->ifa_ifp = ifp;
560         ifa->ifa_rtrequest = link_rtrequest;
561         ifa->ifa_addr = (struct sockaddr *)sdl;
562         sdl = (struct sockaddr_dl *)(socksize + (caddr_t)sdl);
563         ifa->ifa_netmask = (struct sockaddr *)sdl;
564         sdl->sdl_len = masklen;
565         while (namelen != 0)
566                 sdl->sdl_data[--namelen] = 0xff;
567         ifa_iflink(ifa, ifp, 0 /* Insert head */);
568
569         ifp->if_data_pcpu = kmalloc_cachealign(
570             ncpus * sizeof(struct ifdata_pcpu), M_DEVBUF, M_WAITOK | M_ZERO);
571
572         EVENTHANDLER_INVOKE(ifnet_attach_event, ifp);
573         devctl_notify("IFNET", ifp->if_xname, "ATTACH", NULL);
574
575         if (ifp->if_mapsubq == NULL)
576                 ifp->if_mapsubq = ifq_mapsubq_default;
577
578         ifq = &ifp->if_snd;
579         ifq->altq_type = 0;
580         ifq->altq_disc = NULL;
581         ifq->altq_flags &= ALTQF_CANTCHANGE;
582         ifq->altq_tbr = NULL;
583         ifq->altq_ifp = ifp;
584
585         if (ifq->altq_subq_cnt <= 0)
586                 ifq->altq_subq_cnt = 1;
587         ifq->altq_subq = kmalloc_cachealign(
588             ifq->altq_subq_cnt * sizeof(struct ifaltq_subque),
589             M_DEVBUF, M_WAITOK | M_ZERO);
590
591         if (ifq->altq_maxlen == 0) {
592                 if_printf(ifp, "driver didn't set ifq_maxlen\n");
593                 ifq_set_maxlen(ifq, ifqmaxlen);
594         }
595
596         for (q = 0; q < ifq->altq_subq_cnt; ++q) {
597                 struct ifaltq_subque *ifsq = &ifq->altq_subq[q];
598
599                 ALTQ_SQ_LOCK_INIT(ifsq);
600                 ifsq->ifsq_index = q;
601
602                 ifsq->ifsq_altq = ifq;
603                 ifsq->ifsq_ifp = ifp;
604
605                 ifsq->ifq_maxlen = ifq->altq_maxlen;
606                 ifsq->ifsq_prepended = NULL;
607                 ifsq->ifsq_started = 0;
608                 ifsq->ifsq_hw_oactive = 0;
609                 ifsq_set_cpuid(ifsq, 0);
610                 if (ifp->if_serializer != NULL)
611                         ifsq_set_hw_serialize(ifsq, ifp->if_serializer);
612
613                 ifsq->ifsq_stage =
614                     kmalloc_cachealign(ncpus * sizeof(struct ifsubq_stage),
615                     M_DEVBUF, M_WAITOK | M_ZERO);
616                 for (i = 0; i < ncpus; ++i)
617                         ifsq->ifsq_stage[i].stg_subq = ifsq;
618
619                 ifsq->ifsq_ifstart_nmsg =
620                     kmalloc(ncpus * sizeof(struct netmsg_base),
621                     M_LWKTMSG, M_WAITOK);
622                 for (i = 0; i < ncpus; ++i) {
623                         netmsg_init(&ifsq->ifsq_ifstart_nmsg[i], NULL,
624                             &netisr_adone_rport, 0, ifsq_ifstart_dispatch);
625                         ifsq->ifsq_ifstart_nmsg[i].lmsg.u.ms_resultp = ifsq;
626                 }
627         }
628         ifq_set_classic(ifq);
629
630         if (!SLIST_EMPTY(&domains))
631                 if_attachdomain1(ifp);
632
633         /* Announce the interface. */
634         rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
635
636         mtx_unlock(&ifp->if_ioctl_mtx);
637 }
638
639 static void
640 if_attachdomain(void *dummy)
641 {
642         struct ifnet *ifp;
643
644         crit_enter();
645         TAILQ_FOREACH(ifp, &ifnet, if_list)
646                 if_attachdomain1(ifp);
647         crit_exit();
648 }
649 SYSINIT(domainifattach, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST,
650         if_attachdomain, NULL);
651
652 static void
653 if_attachdomain1(struct ifnet *ifp)
654 {
655         struct domain *dp;
656
657         crit_enter();
658
659         /* address family dependent data region */
660         bzero(ifp->if_afdata, sizeof(ifp->if_afdata));
661         SLIST_FOREACH(dp, &domains, dom_next)
662                 if (dp->dom_ifattach)
663                         ifp->if_afdata[dp->dom_family] =
664                                 (*dp->dom_ifattach)(ifp);
665         crit_exit();
666 }
667
668 /*
669  * Purge all addresses whose type is _not_ AF_LINK
670  */
671 void
672 if_purgeaddrs_nolink(struct ifnet *ifp)
673 {
674         struct ifaddr_container *ifac, *next;
675
676         TAILQ_FOREACH_MUTABLE(ifac, &ifp->if_addrheads[mycpuid],
677                               ifa_link, next) {
678                 struct ifaddr *ifa = ifac->ifa;
679
680                 /* Leave link ifaddr as it is */
681                 if (ifa->ifa_addr->sa_family == AF_LINK)
682                         continue;
683 #ifdef INET
684                 /* XXX: Ugly!! ad hoc just for INET */
685                 if (ifa->ifa_addr && ifa->ifa_addr->sa_family == AF_INET) {
686                         struct ifaliasreq ifr;
687 #ifdef IFADDR_DEBUG_VERBOSE
688                         int i;
689
690                         kprintf("purge in4 addr %p: ", ifa);
691                         for (i = 0; i < ncpus; ++i)
692                                 kprintf("%d ", ifa->ifa_containers[i].ifa_refcnt);
693                         kprintf("\n");
694 #endif
695
696                         bzero(&ifr, sizeof ifr);
697                         ifr.ifra_addr = *ifa->ifa_addr;
698                         if (ifa->ifa_dstaddr)
699                                 ifr.ifra_broadaddr = *ifa->ifa_dstaddr;
700                         if (in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, ifp,
701                                        NULL) == 0)
702                                 continue;
703                 }
704 #endif /* INET */
705 #ifdef INET6
706                 if (ifa->ifa_addr && ifa->ifa_addr->sa_family == AF_INET6) {
707 #ifdef IFADDR_DEBUG_VERBOSE
708                         int i;
709
710                         kprintf("purge in6 addr %p: ", ifa);
711                         for (i = 0; i < ncpus; ++i)
712                                 kprintf("%d ", ifa->ifa_containers[i].ifa_refcnt);
713                         kprintf("\n");
714 #endif
715
716                         in6_purgeaddr(ifa);
717                         /* ifp_addrhead is already updated */
718                         continue;
719                 }
720 #endif /* INET6 */
721                 ifa_ifunlink(ifa, ifp);
722                 ifa_destroy(ifa);
723         }
724 }
725
726 static void
727 ifq_stage_detach_handler(netmsg_t nmsg)
728 {
729         struct ifaltq *ifq = nmsg->lmsg.u.ms_resultp;
730         int q;
731
732         for (q = 0; q < ifq->altq_subq_cnt; ++q) {
733                 struct ifaltq_subque *ifsq = &ifq->altq_subq[q];
734                 struct ifsubq_stage *stage = ifsq_get_stage(ifsq, mycpuid);
735
736                 if (stage->stg_flags & IFSQ_STAGE_FLAG_QUED)
737                         ifsq_stage_remove(&ifsubq_stage_heads[mycpuid], stage);
738         }
739         lwkt_replymsg(&nmsg->lmsg, 0);
740 }
741
742 static void
743 ifq_stage_detach(struct ifaltq *ifq)
744 {
745         struct netmsg_base base;
746         int cpu;
747
748         netmsg_init(&base, NULL, &curthread->td_msgport, 0,
749             ifq_stage_detach_handler);
750         base.lmsg.u.ms_resultp = ifq;
751
752         for (cpu = 0; cpu < ncpus; ++cpu)
753                 lwkt_domsg(netisr_cpuport(cpu), &base.lmsg, 0);
754 }
755
756 /*
757  * Detach an interface, removing it from the
758  * list of "active" interfaces.
759  */
760 void
761 if_detach(struct ifnet *ifp)
762 {
763         struct radix_node_head  *rnh;
764         int i, q;
765         int cpu, origcpu;
766         struct domain *dp;
767
768         EVENTHANDLER_INVOKE(ifnet_detach_event, ifp);
769
770         /*
771          * Remove routes and flush queues.
772          */
773         crit_enter();
774 #ifdef IFPOLL_ENABLE
775         if (ifp->if_flags & IFF_NPOLLING)
776                 ifpoll_deregister(ifp);
777 #endif
778         if_down(ifp);
779
780 #ifdef ALTQ
781         if (ifq_is_enabled(&ifp->if_snd))
782                 altq_disable(&ifp->if_snd);
783         if (ifq_is_attached(&ifp->if_snd))
784                 altq_detach(&ifp->if_snd);
785 #endif
786
787         /*
788          * Clean up all addresses.
789          */
790         ifp->if_lladdr = NULL;
791
792         if_purgeaddrs_nolink(ifp);
793         if (!TAILQ_EMPTY(&ifp->if_addrheads[mycpuid])) {
794                 struct ifaddr *ifa;
795
796                 ifa = TAILQ_FIRST(&ifp->if_addrheads[mycpuid])->ifa;
797                 KASSERT(ifa->ifa_addr->sa_family == AF_LINK,
798                         ("non-link ifaddr is left on if_addrheads"));
799
800                 ifa_ifunlink(ifa, ifp);
801                 ifa_destroy(ifa);
802                 KASSERT(TAILQ_EMPTY(&ifp->if_addrheads[mycpuid]),
803                         ("there are still ifaddrs left on if_addrheads"));
804         }
805
806 #ifdef INET
807         /*
808          * Remove all IPv4 kernel structures related to ifp.
809          */
810         in_ifdetach(ifp);
811 #endif
812
813 #ifdef INET6
814         /*
815          * Remove all IPv6 kernel structs related to ifp.  This should be done
816          * before removing routing entries below, since IPv6 interface direct
817          * routes are expected to be removed by the IPv6-specific kernel API.
818          * Otherwise, the kernel will detect some inconsistency and bark it.
819          */
820         in6_ifdetach(ifp);
821 #endif
822
823         /*
824          * Delete all remaining routes using this interface
825          * Unfortuneatly the only way to do this is to slog through
826          * the entire routing table looking for routes which point
827          * to this interface...oh well...
828          */
829         origcpu = mycpuid;
830         for (cpu = 0; cpu < ncpus; cpu++) {
831                 lwkt_migratecpu(cpu);
832                 for (i = 1; i <= AF_MAX; i++) {
833                         if ((rnh = rt_tables[cpu][i]) == NULL)
834                                 continue;
835                         rnh->rnh_walktree(rnh, if_rtdel, ifp);
836                 }
837         }
838         lwkt_migratecpu(origcpu);
839
840         /* Announce that the interface is gone. */
841         rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
842         devctl_notify("IFNET", ifp->if_xname, "DETACH", NULL);
843
844         SLIST_FOREACH(dp, &domains, dom_next)
845                 if (dp->dom_ifdetach && ifp->if_afdata[dp->dom_family])
846                         (*dp->dom_ifdetach)(ifp,
847                                 ifp->if_afdata[dp->dom_family]);
848
849         /*
850          * Remove interface from ifindex2ifp[] and maybe decrement if_index.
851          */
852         ifindex2ifnet[ifp->if_index] = NULL;
853         while (if_index > 0 && ifindex2ifnet[if_index] == NULL)
854                 if_index--;
855
856         TAILQ_REMOVE(&ifnet, ifp, if_link);
857         kfree(ifp->if_addrheads, M_IFADDR);
858
859         lwkt_synchronize_ipiqs("if_detach");
860         ifq_stage_detach(&ifp->if_snd);
861
862         for (q = 0; q < ifp->if_snd.altq_subq_cnt; ++q) {
863                 struct ifaltq_subque *ifsq = &ifp->if_snd.altq_subq[q];
864
865                 kfree(ifsq->ifsq_ifstart_nmsg, M_LWKTMSG);
866                 kfree(ifsq->ifsq_stage, M_DEVBUF);
867         }
868         kfree(ifp->if_snd.altq_subq, M_DEVBUF);
869
870         kfree(ifp->if_data_pcpu, M_DEVBUF);
871
872         crit_exit();
873 }
874
875 /*
876  * Create interface group without members
877  */
878 struct ifg_group *
879 if_creategroup(const char *groupname)
880 {
881         struct ifg_group        *ifg = NULL;
882
883         if ((ifg = (struct ifg_group *)kmalloc(sizeof(struct ifg_group),
884             M_TEMP, M_NOWAIT)) == NULL)
885                 return (NULL);
886
887         strlcpy(ifg->ifg_group, groupname, sizeof(ifg->ifg_group));
888         ifg->ifg_refcnt = 0;
889         ifg->ifg_carp_demoted = 0;
890         TAILQ_INIT(&ifg->ifg_members);
891 #if NPF > 0
892         pfi_attach_ifgroup(ifg);
893 #endif
894         TAILQ_INSERT_TAIL(&ifg_head, ifg, ifg_next);
895
896         return (ifg);
897 }
898
899 /*
900  * Add a group to an interface
901  */
902 int
903 if_addgroup(struct ifnet *ifp, const char *groupname)
904 {
905         struct ifg_list         *ifgl;
906         struct ifg_group        *ifg = NULL;
907         struct ifg_member       *ifgm;
908
909         if (groupname[0] && groupname[strlen(groupname) - 1] >= '0' &&
910             groupname[strlen(groupname) - 1] <= '9')
911                 return (EINVAL);
912
913         TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
914                 if (!strcmp(ifgl->ifgl_group->ifg_group, groupname))
915                         return (EEXIST);
916
917         if ((ifgl = kmalloc(sizeof(*ifgl), M_TEMP, M_NOWAIT)) == NULL)
918                 return (ENOMEM);
919
920         if ((ifgm = kmalloc(sizeof(*ifgm), M_TEMP, M_NOWAIT)) == NULL) {
921                 kfree(ifgl, M_TEMP);
922                 return (ENOMEM);
923         }
924
925         TAILQ_FOREACH(ifg, &ifg_head, ifg_next)
926                 if (!strcmp(ifg->ifg_group, groupname))
927                         break;
928
929         if (ifg == NULL && (ifg = if_creategroup(groupname)) == NULL) {
930                 kfree(ifgl, M_TEMP);
931                 kfree(ifgm, M_TEMP);
932                 return (ENOMEM);
933         }
934
935         ifg->ifg_refcnt++;
936         ifgl->ifgl_group = ifg;
937         ifgm->ifgm_ifp = ifp;
938
939         TAILQ_INSERT_TAIL(&ifg->ifg_members, ifgm, ifgm_next);
940         TAILQ_INSERT_TAIL(&ifp->if_groups, ifgl, ifgl_next);
941
942 #if NPF > 0
943         pfi_group_change(groupname);
944 #endif
945
946         return (0);
947 }
948
949 /*
950  * Remove a group from an interface
951  */
952 int
953 if_delgroup(struct ifnet *ifp, const char *groupname)
954 {
955         struct ifg_list         *ifgl;
956         struct ifg_member       *ifgm;
957
958         TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
959                 if (!strcmp(ifgl->ifgl_group->ifg_group, groupname))
960                         break;
961         if (ifgl == NULL)
962                 return (ENOENT);
963
964         TAILQ_REMOVE(&ifp->if_groups, ifgl, ifgl_next);
965
966         TAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next)
967                 if (ifgm->ifgm_ifp == ifp)
968                         break;
969
970         if (ifgm != NULL) {
971                 TAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifgm_next);
972                 kfree(ifgm, M_TEMP);
973         }
974
975         if (--ifgl->ifgl_group->ifg_refcnt == 0) {
976                 TAILQ_REMOVE(&ifg_head, ifgl->ifgl_group, ifg_next);
977 #if NPF > 0
978                 pfi_detach_ifgroup(ifgl->ifgl_group);
979 #endif
980                 kfree(ifgl->ifgl_group, M_TEMP);
981         }
982
983         kfree(ifgl, M_TEMP);
984
985 #if NPF > 0
986         pfi_group_change(groupname);
987 #endif
988
989         return (0);
990 }
991
992 /*
993  * Stores all groups from an interface in memory pointed
994  * to by data
995  */
996 int
997 if_getgroup(caddr_t data, struct ifnet *ifp)
998 {
999         int                      len, error;
1000         struct ifg_list         *ifgl;
1001         struct ifg_req           ifgrq, *ifgp;
1002         struct ifgroupreq       *ifgr = (struct ifgroupreq *)data;
1003
1004         if (ifgr->ifgr_len == 0) {
1005                 TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
1006                         ifgr->ifgr_len += sizeof(struct ifg_req);
1007                 return (0);
1008         }
1009
1010         len = ifgr->ifgr_len;
1011         ifgp = ifgr->ifgr_groups;
1012         TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) {
1013                 if (len < sizeof(ifgrq))
1014                         return (EINVAL);
1015                 bzero(&ifgrq, sizeof ifgrq);
1016                 strlcpy(ifgrq.ifgrq_group, ifgl->ifgl_group->ifg_group,
1017                     sizeof(ifgrq.ifgrq_group));
1018                 if ((error = copyout((caddr_t)&ifgrq, (caddr_t)ifgp,
1019                     sizeof(struct ifg_req))))
1020                         return (error);
1021                 len -= sizeof(ifgrq);
1022                 ifgp++;
1023         }
1024
1025         return (0);
1026 }
1027
1028 /*
1029  * Stores all members of a group in memory pointed to by data
1030  */
1031 int
1032 if_getgroupmembers(caddr_t data)
1033 {
1034         struct ifgroupreq       *ifgr = (struct ifgroupreq *)data;
1035         struct ifg_group        *ifg;
1036         struct ifg_member       *ifgm;
1037         struct ifg_req           ifgrq, *ifgp;
1038         int                      len, error;
1039
1040         TAILQ_FOREACH(ifg, &ifg_head, ifg_next)
1041                 if (!strcmp(ifg->ifg_group, ifgr->ifgr_name))
1042                         break;
1043         if (ifg == NULL)
1044                 return (ENOENT);
1045
1046         if (ifgr->ifgr_len == 0) {
1047                 TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next)
1048                         ifgr->ifgr_len += sizeof(ifgrq);
1049                 return (0);
1050         }
1051
1052         len = ifgr->ifgr_len;
1053         ifgp = ifgr->ifgr_groups;
1054         TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) {
1055                 if (len < sizeof(ifgrq))
1056                         return (EINVAL);
1057                 bzero(&ifgrq, sizeof ifgrq);
1058                 strlcpy(ifgrq.ifgrq_member, ifgm->ifgm_ifp->if_xname,
1059                     sizeof(ifgrq.ifgrq_member));
1060                 if ((error = copyout((caddr_t)&ifgrq, (caddr_t)ifgp,
1061                     sizeof(struct ifg_req))))
1062                         return (error);
1063                 len -= sizeof(ifgrq);
1064                 ifgp++;
1065         }
1066
1067         return (0);
1068 }
1069
1070 /*
1071  * Delete Routes for a Network Interface
1072  *
1073  * Called for each routing entry via the rnh->rnh_walktree() call above
1074  * to delete all route entries referencing a detaching network interface.
1075  *
1076  * Arguments:
1077  *      rn      pointer to node in the routing table
1078  *      arg     argument passed to rnh->rnh_walktree() - detaching interface
1079  *
1080  * Returns:
1081  *      0       successful
1082  *      errno   failed - reason indicated
1083  *
1084  */
1085 static int
1086 if_rtdel(struct radix_node *rn, void *arg)
1087 {
1088         struct rtentry  *rt = (struct rtentry *)rn;
1089         struct ifnet    *ifp = arg;
1090         int             err;
1091
1092         if (rt->rt_ifp == ifp) {
1093
1094                 /*
1095                  * Protect (sorta) against walktree recursion problems
1096                  * with cloned routes
1097                  */
1098                 if (!(rt->rt_flags & RTF_UP))
1099                         return (0);
1100
1101                 err = rtrequest(RTM_DELETE, rt_key(rt), rt->rt_gateway,
1102                                 rt_mask(rt), rt->rt_flags,
1103                                 NULL);
1104                 if (err) {
1105                         log(LOG_WARNING, "if_rtdel: error %d\n", err);
1106                 }
1107         }
1108
1109         return (0);
1110 }
1111
1112 /*
1113  * Locate an interface based on a complete address.
1114  */
1115 struct ifaddr *
1116 ifa_ifwithaddr(struct sockaddr *addr)
1117 {
1118         struct ifnet *ifp;
1119
1120         TAILQ_FOREACH(ifp, &ifnet, if_link) {
1121                 struct ifaddr_container *ifac;
1122
1123                 TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1124                         struct ifaddr *ifa = ifac->ifa;
1125
1126                         if (ifa->ifa_addr->sa_family != addr->sa_family)
1127                                 continue;
1128                         if (sa_equal(addr, ifa->ifa_addr))
1129                                 return (ifa);
1130                         if ((ifp->if_flags & IFF_BROADCAST) &&
1131                             ifa->ifa_broadaddr &&
1132                             /* IPv6 doesn't have broadcast */
1133                             ifa->ifa_broadaddr->sa_len != 0 &&
1134                             sa_equal(ifa->ifa_broadaddr, addr))
1135                                 return (ifa);
1136                 }
1137         }
1138         return (NULL);
1139 }
1140 /*
1141  * Locate the point to point interface with a given destination address.
1142  */
1143 struct ifaddr *
1144 ifa_ifwithdstaddr(struct sockaddr *addr)
1145 {
1146         struct ifnet *ifp;
1147
1148         TAILQ_FOREACH(ifp, &ifnet, if_link) {
1149                 struct ifaddr_container *ifac;
1150
1151                 if (!(ifp->if_flags & IFF_POINTOPOINT))
1152                         continue;
1153
1154                 TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1155                         struct ifaddr *ifa = ifac->ifa;
1156
1157                         if (ifa->ifa_addr->sa_family != addr->sa_family)
1158                                 continue;
1159                         if (ifa->ifa_dstaddr &&
1160                             sa_equal(addr, ifa->ifa_dstaddr))
1161                                 return (ifa);
1162                 }
1163         }
1164         return (NULL);
1165 }
1166
1167 /*
1168  * Find an interface on a specific network.  If many, choice
1169  * is most specific found.
1170  */
1171 struct ifaddr *
1172 ifa_ifwithnet(struct sockaddr *addr)
1173 {
1174         struct ifnet *ifp;
1175         struct ifaddr *ifa_maybe = NULL;
1176         u_int af = addr->sa_family;
1177         char *addr_data = addr->sa_data, *cplim;
1178
1179         /*
1180          * AF_LINK addresses can be looked up directly by their index number,
1181          * so do that if we can.
1182          */
1183         if (af == AF_LINK) {
1184                 struct sockaddr_dl *sdl = (struct sockaddr_dl *)addr;
1185
1186                 if (sdl->sdl_index && sdl->sdl_index <= if_index)
1187                         return (ifindex2ifnet[sdl->sdl_index]->if_lladdr);
1188         }
1189
1190         /*
1191          * Scan though each interface, looking for ones that have
1192          * addresses in this address family.
1193          */
1194         TAILQ_FOREACH(ifp, &ifnet, if_link) {
1195                 struct ifaddr_container *ifac;
1196
1197                 TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1198                         struct ifaddr *ifa = ifac->ifa;
1199                         char *cp, *cp2, *cp3;
1200
1201                         if (ifa->ifa_addr->sa_family != af)
1202 next:                           continue;
1203                         if (af == AF_INET && ifp->if_flags & IFF_POINTOPOINT) {
1204                                 /*
1205                                  * This is a bit broken as it doesn't
1206                                  * take into account that the remote end may
1207                                  * be a single node in the network we are
1208                                  * looking for.
1209                                  * The trouble is that we don't know the
1210                                  * netmask for the remote end.
1211                                  */
1212                                 if (ifa->ifa_dstaddr != NULL &&
1213                                     sa_equal(addr, ifa->ifa_dstaddr))
1214                                         return (ifa);
1215                         } else {
1216                                 /*
1217                                  * if we have a special address handler,
1218                                  * then use it instead of the generic one.
1219                                  */
1220                                 if (ifa->ifa_claim_addr) {
1221                                         if ((*ifa->ifa_claim_addr)(ifa, addr)) {
1222                                                 return (ifa);
1223                                         } else {
1224                                                 continue;
1225                                         }
1226                                 }
1227
1228                                 /*
1229                                  * Scan all the bits in the ifa's address.
1230                                  * If a bit dissagrees with what we are
1231                                  * looking for, mask it with the netmask
1232                                  * to see if it really matters.
1233                                  * (A byte at a time)
1234                                  */
1235                                 if (ifa->ifa_netmask == 0)
1236                                         continue;
1237                                 cp = addr_data;
1238                                 cp2 = ifa->ifa_addr->sa_data;
1239                                 cp3 = ifa->ifa_netmask->sa_data;
1240                                 cplim = ifa->ifa_netmask->sa_len +
1241                                         (char *)ifa->ifa_netmask;
1242                                 while (cp3 < cplim)
1243                                         if ((*cp++ ^ *cp2++) & *cp3++)
1244                                                 goto next; /* next address! */
1245                                 /*
1246                                  * If the netmask of what we just found
1247                                  * is more specific than what we had before
1248                                  * (if we had one) then remember the new one
1249                                  * before continuing to search
1250                                  * for an even better one.
1251                                  */
1252                                 if (ifa_maybe == NULL ||
1253                                     rn_refines((char *)ifa->ifa_netmask,
1254                                                (char *)ifa_maybe->ifa_netmask))
1255                                         ifa_maybe = ifa;
1256                         }
1257                 }
1258         }
1259         return (ifa_maybe);
1260 }
1261
1262 /*
1263  * Find an interface address specific to an interface best matching
1264  * a given address.
1265  */
1266 struct ifaddr *
1267 ifaof_ifpforaddr(struct sockaddr *addr, struct ifnet *ifp)
1268 {
1269         struct ifaddr_container *ifac;
1270         char *cp, *cp2, *cp3;
1271         char *cplim;
1272         struct ifaddr *ifa_maybe = NULL;
1273         u_int af = addr->sa_family;
1274
1275         if (af >= AF_MAX)
1276                 return (0);
1277         TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1278                 struct ifaddr *ifa = ifac->ifa;
1279
1280                 if (ifa->ifa_addr->sa_family != af)
1281                         continue;
1282                 if (ifa_maybe == NULL)
1283                         ifa_maybe = ifa;
1284                 if (ifa->ifa_netmask == NULL) {
1285                         if (sa_equal(addr, ifa->ifa_addr) ||
1286                             (ifa->ifa_dstaddr != NULL &&
1287                              sa_equal(addr, ifa->ifa_dstaddr)))
1288                                 return (ifa);
1289                         continue;
1290                 }
1291                 if (ifp->if_flags & IFF_POINTOPOINT) {
1292                         if (sa_equal(addr, ifa->ifa_dstaddr))
1293                                 return (ifa);
1294                 } else {
1295                         cp = addr->sa_data;
1296                         cp2 = ifa->ifa_addr->sa_data;
1297                         cp3 = ifa->ifa_netmask->sa_data;
1298                         cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask;
1299                         for (; cp3 < cplim; cp3++)
1300                                 if ((*cp++ ^ *cp2++) & *cp3)
1301                                         break;
1302                         if (cp3 == cplim)
1303                                 return (ifa);
1304                 }
1305         }
1306         return (ifa_maybe);
1307 }
1308
1309 /*
1310  * Default action when installing a route with a Link Level gateway.
1311  * Lookup an appropriate real ifa to point to.
1312  * This should be moved to /sys/net/link.c eventually.
1313  */
1314 static void
1315 link_rtrequest(int cmd, struct rtentry *rt, struct rt_addrinfo *info)
1316 {
1317         struct ifaddr *ifa;
1318         struct sockaddr *dst;
1319         struct ifnet *ifp;
1320
1321         if (cmd != RTM_ADD || (ifa = rt->rt_ifa) == NULL ||
1322             (ifp = ifa->ifa_ifp) == NULL || (dst = rt_key(rt)) == NULL)
1323                 return;
1324         ifa = ifaof_ifpforaddr(dst, ifp);
1325         if (ifa != NULL) {
1326                 IFAFREE(rt->rt_ifa);
1327                 IFAREF(ifa);
1328                 rt->rt_ifa = ifa;
1329                 if (ifa->ifa_rtrequest && ifa->ifa_rtrequest != link_rtrequest)
1330                         ifa->ifa_rtrequest(cmd, rt, info);
1331         }
1332 }
1333
1334 /*
1335  * Mark an interface down and notify protocols of
1336  * the transition.
1337  * NOTE: must be called at splnet or eqivalent.
1338  */
1339 void
1340 if_unroute(struct ifnet *ifp, int flag, int fam)
1341 {
1342         struct ifaddr_container *ifac;
1343
1344         ifp->if_flags &= ~flag;
1345         getmicrotime(&ifp->if_lastchange);
1346         TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1347                 struct ifaddr *ifa = ifac->ifa;
1348
1349                 if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family))
1350                         kpfctlinput(PRC_IFDOWN, ifa->ifa_addr);
1351         }
1352         ifq_purge_all(&ifp->if_snd);
1353         rt_ifmsg(ifp);
1354 }
1355
1356 /*
1357  * Mark an interface up and notify protocols of
1358  * the transition.
1359  * NOTE: must be called at splnet or eqivalent.
1360  */
1361 void
1362 if_route(struct ifnet *ifp, int flag, int fam)
1363 {
1364         struct ifaddr_container *ifac;
1365
1366         ifq_purge_all(&ifp->if_snd);
1367         ifp->if_flags |= flag;
1368         getmicrotime(&ifp->if_lastchange);
1369         TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1370                 struct ifaddr *ifa = ifac->ifa;
1371
1372                 if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family))
1373                         kpfctlinput(PRC_IFUP, ifa->ifa_addr);
1374         }
1375         rt_ifmsg(ifp);
1376 #ifdef INET6
1377         in6_if_up(ifp);
1378 #endif
1379 }
1380
1381 /*
1382  * Mark an interface down and notify protocols of the transition.  An
1383  * interface going down is also considered to be a synchronizing event.
1384  * We must ensure that all packet processing related to the interface
1385  * has completed before we return so e.g. the caller can free the ifnet
1386  * structure that the mbufs may be referencing.
1387  *
1388  * NOTE: must be called at splnet or eqivalent.
1389  */
1390 void
1391 if_down(struct ifnet *ifp)
1392 {
1393         if_unroute(ifp, IFF_UP, AF_UNSPEC);
1394         netmsg_service_sync();
1395 }
1396
1397 /*
1398  * Mark an interface up and notify protocols of
1399  * the transition.
1400  * NOTE: must be called at splnet or eqivalent.
1401  */
1402 void
1403 if_up(struct ifnet *ifp)
1404 {
1405         if_route(ifp, IFF_UP, AF_UNSPEC);
1406 }
1407
1408 /*
1409  * Process a link state change.
1410  * NOTE: must be called at splsoftnet or equivalent.
1411  */
1412 void
1413 if_link_state_change(struct ifnet *ifp)
1414 {
1415         int link_state = ifp->if_link_state;
1416
1417         rt_ifmsg(ifp);
1418         devctl_notify("IFNET", ifp->if_xname,
1419             (link_state == LINK_STATE_UP) ? "LINK_UP" : "LINK_DOWN", NULL);
1420 }
1421
1422 /*
1423  * Handle interface watchdog timer routines.  Called
1424  * from softclock, we decrement timers (if set) and
1425  * call the appropriate interface routine on expiration.
1426  */
1427 static void
1428 if_slowtimo(void *arg)
1429 {
1430         struct ifnet *ifp;
1431
1432         crit_enter();
1433
1434         TAILQ_FOREACH(ifp, &ifnet, if_link) {
1435                 if (if_stats_compat) {
1436                         IFNET_STAT_GET(ifp, ipackets, ifp->if_ipackets);
1437                         IFNET_STAT_GET(ifp, ierrors, ifp->if_ierrors);
1438                         IFNET_STAT_GET(ifp, opackets, ifp->if_opackets);
1439                         IFNET_STAT_GET(ifp, oerrors, ifp->if_oerrors);
1440                         IFNET_STAT_GET(ifp, collisions, ifp->if_collisions);
1441                         IFNET_STAT_GET(ifp, ibytes, ifp->if_ibytes);
1442                         IFNET_STAT_GET(ifp, obytes, ifp->if_obytes);
1443                         IFNET_STAT_GET(ifp, imcasts, ifp->if_imcasts);
1444                         IFNET_STAT_GET(ifp, omcasts, ifp->if_omcasts);
1445                         IFNET_STAT_GET(ifp, iqdrops, ifp->if_iqdrops);
1446                         IFNET_STAT_GET(ifp, noproto, ifp->if_noproto);
1447                 }
1448
1449                 if (ifp->if_timer == 0 || --ifp->if_timer)
1450                         continue;
1451                 if (ifp->if_watchdog) {
1452                         if (ifnet_tryserialize_all(ifp)) {
1453                                 (*ifp->if_watchdog)(ifp);
1454                                 ifnet_deserialize_all(ifp);
1455                         } else {
1456                                 /* try again next timeout */
1457                                 ++ifp->if_timer;
1458                         }
1459                 }
1460         }
1461
1462         crit_exit();
1463
1464         callout_reset(&if_slowtimo_timer, hz / IFNET_SLOWHZ, if_slowtimo, NULL);
1465 }
1466
1467 /*
1468  * Map interface name to
1469  * interface structure pointer.
1470  */
1471 struct ifnet *
1472 ifunit(const char *name)
1473 {
1474         struct ifnet *ifp;
1475
1476         /*
1477          * Search all the interfaces for this name/number
1478          */
1479
1480         TAILQ_FOREACH(ifp, &ifnet, if_link) {
1481                 if (strncmp(ifp->if_xname, name, IFNAMSIZ) == 0)
1482                         break;
1483         }
1484         return (ifp);
1485 }
1486
1487
1488 /*
1489  * Map interface name in a sockaddr_dl to
1490  * interface structure pointer.
1491  */
1492 struct ifnet *
1493 if_withname(struct sockaddr *sa)
1494 {
1495         char ifname[IFNAMSIZ+1];
1496         struct sockaddr_dl *sdl = (struct sockaddr_dl *)sa;
1497
1498         if ( (sa->sa_family != AF_LINK) || (sdl->sdl_nlen == 0) ||
1499              (sdl->sdl_nlen > IFNAMSIZ) )
1500                 return NULL;
1501
1502         /*
1503          * ifunit wants a null-terminated name.  It may not be null-terminated
1504          * in the sockaddr.  We don't want to change the caller's sockaddr,
1505          * and there might not be room to put the trailing null anyway, so we
1506          * make a local copy that we know we can null terminate safely.
1507          */
1508
1509         bcopy(sdl->sdl_data, ifname, sdl->sdl_nlen);
1510         ifname[sdl->sdl_nlen] = '\0';
1511         return ifunit(ifname);
1512 }
1513
1514
1515 /*
1516  * Interface ioctls.
1517  */
1518 int
1519 ifioctl(struct socket *so, u_long cmd, caddr_t data, struct ucred *cred)
1520 {
1521         struct ifnet *ifp;
1522         struct ifreq *ifr;
1523         struct ifstat *ifs;
1524         int error;
1525         short oif_flags;
1526         int new_flags;
1527 #ifdef COMPAT_43
1528         int ocmd;
1529 #endif
1530         size_t namelen, onamelen;
1531         char new_name[IFNAMSIZ];
1532         struct ifaddr *ifa;
1533         struct sockaddr_dl *sdl;
1534
1535         switch (cmd) {
1536         case SIOCGIFCONF:
1537         case OSIOCGIFCONF:
1538                 return (ifconf(cmd, data, cred));
1539         default:
1540                 break;
1541         }
1542
1543         ifr = (struct ifreq *)data;
1544
1545         switch (cmd) {
1546         case SIOCIFCREATE:
1547         case SIOCIFCREATE2:
1548                 if ((error = priv_check_cred(cred, PRIV_ROOT, 0)) != 0)
1549                         return (error);
1550                 return (if_clone_create(ifr->ifr_name, sizeof(ifr->ifr_name),
1551                         cmd == SIOCIFCREATE2 ? ifr->ifr_data : NULL));
1552         case SIOCIFDESTROY:
1553                 if ((error = priv_check_cred(cred, PRIV_ROOT, 0)) != 0)
1554                         return (error);
1555                 return (if_clone_destroy(ifr->ifr_name));
1556         case SIOCIFGCLONERS:
1557                 return (if_clone_list((struct if_clonereq *)data));
1558         default:
1559                 break;
1560         }
1561
1562         /*
1563          * Nominal ioctl through interface, lookup the ifp and obtain a
1564          * lock to serialize the ifconfig ioctl operation.
1565          */
1566         ifp = ifunit(ifr->ifr_name);
1567         if (ifp == NULL)
1568                 return (ENXIO);
1569         error = 0;
1570         mtx_lock(&ifp->if_ioctl_mtx);
1571
1572         switch (cmd) {
1573         case SIOCGIFINDEX:
1574                 ifr->ifr_index = ifp->if_index;
1575                 break;
1576
1577         case SIOCGIFFLAGS:
1578                 ifr->ifr_flags = ifp->if_flags;
1579                 ifr->ifr_flagshigh = ifp->if_flags >> 16;
1580                 break;
1581
1582         case SIOCGIFCAP:
1583                 ifr->ifr_reqcap = ifp->if_capabilities;
1584                 ifr->ifr_curcap = ifp->if_capenable;
1585                 break;
1586
1587         case SIOCGIFMETRIC:
1588                 ifr->ifr_metric = ifp->if_metric;
1589                 break;
1590
1591         case SIOCGIFMTU:
1592                 ifr->ifr_mtu = ifp->if_mtu;
1593                 break;
1594
1595         case SIOCGIFTSOLEN:
1596                 ifr->ifr_tsolen = ifp->if_tsolen;
1597                 break;
1598
1599         case SIOCGIFDATA:
1600                 error = copyout((caddr_t)&ifp->if_data, ifr->ifr_data,
1601                                 sizeof(ifp->if_data));
1602                 break;
1603
1604         case SIOCGIFPHYS:
1605                 ifr->ifr_phys = ifp->if_physical;
1606                 break;
1607
1608         case SIOCGIFPOLLCPU:
1609                 ifr->ifr_pollcpu = -1;
1610                 break;
1611
1612         case SIOCSIFPOLLCPU:
1613                 break;
1614
1615         case SIOCSIFFLAGS:
1616                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1617                 if (error)
1618                         break;
1619                 new_flags = (ifr->ifr_flags & 0xffff) |
1620                     (ifr->ifr_flagshigh << 16);
1621                 if (ifp->if_flags & IFF_SMART) {
1622                         /* Smart drivers twiddle their own routes */
1623                 } else if (ifp->if_flags & IFF_UP &&
1624                     (new_flags & IFF_UP) == 0) {
1625                         crit_enter();
1626                         if_down(ifp);
1627                         crit_exit();
1628                 } else if (new_flags & IFF_UP &&
1629                     (ifp->if_flags & IFF_UP) == 0) {
1630                         crit_enter();
1631                         if_up(ifp);
1632                         crit_exit();
1633                 }
1634
1635 #ifdef IFPOLL_ENABLE
1636                 if ((new_flags ^ ifp->if_flags) & IFF_NPOLLING) {
1637                         if (new_flags & IFF_NPOLLING)
1638                                 ifpoll_register(ifp);
1639                         else
1640                                 ifpoll_deregister(ifp);
1641                 }
1642 #endif
1643
1644                 ifp->if_flags = (ifp->if_flags & IFF_CANTCHANGE) |
1645                         (new_flags &~ IFF_CANTCHANGE);
1646                 if (new_flags & IFF_PPROMISC) {
1647                         /* Permanently promiscuous mode requested */
1648                         ifp->if_flags |= IFF_PROMISC;
1649                 } else if (ifp->if_pcount == 0) {
1650                         ifp->if_flags &= ~IFF_PROMISC;
1651                 }
1652                 if (ifp->if_ioctl) {
1653                         ifnet_serialize_all(ifp);
1654                         ifp->if_ioctl(ifp, cmd, data, cred);
1655                         ifnet_deserialize_all(ifp);
1656                 }
1657                 getmicrotime(&ifp->if_lastchange);
1658                 break;
1659
1660         case SIOCSIFCAP:
1661                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1662                 if (error)
1663                         break;
1664                 if (ifr->ifr_reqcap & ~ifp->if_capabilities) {
1665                         error = EINVAL;
1666                         break;
1667                 }
1668                 ifnet_serialize_all(ifp);
1669                 ifp->if_ioctl(ifp, cmd, data, cred);
1670                 ifnet_deserialize_all(ifp);
1671                 break;
1672
1673         case SIOCSIFNAME:
1674                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1675                 if (error)
1676                         break;
1677                 error = copyinstr(ifr->ifr_data, new_name, IFNAMSIZ, NULL);
1678                 if (error)
1679                         break;
1680                 if (new_name[0] == '\0') {
1681                         error = EINVAL;
1682                         break;
1683                 }
1684                 if (ifunit(new_name) != NULL) {
1685                         error = EEXIST;
1686                         break;
1687                 }
1688
1689                 EVENTHANDLER_INVOKE(ifnet_detach_event, ifp);
1690
1691                 /* Announce the departure of the interface. */
1692                 rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
1693
1694                 strlcpy(ifp->if_xname, new_name, sizeof(ifp->if_xname));
1695                 ifa = TAILQ_FIRST(&ifp->if_addrheads[mycpuid])->ifa;
1696                 /* XXX IFA_LOCK(ifa); */
1697                 sdl = (struct sockaddr_dl *)ifa->ifa_addr;
1698                 namelen = strlen(new_name);
1699                 onamelen = sdl->sdl_nlen;
1700                 /*
1701                  * Move the address if needed.  This is safe because we
1702                  * allocate space for a name of length IFNAMSIZ when we
1703                  * create this in if_attach().
1704                  */
1705                 if (namelen != onamelen) {
1706                         bcopy(sdl->sdl_data + onamelen,
1707                             sdl->sdl_data + namelen, sdl->sdl_alen);
1708                 }
1709                 bcopy(new_name, sdl->sdl_data, namelen);
1710                 sdl->sdl_nlen = namelen;
1711                 sdl = (struct sockaddr_dl *)ifa->ifa_netmask;
1712                 bzero(sdl->sdl_data, onamelen);
1713                 while (namelen != 0)
1714                         sdl->sdl_data[--namelen] = 0xff;
1715                 /* XXX IFA_UNLOCK(ifa) */
1716
1717                 EVENTHANDLER_INVOKE(ifnet_attach_event, ifp);
1718
1719                 /* Announce the return of the interface. */
1720                 rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
1721                 break;
1722
1723         case SIOCSIFMETRIC:
1724                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1725                 if (error)
1726                         break;
1727                 ifp->if_metric = ifr->ifr_metric;
1728                 getmicrotime(&ifp->if_lastchange);
1729                 break;
1730
1731         case SIOCSIFPHYS:
1732                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1733                 if (error)
1734                         break;
1735                 if (ifp->if_ioctl == NULL) {
1736                         error = EOPNOTSUPP;
1737                         break;
1738                 }
1739                 ifnet_serialize_all(ifp);
1740                 error = ifp->if_ioctl(ifp, cmd, data, cred);
1741                 ifnet_deserialize_all(ifp);
1742                 if (error == 0)
1743                         getmicrotime(&ifp->if_lastchange);
1744                 break;
1745
1746         case SIOCSIFMTU:
1747         {
1748                 u_long oldmtu = ifp->if_mtu;
1749
1750                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1751                 if (error)
1752                         break;
1753                 if (ifp->if_ioctl == NULL) {
1754                         error = EOPNOTSUPP;
1755                         break;
1756                 }
1757                 if (ifr->ifr_mtu < IF_MINMTU || ifr->ifr_mtu > IF_MAXMTU) {
1758                         error = EINVAL;
1759                         break;
1760                 }
1761                 ifnet_serialize_all(ifp);
1762                 error = ifp->if_ioctl(ifp, cmd, data, cred);
1763                 ifnet_deserialize_all(ifp);
1764                 if (error == 0) {
1765                         getmicrotime(&ifp->if_lastchange);
1766                         rt_ifmsg(ifp);
1767                 }
1768                 /*
1769                  * If the link MTU changed, do network layer specific procedure.
1770                  */
1771                 if (ifp->if_mtu != oldmtu) {
1772 #ifdef INET6
1773                         nd6_setmtu(ifp);
1774 #endif
1775                 }
1776                 break;
1777         }
1778
1779         case SIOCSIFTSOLEN:
1780                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1781                 if (error)
1782                         break;
1783
1784                 /* XXX need driver supplied upper limit */
1785                 if (ifr->ifr_tsolen <= 0) {
1786                         error = EINVAL;
1787                         break;
1788                 }
1789                 ifp->if_tsolen = ifr->ifr_tsolen;
1790                 break;
1791
1792         case SIOCADDMULTI:
1793         case SIOCDELMULTI:
1794                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1795                 if (error)
1796                         break;
1797
1798                 /* Don't allow group membership on non-multicast interfaces. */
1799                 if ((ifp->if_flags & IFF_MULTICAST) == 0) {
1800                         error = EOPNOTSUPP;
1801                         break;
1802                 }
1803
1804                 /* Don't let users screw up protocols' entries. */
1805                 if (ifr->ifr_addr.sa_family != AF_LINK) {
1806                         error = EINVAL;
1807                         break;
1808                 }
1809
1810                 if (cmd == SIOCADDMULTI) {
1811                         struct ifmultiaddr *ifma;
1812                         error = if_addmulti(ifp, &ifr->ifr_addr, &ifma);
1813                 } else {
1814                         error = if_delmulti(ifp, &ifr->ifr_addr);
1815                 }
1816                 if (error == 0)
1817                         getmicrotime(&ifp->if_lastchange);
1818                 break;
1819
1820         case SIOCSIFPHYADDR:
1821         case SIOCDIFPHYADDR:
1822 #ifdef INET6
1823         case SIOCSIFPHYADDR_IN6:
1824 #endif
1825         case SIOCSLIFPHYADDR:
1826         case SIOCSIFMEDIA:
1827         case SIOCSIFGENERIC:
1828                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1829                 if (error)
1830                         break;
1831                 if (ifp->if_ioctl == 0) {
1832                         error = EOPNOTSUPP;
1833                         break;
1834                 }
1835                 ifnet_serialize_all(ifp);
1836                 error = ifp->if_ioctl(ifp, cmd, data, cred);
1837                 ifnet_deserialize_all(ifp);
1838                 if (error == 0)
1839                         getmicrotime(&ifp->if_lastchange);
1840                 break;
1841
1842         case SIOCGIFSTATUS:
1843                 ifs = (struct ifstat *)data;
1844                 ifs->ascii[0] = '\0';
1845                 /* fall through */
1846         case SIOCGIFPSRCADDR:
1847         case SIOCGIFPDSTADDR:
1848         case SIOCGLIFPHYADDR:
1849         case SIOCGIFMEDIA:
1850         case SIOCGIFGENERIC:
1851                 if (ifp->if_ioctl == NULL) {
1852                         error = EOPNOTSUPP;
1853                         break;
1854                 }
1855                 ifnet_serialize_all(ifp);
1856                 error = ifp->if_ioctl(ifp, cmd, data, cred);
1857                 ifnet_deserialize_all(ifp);
1858                 break;
1859
1860         case SIOCSIFLLADDR:
1861                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1862                 if (error)
1863                         break;
1864                 error = if_setlladdr(ifp, ifr->ifr_addr.sa_data,
1865                                      ifr->ifr_addr.sa_len);
1866                 EVENTHANDLER_INVOKE(iflladdr_event, ifp);
1867                 break;
1868
1869         default:
1870                 oif_flags = ifp->if_flags;
1871                 if (so->so_proto == 0) {
1872                         error = EOPNOTSUPP;
1873                         break;
1874                 }
1875 #ifndef COMPAT_43
1876                 error = so_pru_control_direct(so, cmd, data, ifp);
1877 #else
1878                 ocmd = cmd;
1879
1880                 switch (cmd) {
1881                 case SIOCSIFDSTADDR:
1882                 case SIOCSIFADDR:
1883                 case SIOCSIFBRDADDR:
1884                 case SIOCSIFNETMASK:
1885 #if BYTE_ORDER != BIG_ENDIAN
1886                         if (ifr->ifr_addr.sa_family == 0 &&
1887                             ifr->ifr_addr.sa_len < 16) {
1888                                 ifr->ifr_addr.sa_family = ifr->ifr_addr.sa_len;
1889                                 ifr->ifr_addr.sa_len = 16;
1890                         }
1891 #else
1892                         if (ifr->ifr_addr.sa_len == 0)
1893                                 ifr->ifr_addr.sa_len = 16;
1894 #endif
1895                         break;
1896                 case OSIOCGIFADDR:
1897                         cmd = SIOCGIFADDR;
1898                         break;
1899                 case OSIOCGIFDSTADDR:
1900                         cmd = SIOCGIFDSTADDR;
1901                         break;
1902                 case OSIOCGIFBRDADDR:
1903                         cmd = SIOCGIFBRDADDR;
1904                         break;
1905                 case OSIOCGIFNETMASK:
1906                         cmd = SIOCGIFNETMASK;
1907                         break;
1908                 default:
1909                         break;
1910                 }
1911
1912                 error = so_pru_control_direct(so, cmd, data, ifp);
1913
1914                 switch (ocmd) {
1915                 case OSIOCGIFADDR:
1916                 case OSIOCGIFDSTADDR:
1917                 case OSIOCGIFBRDADDR:
1918                 case OSIOCGIFNETMASK:
1919                         *(u_short *)&ifr->ifr_addr = ifr->ifr_addr.sa_family;
1920                         break;
1921                 }
1922 #endif /* COMPAT_43 */
1923
1924                 if ((oif_flags ^ ifp->if_flags) & IFF_UP) {
1925 #ifdef INET6
1926                         DELAY(100);/* XXX: temporary workaround for fxp issue*/
1927                         if (ifp->if_flags & IFF_UP) {
1928                                 crit_enter();
1929                                 in6_if_up(ifp);
1930                                 crit_exit();
1931                         }
1932 #endif
1933                 }
1934                 break;
1935         }
1936
1937         mtx_unlock(&ifp->if_ioctl_mtx);
1938         return (error);
1939 }
1940
1941 /*
1942  * Set/clear promiscuous mode on interface ifp based on the truth value
1943  * of pswitch.  The calls are reference counted so that only the first
1944  * "on" request actually has an effect, as does the final "off" request.
1945  * Results are undefined if the "off" and "on" requests are not matched.
1946  */
1947 int
1948 ifpromisc(struct ifnet *ifp, int pswitch)
1949 {
1950         struct ifreq ifr;
1951         int error;
1952         int oldflags;
1953
1954         oldflags = ifp->if_flags;
1955         if (ifp->if_flags & IFF_PPROMISC) {
1956                 /* Do nothing if device is in permanently promiscuous mode */
1957                 ifp->if_pcount += pswitch ? 1 : -1;
1958                 return (0);
1959         }
1960         if (pswitch) {
1961                 /*
1962                  * If the device is not configured up, we cannot put it in
1963                  * promiscuous mode.
1964                  */
1965                 if ((ifp->if_flags & IFF_UP) == 0)
1966                         return (ENETDOWN);
1967                 if (ifp->if_pcount++ != 0)
1968                         return (0);
1969                 ifp->if_flags |= IFF_PROMISC;
1970                 log(LOG_INFO, "%s: promiscuous mode enabled\n",
1971                     ifp->if_xname);
1972         } else {
1973                 if (--ifp->if_pcount > 0)
1974                         return (0);
1975                 ifp->if_flags &= ~IFF_PROMISC;
1976                 log(LOG_INFO, "%s: promiscuous mode disabled\n",
1977                     ifp->if_xname);
1978         }
1979         ifr.ifr_flags = ifp->if_flags;
1980         ifr.ifr_flagshigh = ifp->if_flags >> 16;
1981         ifnet_serialize_all(ifp);
1982         error = ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr, NULL);
1983         ifnet_deserialize_all(ifp);
1984         if (error == 0)
1985                 rt_ifmsg(ifp);
1986         else
1987                 ifp->if_flags = oldflags;
1988         return error;
1989 }
1990
1991 /*
1992  * Return interface configuration
1993  * of system.  List may be used
1994  * in later ioctl's (above) to get
1995  * other information.
1996  */
1997 static int
1998 ifconf(u_long cmd, caddr_t data, struct ucred *cred)
1999 {
2000         struct ifconf *ifc = (struct ifconf *)data;
2001         struct ifnet *ifp;
2002         struct sockaddr *sa;
2003         struct ifreq ifr, *ifrp;
2004         int space = ifc->ifc_len, error = 0;
2005
2006         ifrp = ifc->ifc_req;
2007         TAILQ_FOREACH(ifp, &ifnet, if_link) {
2008                 struct ifaddr_container *ifac;
2009                 int addrs;
2010
2011                 if (space <= sizeof ifr)
2012                         break;
2013
2014                 /*
2015                  * Zero the stack declared structure first to prevent
2016                  * memory disclosure.
2017                  */
2018                 bzero(&ifr, sizeof(ifr));
2019                 if (strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name))
2020                     >= sizeof(ifr.ifr_name)) {
2021                         error = ENAMETOOLONG;
2022                         break;
2023                 }
2024
2025                 addrs = 0;
2026                 TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
2027                         struct ifaddr *ifa = ifac->ifa;
2028
2029                         if (space <= sizeof ifr)
2030                                 break;
2031                         sa = ifa->ifa_addr;
2032                         if (cred->cr_prison &&
2033                             prison_if(cred, sa))
2034                                 continue;
2035                         addrs++;
2036 #ifdef COMPAT_43
2037                         if (cmd == OSIOCGIFCONF) {
2038                                 struct osockaddr *osa =
2039                                          (struct osockaddr *)&ifr.ifr_addr;
2040                                 ifr.ifr_addr = *sa;
2041                                 osa->sa_family = sa->sa_family;
2042                                 error = copyout(&ifr, ifrp, sizeof ifr);
2043                                 ifrp++;
2044                         } else
2045 #endif
2046                         if (sa->sa_len <= sizeof(*sa)) {
2047                                 ifr.ifr_addr = *sa;
2048                                 error = copyout(&ifr, ifrp, sizeof ifr);
2049                                 ifrp++;
2050                         } else {
2051                                 if (space < (sizeof ifr) + sa->sa_len -
2052                                             sizeof(*sa))
2053                                         break;
2054                                 space -= sa->sa_len - sizeof(*sa);
2055                                 error = copyout(&ifr, ifrp,
2056                                                 sizeof ifr.ifr_name);
2057                                 if (error == 0)
2058                                         error = copyout(sa, &ifrp->ifr_addr,
2059                                                         sa->sa_len);
2060                                 ifrp = (struct ifreq *)
2061                                         (sa->sa_len + (caddr_t)&ifrp->ifr_addr);
2062                         }
2063                         if (error)
2064                                 break;
2065                         space -= sizeof ifr;
2066                 }
2067                 if (error)
2068                         break;
2069                 if (!addrs) {
2070                         bzero(&ifr.ifr_addr, sizeof ifr.ifr_addr);
2071                         error = copyout(&ifr, ifrp, sizeof ifr);
2072                         if (error)
2073                                 break;
2074                         space -= sizeof ifr;
2075                         ifrp++;
2076                 }
2077         }
2078         ifc->ifc_len -= space;
2079         return (error);
2080 }
2081
2082 /*
2083  * Just like if_promisc(), but for all-multicast-reception mode.
2084  */
2085 int
2086 if_allmulti(struct ifnet *ifp, int onswitch)
2087 {
2088         int error = 0;
2089         struct ifreq ifr;
2090
2091         crit_enter();
2092
2093         if (onswitch) {
2094                 if (ifp->if_amcount++ == 0) {
2095                         ifp->if_flags |= IFF_ALLMULTI;
2096                         ifr.ifr_flags = ifp->if_flags;
2097                         ifr.ifr_flagshigh = ifp->if_flags >> 16;
2098                         ifnet_serialize_all(ifp);
2099                         error = ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr,
2100                                               NULL);
2101                         ifnet_deserialize_all(ifp);
2102                 }
2103         } else {
2104                 if (ifp->if_amcount > 1) {
2105                         ifp->if_amcount--;
2106                 } else {
2107                         ifp->if_amcount = 0;
2108                         ifp->if_flags &= ~IFF_ALLMULTI;
2109                         ifr.ifr_flags = ifp->if_flags;
2110                         ifr.ifr_flagshigh = ifp->if_flags >> 16;
2111                         ifnet_serialize_all(ifp);
2112                         error = ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr,
2113                                               NULL);
2114                         ifnet_deserialize_all(ifp);
2115                 }
2116         }
2117
2118         crit_exit();
2119
2120         if (error == 0)
2121                 rt_ifmsg(ifp);
2122         return error;
2123 }
2124
2125 /*
2126  * Add a multicast listenership to the interface in question.
2127  * The link layer provides a routine which converts
2128  */
2129 int
2130 if_addmulti(
2131         struct ifnet *ifp,      /* interface to manipulate */
2132         struct sockaddr *sa,    /* address to add */
2133         struct ifmultiaddr **retifma)
2134 {
2135         struct sockaddr *llsa, *dupsa;
2136         int error;
2137         struct ifmultiaddr *ifma;
2138
2139         /*
2140          * If the matching multicast address already exists
2141          * then don't add a new one, just add a reference
2142          */
2143         TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
2144                 if (sa_equal(sa, ifma->ifma_addr)) {
2145                         ifma->ifma_refcount++;
2146                         if (retifma)
2147                                 *retifma = ifma;
2148                         return 0;
2149                 }
2150         }
2151
2152         /*
2153          * Give the link layer a chance to accept/reject it, and also
2154          * find out which AF_LINK address this maps to, if it isn't one
2155          * already.
2156          */
2157         if (ifp->if_resolvemulti) {
2158                 ifnet_serialize_all(ifp);
2159                 error = ifp->if_resolvemulti(ifp, &llsa, sa);
2160                 ifnet_deserialize_all(ifp);
2161                 if (error) 
2162                         return error;
2163         } else {
2164                 llsa = NULL;
2165         }
2166
2167         ifma = kmalloc(sizeof *ifma, M_IFMADDR, M_WAITOK);
2168         dupsa = kmalloc(sa->sa_len, M_IFMADDR, M_WAITOK);
2169         bcopy(sa, dupsa, sa->sa_len);
2170
2171         ifma->ifma_addr = dupsa;
2172         ifma->ifma_lladdr = llsa;
2173         ifma->ifma_ifp = ifp;
2174         ifma->ifma_refcount = 1;
2175         ifma->ifma_protospec = 0;
2176         rt_newmaddrmsg(RTM_NEWMADDR, ifma);
2177
2178         /*
2179          * Some network interfaces can scan the address list at
2180          * interrupt time; lock them out.
2181          */
2182         crit_enter();
2183         TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link);
2184         crit_exit();
2185         if (retifma)
2186                 *retifma = ifma;
2187
2188         if (llsa != NULL) {
2189                 TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
2190                         if (sa_equal(ifma->ifma_addr, llsa))
2191                                 break;
2192                 }
2193                 if (ifma) {
2194                         ifma->ifma_refcount++;
2195                 } else {
2196                         ifma = kmalloc(sizeof *ifma, M_IFMADDR, M_WAITOK);
2197                         dupsa = kmalloc(llsa->sa_len, M_IFMADDR, M_WAITOK);
2198                         bcopy(llsa, dupsa, llsa->sa_len);
2199                         ifma->ifma_addr = dupsa;
2200                         ifma->ifma_ifp = ifp;
2201                         ifma->ifma_refcount = 1;
2202                         crit_enter();
2203                         TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link);
2204                         crit_exit();
2205                 }
2206         }
2207         /*
2208          * We are certain we have added something, so call down to the
2209          * interface to let them know about it.
2210          */
2211         crit_enter();
2212         ifnet_serialize_all(ifp);
2213         if (ifp->if_ioctl)
2214                 ifp->if_ioctl(ifp, SIOCADDMULTI, 0, NULL);
2215         ifnet_deserialize_all(ifp);
2216         crit_exit();
2217
2218         return 0;
2219 }
2220
2221 /*
2222  * Remove a reference to a multicast address on this interface.  Yell
2223  * if the request does not match an existing membership.
2224  */
2225 int
2226 if_delmulti(struct ifnet *ifp, struct sockaddr *sa)
2227 {
2228         struct ifmultiaddr *ifma;
2229
2230         TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link)
2231                 if (sa_equal(sa, ifma->ifma_addr))
2232                         break;
2233         if (ifma == NULL)
2234                 return ENOENT;
2235
2236         if (ifma->ifma_refcount > 1) {
2237                 ifma->ifma_refcount--;
2238                 return 0;
2239         }
2240
2241         rt_newmaddrmsg(RTM_DELMADDR, ifma);
2242         sa = ifma->ifma_lladdr;
2243         crit_enter();
2244         TAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifma_link);
2245         /*
2246          * Make sure the interface driver is notified
2247          * in the case of a link layer mcast group being left.
2248          */
2249         if (ifma->ifma_addr->sa_family == AF_LINK && sa == NULL) {
2250                 ifnet_serialize_all(ifp);
2251                 ifp->if_ioctl(ifp, SIOCDELMULTI, 0, NULL);
2252                 ifnet_deserialize_all(ifp);
2253         }
2254         crit_exit();
2255         kfree(ifma->ifma_addr, M_IFMADDR);
2256         kfree(ifma, M_IFMADDR);
2257         if (sa == NULL)
2258                 return 0;
2259
2260         /*
2261          * Now look for the link-layer address which corresponds to
2262          * this network address.  It had been squirreled away in
2263          * ifma->ifma_lladdr for this purpose (so we don't have
2264          * to call ifp->if_resolvemulti() again), and we saved that
2265          * value in sa above.  If some nasty deleted the
2266          * link-layer address out from underneath us, we can deal because
2267          * the address we stored was is not the same as the one which was
2268          * in the record for the link-layer address.  (So we don't complain
2269          * in that case.)
2270          */
2271         TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link)
2272                 if (sa_equal(sa, ifma->ifma_addr))
2273                         break;
2274         if (ifma == NULL)
2275                 return 0;
2276
2277         if (ifma->ifma_refcount > 1) {
2278                 ifma->ifma_refcount--;
2279                 return 0;
2280         }
2281
2282         crit_enter();
2283         ifnet_serialize_all(ifp);
2284         TAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifma_link);
2285         ifp->if_ioctl(ifp, SIOCDELMULTI, 0, NULL);
2286         ifnet_deserialize_all(ifp);
2287         crit_exit();
2288         kfree(ifma->ifma_addr, M_IFMADDR);
2289         kfree(sa, M_IFMADDR);
2290         kfree(ifma, M_IFMADDR);
2291
2292         return 0;
2293 }
2294
2295 /*
2296  * Delete all multicast group membership for an interface.
2297  * Should be used to quickly flush all multicast filters.
2298  */
2299 void
2300 if_delallmulti(struct ifnet *ifp)
2301 {
2302         struct ifmultiaddr *ifma;
2303         struct ifmultiaddr *next;
2304
2305         TAILQ_FOREACH_MUTABLE(ifma, &ifp->if_multiaddrs, ifma_link, next)
2306                 if_delmulti(ifp, ifma->ifma_addr);
2307 }
2308
2309
2310 /*
2311  * Set the link layer address on an interface.
2312  *
2313  * At this time we only support certain types of interfaces,
2314  * and we don't allow the length of the address to change.
2315  */
2316 int
2317 if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len)
2318 {
2319         struct sockaddr_dl *sdl;
2320         struct ifreq ifr;
2321
2322         sdl = IF_LLSOCKADDR(ifp);
2323         if (sdl == NULL)
2324                 return (EINVAL);
2325         if (len != sdl->sdl_alen)       /* don't allow length to change */
2326                 return (EINVAL);
2327         switch (ifp->if_type) {
2328         case IFT_ETHER:                 /* these types use struct arpcom */
2329         case IFT_XETHER:
2330         case IFT_L2VLAN:
2331                 bcopy(lladdr, ((struct arpcom *)ifp->if_softc)->ac_enaddr, len);
2332                 bcopy(lladdr, LLADDR(sdl), len);
2333                 break;
2334         default:
2335                 return (ENODEV);
2336         }
2337         /*
2338          * If the interface is already up, we need
2339          * to re-init it in order to reprogram its
2340          * address filter.
2341          */
2342         ifnet_serialize_all(ifp);
2343         if ((ifp->if_flags & IFF_UP) != 0) {
2344 #ifdef INET
2345                 struct ifaddr_container *ifac;
2346 #endif
2347
2348                 ifp->if_flags &= ~IFF_UP;
2349                 ifr.ifr_flags = ifp->if_flags;
2350                 ifr.ifr_flagshigh = ifp->if_flags >> 16;
2351                 ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr,
2352                               NULL);
2353                 ifp->if_flags |= IFF_UP;
2354                 ifr.ifr_flags = ifp->if_flags;
2355                 ifr.ifr_flagshigh = ifp->if_flags >> 16;
2356                 ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr,
2357                                  NULL);
2358 #ifdef INET
2359                 /*
2360                  * Also send gratuitous ARPs to notify other nodes about
2361                  * the address change.
2362                  */
2363                 TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
2364                         struct ifaddr *ifa = ifac->ifa;
2365
2366                         if (ifa->ifa_addr != NULL &&
2367                             ifa->ifa_addr->sa_family == AF_INET)
2368                                 arp_gratuitous(ifp, ifa);
2369                 }
2370 #endif
2371         }
2372         ifnet_deserialize_all(ifp);
2373         return (0);
2374 }
2375
2376 struct ifmultiaddr *
2377 ifmaof_ifpforaddr(struct sockaddr *sa, struct ifnet *ifp)
2378 {
2379         struct ifmultiaddr *ifma;
2380
2381         TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link)
2382                 if (sa_equal(ifma->ifma_addr, sa))
2383                         break;
2384
2385         return ifma;
2386 }
2387
2388 /*
2389  * This function locates the first real ethernet MAC from a network
2390  * card and loads it into node, returning 0 on success or ENOENT if
2391  * no suitable interfaces were found.  It is used by the uuid code to
2392  * generate a unique 6-byte number.
2393  */
2394 int
2395 if_getanyethermac(uint16_t *node, int minlen)
2396 {
2397         struct ifnet *ifp;
2398         struct sockaddr_dl *sdl;
2399
2400         TAILQ_FOREACH(ifp, &ifnet, if_link) {
2401                 if (ifp->if_type != IFT_ETHER)
2402                         continue;
2403                 sdl = IF_LLSOCKADDR(ifp);
2404                 if (sdl->sdl_alen < minlen)
2405                         continue;
2406                 bcopy(((struct arpcom *)ifp->if_softc)->ac_enaddr, node,
2407                       minlen);
2408                 return(0);
2409         }
2410         return (ENOENT);
2411 }
2412
2413 /*
2414  * The name argument must be a pointer to storage which will last as
2415  * long as the interface does.  For physical devices, the result of
2416  * device_get_name(dev) is a good choice and for pseudo-devices a
2417  * static string works well.
2418  */
2419 void
2420 if_initname(struct ifnet *ifp, const char *name, int unit)
2421 {
2422         ifp->if_dname = name;
2423         ifp->if_dunit = unit;
2424         if (unit != IF_DUNIT_NONE)
2425                 ksnprintf(ifp->if_xname, IFNAMSIZ, "%s%d", name, unit);
2426         else
2427                 strlcpy(ifp->if_xname, name, IFNAMSIZ);
2428 }
2429
2430 int
2431 if_printf(struct ifnet *ifp, const char *fmt, ...)
2432 {
2433         __va_list ap;
2434         int retval;
2435
2436         retval = kprintf("%s: ", ifp->if_xname);
2437         __va_start(ap, fmt);
2438         retval += kvprintf(fmt, ap);
2439         __va_end(ap);
2440         return (retval);
2441 }
2442
2443 struct ifnet *
2444 if_alloc(uint8_t type)
2445 {
2446         struct ifnet *ifp;
2447         size_t size;
2448
2449         /*
2450          * XXX temporary hack until arpcom is setup in if_l2com
2451          */
2452         if (type == IFT_ETHER)
2453                 size = sizeof(struct arpcom);
2454         else
2455                 size = sizeof(struct ifnet);
2456
2457         ifp = kmalloc(size, M_IFNET, M_WAITOK|M_ZERO);
2458
2459         ifp->if_type = type;
2460
2461         if (if_com_alloc[type] != NULL) {
2462                 ifp->if_l2com = if_com_alloc[type](type, ifp);
2463                 if (ifp->if_l2com == NULL) {
2464                         kfree(ifp, M_IFNET);
2465                         return (NULL);
2466                 }
2467         }
2468         return (ifp);
2469 }
2470
2471 void
2472 if_free(struct ifnet *ifp)
2473 {
2474         kfree(ifp, M_IFNET);
2475 }
2476
2477 void
2478 ifq_set_classic(struct ifaltq *ifq)
2479 {
2480         ifq_set_methods(ifq, ifq->altq_ifp->if_mapsubq,
2481             ifsq_classic_enqueue, ifsq_classic_dequeue, ifsq_classic_request);
2482 }
2483
2484 void
2485 ifq_set_methods(struct ifaltq *ifq, altq_mapsubq_t mapsubq,
2486     ifsq_enqueue_t enqueue, ifsq_dequeue_t dequeue, ifsq_request_t request)
2487 {
2488         int q;
2489
2490         KASSERT(mapsubq != NULL, ("mapsubq is not specified"));
2491         KASSERT(enqueue != NULL, ("enqueue is not specified"));
2492         KASSERT(dequeue != NULL, ("dequeue is not specified"));
2493         KASSERT(request != NULL, ("request is not specified"));
2494
2495         ifq->altq_mapsubq = mapsubq;
2496         for (q = 0; q < ifq->altq_subq_cnt; ++q) {
2497                 struct ifaltq_subque *ifsq = &ifq->altq_subq[q];
2498
2499                 ifsq->ifsq_enqueue = enqueue;
2500                 ifsq->ifsq_dequeue = dequeue;
2501                 ifsq->ifsq_request = request;
2502         }
2503 }
2504
2505 int
2506 ifsq_classic_enqueue(struct ifaltq_subque *ifsq, struct mbuf *m,
2507     struct altq_pktattr *pa __unused)
2508 {
2509         if (ifsq->ifq_len >= ifsq->ifq_maxlen) {
2510                 m_freem(m);
2511                 return ENOBUFS;
2512         } else {
2513                 m->m_nextpkt = NULL;
2514                 if (ifsq->ifq_tail == NULL)
2515                         ifsq->ifq_head = m;
2516                 else
2517                         ifsq->ifq_tail->m_nextpkt = m;
2518                 ifsq->ifq_tail = m;
2519                 ifsq->ifq_len++;
2520                 return 0;
2521         }       
2522 }
2523
2524 struct mbuf *
2525 ifsq_classic_dequeue(struct ifaltq_subque *ifsq, struct mbuf *mpolled, int op)
2526 {
2527         struct mbuf *m;
2528
2529         switch (op) {
2530         case ALTDQ_POLL:
2531                 m = ifsq->ifq_head;
2532                 break;
2533
2534         case ALTDQ_REMOVE:
2535                 m = ifsq->ifq_head;
2536                 if (m != NULL) {
2537                         if ((ifsq->ifq_head = m->m_nextpkt) == NULL)
2538                                 ifsq->ifq_tail = NULL;
2539                         m->m_nextpkt = NULL;
2540                         ifsq->ifq_len--;
2541                 }
2542                 break;
2543
2544         default:
2545                 panic("unsupported ALTQ dequeue op: %d", op);
2546         }
2547         KKASSERT(mpolled == NULL || mpolled == m);
2548         return m;
2549 }
2550
2551 int
2552 ifsq_classic_request(struct ifaltq_subque *ifsq, int req, void *arg)
2553 {
2554         switch (req) {
2555         case ALTRQ_PURGE:
2556                 for (;;) {
2557                         struct mbuf *m;
2558
2559                         m = ifsq_classic_dequeue(ifsq, NULL, ALTDQ_REMOVE);
2560                         if (m == NULL)
2561                                 break;
2562                         m_freem(m);
2563                 }
2564                 break;
2565
2566         default:
2567                 panic("unsupported ALTQ request: %d", req);
2568         }
2569         return 0;
2570 }
2571
2572 static void
2573 ifsq_ifstart_try(struct ifaltq_subque *ifsq, int force_sched)
2574 {
2575         struct ifnet *ifp = ifsq_get_ifp(ifsq);
2576         int running = 0, need_sched;
2577
2578         /*
2579          * Try to do direct ifnet.if_start on the subqueue first, if there is
2580          * contention on the subqueue hardware serializer, ifnet.if_start on
2581          * the subqueue will be scheduled on the subqueue owner CPU.
2582          */
2583         if (!ifsq_tryserialize_hw(ifsq)) {
2584                 /*
2585                  * Subqueue hardware serializer contention happened,
2586                  * ifnet.if_start on the subqueue is scheduled on
2587                  * the subqueue owner CPU, and we keep going.
2588                  */
2589                 ifsq_ifstart_schedule(ifsq, 1);
2590                 return;
2591         }
2592
2593         if ((ifp->if_flags & IFF_RUNNING) && !ifsq_is_oactive(ifsq)) {
2594                 ifp->if_start(ifp, ifsq);
2595                 if ((ifp->if_flags & IFF_RUNNING) && !ifsq_is_oactive(ifsq))
2596                         running = 1;
2597         }
2598         need_sched = ifsq_ifstart_need_schedule(ifsq, running);
2599
2600         ifsq_deserialize_hw(ifsq);
2601
2602         if (need_sched) {
2603                 /*
2604                  * More data need to be transmitted, ifnet.if_start on the
2605                  * subqueue is scheduled on the subqueue owner CPU, and we
2606                  * keep going.
2607                  * NOTE: ifnet.if_start subqueue interlock is not released.
2608                  */
2609                 ifsq_ifstart_schedule(ifsq, force_sched);
2610         }
2611 }
2612
2613 /*
2614  * Subqeue packets staging mechanism:
2615  *
2616  * The packets enqueued into the subqueue are staged to a certain amount
2617  * before the ifnet.if_start on the subqueue is called.  In this way, the
2618  * driver could avoid writing to hardware registers upon every packet,
2619  * instead, hardware registers could be written when certain amount of
2620  * packets are put onto hardware TX ring.  The measurement on several modern
2621  * NICs (emx(4), igb(4), bnx(4), bge(4), jme(4)) shows that the hardware
2622  * registers writing aggregation could save ~20% CPU time when 18bytes UDP
2623  * datagrams are transmitted at 1.48Mpps.  The performance improvement by
2624  * hardware registers writing aggeregation is also mentioned by Luigi Rizzo's
2625  * netmap paper (http://info.iet.unipi.it/~luigi/netmap/).
2626  *
2627  * Subqueue packets staging is performed for two entry points into drivers'
2628  * transmission function:
2629  * - Direct ifnet.if_start calling on the subqueue, i.e. ifsq_ifstart_try()
2630  * - ifnet.if_start scheduling on the subqueue, i.e. ifsq_ifstart_schedule()
2631  *
2632  * Subqueue packets staging will be stopped upon any of the following
2633  * conditions:
2634  * - If the count of packets enqueued on the current CPU is great than or
2635  *   equal to ifsq_stage_cntmax. (XXX this should be per-interface)
2636  * - If the total length of packets enqueued on the current CPU is great
2637  *   than or equal to the hardware's MTU - max_protohdr.  max_protohdr is
2638  *   cut from the hardware's MTU mainly bacause a full TCP segment's size
2639  *   is usually less than hardware's MTU.
2640  * - ifsq_ifstart_schedule() is not pending on the current CPU and
2641  *   ifnet.if_start subqueue interlock (ifaltq_subq.ifsq_started) is not
2642  *   released.
2643  * - The if_start_rollup(), which is registered as low priority netisr
2644  *   rollup function, is called; probably because no more work is pending
2645  *   for netisr.
2646  *
2647  * NOTE:
2648  * Currently subqueue packet staging is only performed in netisr threads.
2649  */
2650 int
2651 ifq_dispatch(struct ifnet *ifp, struct mbuf *m, struct altq_pktattr *pa)
2652 {
2653         struct ifaltq *ifq = &ifp->if_snd;
2654         struct ifaltq_subque *ifsq;
2655         int error, start = 0, len, mcast = 0, avoid_start = 0;
2656         struct ifsubq_stage_head *head = NULL;
2657         struct ifsubq_stage *stage = NULL;
2658
2659         ifsq = ifq_map_subq(ifq, mycpuid);
2660         ASSERT_ALTQ_SQ_NOT_SERIALIZED_HW(ifsq);
2661
2662         len = m->m_pkthdr.len;
2663         if (m->m_flags & M_MCAST)
2664                 mcast = 1;
2665
2666         if (curthread->td_type == TD_TYPE_NETISR) {
2667                 head = &ifsubq_stage_heads[mycpuid];
2668                 stage = ifsq_get_stage(ifsq, mycpuid);
2669
2670                 stage->stg_cnt++;
2671                 stage->stg_len += len;
2672                 if (stage->stg_cnt < ifsq_stage_cntmax &&
2673                     stage->stg_len < (ifp->if_mtu - max_protohdr))
2674                         avoid_start = 1;
2675         }
2676
2677         ALTQ_SQ_LOCK(ifsq);
2678         error = ifsq_enqueue_locked(ifsq, m, pa);
2679         if (error) {
2680                 if (!ifsq_data_ready(ifsq)) {
2681                         ALTQ_SQ_UNLOCK(ifsq);
2682                         return error;
2683                 }
2684                 avoid_start = 0;
2685         }
2686         if (!ifsq_is_started(ifsq)) {
2687                 if (avoid_start) {
2688                         ALTQ_SQ_UNLOCK(ifsq);
2689
2690                         KKASSERT(!error);
2691                         if ((stage->stg_flags & IFSQ_STAGE_FLAG_QUED) == 0)
2692                                 ifsq_stage_insert(head, stage);
2693
2694                         IFNET_STAT_INC(ifp, obytes, len);
2695                         if (mcast)
2696                                 IFNET_STAT_INC(ifp, omcasts, 1);
2697                         return error;
2698                 }
2699
2700                 /*
2701                  * Hold the subqueue interlock of ifnet.if_start
2702                  */
2703                 ifsq_set_started(ifsq);
2704                 start = 1;
2705         }
2706         ALTQ_SQ_UNLOCK(ifsq);
2707
2708         if (!error) {
2709                 IFNET_STAT_INC(ifp, obytes, len);
2710                 if (mcast)
2711                         IFNET_STAT_INC(ifp, omcasts, 1);
2712         }
2713
2714         if (stage != NULL) {
2715                 if (!start && (stage->stg_flags & IFSQ_STAGE_FLAG_SCHED)) {
2716                         KKASSERT(stage->stg_flags & IFSQ_STAGE_FLAG_QUED);
2717                         if (!avoid_start) {
2718                                 ifsq_stage_remove(head, stage);
2719                                 ifsq_ifstart_schedule(ifsq, 1);
2720                         }
2721                         return error;
2722                 }
2723
2724                 if (stage->stg_flags & IFSQ_STAGE_FLAG_QUED) {
2725                         ifsq_stage_remove(head, stage);
2726                 } else {
2727                         stage->stg_cnt = 0;
2728                         stage->stg_len = 0;
2729                 }
2730         }
2731
2732         if (!start)
2733                 return error;
2734
2735         ifsq_ifstart_try(ifsq, 0);
2736         return error;
2737 }
2738
2739 void *
2740 ifa_create(int size, int flags)
2741 {
2742         struct ifaddr *ifa;
2743         int i;
2744
2745         KASSERT(size >= sizeof(*ifa), ("ifaddr size too small"));
2746
2747         ifa = kmalloc(size, M_IFADDR, flags | M_ZERO);
2748         if (ifa == NULL)
2749                 return NULL;
2750
2751         ifa->ifa_containers =
2752             kmalloc_cachealign(ncpus * sizeof(struct ifaddr_container),
2753                 M_IFADDR, M_WAITOK | M_ZERO);
2754         ifa->ifa_ncnt = ncpus;
2755         for (i = 0; i < ncpus; ++i) {
2756                 struct ifaddr_container *ifac = &ifa->ifa_containers[i];
2757
2758                 ifac->ifa_magic = IFA_CONTAINER_MAGIC;
2759                 ifac->ifa = ifa;
2760                 ifac->ifa_refcnt = 1;
2761         }
2762 #ifdef IFADDR_DEBUG
2763         kprintf("alloc ifa %p %d\n", ifa, size);
2764 #endif
2765         return ifa;
2766 }
2767
2768 void
2769 ifac_free(struct ifaddr_container *ifac, int cpu_id)
2770 {
2771         struct ifaddr *ifa = ifac->ifa;
2772
2773         KKASSERT(ifac->ifa_magic == IFA_CONTAINER_MAGIC);
2774         KKASSERT(ifac->ifa_refcnt == 0);
2775         KASSERT(ifac->ifa_listmask == 0,
2776                 ("ifa is still on %#x lists", ifac->ifa_listmask));
2777
2778         ifac->ifa_magic = IFA_CONTAINER_DEAD;
2779
2780 #ifdef IFADDR_DEBUG_VERBOSE
2781         kprintf("try free ifa %p cpu_id %d\n", ifac->ifa, cpu_id);
2782 #endif
2783
2784         KASSERT(ifa->ifa_ncnt > 0 && ifa->ifa_ncnt <= ncpus,
2785                 ("invalid # of ifac, %d", ifa->ifa_ncnt));
2786         if (atomic_fetchadd_int(&ifa->ifa_ncnt, -1) == 1) {
2787 #ifdef IFADDR_DEBUG
2788                 kprintf("free ifa %p\n", ifa);
2789 #endif
2790                 kfree(ifa->ifa_containers, M_IFADDR);
2791                 kfree(ifa, M_IFADDR);
2792         }
2793 }
2794
2795 static void
2796 ifa_iflink_dispatch(netmsg_t nmsg)
2797 {
2798         struct netmsg_ifaddr *msg = (struct netmsg_ifaddr *)nmsg;
2799         struct ifaddr *ifa = msg->ifa;
2800         struct ifnet *ifp = msg->ifp;
2801         int cpu = mycpuid;
2802         struct ifaddr_container *ifac;
2803
2804         crit_enter();
2805
2806         ifac = &ifa->ifa_containers[cpu];
2807         ASSERT_IFAC_VALID(ifac);
2808         KASSERT((ifac->ifa_listmask & IFA_LIST_IFADDRHEAD) == 0,
2809                 ("ifaddr is on if_addrheads"));
2810
2811         ifac->ifa_listmask |= IFA_LIST_IFADDRHEAD;
2812         if (msg->tail)
2813                 TAILQ_INSERT_TAIL(&ifp->if_addrheads[cpu], ifac, ifa_link);
2814         else
2815                 TAILQ_INSERT_HEAD(&ifp->if_addrheads[cpu], ifac, ifa_link);
2816
2817         crit_exit();
2818
2819         ifa_forwardmsg(&nmsg->lmsg, cpu + 1);
2820 }
2821
2822 void
2823 ifa_iflink(struct ifaddr *ifa, struct ifnet *ifp, int tail)
2824 {
2825         struct netmsg_ifaddr msg;
2826
2827         netmsg_init(&msg.base, NULL, &curthread->td_msgport,
2828                     0, ifa_iflink_dispatch);
2829         msg.ifa = ifa;
2830         msg.ifp = ifp;
2831         msg.tail = tail;
2832
2833         ifa_domsg(&msg.base.lmsg, 0);
2834 }
2835
2836 static void
2837 ifa_ifunlink_dispatch(netmsg_t nmsg)
2838 {
2839         struct netmsg_ifaddr *msg = (struct netmsg_ifaddr *)nmsg;
2840         struct ifaddr *ifa = msg->ifa;
2841         struct ifnet *ifp = msg->ifp;
2842         int cpu = mycpuid;
2843         struct ifaddr_container *ifac;
2844
2845         crit_enter();
2846
2847         ifac = &ifa->ifa_containers[cpu];
2848         ASSERT_IFAC_VALID(ifac);
2849         KASSERT(ifac->ifa_listmask & IFA_LIST_IFADDRHEAD,
2850                 ("ifaddr is not on if_addrhead"));
2851
2852         TAILQ_REMOVE(&ifp->if_addrheads[cpu], ifac, ifa_link);
2853         ifac->ifa_listmask &= ~IFA_LIST_IFADDRHEAD;
2854
2855         crit_exit();
2856
2857         ifa_forwardmsg(&nmsg->lmsg, cpu + 1);
2858 }
2859
2860 void
2861 ifa_ifunlink(struct ifaddr *ifa, struct ifnet *ifp)
2862 {
2863         struct netmsg_ifaddr msg;
2864
2865         netmsg_init(&msg.base, NULL, &curthread->td_msgport,
2866                     0, ifa_ifunlink_dispatch);
2867         msg.ifa = ifa;
2868         msg.ifp = ifp;
2869
2870         ifa_domsg(&msg.base.lmsg, 0);
2871 }
2872
2873 static void
2874 ifa_destroy_dispatch(netmsg_t nmsg)
2875 {
2876         struct netmsg_ifaddr *msg = (struct netmsg_ifaddr *)nmsg;
2877
2878         IFAFREE(msg->ifa);
2879         ifa_forwardmsg(&nmsg->lmsg, mycpuid + 1);
2880 }
2881
2882 void
2883 ifa_destroy(struct ifaddr *ifa)
2884 {
2885         struct netmsg_ifaddr msg;
2886
2887         netmsg_init(&msg.base, NULL, &curthread->td_msgport,
2888                     0, ifa_destroy_dispatch);
2889         msg.ifa = ifa;
2890
2891         ifa_domsg(&msg.base.lmsg, 0);
2892 }
2893
2894 struct lwkt_port *
2895 ifnet_portfn(int cpu)
2896 {
2897         return &ifnet_threads[cpu].td_msgport;
2898 }
2899
2900 void
2901 ifnet_forwardmsg(struct lwkt_msg *lmsg, int next_cpu)
2902 {
2903         KKASSERT(next_cpu > mycpuid && next_cpu <= ncpus);
2904
2905         if (next_cpu < ncpus)
2906                 lwkt_forwardmsg(ifnet_portfn(next_cpu), lmsg);
2907         else
2908                 lwkt_replymsg(lmsg, 0);
2909 }
2910
2911 int
2912 ifnet_domsg(struct lwkt_msg *lmsg, int cpu)
2913 {
2914         KKASSERT(cpu < ncpus);
2915         return lwkt_domsg(ifnet_portfn(cpu), lmsg, 0);
2916 }
2917
2918 void
2919 ifnet_sendmsg(struct lwkt_msg *lmsg, int cpu)
2920 {
2921         KKASSERT(cpu < ncpus);
2922         lwkt_sendmsg(ifnet_portfn(cpu), lmsg);
2923 }
2924
2925 /*
2926  * Generic netmsg service loop.  Some protocols may roll their own but all
2927  * must do the basic command dispatch function call done here.
2928  */
2929 static void
2930 ifnet_service_loop(void *arg __unused)
2931 {
2932         netmsg_t msg;
2933
2934         while ((msg = lwkt_waitport(&curthread->td_msgport, 0))) {
2935                 KASSERT(msg->base.nm_dispatch, ("ifnet_service: badmsg"));
2936                 msg->base.nm_dispatch(msg);
2937         }
2938 }
2939
2940 static void
2941 if_start_rollup(void)
2942 {
2943         struct ifsubq_stage_head *head = &ifsubq_stage_heads[mycpuid];
2944         struct ifsubq_stage *stage;
2945
2946         while ((stage = TAILQ_FIRST(&head->stg_head)) != NULL) {
2947                 struct ifaltq_subque *ifsq = stage->stg_subq;
2948                 int is_sched = 0;
2949
2950                 if (stage->stg_flags & IFSQ_STAGE_FLAG_SCHED)
2951                         is_sched = 1;
2952                 ifsq_stage_remove(head, stage);
2953
2954                 if (is_sched) {
2955                         ifsq_ifstart_schedule(ifsq, 1);
2956                 } else {
2957                         int start = 0;
2958
2959                         ALTQ_SQ_LOCK(ifsq);
2960                         if (!ifsq_is_started(ifsq)) {
2961                                 /*
2962                                  * Hold the subqueue interlock of
2963                                  * ifnet.if_start
2964                                  */
2965                                 ifsq_set_started(ifsq);
2966                                 start = 1;
2967                         }
2968                         ALTQ_SQ_UNLOCK(ifsq);
2969
2970                         if (start)
2971                                 ifsq_ifstart_try(ifsq, 1);
2972                 }
2973                 KKASSERT((stage->stg_flags &
2974                     (IFSQ_STAGE_FLAG_QUED | IFSQ_STAGE_FLAG_SCHED)) == 0);
2975         }
2976 }
2977
2978 static void
2979 ifnetinit(void *dummy __unused)
2980 {
2981         int i;
2982
2983         for (i = 0; i < ncpus; ++i) {
2984                 struct thread *thr = &ifnet_threads[i];
2985
2986                 lwkt_create(ifnet_service_loop, NULL, NULL,
2987                             thr, TDF_NOSTART|TDF_FORCE_SPINPORT,
2988                             i, "ifnet %d", i);
2989                 netmsg_service_port_init(&thr->td_msgport);
2990                 lwkt_schedule(thr);
2991         }
2992
2993         for (i = 0; i < ncpus; ++i)
2994                 TAILQ_INIT(&ifsubq_stage_heads[i].stg_head);
2995         netisr_register_rollup(if_start_rollup, NETISR_ROLLUP_PRIO_IFSTART);
2996 }
2997
2998 struct ifnet *
2999 ifnet_byindex(unsigned short idx)
3000 {
3001         if (idx > if_index)
3002                 return NULL;
3003         return ifindex2ifnet[idx];
3004 }
3005
3006 struct ifaddr *
3007 ifaddr_byindex(unsigned short idx)
3008 {
3009         struct ifnet *ifp;
3010
3011         ifp = ifnet_byindex(idx);
3012         if (!ifp)
3013                 return NULL;
3014         return TAILQ_FIRST(&ifp->if_addrheads[mycpuid])->ifa;
3015 }
3016
3017 void
3018 if_register_com_alloc(u_char type,
3019     if_com_alloc_t *a, if_com_free_t *f)
3020 {
3021
3022         KASSERT(if_com_alloc[type] == NULL,
3023             ("if_register_com_alloc: %d already registered", type));
3024         KASSERT(if_com_free[type] == NULL,
3025             ("if_register_com_alloc: %d free already registered", type));
3026
3027         if_com_alloc[type] = a;
3028         if_com_free[type] = f;
3029 }
3030
3031 void
3032 if_deregister_com_alloc(u_char type)
3033 {
3034
3035         KASSERT(if_com_alloc[type] != NULL,
3036             ("if_deregister_com_alloc: %d not registered", type));
3037         KASSERT(if_com_free[type] != NULL,
3038             ("if_deregister_com_alloc: %d free not registered", type));
3039         if_com_alloc[type] = NULL;
3040         if_com_free[type] = NULL;
3041 }
3042
3043 int
3044 if_ring_count2(int cnt, int cnt_max)
3045 {
3046         int shift = 0;
3047
3048         KASSERT(cnt_max >= 1 && powerof2(cnt_max),
3049             ("invalid ring count max %d", cnt_max));
3050
3051         if (cnt <= 0)
3052                 cnt = cnt_max;
3053         if (cnt > ncpus2)
3054                 cnt = ncpus2;
3055         if (cnt > cnt_max)
3056                 cnt = cnt_max;
3057
3058         while ((1 << (shift + 1)) <= cnt)
3059                 ++shift;
3060         cnt = 1 << shift;
3061
3062         KASSERT(cnt >= 1 && cnt <= ncpus2 && cnt <= cnt_max,
3063             ("calculate cnt %d, ncpus2 %d, cnt max %d",
3064              cnt, ncpus2, cnt_max));
3065         return cnt;
3066 }
3067
3068 void
3069 ifq_set_maxlen(struct ifaltq *ifq, int len)
3070 {
3071         ifq->altq_maxlen = len + (ncpus * ifsq_stage_cntmax);
3072 }
3073
3074 int
3075 ifq_mapsubq_default(struct ifaltq *ifq __unused, int cpuid __unused)
3076 {
3077         return ALTQ_SUBQ_INDEX_DEFAULT;
3078 }
3079
3080 int
3081 ifq_mapsubq_mask(struct ifaltq *ifq, int cpuid)
3082 {
3083         return (cpuid & ifq->altq_subq_mask);
3084 }
3085
3086 static void
3087 ifsq_watchdog(void *arg)
3088 {
3089         struct ifsubq_watchdog *wd = arg;
3090         struct ifnet *ifp;
3091
3092         if (__predict_true(wd->wd_timer == 0 || --wd->wd_timer))
3093                 goto done;
3094
3095         ifp = ifsq_get_ifp(wd->wd_subq);
3096         if (ifnet_tryserialize_all(ifp)) {
3097                 wd->wd_watchdog(wd->wd_subq);
3098                 ifnet_deserialize_all(ifp);
3099         } else {
3100                 /* try again next timeout */
3101                 wd->wd_timer = 1;
3102         }
3103 done:
3104         ifsq_watchdog_reset(wd);
3105 }
3106
3107 static void
3108 ifsq_watchdog_reset(struct ifsubq_watchdog *wd)
3109 {
3110         callout_reset_bycpu(&wd->wd_callout, hz, ifsq_watchdog, wd,
3111             ifsq_get_cpuid(wd->wd_subq));
3112 }
3113
3114 void
3115 ifsq_watchdog_init(struct ifsubq_watchdog *wd, struct ifaltq_subque *ifsq,
3116     ifsq_watchdog_t watchdog)
3117 {
3118         callout_init_mp(&wd->wd_callout);
3119         wd->wd_timer = 0;
3120         wd->wd_subq = ifsq;
3121         wd->wd_watchdog = watchdog;
3122 }
3123
3124 void
3125 ifsq_watchdog_start(struct ifsubq_watchdog *wd)
3126 {
3127         wd->wd_timer = 0;
3128         ifsq_watchdog_reset(wd);
3129 }
3130
3131 void
3132 ifsq_watchdog_stop(struct ifsubq_watchdog *wd)
3133 {
3134         wd->wd_timer = 0;
3135         callout_stop(&wd->wd_callout);
3136 }