209e8f47470606b5055ff3e793c735a424659c9c
[dragonfly.git] / sys / net / if.c
1 /*
2  * Copyright (c) 1980, 1986, 1993
3  *      The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. Neither the name of the University nor the names of its contributors
14  *    may be used to endorse or promote products derived from this software
15  *    without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  *      @(#)if.c        8.3 (Berkeley) 1/4/94
30  * $FreeBSD: src/sys/net/if.c,v 1.185 2004/03/13 02:35:03 brooks Exp $
31  */
32
33 #include "opt_compat.h"
34 #include "opt_inet6.h"
35 #include "opt_inet.h"
36 #include "opt_ifpoll.h"
37
38 #include <sys/param.h>
39 #include <sys/malloc.h>
40 #include <sys/mbuf.h>
41 #include <sys/systm.h>
42 #include <sys/proc.h>
43 #include <sys/priv.h>
44 #include <sys/protosw.h>
45 #include <sys/socket.h>
46 #include <sys/socketvar.h>
47 #include <sys/socketops.h>
48 #include <sys/kernel.h>
49 #include <sys/ktr.h>
50 #include <sys/mutex.h>
51 #include <sys/sockio.h>
52 #include <sys/syslog.h>
53 #include <sys/sysctl.h>
54 #include <sys/domain.h>
55 #include <sys/thread.h>
56 #include <sys/serialize.h>
57 #include <sys/bus.h>
58
59 #include <sys/thread2.h>
60 #include <sys/msgport2.h>
61 #include <sys/mutex2.h>
62
63 #include <net/if.h>
64 #include <net/if_arp.h>
65 #include <net/if_dl.h>
66 #include <net/if_types.h>
67 #include <net/if_var.h>
68 #include <net/ifq_var.h>
69 #include <net/radix.h>
70 #include <net/route.h>
71 #include <net/if_clone.h>
72 #include <net/netisr2.h>
73 #include <net/netmsg2.h>
74
75 #include <machine/atomic.h>
76 #include <machine/stdarg.h>
77 #include <machine/smp.h>
78
79 #if defined(INET) || defined(INET6)
80 /*XXX*/
81 #include <netinet/in.h>
82 #include <netinet/in_var.h>
83 #include <netinet/if_ether.h>
84 #ifdef INET6
85 #include <netinet6/in6_var.h>
86 #include <netinet6/in6_ifattach.h>
87 #endif
88 #endif
89
90 #if defined(COMPAT_43)
91 #include <emulation/43bsd/43bsd_socket.h>
92 #endif /* COMPAT_43 */
93
94 struct netmsg_ifaddr {
95         struct netmsg_base base;
96         struct ifaddr   *ifa;
97         struct ifnet    *ifp;
98         int             tail;
99 };
100
101 struct ifsubq_stage_head {
102         TAILQ_HEAD(, ifsubq_stage)      stg_head;
103 } __cachealign;
104
105 /*
106  * System initialization
107  */
108 static void     if_attachdomain(void *);
109 static void     if_attachdomain1(struct ifnet *);
110 static int      ifconf(u_long, caddr_t, struct ucred *);
111 static void     ifinit(void *);
112 static void     ifnetinit(void *);
113 static void     if_slowtimo(void *);
114 static void     link_rtrequest(int, struct rtentry *);
115 static int      if_rtdel(struct radix_node *, void *);
116 static void     if_slowtimo_dispatch(netmsg_t);
117
118 /* Helper functions */
119 static void     ifsq_watchdog_reset(struct ifsubq_watchdog *);
120 static int      if_delmulti_serialized(struct ifnet *, struct sockaddr *);
121
122 #ifdef INET6
123 /*
124  * XXX: declare here to avoid to include many inet6 related files..
125  * should be more generalized?
126  */
127 extern void     nd6_setmtu(struct ifnet *);
128 #endif
129
130 SYSCTL_NODE(_net, PF_LINK, link, CTLFLAG_RW, 0, "Link layers");
131 SYSCTL_NODE(_net_link, 0, generic, CTLFLAG_RW, 0, "Generic link-management");
132
133 static int ifsq_stage_cntmax = 4;
134 TUNABLE_INT("net.link.stage_cntmax", &ifsq_stage_cntmax);
135 SYSCTL_INT(_net_link, OID_AUTO, stage_cntmax, CTLFLAG_RW,
136     &ifsq_stage_cntmax, 0, "ifq staging packet count max");
137
138 static int if_stats_compat = 0;
139 SYSCTL_INT(_net_link, OID_AUTO, stats_compat, CTLFLAG_RW,
140     &if_stats_compat, 0, "Compat the old ifnet stats");
141
142 SYSINIT(interfaces, SI_SUB_PROTO_IF, SI_ORDER_FIRST, ifinit, NULL)
143 /* Must be after netisr_init */
144 SYSINIT(ifnet, SI_SUB_PRE_DRIVERS, SI_ORDER_SECOND, ifnetinit, NULL)
145
146 static  if_com_alloc_t *if_com_alloc[256];
147 static  if_com_free_t *if_com_free[256];
148
149 MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address");
150 MALLOC_DEFINE(M_IFMADDR, "ether_multi", "link-level multicast address");
151 MALLOC_DEFINE(M_IFNET, "ifnet", "interface structure");
152
153 int                     ifqmaxlen = IFQ_MAXLEN;
154 struct ifnethead        ifnet = TAILQ_HEAD_INITIALIZER(ifnet);
155
156 static struct callout           if_slowtimo_timer;
157 static struct netmsg_base       if_slowtimo_netmsg;
158
159 int                     if_index = 0;
160 struct ifnet            **ifindex2ifnet = NULL;
161 static struct thread    ifnet_threads[MAXCPU];
162
163 static struct ifsubq_stage_head ifsubq_stage_heads[MAXCPU];
164
165 #ifdef notyet
166 #define IFQ_KTR_STRING          "ifq=%p"
167 #define IFQ_KTR_ARGS    struct ifaltq *ifq
168 #ifndef KTR_IFQ
169 #define KTR_IFQ                 KTR_ALL
170 #endif
171 KTR_INFO_MASTER(ifq);
172 KTR_INFO(KTR_IFQ, ifq, enqueue, 0, IFQ_KTR_STRING, IFQ_KTR_ARGS);
173 KTR_INFO(KTR_IFQ, ifq, dequeue, 1, IFQ_KTR_STRING, IFQ_KTR_ARGS);
174 #define logifq(name, arg)       KTR_LOG(ifq_ ## name, arg)
175
176 #define IF_START_KTR_STRING     "ifp=%p"
177 #define IF_START_KTR_ARGS       struct ifnet *ifp
178 #ifndef KTR_IF_START
179 #define KTR_IF_START            KTR_ALL
180 #endif
181 KTR_INFO_MASTER(if_start);
182 KTR_INFO(KTR_IF_START, if_start, run, 0,
183          IF_START_KTR_STRING, IF_START_KTR_ARGS);
184 KTR_INFO(KTR_IF_START, if_start, sched, 1,
185          IF_START_KTR_STRING, IF_START_KTR_ARGS);
186 KTR_INFO(KTR_IF_START, if_start, avoid, 2,
187          IF_START_KTR_STRING, IF_START_KTR_ARGS);
188 KTR_INFO(KTR_IF_START, if_start, contend_sched, 3,
189          IF_START_KTR_STRING, IF_START_KTR_ARGS);
190 KTR_INFO(KTR_IF_START, if_start, chase_sched, 4,
191          IF_START_KTR_STRING, IF_START_KTR_ARGS);
192 #define logifstart(name, arg)   KTR_LOG(if_start_ ## name, arg)
193 #endif
194
195 TAILQ_HEAD(, ifg_group) ifg_head = TAILQ_HEAD_INITIALIZER(ifg_head);
196
197 /*
198  * Network interface utility routines.
199  *
200  * Routines with ifa_ifwith* names take sockaddr *'s as
201  * parameters.
202  */
203 /* ARGSUSED*/
204 void
205 ifinit(void *dummy)
206 {
207         struct ifnet *ifp;
208
209         callout_init_mp(&if_slowtimo_timer);
210         netmsg_init(&if_slowtimo_netmsg, NULL, &netisr_adone_rport,
211             MSGF_PRIORITY, if_slowtimo_dispatch);
212
213         crit_enter();
214         TAILQ_FOREACH(ifp, &ifnet, if_link) {
215                 if (ifp->if_snd.altq_maxlen == 0) {
216                         if_printf(ifp, "XXX: driver didn't set altq_maxlen\n");
217                         ifq_set_maxlen(&ifp->if_snd, ifqmaxlen);
218                 }
219         }
220         crit_exit();
221
222         /* Start if_slowtimo */
223         lwkt_sendmsg(netisr_cpuport(0), &if_slowtimo_netmsg.lmsg);
224 }
225
226 static void
227 ifsq_ifstart_ipifunc(void *arg)
228 {
229         struct ifaltq_subque *ifsq = arg;
230         struct lwkt_msg *lmsg = ifsq_get_ifstart_lmsg(ifsq, mycpuid);
231
232         crit_enter();
233         if (lmsg->ms_flags & MSGF_DONE)
234                 lwkt_sendmsg_oncpu(netisr_cpuport(mycpuid), lmsg);
235         crit_exit();
236 }
237
238 static __inline void
239 ifsq_stage_remove(struct ifsubq_stage_head *head, struct ifsubq_stage *stage)
240 {
241         KKASSERT(stage->stg_flags & IFSQ_STAGE_FLAG_QUED);
242         TAILQ_REMOVE(&head->stg_head, stage, stg_link);
243         stage->stg_flags &= ~(IFSQ_STAGE_FLAG_QUED | IFSQ_STAGE_FLAG_SCHED);
244         stage->stg_cnt = 0;
245         stage->stg_len = 0;
246 }
247
248 static __inline void
249 ifsq_stage_insert(struct ifsubq_stage_head *head, struct ifsubq_stage *stage)
250 {
251         KKASSERT((stage->stg_flags &
252             (IFSQ_STAGE_FLAG_QUED | IFSQ_STAGE_FLAG_SCHED)) == 0);
253         stage->stg_flags |= IFSQ_STAGE_FLAG_QUED;
254         TAILQ_INSERT_TAIL(&head->stg_head, stage, stg_link);
255 }
256
257 /*
258  * Schedule ifnet.if_start on the subqueue owner CPU
259  */
260 static void
261 ifsq_ifstart_schedule(struct ifaltq_subque *ifsq, int force)
262 {
263         int cpu;
264
265         if (!force && curthread->td_type == TD_TYPE_NETISR &&
266             ifsq_stage_cntmax > 0) {
267                 struct ifsubq_stage *stage = ifsq_get_stage(ifsq, mycpuid);
268
269                 stage->stg_cnt = 0;
270                 stage->stg_len = 0;
271                 if ((stage->stg_flags & IFSQ_STAGE_FLAG_QUED) == 0)
272                         ifsq_stage_insert(&ifsubq_stage_heads[mycpuid], stage);
273                 stage->stg_flags |= IFSQ_STAGE_FLAG_SCHED;
274                 return;
275         }
276
277         cpu = ifsq_get_cpuid(ifsq);
278         if (cpu != mycpuid)
279                 lwkt_send_ipiq(globaldata_find(cpu), ifsq_ifstart_ipifunc, ifsq);
280         else
281                 ifsq_ifstart_ipifunc(ifsq);
282 }
283
284 /*
285  * NOTE:
286  * This function will release ifnet.if_start subqueue interlock,
287  * if ifnet.if_start for the subqueue does not need to be scheduled
288  */
289 static __inline int
290 ifsq_ifstart_need_schedule(struct ifaltq_subque *ifsq, int running)
291 {
292         if (!running || ifsq_is_empty(ifsq)
293 #ifdef ALTQ
294             || ifsq->ifsq_altq->altq_tbr != NULL
295 #endif
296         ) {
297                 ALTQ_SQ_LOCK(ifsq);
298                 /*
299                  * ifnet.if_start subqueue interlock is released, if:
300                  * 1) Hardware can not take any packets, due to
301                  *    o  interface is marked down
302                  *    o  hardware queue is full (ifsq_is_oactive)
303                  *    Under the second situation, hardware interrupt
304                  *    or polling(4) will call/schedule ifnet.if_start
305                  *    on the subqueue when hardware queue is ready
306                  * 2) There is no packet in the subqueue.
307                  *    Further ifq_dispatch or ifq_handoff will call/
308                  *    schedule ifnet.if_start on the subqueue.
309                  * 3) TBR is used and it does not allow further
310                  *    dequeueing.
311                  *    TBR callout will call ifnet.if_start on the
312                  *    subqueue.
313                  */
314                 if (!running || !ifsq_data_ready(ifsq)) {
315                         ifsq_clr_started(ifsq);
316                         ALTQ_SQ_UNLOCK(ifsq);
317                         return 0;
318                 }
319                 ALTQ_SQ_UNLOCK(ifsq);
320         }
321         return 1;
322 }
323
324 static void
325 ifsq_ifstart_dispatch(netmsg_t msg)
326 {
327         struct lwkt_msg *lmsg = &msg->base.lmsg;
328         struct ifaltq_subque *ifsq = lmsg->u.ms_resultp;
329         struct ifnet *ifp = ifsq_get_ifp(ifsq);
330         struct globaldata *gd = mycpu;
331         int running = 0, need_sched;
332
333         crit_enter_gd(gd);
334
335         lwkt_replymsg(lmsg, 0); /* reply ASAP */
336
337         if (gd->gd_cpuid != ifsq_get_cpuid(ifsq)) {
338                 /*
339                  * We need to chase the subqueue owner CPU change.
340                  */
341                 ifsq_ifstart_schedule(ifsq, 1);
342                 crit_exit_gd(gd);
343                 return;
344         }
345
346         ifsq_serialize_hw(ifsq);
347         if ((ifp->if_flags & IFF_RUNNING) && !ifsq_is_oactive(ifsq)) {
348                 ifp->if_start(ifp, ifsq);
349                 if ((ifp->if_flags & IFF_RUNNING) && !ifsq_is_oactive(ifsq))
350                         running = 1;
351         }
352         need_sched = ifsq_ifstart_need_schedule(ifsq, running);
353         ifsq_deserialize_hw(ifsq);
354
355         if (need_sched) {
356                 /*
357                  * More data need to be transmitted, ifnet.if_start is
358                  * scheduled on the subqueue owner CPU, and we keep going.
359                  * NOTE: ifnet.if_start subqueue interlock is not released.
360                  */
361                 ifsq_ifstart_schedule(ifsq, 0);
362         }
363
364         crit_exit_gd(gd);
365 }
366
367 /* Device driver ifnet.if_start helper function */
368 void
369 ifsq_devstart(struct ifaltq_subque *ifsq)
370 {
371         struct ifnet *ifp = ifsq_get_ifp(ifsq);
372         int running = 0;
373
374         ASSERT_ALTQ_SQ_SERIALIZED_HW(ifsq);
375
376         ALTQ_SQ_LOCK(ifsq);
377         if (ifsq_is_started(ifsq) || !ifsq_data_ready(ifsq)) {
378                 ALTQ_SQ_UNLOCK(ifsq);
379                 return;
380         }
381         ifsq_set_started(ifsq);
382         ALTQ_SQ_UNLOCK(ifsq);
383
384         ifp->if_start(ifp, ifsq);
385
386         if ((ifp->if_flags & IFF_RUNNING) && !ifsq_is_oactive(ifsq))
387                 running = 1;
388
389         if (ifsq_ifstart_need_schedule(ifsq, running)) {
390                 /*
391                  * More data need to be transmitted, ifnet.if_start is
392                  * scheduled on ifnet's CPU, and we keep going.
393                  * NOTE: ifnet.if_start interlock is not released.
394                  */
395                 ifsq_ifstart_schedule(ifsq, 0);
396         }
397 }
398
399 void
400 if_devstart(struct ifnet *ifp)
401 {
402         ifsq_devstart(ifq_get_subq_default(&ifp->if_snd));
403 }
404
405 /* Device driver ifnet.if_start schedule helper function */
406 void
407 ifsq_devstart_sched(struct ifaltq_subque *ifsq)
408 {
409         ifsq_ifstart_schedule(ifsq, 1);
410 }
411
412 void
413 if_devstart_sched(struct ifnet *ifp)
414 {
415         ifsq_devstart_sched(ifq_get_subq_default(&ifp->if_snd));
416 }
417
418 static void
419 if_default_serialize(struct ifnet *ifp, enum ifnet_serialize slz __unused)
420 {
421         lwkt_serialize_enter(ifp->if_serializer);
422 }
423
424 static void
425 if_default_deserialize(struct ifnet *ifp, enum ifnet_serialize slz __unused)
426 {
427         lwkt_serialize_exit(ifp->if_serializer);
428 }
429
430 static int
431 if_default_tryserialize(struct ifnet *ifp, enum ifnet_serialize slz __unused)
432 {
433         return lwkt_serialize_try(ifp->if_serializer);
434 }
435
436 #ifdef INVARIANTS
437 static void
438 if_default_serialize_assert(struct ifnet *ifp,
439                             enum ifnet_serialize slz __unused,
440                             boolean_t serialized)
441 {
442         if (serialized)
443                 ASSERT_SERIALIZED(ifp->if_serializer);
444         else
445                 ASSERT_NOT_SERIALIZED(ifp->if_serializer);
446 }
447 #endif
448
449 /*
450  * Attach an interface to the list of "active" interfaces.
451  *
452  * The serializer is optional.
453  */
454 void
455 if_attach(struct ifnet *ifp, lwkt_serialize_t serializer)
456 {
457         unsigned socksize, ifasize;
458         int namelen, masklen;
459         struct sockaddr_dl *sdl;
460         struct ifaddr *ifa;
461         struct ifaltq *ifq;
462         int i, q;
463
464         static int if_indexlim = 8;
465
466         if (ifp->if_serialize != NULL) {
467                 KASSERT(ifp->if_deserialize != NULL &&
468                         ifp->if_tryserialize != NULL &&
469                         ifp->if_serialize_assert != NULL,
470                         ("serialize functions are partially setup"));
471
472                 /*
473                  * If the device supplies serialize functions,
474                  * then clear if_serializer to catch any invalid
475                  * usage of this field.
476                  */
477                 KASSERT(serializer == NULL,
478                         ("both serialize functions and default serializer "
479                          "are supplied"));
480                 ifp->if_serializer = NULL;
481         } else {
482                 KASSERT(ifp->if_deserialize == NULL &&
483                         ifp->if_tryserialize == NULL &&
484                         ifp->if_serialize_assert == NULL,
485                         ("serialize functions are partially setup"));
486                 ifp->if_serialize = if_default_serialize;
487                 ifp->if_deserialize = if_default_deserialize;
488                 ifp->if_tryserialize = if_default_tryserialize;
489 #ifdef INVARIANTS
490                 ifp->if_serialize_assert = if_default_serialize_assert;
491 #endif
492
493                 /*
494                  * The serializer can be passed in from the device,
495                  * allowing the same serializer to be used for both
496                  * the interrupt interlock and the device queue.
497                  * If not specified, the netif structure will use an
498                  * embedded serializer.
499                  */
500                 if (serializer == NULL) {
501                         serializer = &ifp->if_default_serializer;
502                         lwkt_serialize_init(serializer);
503                 }
504                 ifp->if_serializer = serializer;
505         }
506
507         mtx_init(&ifp->if_ioctl_mtx);
508         mtx_lock(&ifp->if_ioctl_mtx);
509
510         lwkt_gettoken(&ifnet_token);    /* protect if_index and ifnet tailq */
511         ifp->if_index = ++if_index;
512
513         /*
514          * XXX -
515          * The old code would work if the interface passed a pre-existing
516          * chain of ifaddrs to this code.  We don't trust our callers to
517          * properly initialize the tailq, however, so we no longer allow
518          * this unlikely case.
519          */
520         ifp->if_addrheads = kmalloc(ncpus * sizeof(struct ifaddrhead),
521                                     M_IFADDR, M_WAITOK | M_ZERO);
522         for (i = 0; i < ncpus; ++i)
523                 TAILQ_INIT(&ifp->if_addrheads[i]);
524
525         TAILQ_INIT(&ifp->if_multiaddrs);
526         TAILQ_INIT(&ifp->if_groups);
527         getmicrotime(&ifp->if_lastchange);
528         if (ifindex2ifnet == NULL || if_index >= if_indexlim) {
529                 unsigned int n;
530                 struct ifnet **q;
531
532                 if_indexlim <<= 1;
533
534                 /* grow ifindex2ifnet */
535                 n = if_indexlim * sizeof(*q);
536                 q = kmalloc(n, M_IFADDR, M_WAITOK | M_ZERO);
537                 if (ifindex2ifnet) {
538                         bcopy(ifindex2ifnet, q, n/2);
539                         kfree(ifindex2ifnet, M_IFADDR);
540                 }
541                 ifindex2ifnet = q;
542         }
543
544         ifindex2ifnet[if_index] = ifp;
545
546         /*
547          * create a Link Level name for this device
548          */
549         namelen = strlen(ifp->if_xname);
550         masklen = offsetof(struct sockaddr_dl, sdl_data[0]) + namelen;
551         socksize = masklen + ifp->if_addrlen;
552         if (socksize < sizeof(*sdl))
553                 socksize = sizeof(*sdl);
554         socksize = RT_ROUNDUP(socksize);
555         ifasize = sizeof(struct ifaddr) + 2 * socksize;
556         ifa = ifa_create(ifasize, M_WAITOK);
557         sdl = (struct sockaddr_dl *)(ifa + 1);
558         sdl->sdl_len = socksize;
559         sdl->sdl_family = AF_LINK;
560         bcopy(ifp->if_xname, sdl->sdl_data, namelen);
561         sdl->sdl_nlen = namelen;
562         sdl->sdl_index = ifp->if_index;
563         sdl->sdl_type = ifp->if_type;
564         ifp->if_lladdr = ifa;
565         ifa->ifa_ifp = ifp;
566         ifa->ifa_rtrequest = link_rtrequest;
567         ifa->ifa_addr = (struct sockaddr *)sdl;
568         sdl = (struct sockaddr_dl *)(socksize + (caddr_t)sdl);
569         ifa->ifa_netmask = (struct sockaddr *)sdl;
570         sdl->sdl_len = masklen;
571         while (namelen != 0)
572                 sdl->sdl_data[--namelen] = 0xff;
573         ifa_iflink(ifa, ifp, 0 /* Insert head */);
574
575         ifp->if_data_pcpu = kmalloc_cachealign(
576             ncpus * sizeof(struct ifdata_pcpu), M_DEVBUF, M_WAITOK | M_ZERO);
577
578         if (ifp->if_mapsubq == NULL)
579                 ifp->if_mapsubq = ifq_mapsubq_default;
580
581         ifq = &ifp->if_snd;
582         ifq->altq_type = 0;
583         ifq->altq_disc = NULL;
584         ifq->altq_flags &= ALTQF_CANTCHANGE;
585         ifq->altq_tbr = NULL;
586         ifq->altq_ifp = ifp;
587
588         if (ifq->altq_subq_cnt <= 0)
589                 ifq->altq_subq_cnt = 1;
590         ifq->altq_subq = kmalloc_cachealign(
591             ifq->altq_subq_cnt * sizeof(struct ifaltq_subque),
592             M_DEVBUF, M_WAITOK | M_ZERO);
593
594         if (ifq->altq_maxlen == 0) {
595                 if_printf(ifp, "driver didn't set altq_maxlen\n");
596                 ifq_set_maxlen(ifq, ifqmaxlen);
597         }
598
599         for (q = 0; q < ifq->altq_subq_cnt; ++q) {
600                 struct ifaltq_subque *ifsq = &ifq->altq_subq[q];
601
602                 ALTQ_SQ_LOCK_INIT(ifsq);
603                 ifsq->ifsq_index = q;
604
605                 ifsq->ifsq_altq = ifq;
606                 ifsq->ifsq_ifp = ifp;
607
608                 ifsq->ifsq_maxlen = ifq->altq_maxlen;
609                 ifsq->ifsq_maxbcnt = ifsq->ifsq_maxlen * MCLBYTES;
610                 ifsq->ifsq_prepended = NULL;
611                 ifsq->ifsq_started = 0;
612                 ifsq->ifsq_hw_oactive = 0;
613                 ifsq_set_cpuid(ifsq, 0);
614                 if (ifp->if_serializer != NULL)
615                         ifsq_set_hw_serialize(ifsq, ifp->if_serializer);
616
617                 ifsq->ifsq_stage =
618                     kmalloc_cachealign(ncpus * sizeof(struct ifsubq_stage),
619                     M_DEVBUF, M_WAITOK | M_ZERO);
620                 for (i = 0; i < ncpus; ++i)
621                         ifsq->ifsq_stage[i].stg_subq = ifsq;
622
623                 ifsq->ifsq_ifstart_nmsg =
624                     kmalloc(ncpus * sizeof(struct netmsg_base),
625                     M_LWKTMSG, M_WAITOK);
626                 for (i = 0; i < ncpus; ++i) {
627                         netmsg_init(&ifsq->ifsq_ifstart_nmsg[i], NULL,
628                             &netisr_adone_rport, 0, ifsq_ifstart_dispatch);
629                         ifsq->ifsq_ifstart_nmsg[i].lmsg.u.ms_resultp = ifsq;
630                 }
631         }
632         ifq_set_classic(ifq);
633
634         if (!SLIST_EMPTY(&domains))
635                 if_attachdomain1(ifp);
636
637         TAILQ_INSERT_TAIL(&ifnet, ifp, if_link);
638         lwkt_reltoken(&ifnet_token);
639
640         /* Announce the interface. */
641         EVENTHANDLER_INVOKE(ifnet_attach_event, ifp);
642         devctl_notify("IFNET", ifp->if_xname, "ATTACH", NULL);
643         rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
644
645         mtx_unlock(&ifp->if_ioctl_mtx);
646 }
647
648 static void
649 if_attachdomain(void *dummy)
650 {
651         struct ifnet *ifp;
652
653         crit_enter();
654         TAILQ_FOREACH(ifp, &ifnet, if_list)
655                 if_attachdomain1(ifp);
656         crit_exit();
657 }
658 SYSINIT(domainifattach, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST,
659         if_attachdomain, NULL);
660
661 static void
662 if_attachdomain1(struct ifnet *ifp)
663 {
664         struct domain *dp;
665
666         crit_enter();
667
668         /* address family dependent data region */
669         bzero(ifp->if_afdata, sizeof(ifp->if_afdata));
670         SLIST_FOREACH(dp, &domains, dom_next)
671                 if (dp->dom_ifattach)
672                         ifp->if_afdata[dp->dom_family] =
673                                 (*dp->dom_ifattach)(ifp);
674         crit_exit();
675 }
676
677 /*
678  * Purge all addresses whose type is _not_ AF_LINK
679  */
680 void
681 if_purgeaddrs_nolink(struct ifnet *ifp)
682 {
683         struct ifaddr_container *ifac, *next;
684
685         TAILQ_FOREACH_MUTABLE(ifac, &ifp->if_addrheads[mycpuid],
686                               ifa_link, next) {
687                 struct ifaddr *ifa = ifac->ifa;
688
689                 /* Leave link ifaddr as it is */
690                 if (ifa->ifa_addr->sa_family == AF_LINK)
691                         continue;
692 #ifdef INET
693                 /* XXX: Ugly!! ad hoc just for INET */
694                 if (ifa->ifa_addr && ifa->ifa_addr->sa_family == AF_INET) {
695                         struct ifaliasreq ifr;
696 #ifdef IFADDR_DEBUG_VERBOSE
697                         int i;
698
699                         kprintf("purge in4 addr %p: ", ifa);
700                         for (i = 0; i < ncpus; ++i)
701                                 kprintf("%d ", ifa->ifa_containers[i].ifa_refcnt);
702                         kprintf("\n");
703 #endif
704
705                         bzero(&ifr, sizeof ifr);
706                         ifr.ifra_addr = *ifa->ifa_addr;
707                         if (ifa->ifa_dstaddr)
708                                 ifr.ifra_broadaddr = *ifa->ifa_dstaddr;
709                         if (in_control(SIOCDIFADDR, (caddr_t)&ifr, ifp,
710                                        NULL) == 0)
711                                 continue;
712                 }
713 #endif /* INET */
714 #ifdef INET6
715                 if (ifa->ifa_addr && ifa->ifa_addr->sa_family == AF_INET6) {
716 #ifdef IFADDR_DEBUG_VERBOSE
717                         int i;
718
719                         kprintf("purge in6 addr %p: ", ifa);
720                         for (i = 0; i < ncpus; ++i)
721                                 kprintf("%d ", ifa->ifa_containers[i].ifa_refcnt);
722                         kprintf("\n");
723 #endif
724
725                         in6_purgeaddr(ifa);
726                         /* ifp_addrhead is already updated */
727                         continue;
728                 }
729 #endif /* INET6 */
730                 ifa_ifunlink(ifa, ifp);
731                 ifa_destroy(ifa);
732         }
733 }
734
735 static void
736 ifq_stage_detach_handler(netmsg_t nmsg)
737 {
738         struct ifaltq *ifq = nmsg->lmsg.u.ms_resultp;
739         int q;
740
741         for (q = 0; q < ifq->altq_subq_cnt; ++q) {
742                 struct ifaltq_subque *ifsq = &ifq->altq_subq[q];
743                 struct ifsubq_stage *stage = ifsq_get_stage(ifsq, mycpuid);
744
745                 if (stage->stg_flags & IFSQ_STAGE_FLAG_QUED)
746                         ifsq_stage_remove(&ifsubq_stage_heads[mycpuid], stage);
747         }
748         lwkt_replymsg(&nmsg->lmsg, 0);
749 }
750
751 static void
752 ifq_stage_detach(struct ifaltq *ifq)
753 {
754         struct netmsg_base base;
755         int cpu;
756
757         netmsg_init(&base, NULL, &curthread->td_msgport, 0,
758             ifq_stage_detach_handler);
759         base.lmsg.u.ms_resultp = ifq;
760
761         for (cpu = 0; cpu < ncpus; ++cpu)
762                 lwkt_domsg(netisr_cpuport(cpu), &base.lmsg, 0);
763 }
764
765 struct netmsg_if_rtdel {
766         struct netmsg_base      base;
767         struct ifnet            *ifp;
768 };
769
770 static void
771 if_rtdel_dispatch(netmsg_t msg)
772 {
773         struct netmsg_if_rtdel *rmsg = (void *)msg;
774         int i, nextcpu, cpu;
775
776         cpu = mycpuid;
777         for (i = 1; i <= AF_MAX; i++) {
778                 struct radix_node_head  *rnh;
779
780                 if ((rnh = rt_tables[cpu][i]) == NULL)
781                         continue;
782                 rnh->rnh_walktree(rnh, if_rtdel, rmsg->ifp);
783         }
784
785         nextcpu = cpu + 1;
786         if (nextcpu < ncpus)
787                 lwkt_forwardmsg(netisr_cpuport(nextcpu), &rmsg->base.lmsg);
788         else
789                 lwkt_replymsg(&rmsg->base.lmsg, 0);
790 }
791
792 /*
793  * Detach an interface, removing it from the
794  * list of "active" interfaces.
795  */
796 void
797 if_detach(struct ifnet *ifp)
798 {
799         struct netmsg_if_rtdel msg;
800         struct domain *dp;
801         int q;
802
803         EVENTHANDLER_INVOKE(ifnet_detach_event, ifp);
804
805         /*
806          * Remove routes and flush queues.
807          */
808         crit_enter();
809 #ifdef IFPOLL_ENABLE
810         if (ifp->if_flags & IFF_NPOLLING)
811                 ifpoll_deregister(ifp);
812 #endif
813         if_down(ifp);
814
815 #ifdef ALTQ
816         if (ifq_is_enabled(&ifp->if_snd))
817                 altq_disable(&ifp->if_snd);
818         if (ifq_is_attached(&ifp->if_snd))
819                 altq_detach(&ifp->if_snd);
820 #endif
821
822         /*
823          * Clean up all addresses.
824          */
825         ifp->if_lladdr = NULL;
826
827         if_purgeaddrs_nolink(ifp);
828         if (!TAILQ_EMPTY(&ifp->if_addrheads[mycpuid])) {
829                 struct ifaddr *ifa;
830
831                 ifa = TAILQ_FIRST(&ifp->if_addrheads[mycpuid])->ifa;
832                 KASSERT(ifa->ifa_addr->sa_family == AF_LINK,
833                         ("non-link ifaddr is left on if_addrheads"));
834
835                 ifa_ifunlink(ifa, ifp);
836                 ifa_destroy(ifa);
837                 KASSERT(TAILQ_EMPTY(&ifp->if_addrheads[mycpuid]),
838                         ("there are still ifaddrs left on if_addrheads"));
839         }
840
841 #ifdef INET
842         /*
843          * Remove all IPv4 kernel structures related to ifp.
844          */
845         in_ifdetach(ifp);
846 #endif
847
848 #ifdef INET6
849         /*
850          * Remove all IPv6 kernel structs related to ifp.  This should be done
851          * before removing routing entries below, since IPv6 interface direct
852          * routes are expected to be removed by the IPv6-specific kernel API.
853          * Otherwise, the kernel will detect some inconsistency and bark it.
854          */
855         in6_ifdetach(ifp);
856 #endif
857
858         /*
859          * Delete all remaining routes using this interface
860          */
861         netmsg_init(&msg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
862             if_rtdel_dispatch);
863         msg.ifp = ifp;
864         rt_domsg_global(&msg.base);
865
866         /* Announce that the interface is gone. */
867         rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
868         devctl_notify("IFNET", ifp->if_xname, "DETACH", NULL);
869
870         SLIST_FOREACH(dp, &domains, dom_next)
871                 if (dp->dom_ifdetach && ifp->if_afdata[dp->dom_family])
872                         (*dp->dom_ifdetach)(ifp,
873                                 ifp->if_afdata[dp->dom_family]);
874
875         /*
876          * Remove interface from ifindex2ifp[] and maybe decrement if_index.
877          */
878         lwkt_gettoken(&ifnet_token);
879         ifindex2ifnet[ifp->if_index] = NULL;
880         while (if_index > 0 && ifindex2ifnet[if_index] == NULL)
881                 if_index--;
882         TAILQ_REMOVE(&ifnet, ifp, if_link);
883         lwkt_reltoken(&ifnet_token);
884
885         kfree(ifp->if_addrheads, M_IFADDR);
886
887         lwkt_synchronize_ipiqs("if_detach");
888         ifq_stage_detach(&ifp->if_snd);
889
890         for (q = 0; q < ifp->if_snd.altq_subq_cnt; ++q) {
891                 struct ifaltq_subque *ifsq = &ifp->if_snd.altq_subq[q];
892
893                 kfree(ifsq->ifsq_ifstart_nmsg, M_LWKTMSG);
894                 kfree(ifsq->ifsq_stage, M_DEVBUF);
895         }
896         kfree(ifp->if_snd.altq_subq, M_DEVBUF);
897
898         kfree(ifp->if_data_pcpu, M_DEVBUF);
899
900         crit_exit();
901 }
902
903 /*
904  * Create interface group without members
905  */
906 struct ifg_group *
907 if_creategroup(const char *groupname)
908 {
909         struct ifg_group        *ifg = NULL;
910
911         if ((ifg = (struct ifg_group *)kmalloc(sizeof(struct ifg_group),
912             M_TEMP, M_NOWAIT)) == NULL)
913                 return (NULL);
914
915         strlcpy(ifg->ifg_group, groupname, sizeof(ifg->ifg_group));
916         ifg->ifg_refcnt = 0;
917         ifg->ifg_carp_demoted = 0;
918         TAILQ_INIT(&ifg->ifg_members);
919 #if NPF > 0
920         pfi_attach_ifgroup(ifg);
921 #endif
922         TAILQ_INSERT_TAIL(&ifg_head, ifg, ifg_next);
923
924         return (ifg);
925 }
926
927 /*
928  * Add a group to an interface
929  */
930 int
931 if_addgroup(struct ifnet *ifp, const char *groupname)
932 {
933         struct ifg_list         *ifgl;
934         struct ifg_group        *ifg = NULL;
935         struct ifg_member       *ifgm;
936
937         if (groupname[0] && groupname[strlen(groupname) - 1] >= '0' &&
938             groupname[strlen(groupname) - 1] <= '9')
939                 return (EINVAL);
940
941         TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
942                 if (!strcmp(ifgl->ifgl_group->ifg_group, groupname))
943                         return (EEXIST);
944
945         if ((ifgl = kmalloc(sizeof(*ifgl), M_TEMP, M_NOWAIT)) == NULL)
946                 return (ENOMEM);
947
948         if ((ifgm = kmalloc(sizeof(*ifgm), M_TEMP, M_NOWAIT)) == NULL) {
949                 kfree(ifgl, M_TEMP);
950                 return (ENOMEM);
951         }
952
953         TAILQ_FOREACH(ifg, &ifg_head, ifg_next)
954                 if (!strcmp(ifg->ifg_group, groupname))
955                         break;
956
957         if (ifg == NULL && (ifg = if_creategroup(groupname)) == NULL) {
958                 kfree(ifgl, M_TEMP);
959                 kfree(ifgm, M_TEMP);
960                 return (ENOMEM);
961         }
962
963         ifg->ifg_refcnt++;
964         ifgl->ifgl_group = ifg;
965         ifgm->ifgm_ifp = ifp;
966
967         TAILQ_INSERT_TAIL(&ifg->ifg_members, ifgm, ifgm_next);
968         TAILQ_INSERT_TAIL(&ifp->if_groups, ifgl, ifgl_next);
969
970 #if NPF > 0
971         pfi_group_change(groupname);
972 #endif
973
974         return (0);
975 }
976
977 /*
978  * Remove a group from an interface
979  */
980 int
981 if_delgroup(struct ifnet *ifp, const char *groupname)
982 {
983         struct ifg_list         *ifgl;
984         struct ifg_member       *ifgm;
985
986         TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
987                 if (!strcmp(ifgl->ifgl_group->ifg_group, groupname))
988                         break;
989         if (ifgl == NULL)
990                 return (ENOENT);
991
992         TAILQ_REMOVE(&ifp->if_groups, ifgl, ifgl_next);
993
994         TAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next)
995                 if (ifgm->ifgm_ifp == ifp)
996                         break;
997
998         if (ifgm != NULL) {
999                 TAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifgm_next);
1000                 kfree(ifgm, M_TEMP);
1001         }
1002
1003         if (--ifgl->ifgl_group->ifg_refcnt == 0) {
1004                 TAILQ_REMOVE(&ifg_head, ifgl->ifgl_group, ifg_next);
1005 #if NPF > 0
1006                 pfi_detach_ifgroup(ifgl->ifgl_group);
1007 #endif
1008                 kfree(ifgl->ifgl_group, M_TEMP);
1009         }
1010
1011         kfree(ifgl, M_TEMP);
1012
1013 #if NPF > 0
1014         pfi_group_change(groupname);
1015 #endif
1016
1017         return (0);
1018 }
1019
1020 /*
1021  * Stores all groups from an interface in memory pointed
1022  * to by data
1023  */
1024 int
1025 if_getgroup(caddr_t data, struct ifnet *ifp)
1026 {
1027         int                      len, error;
1028         struct ifg_list         *ifgl;
1029         struct ifg_req           ifgrq, *ifgp;
1030         struct ifgroupreq       *ifgr = (struct ifgroupreq *)data;
1031
1032         if (ifgr->ifgr_len == 0) {
1033                 TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
1034                         ifgr->ifgr_len += sizeof(struct ifg_req);
1035                 return (0);
1036         }
1037
1038         len = ifgr->ifgr_len;
1039         ifgp = ifgr->ifgr_groups;
1040         TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) {
1041                 if (len < sizeof(ifgrq))
1042                         return (EINVAL);
1043                 bzero(&ifgrq, sizeof ifgrq);
1044                 strlcpy(ifgrq.ifgrq_group, ifgl->ifgl_group->ifg_group,
1045                     sizeof(ifgrq.ifgrq_group));
1046                 if ((error = copyout((caddr_t)&ifgrq, (caddr_t)ifgp,
1047                     sizeof(struct ifg_req))))
1048                         return (error);
1049                 len -= sizeof(ifgrq);
1050                 ifgp++;
1051         }
1052
1053         return (0);
1054 }
1055
1056 /*
1057  * Stores all members of a group in memory pointed to by data
1058  */
1059 int
1060 if_getgroupmembers(caddr_t data)
1061 {
1062         struct ifgroupreq       *ifgr = (struct ifgroupreq *)data;
1063         struct ifg_group        *ifg;
1064         struct ifg_member       *ifgm;
1065         struct ifg_req           ifgrq, *ifgp;
1066         int                      len, error;
1067
1068         TAILQ_FOREACH(ifg, &ifg_head, ifg_next)
1069                 if (!strcmp(ifg->ifg_group, ifgr->ifgr_name))
1070                         break;
1071         if (ifg == NULL)
1072                 return (ENOENT);
1073
1074         if (ifgr->ifgr_len == 0) {
1075                 TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next)
1076                         ifgr->ifgr_len += sizeof(ifgrq);
1077                 return (0);
1078         }
1079
1080         len = ifgr->ifgr_len;
1081         ifgp = ifgr->ifgr_groups;
1082         TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) {
1083                 if (len < sizeof(ifgrq))
1084                         return (EINVAL);
1085                 bzero(&ifgrq, sizeof ifgrq);
1086                 strlcpy(ifgrq.ifgrq_member, ifgm->ifgm_ifp->if_xname,
1087                     sizeof(ifgrq.ifgrq_member));
1088                 if ((error = copyout((caddr_t)&ifgrq, (caddr_t)ifgp,
1089                     sizeof(struct ifg_req))))
1090                         return (error);
1091                 len -= sizeof(ifgrq);
1092                 ifgp++;
1093         }
1094
1095         return (0);
1096 }
1097
1098 /*
1099  * Delete Routes for a Network Interface
1100  *
1101  * Called for each routing entry via the rnh->rnh_walktree() call above
1102  * to delete all route entries referencing a detaching network interface.
1103  *
1104  * Arguments:
1105  *      rn      pointer to node in the routing table
1106  *      arg     argument passed to rnh->rnh_walktree() - detaching interface
1107  *
1108  * Returns:
1109  *      0       successful
1110  *      errno   failed - reason indicated
1111  *
1112  */
1113 static int
1114 if_rtdel(struct radix_node *rn, void *arg)
1115 {
1116         struct rtentry  *rt = (struct rtentry *)rn;
1117         struct ifnet    *ifp = arg;
1118         int             err;
1119
1120         if (rt->rt_ifp == ifp) {
1121
1122                 /*
1123                  * Protect (sorta) against walktree recursion problems
1124                  * with cloned routes
1125                  */
1126                 if (!(rt->rt_flags & RTF_UP))
1127                         return (0);
1128
1129                 err = rtrequest(RTM_DELETE, rt_key(rt), rt->rt_gateway,
1130                                 rt_mask(rt), rt->rt_flags,
1131                                 NULL);
1132                 if (err) {
1133                         log(LOG_WARNING, "if_rtdel: error %d\n", err);
1134                 }
1135         }
1136
1137         return (0);
1138 }
1139
1140 /*
1141  * Locate an interface based on a complete address.
1142  */
1143 struct ifaddr *
1144 ifa_ifwithaddr(struct sockaddr *addr)
1145 {
1146         struct ifnet *ifp;
1147
1148         TAILQ_FOREACH(ifp, &ifnet, if_link) {
1149                 struct ifaddr_container *ifac;
1150
1151                 TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1152                         struct ifaddr *ifa = ifac->ifa;
1153
1154                         if (ifa->ifa_addr->sa_family != addr->sa_family)
1155                                 continue;
1156                         if (sa_equal(addr, ifa->ifa_addr))
1157                                 return (ifa);
1158                         if ((ifp->if_flags & IFF_BROADCAST) &&
1159                             ifa->ifa_broadaddr &&
1160                             /* IPv6 doesn't have broadcast */
1161                             ifa->ifa_broadaddr->sa_len != 0 &&
1162                             sa_equal(ifa->ifa_broadaddr, addr))
1163                                 return (ifa);
1164                 }
1165         }
1166         return (NULL);
1167 }
1168 /*
1169  * Locate the point to point interface with a given destination address.
1170  */
1171 struct ifaddr *
1172 ifa_ifwithdstaddr(struct sockaddr *addr)
1173 {
1174         struct ifnet *ifp;
1175
1176         TAILQ_FOREACH(ifp, &ifnet, if_link) {
1177                 struct ifaddr_container *ifac;
1178
1179                 if (!(ifp->if_flags & IFF_POINTOPOINT))
1180                         continue;
1181
1182                 TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1183                         struct ifaddr *ifa = ifac->ifa;
1184
1185                         if (ifa->ifa_addr->sa_family != addr->sa_family)
1186                                 continue;
1187                         if (ifa->ifa_dstaddr &&
1188                             sa_equal(addr, ifa->ifa_dstaddr))
1189                                 return (ifa);
1190                 }
1191         }
1192         return (NULL);
1193 }
1194
1195 /*
1196  * Find an interface on a specific network.  If many, choice
1197  * is most specific found.
1198  */
1199 struct ifaddr *
1200 ifa_ifwithnet(struct sockaddr *addr)
1201 {
1202         struct ifnet *ifp;
1203         struct ifaddr *ifa_maybe = NULL;
1204         u_int af = addr->sa_family;
1205         char *addr_data = addr->sa_data, *cplim;
1206
1207         /*
1208          * AF_LINK addresses can be looked up directly by their index number,
1209          * so do that if we can.
1210          */
1211         if (af == AF_LINK) {
1212                 struct sockaddr_dl *sdl = (struct sockaddr_dl *)addr;
1213
1214                 if (sdl->sdl_index && sdl->sdl_index <= if_index)
1215                         return (ifindex2ifnet[sdl->sdl_index]->if_lladdr);
1216         }
1217
1218         /*
1219          * Scan though each interface, looking for ones that have
1220          * addresses in this address family.
1221          */
1222         TAILQ_FOREACH(ifp, &ifnet, if_link) {
1223                 struct ifaddr_container *ifac;
1224
1225                 TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1226                         struct ifaddr *ifa = ifac->ifa;
1227                         char *cp, *cp2, *cp3;
1228
1229                         if (ifa->ifa_addr->sa_family != af)
1230 next:                           continue;
1231                         if (af == AF_INET && ifp->if_flags & IFF_POINTOPOINT) {
1232                                 /*
1233                                  * This is a bit broken as it doesn't
1234                                  * take into account that the remote end may
1235                                  * be a single node in the network we are
1236                                  * looking for.
1237                                  * The trouble is that we don't know the
1238                                  * netmask for the remote end.
1239                                  */
1240                                 if (ifa->ifa_dstaddr != NULL &&
1241                                     sa_equal(addr, ifa->ifa_dstaddr))
1242                                         return (ifa);
1243                         } else {
1244                                 /*
1245                                  * if we have a special address handler,
1246                                  * then use it instead of the generic one.
1247                                  */
1248                                 if (ifa->ifa_claim_addr) {
1249                                         if ((*ifa->ifa_claim_addr)(ifa, addr)) {
1250                                                 return (ifa);
1251                                         } else {
1252                                                 continue;
1253                                         }
1254                                 }
1255
1256                                 /*
1257                                  * Scan all the bits in the ifa's address.
1258                                  * If a bit dissagrees with what we are
1259                                  * looking for, mask it with the netmask
1260                                  * to see if it really matters.
1261                                  * (A byte at a time)
1262                                  */
1263                                 if (ifa->ifa_netmask == 0)
1264                                         continue;
1265                                 cp = addr_data;
1266                                 cp2 = ifa->ifa_addr->sa_data;
1267                                 cp3 = ifa->ifa_netmask->sa_data;
1268                                 cplim = ifa->ifa_netmask->sa_len +
1269                                         (char *)ifa->ifa_netmask;
1270                                 while (cp3 < cplim)
1271                                         if ((*cp++ ^ *cp2++) & *cp3++)
1272                                                 goto next; /* next address! */
1273                                 /*
1274                                  * If the netmask of what we just found
1275                                  * is more specific than what we had before
1276                                  * (if we had one) then remember the new one
1277                                  * before continuing to search
1278                                  * for an even better one.
1279                                  */
1280                                 if (ifa_maybe == NULL ||
1281                                     rn_refines((char *)ifa->ifa_netmask,
1282                                                (char *)ifa_maybe->ifa_netmask))
1283                                         ifa_maybe = ifa;
1284                         }
1285                 }
1286         }
1287         return (ifa_maybe);
1288 }
1289
1290 /*
1291  * Find an interface address specific to an interface best matching
1292  * a given address.
1293  */
1294 struct ifaddr *
1295 ifaof_ifpforaddr(struct sockaddr *addr, struct ifnet *ifp)
1296 {
1297         struct ifaddr_container *ifac;
1298         char *cp, *cp2, *cp3;
1299         char *cplim;
1300         struct ifaddr *ifa_maybe = NULL;
1301         u_int af = addr->sa_family;
1302
1303         if (af >= AF_MAX)
1304                 return (0);
1305         TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1306                 struct ifaddr *ifa = ifac->ifa;
1307
1308                 if (ifa->ifa_addr->sa_family != af)
1309                         continue;
1310                 if (ifa_maybe == NULL)
1311                         ifa_maybe = ifa;
1312                 if (ifa->ifa_netmask == NULL) {
1313                         if (sa_equal(addr, ifa->ifa_addr) ||
1314                             (ifa->ifa_dstaddr != NULL &&
1315                              sa_equal(addr, ifa->ifa_dstaddr)))
1316                                 return (ifa);
1317                         continue;
1318                 }
1319                 if (ifp->if_flags & IFF_POINTOPOINT) {
1320                         if (sa_equal(addr, ifa->ifa_dstaddr))
1321                                 return (ifa);
1322                 } else {
1323                         cp = addr->sa_data;
1324                         cp2 = ifa->ifa_addr->sa_data;
1325                         cp3 = ifa->ifa_netmask->sa_data;
1326                         cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask;
1327                         for (; cp3 < cplim; cp3++)
1328                                 if ((*cp++ ^ *cp2++) & *cp3)
1329                                         break;
1330                         if (cp3 == cplim)
1331                                 return (ifa);
1332                 }
1333         }
1334         return (ifa_maybe);
1335 }
1336
1337 /*
1338  * Default action when installing a route with a Link Level gateway.
1339  * Lookup an appropriate real ifa to point to.
1340  * This should be moved to /sys/net/link.c eventually.
1341  */
1342 static void
1343 link_rtrequest(int cmd, struct rtentry *rt)
1344 {
1345         struct ifaddr *ifa;
1346         struct sockaddr *dst;
1347         struct ifnet *ifp;
1348
1349         if (cmd != RTM_ADD || (ifa = rt->rt_ifa) == NULL ||
1350             (ifp = ifa->ifa_ifp) == NULL || (dst = rt_key(rt)) == NULL)
1351                 return;
1352         ifa = ifaof_ifpforaddr(dst, ifp);
1353         if (ifa != NULL) {
1354                 IFAFREE(rt->rt_ifa);
1355                 IFAREF(ifa);
1356                 rt->rt_ifa = ifa;
1357                 if (ifa->ifa_rtrequest && ifa->ifa_rtrequest != link_rtrequest)
1358                         ifa->ifa_rtrequest(cmd, rt);
1359         }
1360 }
1361
1362 /*
1363  * Mark an interface down and notify protocols of
1364  * the transition.
1365  * NOTE: must be called at splnet or eqivalent.
1366  */
1367 void
1368 if_unroute(struct ifnet *ifp, int flag, int fam)
1369 {
1370         struct ifaddr_container *ifac;
1371
1372         ifp->if_flags &= ~flag;
1373         getmicrotime(&ifp->if_lastchange);
1374         TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1375                 struct ifaddr *ifa = ifac->ifa;
1376
1377                 if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family))
1378                         kpfctlinput(PRC_IFDOWN, ifa->ifa_addr);
1379         }
1380         ifq_purge_all(&ifp->if_snd);
1381         rt_ifmsg(ifp);
1382 }
1383
1384 /*
1385  * Mark an interface up and notify protocols of
1386  * the transition.
1387  * NOTE: must be called at splnet or eqivalent.
1388  */
1389 void
1390 if_route(struct ifnet *ifp, int flag, int fam)
1391 {
1392         struct ifaddr_container *ifac;
1393
1394         ifq_purge_all(&ifp->if_snd);
1395         ifp->if_flags |= flag;
1396         getmicrotime(&ifp->if_lastchange);
1397         TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1398                 struct ifaddr *ifa = ifac->ifa;
1399
1400                 if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family))
1401                         kpfctlinput(PRC_IFUP, ifa->ifa_addr);
1402         }
1403         rt_ifmsg(ifp);
1404 #ifdef INET6
1405         in6_if_up(ifp);
1406 #endif
1407 }
1408
1409 /*
1410  * Mark an interface down and notify protocols of the transition.  An
1411  * interface going down is also considered to be a synchronizing event.
1412  * We must ensure that all packet processing related to the interface
1413  * has completed before we return so e.g. the caller can free the ifnet
1414  * structure that the mbufs may be referencing.
1415  *
1416  * NOTE: must be called at splnet or eqivalent.
1417  */
1418 void
1419 if_down(struct ifnet *ifp)
1420 {
1421         if_unroute(ifp, IFF_UP, AF_UNSPEC);
1422         netmsg_service_sync();
1423 }
1424
1425 /*
1426  * Mark an interface up and notify protocols of
1427  * the transition.
1428  * NOTE: must be called at splnet or eqivalent.
1429  */
1430 void
1431 if_up(struct ifnet *ifp)
1432 {
1433         if_route(ifp, IFF_UP, AF_UNSPEC);
1434 }
1435
1436 /*
1437  * Process a link state change.
1438  * NOTE: must be called at splsoftnet or equivalent.
1439  */
1440 void
1441 if_link_state_change(struct ifnet *ifp)
1442 {
1443         int link_state = ifp->if_link_state;
1444
1445         rt_ifmsg(ifp);
1446         devctl_notify("IFNET", ifp->if_xname,
1447             (link_state == LINK_STATE_UP) ? "LINK_UP" : "LINK_DOWN", NULL);
1448 }
1449
1450 /*
1451  * Handle interface watchdog timer routines.  Called
1452  * from softclock, we decrement timers (if set) and
1453  * call the appropriate interface routine on expiration.
1454  */
1455 static void
1456 if_slowtimo_dispatch(netmsg_t nmsg)
1457 {
1458         struct globaldata *gd = mycpu;
1459         struct ifnet *ifp;
1460
1461         KASSERT(&curthread->td_msgport == netisr_cpuport(0),
1462             ("not in netisr0"));
1463
1464         crit_enter_gd(gd);
1465         lwkt_replymsg(&nmsg->lmsg, 0);  /* reply ASAP */
1466         crit_exit_gd(gd);
1467
1468         TAILQ_FOREACH(ifp, &ifnet, if_link) {
1469                 crit_enter_gd(gd);
1470
1471                 if (if_stats_compat) {
1472                         IFNET_STAT_GET(ifp, ipackets, ifp->if_ipackets);
1473                         IFNET_STAT_GET(ifp, ierrors, ifp->if_ierrors);
1474                         IFNET_STAT_GET(ifp, opackets, ifp->if_opackets);
1475                         IFNET_STAT_GET(ifp, oerrors, ifp->if_oerrors);
1476                         IFNET_STAT_GET(ifp, collisions, ifp->if_collisions);
1477                         IFNET_STAT_GET(ifp, ibytes, ifp->if_ibytes);
1478                         IFNET_STAT_GET(ifp, obytes, ifp->if_obytes);
1479                         IFNET_STAT_GET(ifp, imcasts, ifp->if_imcasts);
1480                         IFNET_STAT_GET(ifp, omcasts, ifp->if_omcasts);
1481                         IFNET_STAT_GET(ifp, iqdrops, ifp->if_iqdrops);
1482                         IFNET_STAT_GET(ifp, noproto, ifp->if_noproto);
1483                 }
1484
1485                 if (ifp->if_timer == 0 || --ifp->if_timer) {
1486                         crit_exit_gd(gd);
1487                         continue;
1488                 }
1489                 if (ifp->if_watchdog) {
1490                         if (ifnet_tryserialize_all(ifp)) {
1491                                 (*ifp->if_watchdog)(ifp);
1492                                 ifnet_deserialize_all(ifp);
1493                         } else {
1494                                 /* try again next timeout */
1495                                 ++ifp->if_timer;
1496                         }
1497                 }
1498
1499                 crit_exit_gd(gd);
1500         }
1501
1502         callout_reset(&if_slowtimo_timer, hz / IFNET_SLOWHZ, if_slowtimo, NULL);
1503 }
1504
1505 static void
1506 if_slowtimo(void *arg __unused)
1507 {
1508         struct lwkt_msg *lmsg = &if_slowtimo_netmsg.lmsg;
1509
1510         KASSERT(mycpuid == 0, ("not on cpu0"));
1511         crit_enter();
1512         if (lmsg->ms_flags & MSGF_DONE)
1513                 lwkt_sendmsg_oncpu(netisr_cpuport(0), lmsg);
1514         crit_exit();
1515 }
1516
1517 /*
1518  * Map interface name to
1519  * interface structure pointer.
1520  */
1521 struct ifnet *
1522 ifunit(const char *name)
1523 {
1524         struct ifnet *ifp;
1525
1526         /*
1527          * Search all the interfaces for this name/number
1528          */
1529
1530         TAILQ_FOREACH(ifp, &ifnet, if_link) {
1531                 if (strncmp(ifp->if_xname, name, IFNAMSIZ) == 0)
1532                         break;
1533         }
1534         return (ifp);
1535 }
1536
1537
1538 /*
1539  * Map interface name in a sockaddr_dl to
1540  * interface structure pointer.
1541  */
1542 struct ifnet *
1543 if_withname(struct sockaddr *sa)
1544 {
1545         char ifname[IFNAMSIZ+1];
1546         struct sockaddr_dl *sdl = (struct sockaddr_dl *)sa;
1547
1548         if ( (sa->sa_family != AF_LINK) || (sdl->sdl_nlen == 0) ||
1549              (sdl->sdl_nlen > IFNAMSIZ) )
1550                 return NULL;
1551
1552         /*
1553          * ifunit wants a null-terminated name.  It may not be null-terminated
1554          * in the sockaddr.  We don't want to change the caller's sockaddr,
1555          * and there might not be room to put the trailing null anyway, so we
1556          * make a local copy that we know we can null terminate safely.
1557          */
1558
1559         bcopy(sdl->sdl_data, ifname, sdl->sdl_nlen);
1560         ifname[sdl->sdl_nlen] = '\0';
1561         return ifunit(ifname);
1562 }
1563
1564
1565 /*
1566  * Interface ioctls.
1567  */
1568 int
1569 ifioctl(struct socket *so, u_long cmd, caddr_t data, struct ucred *cred)
1570 {
1571         struct ifnet *ifp;
1572         struct ifreq *ifr;
1573         struct ifstat *ifs;
1574         int error;
1575         short oif_flags;
1576         int new_flags;
1577 #ifdef COMPAT_43
1578         int ocmd;
1579 #endif
1580         size_t namelen, onamelen;
1581         char new_name[IFNAMSIZ];
1582         struct ifaddr *ifa;
1583         struct sockaddr_dl *sdl;
1584
1585         switch (cmd) {
1586         case SIOCGIFCONF:
1587         case OSIOCGIFCONF:
1588                 return (ifconf(cmd, data, cred));
1589         default:
1590                 break;
1591         }
1592
1593         ifr = (struct ifreq *)data;
1594
1595         switch (cmd) {
1596         case SIOCIFCREATE:
1597         case SIOCIFCREATE2:
1598                 if ((error = priv_check_cred(cred, PRIV_ROOT, 0)) != 0)
1599                         return (error);
1600                 return (if_clone_create(ifr->ifr_name, sizeof(ifr->ifr_name),
1601                         cmd == SIOCIFCREATE2 ? ifr->ifr_data : NULL));
1602         case SIOCIFDESTROY:
1603                 if ((error = priv_check_cred(cred, PRIV_ROOT, 0)) != 0)
1604                         return (error);
1605                 return (if_clone_destroy(ifr->ifr_name));
1606         case SIOCIFGCLONERS:
1607                 return (if_clone_list((struct if_clonereq *)data));
1608         default:
1609                 break;
1610         }
1611
1612         /*
1613          * Nominal ioctl through interface, lookup the ifp and obtain a
1614          * lock to serialize the ifconfig ioctl operation.
1615          */
1616         ifp = ifunit(ifr->ifr_name);
1617         if (ifp == NULL)
1618                 return (ENXIO);
1619         error = 0;
1620         mtx_lock(&ifp->if_ioctl_mtx);
1621
1622         switch (cmd) {
1623         case SIOCGIFINDEX:
1624                 ifr->ifr_index = ifp->if_index;
1625                 break;
1626
1627         case SIOCGIFFLAGS:
1628                 ifr->ifr_flags = ifp->if_flags;
1629                 ifr->ifr_flagshigh = ifp->if_flags >> 16;
1630                 break;
1631
1632         case SIOCGIFCAP:
1633                 ifr->ifr_reqcap = ifp->if_capabilities;
1634                 ifr->ifr_curcap = ifp->if_capenable;
1635                 break;
1636
1637         case SIOCGIFMETRIC:
1638                 ifr->ifr_metric = ifp->if_metric;
1639                 break;
1640
1641         case SIOCGIFMTU:
1642                 ifr->ifr_mtu = ifp->if_mtu;
1643                 break;
1644
1645         case SIOCGIFTSOLEN:
1646                 ifr->ifr_tsolen = ifp->if_tsolen;
1647                 break;
1648
1649         case SIOCGIFDATA:
1650                 error = copyout((caddr_t)&ifp->if_data, ifr->ifr_data,
1651                                 sizeof(ifp->if_data));
1652                 break;
1653
1654         case SIOCGIFPHYS:
1655                 ifr->ifr_phys = ifp->if_physical;
1656                 break;
1657
1658         case SIOCGIFPOLLCPU:
1659                 ifr->ifr_pollcpu = -1;
1660                 break;
1661
1662         case SIOCSIFPOLLCPU:
1663                 break;
1664
1665         case SIOCSIFFLAGS:
1666                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1667                 if (error)
1668                         break;
1669                 new_flags = (ifr->ifr_flags & 0xffff) |
1670                     (ifr->ifr_flagshigh << 16);
1671                 if (ifp->if_flags & IFF_SMART) {
1672                         /* Smart drivers twiddle their own routes */
1673                 } else if (ifp->if_flags & IFF_UP &&
1674                     (new_flags & IFF_UP) == 0) {
1675                         crit_enter();
1676                         if_down(ifp);
1677                         crit_exit();
1678                 } else if (new_flags & IFF_UP &&
1679                     (ifp->if_flags & IFF_UP) == 0) {
1680                         crit_enter();
1681                         if_up(ifp);
1682                         crit_exit();
1683                 }
1684
1685 #ifdef IFPOLL_ENABLE
1686                 if ((new_flags ^ ifp->if_flags) & IFF_NPOLLING) {
1687                         if (new_flags & IFF_NPOLLING)
1688                                 ifpoll_register(ifp);
1689                         else
1690                                 ifpoll_deregister(ifp);
1691                 }
1692 #endif
1693
1694                 ifp->if_flags = (ifp->if_flags & IFF_CANTCHANGE) |
1695                         (new_flags &~ IFF_CANTCHANGE);
1696                 if (new_flags & IFF_PPROMISC) {
1697                         /* Permanently promiscuous mode requested */
1698                         ifp->if_flags |= IFF_PROMISC;
1699                 } else if (ifp->if_pcount == 0) {
1700                         ifp->if_flags &= ~IFF_PROMISC;
1701                 }
1702                 if (ifp->if_ioctl) {
1703                         ifnet_serialize_all(ifp);
1704                         ifp->if_ioctl(ifp, cmd, data, cred);
1705                         ifnet_deserialize_all(ifp);
1706                 }
1707                 getmicrotime(&ifp->if_lastchange);
1708                 break;
1709
1710         case SIOCSIFCAP:
1711                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1712                 if (error)
1713                         break;
1714                 if (ifr->ifr_reqcap & ~ifp->if_capabilities) {
1715                         error = EINVAL;
1716                         break;
1717                 }
1718                 ifnet_serialize_all(ifp);
1719                 ifp->if_ioctl(ifp, cmd, data, cred);
1720                 ifnet_deserialize_all(ifp);
1721                 break;
1722
1723         case SIOCSIFNAME:
1724                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1725                 if (error)
1726                         break;
1727                 error = copyinstr(ifr->ifr_data, new_name, IFNAMSIZ, NULL);
1728                 if (error)
1729                         break;
1730                 if (new_name[0] == '\0') {
1731                         error = EINVAL;
1732                         break;
1733                 }
1734                 if (ifunit(new_name) != NULL) {
1735                         error = EEXIST;
1736                         break;
1737                 }
1738
1739                 EVENTHANDLER_INVOKE(ifnet_detach_event, ifp);
1740
1741                 /* Announce the departure of the interface. */
1742                 rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
1743
1744                 strlcpy(ifp->if_xname, new_name, sizeof(ifp->if_xname));
1745                 ifa = TAILQ_FIRST(&ifp->if_addrheads[mycpuid])->ifa;
1746                 /* XXX IFA_LOCK(ifa); */
1747                 sdl = (struct sockaddr_dl *)ifa->ifa_addr;
1748                 namelen = strlen(new_name);
1749                 onamelen = sdl->sdl_nlen;
1750                 /*
1751                  * Move the address if needed.  This is safe because we
1752                  * allocate space for a name of length IFNAMSIZ when we
1753                  * create this in if_attach().
1754                  */
1755                 if (namelen != onamelen) {
1756                         bcopy(sdl->sdl_data + onamelen,
1757                             sdl->sdl_data + namelen, sdl->sdl_alen);
1758                 }
1759                 bcopy(new_name, sdl->sdl_data, namelen);
1760                 sdl->sdl_nlen = namelen;
1761                 sdl = (struct sockaddr_dl *)ifa->ifa_netmask;
1762                 bzero(sdl->sdl_data, onamelen);
1763                 while (namelen != 0)
1764                         sdl->sdl_data[--namelen] = 0xff;
1765                 /* XXX IFA_UNLOCK(ifa) */
1766
1767                 EVENTHANDLER_INVOKE(ifnet_attach_event, ifp);
1768
1769                 /* Announce the return of the interface. */
1770                 rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
1771                 break;
1772
1773         case SIOCSIFMETRIC:
1774                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1775                 if (error)
1776                         break;
1777                 ifp->if_metric = ifr->ifr_metric;
1778                 getmicrotime(&ifp->if_lastchange);
1779                 break;
1780
1781         case SIOCSIFPHYS:
1782                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1783                 if (error)
1784                         break;
1785                 if (ifp->if_ioctl == NULL) {
1786                         error = EOPNOTSUPP;
1787                         break;
1788                 }
1789                 ifnet_serialize_all(ifp);
1790                 error = ifp->if_ioctl(ifp, cmd, data, cred);
1791                 ifnet_deserialize_all(ifp);
1792                 if (error == 0)
1793                         getmicrotime(&ifp->if_lastchange);
1794                 break;
1795
1796         case SIOCSIFMTU:
1797         {
1798                 u_long oldmtu = ifp->if_mtu;
1799
1800                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1801                 if (error)
1802                         break;
1803                 if (ifp->if_ioctl == NULL) {
1804                         error = EOPNOTSUPP;
1805                         break;
1806                 }
1807                 if (ifr->ifr_mtu < IF_MINMTU || ifr->ifr_mtu > IF_MAXMTU) {
1808                         error = EINVAL;
1809                         break;
1810                 }
1811                 ifnet_serialize_all(ifp);
1812                 error = ifp->if_ioctl(ifp, cmd, data, cred);
1813                 ifnet_deserialize_all(ifp);
1814                 if (error == 0) {
1815                         getmicrotime(&ifp->if_lastchange);
1816                         rt_ifmsg(ifp);
1817                 }
1818                 /*
1819                  * If the link MTU changed, do network layer specific procedure.
1820                  */
1821                 if (ifp->if_mtu != oldmtu) {
1822 #ifdef INET6
1823                         nd6_setmtu(ifp);
1824 #endif
1825                 }
1826                 break;
1827         }
1828
1829         case SIOCSIFTSOLEN:
1830                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1831                 if (error)
1832                         break;
1833
1834                 /* XXX need driver supplied upper limit */
1835                 if (ifr->ifr_tsolen <= 0) {
1836                         error = EINVAL;
1837                         break;
1838                 }
1839                 ifp->if_tsolen = ifr->ifr_tsolen;
1840                 break;
1841
1842         case SIOCADDMULTI:
1843         case SIOCDELMULTI:
1844                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1845                 if (error)
1846                         break;
1847
1848                 /* Don't allow group membership on non-multicast interfaces. */
1849                 if ((ifp->if_flags & IFF_MULTICAST) == 0) {
1850                         error = EOPNOTSUPP;
1851                         break;
1852                 }
1853
1854                 /* Don't let users screw up protocols' entries. */
1855                 if (ifr->ifr_addr.sa_family != AF_LINK) {
1856                         error = EINVAL;
1857                         break;
1858                 }
1859
1860                 if (cmd == SIOCADDMULTI) {
1861                         struct ifmultiaddr *ifma;
1862                         error = if_addmulti(ifp, &ifr->ifr_addr, &ifma);
1863                 } else {
1864                         error = if_delmulti(ifp, &ifr->ifr_addr);
1865                 }
1866                 if (error == 0)
1867                         getmicrotime(&ifp->if_lastchange);
1868                 break;
1869
1870         case SIOCSIFPHYADDR:
1871         case SIOCDIFPHYADDR:
1872 #ifdef INET6
1873         case SIOCSIFPHYADDR_IN6:
1874 #endif
1875         case SIOCSLIFPHYADDR:
1876         case SIOCSIFMEDIA:
1877         case SIOCSIFGENERIC:
1878                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1879                 if (error)
1880                         break;
1881                 if (ifp->if_ioctl == 0) {
1882                         error = EOPNOTSUPP;
1883                         break;
1884                 }
1885                 ifnet_serialize_all(ifp);
1886                 error = ifp->if_ioctl(ifp, cmd, data, cred);
1887                 ifnet_deserialize_all(ifp);
1888                 if (error == 0)
1889                         getmicrotime(&ifp->if_lastchange);
1890                 break;
1891
1892         case SIOCGIFSTATUS:
1893                 ifs = (struct ifstat *)data;
1894                 ifs->ascii[0] = '\0';
1895                 /* fall through */
1896         case SIOCGIFPSRCADDR:
1897         case SIOCGIFPDSTADDR:
1898         case SIOCGLIFPHYADDR:
1899         case SIOCGIFMEDIA:
1900         case SIOCGIFGENERIC:
1901                 if (ifp->if_ioctl == NULL) {
1902                         error = EOPNOTSUPP;
1903                         break;
1904                 }
1905                 ifnet_serialize_all(ifp);
1906                 error = ifp->if_ioctl(ifp, cmd, data, cred);
1907                 ifnet_deserialize_all(ifp);
1908                 break;
1909
1910         case SIOCSIFLLADDR:
1911                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1912                 if (error)
1913                         break;
1914                 error = if_setlladdr(ifp, ifr->ifr_addr.sa_data,
1915                                      ifr->ifr_addr.sa_len);
1916                 EVENTHANDLER_INVOKE(iflladdr_event, ifp);
1917                 break;
1918
1919         default:
1920                 oif_flags = ifp->if_flags;
1921                 if (so->so_proto == 0) {
1922                         error = EOPNOTSUPP;
1923                         break;
1924                 }
1925 #ifndef COMPAT_43
1926                 error = so_pru_control_direct(so, cmd, data, ifp);
1927 #else
1928                 ocmd = cmd;
1929
1930                 switch (cmd) {
1931                 case SIOCSIFDSTADDR:
1932                 case SIOCSIFADDR:
1933                 case SIOCSIFBRDADDR:
1934                 case SIOCSIFNETMASK:
1935 #if BYTE_ORDER != BIG_ENDIAN
1936                         if (ifr->ifr_addr.sa_family == 0 &&
1937                             ifr->ifr_addr.sa_len < 16) {
1938                                 ifr->ifr_addr.sa_family = ifr->ifr_addr.sa_len;
1939                                 ifr->ifr_addr.sa_len = 16;
1940                         }
1941 #else
1942                         if (ifr->ifr_addr.sa_len == 0)
1943                                 ifr->ifr_addr.sa_len = 16;
1944 #endif
1945                         break;
1946                 case OSIOCGIFADDR:
1947                         cmd = SIOCGIFADDR;
1948                         break;
1949                 case OSIOCGIFDSTADDR:
1950                         cmd = SIOCGIFDSTADDR;
1951                         break;
1952                 case OSIOCGIFBRDADDR:
1953                         cmd = SIOCGIFBRDADDR;
1954                         break;
1955                 case OSIOCGIFNETMASK:
1956                         cmd = SIOCGIFNETMASK;
1957                         break;
1958                 default:
1959                         break;
1960                 }
1961
1962                 error = so_pru_control_direct(so, cmd, data, ifp);
1963
1964                 switch (ocmd) {
1965                 case OSIOCGIFADDR:
1966                 case OSIOCGIFDSTADDR:
1967                 case OSIOCGIFBRDADDR:
1968                 case OSIOCGIFNETMASK:
1969                         *(u_short *)&ifr->ifr_addr = ifr->ifr_addr.sa_family;
1970                         break;
1971                 }
1972 #endif /* COMPAT_43 */
1973
1974                 if ((oif_flags ^ ifp->if_flags) & IFF_UP) {
1975 #ifdef INET6
1976                         DELAY(100);/* XXX: temporary workaround for fxp issue*/
1977                         if (ifp->if_flags & IFF_UP) {
1978                                 crit_enter();
1979                                 in6_if_up(ifp);
1980                                 crit_exit();
1981                         }
1982 #endif
1983                 }
1984                 break;
1985         }
1986
1987         mtx_unlock(&ifp->if_ioctl_mtx);
1988         return (error);
1989 }
1990
1991 /*
1992  * Set/clear promiscuous mode on interface ifp based on the truth value
1993  * of pswitch.  The calls are reference counted so that only the first
1994  * "on" request actually has an effect, as does the final "off" request.
1995  * Results are undefined if the "off" and "on" requests are not matched.
1996  */
1997 int
1998 ifpromisc(struct ifnet *ifp, int pswitch)
1999 {
2000         struct ifreq ifr;
2001         int error;
2002         int oldflags;
2003
2004         oldflags = ifp->if_flags;
2005         if (ifp->if_flags & IFF_PPROMISC) {
2006                 /* Do nothing if device is in permanently promiscuous mode */
2007                 ifp->if_pcount += pswitch ? 1 : -1;
2008                 return (0);
2009         }
2010         if (pswitch) {
2011                 /*
2012                  * If the device is not configured up, we cannot put it in
2013                  * promiscuous mode.
2014                  */
2015                 if ((ifp->if_flags & IFF_UP) == 0)
2016                         return (ENETDOWN);
2017                 if (ifp->if_pcount++ != 0)
2018                         return (0);
2019                 ifp->if_flags |= IFF_PROMISC;
2020                 log(LOG_INFO, "%s: promiscuous mode enabled\n",
2021                     ifp->if_xname);
2022         } else {
2023                 if (--ifp->if_pcount > 0)
2024                         return (0);
2025                 ifp->if_flags &= ~IFF_PROMISC;
2026                 log(LOG_INFO, "%s: promiscuous mode disabled\n",
2027                     ifp->if_xname);
2028         }
2029         ifr.ifr_flags = ifp->if_flags;
2030         ifr.ifr_flagshigh = ifp->if_flags >> 16;
2031         ifnet_serialize_all(ifp);
2032         error = ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr, NULL);
2033         ifnet_deserialize_all(ifp);
2034         if (error == 0)
2035                 rt_ifmsg(ifp);
2036         else
2037                 ifp->if_flags = oldflags;
2038         return error;
2039 }
2040
2041 /*
2042  * Return interface configuration
2043  * of system.  List may be used
2044  * in later ioctl's (above) to get
2045  * other information.
2046  */
2047 static int
2048 ifconf(u_long cmd, caddr_t data, struct ucred *cred)
2049 {
2050         struct ifconf *ifc = (struct ifconf *)data;
2051         struct ifnet *ifp;
2052         struct sockaddr *sa;
2053         struct ifreq ifr, *ifrp;
2054         int space = ifc->ifc_len, error = 0;
2055
2056         ifrp = ifc->ifc_req;
2057         TAILQ_FOREACH(ifp, &ifnet, if_link) {
2058                 struct ifaddr_container *ifac;
2059                 int addrs;
2060
2061                 if (space <= sizeof ifr)
2062                         break;
2063
2064                 /*
2065                  * Zero the stack declared structure first to prevent
2066                  * memory disclosure.
2067                  */
2068                 bzero(&ifr, sizeof(ifr));
2069                 if (strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name))
2070                     >= sizeof(ifr.ifr_name)) {
2071                         error = ENAMETOOLONG;
2072                         break;
2073                 }
2074
2075                 addrs = 0;
2076                 TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
2077                         struct ifaddr *ifa = ifac->ifa;
2078
2079                         if (space <= sizeof ifr)
2080                                 break;
2081                         sa = ifa->ifa_addr;
2082                         if (cred->cr_prison &&
2083                             prison_if(cred, sa))
2084                                 continue;
2085                         addrs++;
2086 #ifdef COMPAT_43
2087                         if (cmd == OSIOCGIFCONF) {
2088                                 struct osockaddr *osa =
2089                                          (struct osockaddr *)&ifr.ifr_addr;
2090                                 ifr.ifr_addr = *sa;
2091                                 osa->sa_family = sa->sa_family;
2092                                 error = copyout(&ifr, ifrp, sizeof ifr);
2093                                 ifrp++;
2094                         } else
2095 #endif
2096                         if (sa->sa_len <= sizeof(*sa)) {
2097                                 ifr.ifr_addr = *sa;
2098                                 error = copyout(&ifr, ifrp, sizeof ifr);
2099                                 ifrp++;
2100                         } else {
2101                                 if (space < (sizeof ifr) + sa->sa_len -
2102                                             sizeof(*sa))
2103                                         break;
2104                                 space -= sa->sa_len - sizeof(*sa);
2105                                 error = copyout(&ifr, ifrp,
2106                                                 sizeof ifr.ifr_name);
2107                                 if (error == 0)
2108                                         error = copyout(sa, &ifrp->ifr_addr,
2109                                                         sa->sa_len);
2110                                 ifrp = (struct ifreq *)
2111                                         (sa->sa_len + (caddr_t)&ifrp->ifr_addr);
2112                         }
2113                         if (error)
2114                                 break;
2115                         space -= sizeof ifr;
2116                 }
2117                 if (error)
2118                         break;
2119                 if (!addrs) {
2120                         bzero(&ifr.ifr_addr, sizeof ifr.ifr_addr);
2121                         error = copyout(&ifr, ifrp, sizeof ifr);
2122                         if (error)
2123                                 break;
2124                         space -= sizeof ifr;
2125                         ifrp++;
2126                 }
2127         }
2128         ifc->ifc_len -= space;
2129         return (error);
2130 }
2131
2132 /*
2133  * Just like if_promisc(), but for all-multicast-reception mode.
2134  */
2135 int
2136 if_allmulti(struct ifnet *ifp, int onswitch)
2137 {
2138         int error = 0;
2139         struct ifreq ifr;
2140
2141         crit_enter();
2142
2143         if (onswitch) {
2144                 if (ifp->if_amcount++ == 0) {
2145                         ifp->if_flags |= IFF_ALLMULTI;
2146                         ifr.ifr_flags = ifp->if_flags;
2147                         ifr.ifr_flagshigh = ifp->if_flags >> 16;
2148                         ifnet_serialize_all(ifp);
2149                         error = ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr,
2150                                               NULL);
2151                         ifnet_deserialize_all(ifp);
2152                 }
2153         } else {
2154                 if (ifp->if_amcount > 1) {
2155                         ifp->if_amcount--;
2156                 } else {
2157                         ifp->if_amcount = 0;
2158                         ifp->if_flags &= ~IFF_ALLMULTI;
2159                         ifr.ifr_flags = ifp->if_flags;
2160                         ifr.ifr_flagshigh = ifp->if_flags >> 16;
2161                         ifnet_serialize_all(ifp);
2162                         error = ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr,
2163                                               NULL);
2164                         ifnet_deserialize_all(ifp);
2165                 }
2166         }
2167
2168         crit_exit();
2169
2170         if (error == 0)
2171                 rt_ifmsg(ifp);
2172         return error;
2173 }
2174
2175 /*
2176  * Add a multicast listenership to the interface in question.
2177  * The link layer provides a routine which converts
2178  */
2179 int
2180 if_addmulti_serialized(struct ifnet *ifp, struct sockaddr *sa,
2181     struct ifmultiaddr **retifma)
2182 {
2183         struct sockaddr *llsa, *dupsa;
2184         int error;
2185         struct ifmultiaddr *ifma;
2186
2187         ASSERT_IFNET_SERIALIZED_ALL(ifp);
2188
2189         /*
2190          * If the matching multicast address already exists
2191          * then don't add a new one, just add a reference
2192          */
2193         TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
2194                 if (sa_equal(sa, ifma->ifma_addr)) {
2195                         ifma->ifma_refcount++;
2196                         if (retifma)
2197                                 *retifma = ifma;
2198                         return 0;
2199                 }
2200         }
2201
2202         /*
2203          * Give the link layer a chance to accept/reject it, and also
2204          * find out which AF_LINK address this maps to, if it isn't one
2205          * already.
2206          */
2207         if (ifp->if_resolvemulti) {
2208                 error = ifp->if_resolvemulti(ifp, &llsa, sa);
2209                 if (error)
2210                         return error;
2211         } else {
2212                 llsa = NULL;
2213         }
2214
2215         ifma = kmalloc(sizeof *ifma, M_IFMADDR, M_WAITOK);
2216         dupsa = kmalloc(sa->sa_len, M_IFMADDR, M_WAITOK);
2217         bcopy(sa, dupsa, sa->sa_len);
2218
2219         ifma->ifma_addr = dupsa;
2220         ifma->ifma_lladdr = llsa;
2221         ifma->ifma_ifp = ifp;
2222         ifma->ifma_refcount = 1;
2223         ifma->ifma_protospec = NULL;
2224         rt_newmaddrmsg(RTM_NEWMADDR, ifma);
2225
2226         TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link);
2227         if (retifma)
2228                 *retifma = ifma;
2229
2230         if (llsa != NULL) {
2231                 TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
2232                         if (sa_equal(ifma->ifma_addr, llsa))
2233                                 break;
2234                 }
2235                 if (ifma) {
2236                         ifma->ifma_refcount++;
2237                 } else {
2238                         ifma = kmalloc(sizeof *ifma, M_IFMADDR, M_WAITOK);
2239                         dupsa = kmalloc(llsa->sa_len, M_IFMADDR, M_WAITOK);
2240                         bcopy(llsa, dupsa, llsa->sa_len);
2241                         ifma->ifma_addr = dupsa;
2242                         ifma->ifma_ifp = ifp;
2243                         ifma->ifma_refcount = 1;
2244                         TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link);
2245                 }
2246         }
2247         /*
2248          * We are certain we have added something, so call down to the
2249          * interface to let them know about it.
2250          */
2251         if (ifp->if_ioctl)
2252                 ifp->if_ioctl(ifp, SIOCADDMULTI, 0, NULL);
2253
2254         return 0;
2255 }
2256
2257 int
2258 if_addmulti(struct ifnet *ifp, struct sockaddr *sa,
2259     struct ifmultiaddr **retifma)
2260 {
2261         int error;
2262
2263         ifnet_serialize_all(ifp);
2264         error = if_addmulti_serialized(ifp, sa, retifma);
2265         ifnet_deserialize_all(ifp);
2266
2267         return error;
2268 }
2269
2270 /*
2271  * Remove a reference to a multicast address on this interface.  Yell
2272  * if the request does not match an existing membership.
2273  */
2274 static int
2275 if_delmulti_serialized(struct ifnet *ifp, struct sockaddr *sa)
2276 {
2277         struct ifmultiaddr *ifma;
2278
2279         ASSERT_IFNET_SERIALIZED_ALL(ifp);
2280
2281         TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link)
2282                 if (sa_equal(sa, ifma->ifma_addr))
2283                         break;
2284         if (ifma == NULL)
2285                 return ENOENT;
2286
2287         if (ifma->ifma_refcount > 1) {
2288                 ifma->ifma_refcount--;
2289                 return 0;
2290         }
2291
2292         rt_newmaddrmsg(RTM_DELMADDR, ifma);
2293         sa = ifma->ifma_lladdr;
2294         TAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifma_link);
2295         /*
2296          * Make sure the interface driver is notified
2297          * in the case of a link layer mcast group being left.
2298          */
2299         if (ifma->ifma_addr->sa_family == AF_LINK && sa == NULL)
2300                 ifp->if_ioctl(ifp, SIOCDELMULTI, 0, NULL);
2301         kfree(ifma->ifma_addr, M_IFMADDR);
2302         kfree(ifma, M_IFMADDR);
2303         if (sa == NULL)
2304                 return 0;
2305
2306         /*
2307          * Now look for the link-layer address which corresponds to
2308          * this network address.  It had been squirreled away in
2309          * ifma->ifma_lladdr for this purpose (so we don't have
2310          * to call ifp->if_resolvemulti() again), and we saved that
2311          * value in sa above.  If some nasty deleted the
2312          * link-layer address out from underneath us, we can deal because
2313          * the address we stored was is not the same as the one which was
2314          * in the record for the link-layer address.  (So we don't complain
2315          * in that case.)
2316          */
2317         TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link)
2318                 if (sa_equal(sa, ifma->ifma_addr))
2319                         break;
2320         if (ifma == NULL)
2321                 return 0;
2322
2323         if (ifma->ifma_refcount > 1) {
2324                 ifma->ifma_refcount--;
2325                 return 0;
2326         }
2327
2328         TAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifma_link);
2329         ifp->if_ioctl(ifp, SIOCDELMULTI, 0, NULL);
2330         kfree(ifma->ifma_addr, M_IFMADDR);
2331         kfree(sa, M_IFMADDR);
2332         kfree(ifma, M_IFMADDR);
2333
2334         return 0;
2335 }
2336
2337 int
2338 if_delmulti(struct ifnet *ifp, struct sockaddr *sa)
2339 {
2340         int error;
2341
2342         ifnet_serialize_all(ifp);
2343         error = if_delmulti_serialized(ifp, sa);
2344         ifnet_deserialize_all(ifp);
2345
2346         return error;
2347 }
2348
2349 /*
2350  * Delete all multicast group membership for an interface.
2351  * Should be used to quickly flush all multicast filters.
2352  */
2353 void
2354 if_delallmulti_serialized(struct ifnet *ifp)
2355 {
2356         struct ifmultiaddr *ifma, mark;
2357         struct sockaddr sa;
2358
2359         ASSERT_IFNET_SERIALIZED_ALL(ifp);
2360
2361         bzero(&sa, sizeof(sa));
2362         sa.sa_family = AF_UNSPEC;
2363         sa.sa_len = sizeof(sa);
2364
2365         bzero(&mark, sizeof(mark));
2366         mark.ifma_addr = &sa;
2367
2368         TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, &mark, ifma_link);
2369         while ((ifma = TAILQ_NEXT(&mark, ifma_link)) != NULL) {
2370                 TAILQ_REMOVE(&ifp->if_multiaddrs, &mark, ifma_link);
2371                 TAILQ_INSERT_AFTER(&ifp->if_multiaddrs, ifma, &mark,
2372                     ifma_link);
2373
2374                 if (ifma->ifma_addr->sa_family == AF_UNSPEC)
2375                         continue;
2376
2377                 if_delmulti_serialized(ifp, ifma->ifma_addr);
2378         }
2379         TAILQ_REMOVE(&ifp->if_multiaddrs, &mark, ifma_link);
2380 }
2381
2382
2383 /*
2384  * Set the link layer address on an interface.
2385  *
2386  * At this time we only support certain types of interfaces,
2387  * and we don't allow the length of the address to change.
2388  */
2389 int
2390 if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len)
2391 {
2392         struct sockaddr_dl *sdl;
2393         struct ifreq ifr;
2394
2395         sdl = IF_LLSOCKADDR(ifp);
2396         if (sdl == NULL)
2397                 return (EINVAL);
2398         if (len != sdl->sdl_alen)       /* don't allow length to change */
2399                 return (EINVAL);
2400         switch (ifp->if_type) {
2401         case IFT_ETHER:                 /* these types use struct arpcom */
2402         case IFT_XETHER:
2403         case IFT_L2VLAN:
2404         case IFT_IEEE8023ADLAG:
2405                 bcopy(lladdr, ((struct arpcom *)ifp->if_softc)->ac_enaddr, len);
2406                 bcopy(lladdr, LLADDR(sdl), len);
2407                 break;
2408         default:
2409                 return (ENODEV);
2410         }
2411         /*
2412          * If the interface is already up, we need
2413          * to re-init it in order to reprogram its
2414          * address filter.
2415          */
2416         ifnet_serialize_all(ifp);
2417         if ((ifp->if_flags & IFF_UP) != 0) {
2418 #ifdef INET
2419                 struct ifaddr_container *ifac;
2420 #endif
2421
2422                 ifp->if_flags &= ~IFF_UP;
2423                 ifr.ifr_flags = ifp->if_flags;
2424                 ifr.ifr_flagshigh = ifp->if_flags >> 16;
2425                 ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr,
2426                               NULL);
2427                 ifp->if_flags |= IFF_UP;
2428                 ifr.ifr_flags = ifp->if_flags;
2429                 ifr.ifr_flagshigh = ifp->if_flags >> 16;
2430                 ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr,
2431                                  NULL);
2432 #ifdef INET
2433                 /*
2434                  * Also send gratuitous ARPs to notify other nodes about
2435                  * the address change.
2436                  */
2437                 TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
2438                         struct ifaddr *ifa = ifac->ifa;
2439
2440                         if (ifa->ifa_addr != NULL &&
2441                             ifa->ifa_addr->sa_family == AF_INET)
2442                                 arp_gratuitous(ifp, ifa);
2443                 }
2444 #endif
2445         }
2446         ifnet_deserialize_all(ifp);
2447         return (0);
2448 }
2449
2450 struct ifmultiaddr *
2451 ifmaof_ifpforaddr(struct sockaddr *sa, struct ifnet *ifp)
2452 {
2453         struct ifmultiaddr *ifma;
2454
2455         /* TODO: need ifnet_serialize_main */
2456         ifnet_serialize_all(ifp);
2457         TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link)
2458                 if (sa_equal(ifma->ifma_addr, sa))
2459                         break;
2460         ifnet_deserialize_all(ifp);
2461
2462         return ifma;
2463 }
2464
2465 /*
2466  * This function locates the first real ethernet MAC from a network
2467  * card and loads it into node, returning 0 on success or ENOENT if
2468  * no suitable interfaces were found.  It is used by the uuid code to
2469  * generate a unique 6-byte number.
2470  */
2471 int
2472 if_getanyethermac(uint16_t *node, int minlen)
2473 {
2474         struct ifnet *ifp;
2475         struct sockaddr_dl *sdl;
2476
2477         TAILQ_FOREACH(ifp, &ifnet, if_link) {
2478                 if (ifp->if_type != IFT_ETHER)
2479                         continue;
2480                 sdl = IF_LLSOCKADDR(ifp);
2481                 if (sdl->sdl_alen < minlen)
2482                         continue;
2483                 bcopy(((struct arpcom *)ifp->if_softc)->ac_enaddr, node,
2484                       minlen);
2485                 return(0);
2486         }
2487         return (ENOENT);
2488 }
2489
2490 /*
2491  * The name argument must be a pointer to storage which will last as
2492  * long as the interface does.  For physical devices, the result of
2493  * device_get_name(dev) is a good choice and for pseudo-devices a
2494  * static string works well.
2495  */
2496 void
2497 if_initname(struct ifnet *ifp, const char *name, int unit)
2498 {
2499         ifp->if_dname = name;
2500         ifp->if_dunit = unit;
2501         if (unit != IF_DUNIT_NONE)
2502                 ksnprintf(ifp->if_xname, IFNAMSIZ, "%s%d", name, unit);
2503         else
2504                 strlcpy(ifp->if_xname, name, IFNAMSIZ);
2505 }
2506
2507 int
2508 if_printf(struct ifnet *ifp, const char *fmt, ...)
2509 {
2510         __va_list ap;
2511         int retval;
2512
2513         retval = kprintf("%s: ", ifp->if_xname);
2514         __va_start(ap, fmt);
2515         retval += kvprintf(fmt, ap);
2516         __va_end(ap);
2517         return (retval);
2518 }
2519
2520 struct ifnet *
2521 if_alloc(uint8_t type)
2522 {
2523         struct ifnet *ifp;
2524         size_t size;
2525
2526         /*
2527          * XXX temporary hack until arpcom is setup in if_l2com
2528          */
2529         if (type == IFT_ETHER)
2530                 size = sizeof(struct arpcom);
2531         else
2532                 size = sizeof(struct ifnet);
2533
2534         ifp = kmalloc(size, M_IFNET, M_WAITOK|M_ZERO);
2535
2536         ifp->if_type = type;
2537
2538         if (if_com_alloc[type] != NULL) {
2539                 ifp->if_l2com = if_com_alloc[type](type, ifp);
2540                 if (ifp->if_l2com == NULL) {
2541                         kfree(ifp, M_IFNET);
2542                         return (NULL);
2543                 }
2544         }
2545         return (ifp);
2546 }
2547
2548 void
2549 if_free(struct ifnet *ifp)
2550 {
2551         kfree(ifp, M_IFNET);
2552 }
2553
2554 void
2555 ifq_set_classic(struct ifaltq *ifq)
2556 {
2557         ifq_set_methods(ifq, ifq->altq_ifp->if_mapsubq,
2558             ifsq_classic_enqueue, ifsq_classic_dequeue, ifsq_classic_request);
2559 }
2560
2561 void
2562 ifq_set_methods(struct ifaltq *ifq, altq_mapsubq_t mapsubq,
2563     ifsq_enqueue_t enqueue, ifsq_dequeue_t dequeue, ifsq_request_t request)
2564 {
2565         int q;
2566
2567         KASSERT(mapsubq != NULL, ("mapsubq is not specified"));
2568         KASSERT(enqueue != NULL, ("enqueue is not specified"));
2569         KASSERT(dequeue != NULL, ("dequeue is not specified"));
2570         KASSERT(request != NULL, ("request is not specified"));
2571
2572         ifq->altq_mapsubq = mapsubq;
2573         for (q = 0; q < ifq->altq_subq_cnt; ++q) {
2574                 struct ifaltq_subque *ifsq = &ifq->altq_subq[q];
2575
2576                 ifsq->ifsq_enqueue = enqueue;
2577                 ifsq->ifsq_dequeue = dequeue;
2578                 ifsq->ifsq_request = request;
2579         }
2580 }
2581
2582 static void
2583 ifsq_norm_enqueue(struct ifaltq_subque *ifsq, struct mbuf *m)
2584 {
2585         m->m_nextpkt = NULL;
2586         if (ifsq->ifsq_norm_tail == NULL)
2587                 ifsq->ifsq_norm_head = m;
2588         else
2589                 ifsq->ifsq_norm_tail->m_nextpkt = m;
2590         ifsq->ifsq_norm_tail = m;
2591         ALTQ_SQ_CNTR_INC(ifsq, m->m_pkthdr.len);
2592 }
2593
2594 static void
2595 ifsq_prio_enqueue(struct ifaltq_subque *ifsq, struct mbuf *m)
2596 {
2597         m->m_nextpkt = NULL;
2598         if (ifsq->ifsq_prio_tail == NULL)
2599                 ifsq->ifsq_prio_head = m;
2600         else
2601                 ifsq->ifsq_prio_tail->m_nextpkt = m;
2602         ifsq->ifsq_prio_tail = m;
2603         ALTQ_SQ_CNTR_INC(ifsq, m->m_pkthdr.len);
2604         ALTQ_SQ_PRIO_CNTR_INC(ifsq, m->m_pkthdr.len);
2605 }
2606
2607 static struct mbuf *
2608 ifsq_norm_dequeue(struct ifaltq_subque *ifsq)
2609 {
2610         struct mbuf *m;
2611
2612         m = ifsq->ifsq_norm_head;
2613         if (m != NULL) {
2614                 if ((ifsq->ifsq_norm_head = m->m_nextpkt) == NULL)
2615                         ifsq->ifsq_norm_tail = NULL;
2616                 m->m_nextpkt = NULL;
2617                 ALTQ_SQ_CNTR_DEC(ifsq, m->m_pkthdr.len);
2618         }
2619         return m;
2620 }
2621
2622 static struct mbuf *
2623 ifsq_prio_dequeue(struct ifaltq_subque *ifsq)
2624 {
2625         struct mbuf *m;
2626
2627         m = ifsq->ifsq_prio_head;
2628         if (m != NULL) {
2629                 if ((ifsq->ifsq_prio_head = m->m_nextpkt) == NULL)
2630                         ifsq->ifsq_prio_tail = NULL;
2631                 m->m_nextpkt = NULL;
2632                 ALTQ_SQ_CNTR_DEC(ifsq, m->m_pkthdr.len);
2633                 ALTQ_SQ_PRIO_CNTR_DEC(ifsq, m->m_pkthdr.len);
2634         }
2635         return m;
2636 }
2637
2638 int
2639 ifsq_classic_enqueue(struct ifaltq_subque *ifsq, struct mbuf *m,
2640     struct altq_pktattr *pa __unused)
2641 {
2642         M_ASSERTPKTHDR(m);
2643         if (ifsq->ifsq_len >= ifsq->ifsq_maxlen ||
2644             ifsq->ifsq_bcnt >= ifsq->ifsq_maxbcnt) {
2645                 if ((m->m_flags & M_PRIO) &&
2646                     ifsq->ifsq_prio_len < (ifsq->ifsq_maxlen / 2) &&
2647                     ifsq->ifsq_prio_bcnt < (ifsq->ifsq_maxbcnt / 2)) {
2648                         struct mbuf *m_drop;
2649
2650                         /*
2651                          * Perform drop-head on normal queue
2652                          */
2653                         m_drop = ifsq_norm_dequeue(ifsq);
2654                         if (m_drop != NULL) {
2655                                 m_freem(m_drop);
2656                                 ifsq_prio_enqueue(ifsq, m);
2657                                 return 0;
2658                         }
2659                         /* XXX nothing could be dropped? */
2660                 }
2661                 m_freem(m);
2662                 return ENOBUFS;
2663         } else {
2664                 if (m->m_flags & M_PRIO)
2665                         ifsq_prio_enqueue(ifsq, m);
2666                 else
2667                         ifsq_norm_enqueue(ifsq, m);
2668                 return 0;
2669         }
2670 }
2671
2672 struct mbuf *
2673 ifsq_classic_dequeue(struct ifaltq_subque *ifsq, int op)
2674 {
2675         struct mbuf *m;
2676
2677         switch (op) {
2678         case ALTDQ_POLL:
2679                 m = ifsq->ifsq_prio_head;
2680                 if (m == NULL)
2681                         m = ifsq->ifsq_norm_head;
2682                 break;
2683
2684         case ALTDQ_REMOVE:
2685                 m = ifsq_prio_dequeue(ifsq);
2686                 if (m == NULL)
2687                         m = ifsq_norm_dequeue(ifsq);
2688                 break;
2689
2690         default:
2691                 panic("unsupported ALTQ dequeue op: %d", op);
2692         }
2693         return m;
2694 }
2695
2696 int
2697 ifsq_classic_request(struct ifaltq_subque *ifsq, int req, void *arg)
2698 {
2699         switch (req) {
2700         case ALTRQ_PURGE:
2701                 for (;;) {
2702                         struct mbuf *m;
2703
2704                         m = ifsq_classic_dequeue(ifsq, ALTDQ_REMOVE);
2705                         if (m == NULL)
2706                                 break;
2707                         m_freem(m);
2708                 }
2709                 break;
2710
2711         default:
2712                 panic("unsupported ALTQ request: %d", req);
2713         }
2714         return 0;
2715 }
2716
2717 static void
2718 ifsq_ifstart_try(struct ifaltq_subque *ifsq, int force_sched)
2719 {
2720         struct ifnet *ifp = ifsq_get_ifp(ifsq);
2721         int running = 0, need_sched;
2722
2723         /*
2724          * Try to do direct ifnet.if_start on the subqueue first, if there is
2725          * contention on the subqueue hardware serializer, ifnet.if_start on
2726          * the subqueue will be scheduled on the subqueue owner CPU.
2727          */
2728         if (!ifsq_tryserialize_hw(ifsq)) {
2729                 /*
2730                  * Subqueue hardware serializer contention happened,
2731                  * ifnet.if_start on the subqueue is scheduled on
2732                  * the subqueue owner CPU, and we keep going.
2733                  */
2734                 ifsq_ifstart_schedule(ifsq, 1);
2735                 return;
2736         }
2737
2738         if ((ifp->if_flags & IFF_RUNNING) && !ifsq_is_oactive(ifsq)) {
2739                 ifp->if_start(ifp, ifsq);
2740                 if ((ifp->if_flags & IFF_RUNNING) && !ifsq_is_oactive(ifsq))
2741                         running = 1;
2742         }
2743         need_sched = ifsq_ifstart_need_schedule(ifsq, running);
2744
2745         ifsq_deserialize_hw(ifsq);
2746
2747         if (need_sched) {
2748                 /*
2749                  * More data need to be transmitted, ifnet.if_start on the
2750                  * subqueue is scheduled on the subqueue owner CPU, and we
2751                  * keep going.
2752                  * NOTE: ifnet.if_start subqueue interlock is not released.
2753                  */
2754                 ifsq_ifstart_schedule(ifsq, force_sched);
2755         }
2756 }
2757
2758 /*
2759  * Subqeue packets staging mechanism:
2760  *
2761  * The packets enqueued into the subqueue are staged to a certain amount
2762  * before the ifnet.if_start on the subqueue is called.  In this way, the
2763  * driver could avoid writing to hardware registers upon every packet,
2764  * instead, hardware registers could be written when certain amount of
2765  * packets are put onto hardware TX ring.  The measurement on several modern
2766  * NICs (emx(4), igb(4), bnx(4), bge(4), jme(4)) shows that the hardware
2767  * registers writing aggregation could save ~20% CPU time when 18bytes UDP
2768  * datagrams are transmitted at 1.48Mpps.  The performance improvement by
2769  * hardware registers writing aggeregation is also mentioned by Luigi Rizzo's
2770  * netmap paper (http://info.iet.unipi.it/~luigi/netmap/).
2771  *
2772  * Subqueue packets staging is performed for two entry points into drivers'
2773  * transmission function:
2774  * - Direct ifnet.if_start calling on the subqueue, i.e. ifsq_ifstart_try()
2775  * - ifnet.if_start scheduling on the subqueue, i.e. ifsq_ifstart_schedule()
2776  *
2777  * Subqueue packets staging will be stopped upon any of the following
2778  * conditions:
2779  * - If the count of packets enqueued on the current CPU is great than or
2780  *   equal to ifsq_stage_cntmax. (XXX this should be per-interface)
2781  * - If the total length of packets enqueued on the current CPU is great
2782  *   than or equal to the hardware's MTU - max_protohdr.  max_protohdr is
2783  *   cut from the hardware's MTU mainly bacause a full TCP segment's size
2784  *   is usually less than hardware's MTU.
2785  * - ifsq_ifstart_schedule() is not pending on the current CPU and
2786  *   ifnet.if_start subqueue interlock (ifaltq_subq.ifsq_started) is not
2787  *   released.
2788  * - The if_start_rollup(), which is registered as low priority netisr
2789  *   rollup function, is called; probably because no more work is pending
2790  *   for netisr.
2791  *
2792  * NOTE:
2793  * Currently subqueue packet staging is only performed in netisr threads.
2794  */
2795 int
2796 ifq_dispatch(struct ifnet *ifp, struct mbuf *m, struct altq_pktattr *pa)
2797 {
2798         struct ifaltq *ifq = &ifp->if_snd;
2799         struct ifaltq_subque *ifsq;
2800         int error, start = 0, len, mcast = 0, avoid_start = 0;
2801         struct ifsubq_stage_head *head = NULL;
2802         struct ifsubq_stage *stage = NULL;
2803         struct globaldata *gd = mycpu;
2804         struct thread *td = gd->gd_curthread;
2805
2806         crit_enter_quick(td);
2807
2808         ifsq = ifq_map_subq(ifq, gd->gd_cpuid);
2809         ASSERT_ALTQ_SQ_NOT_SERIALIZED_HW(ifsq);
2810
2811         len = m->m_pkthdr.len;
2812         if (m->m_flags & M_MCAST)
2813                 mcast = 1;
2814
2815         if (td->td_type == TD_TYPE_NETISR) {
2816                 head = &ifsubq_stage_heads[mycpuid];
2817                 stage = ifsq_get_stage(ifsq, mycpuid);
2818
2819                 stage->stg_cnt++;
2820                 stage->stg_len += len;
2821                 if (stage->stg_cnt < ifsq_stage_cntmax &&
2822                     stage->stg_len < (ifp->if_mtu - max_protohdr))
2823                         avoid_start = 1;
2824         }
2825
2826         ALTQ_SQ_LOCK(ifsq);
2827         error = ifsq_enqueue_locked(ifsq, m, pa);
2828         if (error) {
2829                 if (!ifsq_data_ready(ifsq)) {
2830                         ALTQ_SQ_UNLOCK(ifsq);
2831                         crit_exit_quick(td);
2832                         return error;
2833                 }
2834                 avoid_start = 0;
2835         }
2836         if (!ifsq_is_started(ifsq)) {
2837                 if (avoid_start) {
2838                         ALTQ_SQ_UNLOCK(ifsq);
2839
2840                         KKASSERT(!error);
2841                         if ((stage->stg_flags & IFSQ_STAGE_FLAG_QUED) == 0)
2842                                 ifsq_stage_insert(head, stage);
2843
2844                         IFNET_STAT_INC(ifp, obytes, len);
2845                         if (mcast)
2846                                 IFNET_STAT_INC(ifp, omcasts, 1);
2847                         crit_exit_quick(td);
2848                         return error;
2849                 }
2850
2851                 /*
2852                  * Hold the subqueue interlock of ifnet.if_start
2853                  */
2854                 ifsq_set_started(ifsq);
2855                 start = 1;
2856         }
2857         ALTQ_SQ_UNLOCK(ifsq);
2858
2859         if (!error) {
2860                 IFNET_STAT_INC(ifp, obytes, len);
2861                 if (mcast)
2862                         IFNET_STAT_INC(ifp, omcasts, 1);
2863         }
2864
2865         if (stage != NULL) {
2866                 if (!start && (stage->stg_flags & IFSQ_STAGE_FLAG_SCHED)) {
2867                         KKASSERT(stage->stg_flags & IFSQ_STAGE_FLAG_QUED);
2868                         if (!avoid_start) {
2869                                 ifsq_stage_remove(head, stage);
2870                                 ifsq_ifstart_schedule(ifsq, 1);
2871                         }
2872                         crit_exit_quick(td);
2873                         return error;
2874                 }
2875
2876                 if (stage->stg_flags & IFSQ_STAGE_FLAG_QUED) {
2877                         ifsq_stage_remove(head, stage);
2878                 } else {
2879                         stage->stg_cnt = 0;
2880                         stage->stg_len = 0;
2881                 }
2882         }
2883
2884         if (!start) {
2885                 crit_exit_quick(td);
2886                 return error;
2887         }
2888
2889         ifsq_ifstart_try(ifsq, 0);
2890
2891         crit_exit_quick(td);
2892         return error;
2893 }
2894
2895 void *
2896 ifa_create(int size, int flags)
2897 {
2898         struct ifaddr *ifa;
2899         int i;
2900
2901         KASSERT(size >= sizeof(*ifa), ("ifaddr size too small"));
2902
2903         ifa = kmalloc(size, M_IFADDR, flags | M_ZERO);
2904         if (ifa == NULL)
2905                 return NULL;
2906
2907         ifa->ifa_containers =
2908             kmalloc_cachealign(ncpus * sizeof(struct ifaddr_container),
2909                 M_IFADDR, M_WAITOK | M_ZERO);
2910         ifa->ifa_ncnt = ncpus;
2911         for (i = 0; i < ncpus; ++i) {
2912                 struct ifaddr_container *ifac = &ifa->ifa_containers[i];
2913
2914                 ifac->ifa_magic = IFA_CONTAINER_MAGIC;
2915                 ifac->ifa = ifa;
2916                 ifac->ifa_refcnt = 1;
2917         }
2918 #ifdef IFADDR_DEBUG
2919         kprintf("alloc ifa %p %d\n", ifa, size);
2920 #endif
2921         return ifa;
2922 }
2923
2924 void
2925 ifac_free(struct ifaddr_container *ifac, int cpu_id)
2926 {
2927         struct ifaddr *ifa = ifac->ifa;
2928
2929         KKASSERT(ifac->ifa_magic == IFA_CONTAINER_MAGIC);
2930         KKASSERT(ifac->ifa_refcnt == 0);
2931         KASSERT(ifac->ifa_listmask == 0,
2932                 ("ifa is still on %#x lists", ifac->ifa_listmask));
2933
2934         ifac->ifa_magic = IFA_CONTAINER_DEAD;
2935
2936 #ifdef IFADDR_DEBUG_VERBOSE
2937         kprintf("try free ifa %p cpu_id %d\n", ifac->ifa, cpu_id);
2938 #endif
2939
2940         KASSERT(ifa->ifa_ncnt > 0 && ifa->ifa_ncnt <= ncpus,
2941                 ("invalid # of ifac, %d", ifa->ifa_ncnt));
2942         if (atomic_fetchadd_int(&ifa->ifa_ncnt, -1) == 1) {
2943 #ifdef IFADDR_DEBUG
2944                 kprintf("free ifa %p\n", ifa);
2945 #endif
2946                 kfree(ifa->ifa_containers, M_IFADDR);
2947                 kfree(ifa, M_IFADDR);
2948         }
2949 }
2950
2951 static void
2952 ifa_iflink_dispatch(netmsg_t nmsg)
2953 {
2954         struct netmsg_ifaddr *msg = (struct netmsg_ifaddr *)nmsg;
2955         struct ifaddr *ifa = msg->ifa;
2956         struct ifnet *ifp = msg->ifp;
2957         int cpu = mycpuid;
2958         struct ifaddr_container *ifac;
2959
2960         crit_enter();
2961
2962         ifac = &ifa->ifa_containers[cpu];
2963         ASSERT_IFAC_VALID(ifac);
2964         KASSERT((ifac->ifa_listmask & IFA_LIST_IFADDRHEAD) == 0,
2965                 ("ifaddr is on if_addrheads"));
2966
2967         ifac->ifa_listmask |= IFA_LIST_IFADDRHEAD;
2968         if (msg->tail)
2969                 TAILQ_INSERT_TAIL(&ifp->if_addrheads[cpu], ifac, ifa_link);
2970         else
2971                 TAILQ_INSERT_HEAD(&ifp->if_addrheads[cpu], ifac, ifa_link);
2972
2973         crit_exit();
2974
2975         ifa_forwardmsg(&nmsg->lmsg, cpu + 1);
2976 }
2977
2978 void
2979 ifa_iflink(struct ifaddr *ifa, struct ifnet *ifp, int tail)
2980 {
2981         struct netmsg_ifaddr msg;
2982
2983         netmsg_init(&msg.base, NULL, &curthread->td_msgport,
2984                     0, ifa_iflink_dispatch);
2985         msg.ifa = ifa;
2986         msg.ifp = ifp;
2987         msg.tail = tail;
2988
2989         ifa_domsg(&msg.base.lmsg, 0);
2990 }
2991
2992 static void
2993 ifa_ifunlink_dispatch(netmsg_t nmsg)
2994 {
2995         struct netmsg_ifaddr *msg = (struct netmsg_ifaddr *)nmsg;
2996         struct ifaddr *ifa = msg->ifa;
2997         struct ifnet *ifp = msg->ifp;
2998         int cpu = mycpuid;
2999         struct ifaddr_container *ifac;
3000
3001         crit_enter();
3002
3003         ifac = &ifa->ifa_containers[cpu];
3004         ASSERT_IFAC_VALID(ifac);
3005         KASSERT(ifac->ifa_listmask & IFA_LIST_IFADDRHEAD,
3006                 ("ifaddr is not on if_addrhead"));
3007
3008         TAILQ_REMOVE(&ifp->if_addrheads[cpu], ifac, ifa_link);
3009         ifac->ifa_listmask &= ~IFA_LIST_IFADDRHEAD;
3010
3011         crit_exit();
3012
3013         ifa_forwardmsg(&nmsg->lmsg, cpu + 1);
3014 }
3015
3016 void
3017 ifa_ifunlink(struct ifaddr *ifa, struct ifnet *ifp)
3018 {
3019         struct netmsg_ifaddr msg;
3020
3021         netmsg_init(&msg.base, NULL, &curthread->td_msgport,
3022                     0, ifa_ifunlink_dispatch);
3023         msg.ifa = ifa;
3024         msg.ifp = ifp;
3025
3026         ifa_domsg(&msg.base.lmsg, 0);
3027 }
3028
3029 static void
3030 ifa_destroy_dispatch(netmsg_t nmsg)
3031 {
3032         struct netmsg_ifaddr *msg = (struct netmsg_ifaddr *)nmsg;
3033
3034         IFAFREE(msg->ifa);
3035         ifa_forwardmsg(&nmsg->lmsg, mycpuid + 1);
3036 }
3037
3038 void
3039 ifa_destroy(struct ifaddr *ifa)
3040 {
3041         struct netmsg_ifaddr msg;
3042
3043         netmsg_init(&msg.base, NULL, &curthread->td_msgport,
3044                     0, ifa_destroy_dispatch);
3045         msg.ifa = ifa;
3046
3047         ifa_domsg(&msg.base.lmsg, 0);
3048 }
3049
3050 struct lwkt_port *
3051 ifnet_portfn(int cpu)
3052 {
3053         return &ifnet_threads[cpu].td_msgport;
3054 }
3055
3056 void
3057 ifnet_forwardmsg(struct lwkt_msg *lmsg, int next_cpu)
3058 {
3059         KKASSERT(next_cpu > mycpuid && next_cpu <= ncpus);
3060
3061         if (next_cpu < ncpus)
3062                 lwkt_forwardmsg(ifnet_portfn(next_cpu), lmsg);
3063         else
3064                 lwkt_replymsg(lmsg, 0);
3065 }
3066
3067 int
3068 ifnet_domsg(struct lwkt_msg *lmsg, int cpu)
3069 {
3070         KKASSERT(cpu < ncpus);
3071         return lwkt_domsg(ifnet_portfn(cpu), lmsg, 0);
3072 }
3073
3074 void
3075 ifnet_sendmsg(struct lwkt_msg *lmsg, int cpu)
3076 {
3077         KKASSERT(cpu < ncpus);
3078         lwkt_sendmsg(ifnet_portfn(cpu), lmsg);
3079 }
3080
3081 /*
3082  * Generic netmsg service loop.  Some protocols may roll their own but all
3083  * must do the basic command dispatch function call done here.
3084  */
3085 static void
3086 ifnet_service_loop(void *arg __unused)
3087 {
3088         netmsg_t msg;
3089
3090         while ((msg = lwkt_waitport(&curthread->td_msgport, 0))) {
3091                 KASSERT(msg->base.nm_dispatch, ("ifnet_service: badmsg"));
3092                 msg->base.nm_dispatch(msg);
3093         }
3094 }
3095
3096 static void
3097 if_start_rollup(void)
3098 {
3099         struct ifsubq_stage_head *head = &ifsubq_stage_heads[mycpuid];
3100         struct ifsubq_stage *stage;
3101
3102         crit_enter();
3103
3104         while ((stage = TAILQ_FIRST(&head->stg_head)) != NULL) {
3105                 struct ifaltq_subque *ifsq = stage->stg_subq;
3106                 int is_sched = 0;
3107
3108                 if (stage->stg_flags & IFSQ_STAGE_FLAG_SCHED)
3109                         is_sched = 1;
3110                 ifsq_stage_remove(head, stage);
3111
3112                 if (is_sched) {
3113                         ifsq_ifstart_schedule(ifsq, 1);
3114                 } else {
3115                         int start = 0;
3116
3117                         ALTQ_SQ_LOCK(ifsq);
3118                         if (!ifsq_is_started(ifsq)) {
3119                                 /*
3120                                  * Hold the subqueue interlock of
3121                                  * ifnet.if_start
3122                                  */
3123                                 ifsq_set_started(ifsq);
3124                                 start = 1;
3125                         }
3126                         ALTQ_SQ_UNLOCK(ifsq);
3127
3128                         if (start)
3129                                 ifsq_ifstart_try(ifsq, 1);
3130                 }
3131                 KKASSERT((stage->stg_flags &
3132                     (IFSQ_STAGE_FLAG_QUED | IFSQ_STAGE_FLAG_SCHED)) == 0);
3133         }
3134
3135         crit_exit();
3136 }
3137
3138 static void
3139 ifnetinit(void *dummy __unused)
3140 {
3141         int i;
3142
3143         for (i = 0; i < ncpus; ++i) {
3144                 struct thread *thr = &ifnet_threads[i];
3145
3146                 lwkt_create(ifnet_service_loop, NULL, NULL,
3147                             thr, TDF_NOSTART|TDF_FORCE_SPINPORT|TDF_FIXEDCPU,
3148                             i, "ifnet %d", i);
3149                 netmsg_service_port_init(&thr->td_msgport);
3150                 lwkt_schedule(thr);
3151         }
3152
3153         for (i = 0; i < ncpus; ++i)
3154                 TAILQ_INIT(&ifsubq_stage_heads[i].stg_head);
3155         netisr_register_rollup(if_start_rollup, NETISR_ROLLUP_PRIO_IFSTART);
3156 }
3157
3158 void
3159 if_register_com_alloc(u_char type,
3160     if_com_alloc_t *a, if_com_free_t *f)
3161 {
3162
3163         KASSERT(if_com_alloc[type] == NULL,
3164             ("if_register_com_alloc: %d already registered", type));
3165         KASSERT(if_com_free[type] == NULL,
3166             ("if_register_com_alloc: %d free already registered", type));
3167
3168         if_com_alloc[type] = a;
3169         if_com_free[type] = f;
3170 }
3171
3172 void
3173 if_deregister_com_alloc(u_char type)
3174 {
3175
3176         KASSERT(if_com_alloc[type] != NULL,
3177             ("if_deregister_com_alloc: %d not registered", type));
3178         KASSERT(if_com_free[type] != NULL,
3179             ("if_deregister_com_alloc: %d free not registered", type));
3180         if_com_alloc[type] = NULL;
3181         if_com_free[type] = NULL;
3182 }
3183
3184 int
3185 if_ring_count2(int cnt, int cnt_max)
3186 {
3187         int shift = 0;
3188
3189         KASSERT(cnt_max >= 1 && powerof2(cnt_max),
3190             ("invalid ring count max %d", cnt_max));
3191
3192         if (cnt <= 0)
3193                 cnt = cnt_max;
3194         if (cnt > ncpus2)
3195                 cnt = ncpus2;
3196         if (cnt > cnt_max)
3197                 cnt = cnt_max;
3198
3199         while ((1 << (shift + 1)) <= cnt)
3200                 ++shift;
3201         cnt = 1 << shift;
3202
3203         KASSERT(cnt >= 1 && cnt <= ncpus2 && cnt <= cnt_max,
3204             ("calculate cnt %d, ncpus2 %d, cnt max %d",
3205              cnt, ncpus2, cnt_max));
3206         return cnt;
3207 }
3208
3209 void
3210 ifq_set_maxlen(struct ifaltq *ifq, int len)
3211 {
3212         ifq->altq_maxlen = len + (ncpus * ifsq_stage_cntmax);
3213 }
3214
3215 int
3216 ifq_mapsubq_default(struct ifaltq *ifq __unused, int cpuid __unused)
3217 {
3218         return ALTQ_SUBQ_INDEX_DEFAULT;
3219 }
3220
3221 int
3222 ifq_mapsubq_mask(struct ifaltq *ifq, int cpuid)
3223 {
3224         return (cpuid & ifq->altq_subq_mask);
3225 }
3226
3227 static void
3228 ifsq_watchdog(void *arg)
3229 {
3230         struct ifsubq_watchdog *wd = arg;
3231         struct ifnet *ifp;
3232
3233         if (__predict_true(wd->wd_timer == 0 || --wd->wd_timer))
3234                 goto done;
3235
3236         ifp = ifsq_get_ifp(wd->wd_subq);
3237         if (ifnet_tryserialize_all(ifp)) {
3238                 wd->wd_watchdog(wd->wd_subq);
3239                 ifnet_deserialize_all(ifp);
3240         } else {
3241                 /* try again next timeout */
3242                 wd->wd_timer = 1;
3243         }
3244 done:
3245         ifsq_watchdog_reset(wd);
3246 }
3247
3248 static void
3249 ifsq_watchdog_reset(struct ifsubq_watchdog *wd)
3250 {
3251         callout_reset_bycpu(&wd->wd_callout, hz, ifsq_watchdog, wd,
3252             ifsq_get_cpuid(wd->wd_subq));
3253 }
3254
3255 void
3256 ifsq_watchdog_init(struct ifsubq_watchdog *wd, struct ifaltq_subque *ifsq,
3257     ifsq_watchdog_t watchdog)
3258 {
3259         callout_init_mp(&wd->wd_callout);
3260         wd->wd_timer = 0;
3261         wd->wd_subq = ifsq;
3262         wd->wd_watchdog = watchdog;
3263 }
3264
3265 void
3266 ifsq_watchdog_start(struct ifsubq_watchdog *wd)
3267 {
3268         wd->wd_timer = 0;
3269         ifsq_watchdog_reset(wd);
3270 }
3271
3272 void
3273 ifsq_watchdog_stop(struct ifsubq_watchdog *wd)
3274 {
3275         wd->wd_timer = 0;
3276         callout_stop(&wd->wd_callout);
3277 }