net: import FreeBSD's if_lagg
[dragonfly.git] / sys / net / if.c
1 /*
2  * Copyright (c) 1980, 1986, 1993
3  *      The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. Neither the name of the University nor the names of its contributors
14  *    may be used to endorse or promote products derived from this software
15  *    without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  *      @(#)if.c        8.3 (Berkeley) 1/4/94
30  * $FreeBSD: src/sys/net/if.c,v 1.185 2004/03/13 02:35:03 brooks Exp $
31  */
32
33 #include "opt_compat.h"
34 #include "opt_inet6.h"
35 #include "opt_inet.h"
36 #include "opt_ifpoll.h"
37
38 #include <sys/param.h>
39 #include <sys/malloc.h>
40 #include <sys/mbuf.h>
41 #include <sys/systm.h>
42 #include <sys/proc.h>
43 #include <sys/priv.h>
44 #include <sys/protosw.h>
45 #include <sys/socket.h>
46 #include <sys/socketvar.h>
47 #include <sys/socketops.h>
48 #include <sys/kernel.h>
49 #include <sys/ktr.h>
50 #include <sys/mutex.h>
51 #include <sys/sockio.h>
52 #include <sys/syslog.h>
53 #include <sys/sysctl.h>
54 #include <sys/domain.h>
55 #include <sys/thread.h>
56 #include <sys/serialize.h>
57 #include <sys/bus.h>
58
59 #include <sys/thread2.h>
60 #include <sys/msgport2.h>
61 #include <sys/mutex2.h>
62
63 #include <net/if.h>
64 #include <net/if_arp.h>
65 #include <net/if_dl.h>
66 #include <net/if_types.h>
67 #include <net/if_var.h>
68 #include <net/ifq_var.h>
69 #include <net/radix.h>
70 #include <net/route.h>
71 #include <net/if_clone.h>
72 #include <net/netisr2.h>
73 #include <net/netmsg2.h>
74
75 #include <machine/atomic.h>
76 #include <machine/stdarg.h>
77 #include <machine/smp.h>
78
79 #if defined(INET) || defined(INET6)
80 /*XXX*/
81 #include <netinet/in.h>
82 #include <netinet/in_var.h>
83 #include <netinet/if_ether.h>
84 #ifdef INET6
85 #include <netinet6/in6_var.h>
86 #include <netinet6/in6_ifattach.h>
87 #endif
88 #endif
89
90 #if defined(COMPAT_43)
91 #include <emulation/43bsd/43bsd_socket.h>
92 #endif /* COMPAT_43 */
93
94 struct netmsg_ifaddr {
95         struct netmsg_base base;
96         struct ifaddr   *ifa;
97         struct ifnet    *ifp;
98         int             tail;
99 };
100
101 struct ifsubq_stage_head {
102         TAILQ_HEAD(, ifsubq_stage)      stg_head;
103 } __cachealign;
104
105 /*
106  * System initialization
107  */
108 static void     if_attachdomain(void *);
109 static void     if_attachdomain1(struct ifnet *);
110 static int      ifconf(u_long, caddr_t, struct ucred *);
111 static void     ifinit(void *);
112 static void     ifnetinit(void *);
113 static void     if_slowtimo(void *);
114 static void     link_rtrequest(int, struct rtentry *);
115 static int      if_rtdel(struct radix_node *, void *);
116
117 /* Helper functions */
118 static void     ifsq_watchdog_reset(struct ifsubq_watchdog *);
119 static int      if_delmulti_serialized(struct ifnet *, struct sockaddr *);
120
121 #ifdef INET6
122 /*
123  * XXX: declare here to avoid to include many inet6 related files..
124  * should be more generalized?
125  */
126 extern void     nd6_setmtu(struct ifnet *);
127 #endif
128
129 SYSCTL_NODE(_net, PF_LINK, link, CTLFLAG_RW, 0, "Link layers");
130 SYSCTL_NODE(_net_link, 0, generic, CTLFLAG_RW, 0, "Generic link-management");
131
132 static int ifsq_stage_cntmax = 4;
133 TUNABLE_INT("net.link.stage_cntmax", &ifsq_stage_cntmax);
134 SYSCTL_INT(_net_link, OID_AUTO, stage_cntmax, CTLFLAG_RW,
135     &ifsq_stage_cntmax, 0, "ifq staging packet count max");
136
137 static int if_stats_compat = 0;
138 SYSCTL_INT(_net_link, OID_AUTO, stats_compat, CTLFLAG_RW,
139     &if_stats_compat, 0, "Compat the old ifnet stats");
140
141 SYSINIT(interfaces, SI_SUB_PROTO_IF, SI_ORDER_FIRST, ifinit, NULL)
142 /* Must be after netisr_init */
143 SYSINIT(ifnet, SI_SUB_PRE_DRIVERS, SI_ORDER_SECOND, ifnetinit, NULL)
144
145 static  if_com_alloc_t *if_com_alloc[256];
146 static  if_com_free_t *if_com_free[256];
147
148 MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address");
149 MALLOC_DEFINE(M_IFMADDR, "ether_multi", "link-level multicast address");
150 MALLOC_DEFINE(M_IFNET, "ifnet", "interface structure");
151
152 int                     ifqmaxlen = IFQ_MAXLEN;
153 struct ifnethead        ifnet = TAILQ_HEAD_INITIALIZER(ifnet);
154
155 struct callout          if_slowtimo_timer;
156
157 int                     if_index = 0;
158 struct ifnet            **ifindex2ifnet = NULL;
159 static struct thread    ifnet_threads[MAXCPU];
160
161 static struct ifsubq_stage_head ifsubq_stage_heads[MAXCPU];
162
163 #ifdef notyet
164 #define IFQ_KTR_STRING          "ifq=%p"
165 #define IFQ_KTR_ARGS    struct ifaltq *ifq
166 #ifndef KTR_IFQ
167 #define KTR_IFQ                 KTR_ALL
168 #endif
169 KTR_INFO_MASTER(ifq);
170 KTR_INFO(KTR_IFQ, ifq, enqueue, 0, IFQ_KTR_STRING, IFQ_KTR_ARGS);
171 KTR_INFO(KTR_IFQ, ifq, dequeue, 1, IFQ_KTR_STRING, IFQ_KTR_ARGS);
172 #define logifq(name, arg)       KTR_LOG(ifq_ ## name, arg)
173
174 #define IF_START_KTR_STRING     "ifp=%p"
175 #define IF_START_KTR_ARGS       struct ifnet *ifp
176 #ifndef KTR_IF_START
177 #define KTR_IF_START            KTR_ALL
178 #endif
179 KTR_INFO_MASTER(if_start);
180 KTR_INFO(KTR_IF_START, if_start, run, 0,
181          IF_START_KTR_STRING, IF_START_KTR_ARGS);
182 KTR_INFO(KTR_IF_START, if_start, sched, 1,
183          IF_START_KTR_STRING, IF_START_KTR_ARGS);
184 KTR_INFO(KTR_IF_START, if_start, avoid, 2,
185          IF_START_KTR_STRING, IF_START_KTR_ARGS);
186 KTR_INFO(KTR_IF_START, if_start, contend_sched, 3,
187          IF_START_KTR_STRING, IF_START_KTR_ARGS);
188 KTR_INFO(KTR_IF_START, if_start, chase_sched, 4,
189          IF_START_KTR_STRING, IF_START_KTR_ARGS);
190 #define logifstart(name, arg)   KTR_LOG(if_start_ ## name, arg)
191 #endif
192
193 TAILQ_HEAD(, ifg_group) ifg_head = TAILQ_HEAD_INITIALIZER(ifg_head);
194
195 /*
196  * Network interface utility routines.
197  *
198  * Routines with ifa_ifwith* names take sockaddr *'s as
199  * parameters.
200  */
201 /* ARGSUSED*/
202 void
203 ifinit(void *dummy)
204 {
205         struct ifnet *ifp;
206
207         callout_init(&if_slowtimo_timer);
208
209         crit_enter();
210         TAILQ_FOREACH(ifp, &ifnet, if_link) {
211                 if (ifp->if_snd.altq_maxlen == 0) {
212                         if_printf(ifp, "XXX: driver didn't set altq_maxlen\n");
213                         ifq_set_maxlen(&ifp->if_snd, ifqmaxlen);
214                 }
215         }
216         crit_exit();
217
218         if_slowtimo(0);
219 }
220
221 static void
222 ifsq_ifstart_ipifunc(void *arg)
223 {
224         struct ifaltq_subque *ifsq = arg;
225         struct lwkt_msg *lmsg = ifsq_get_ifstart_lmsg(ifsq, mycpuid);
226
227         crit_enter();
228         if (lmsg->ms_flags & MSGF_DONE)
229                 lwkt_sendmsg_oncpu(netisr_cpuport(mycpuid), lmsg);
230         crit_exit();
231 }
232
233 static __inline void
234 ifsq_stage_remove(struct ifsubq_stage_head *head, struct ifsubq_stage *stage)
235 {
236         KKASSERT(stage->stg_flags & IFSQ_STAGE_FLAG_QUED);
237         TAILQ_REMOVE(&head->stg_head, stage, stg_link);
238         stage->stg_flags &= ~(IFSQ_STAGE_FLAG_QUED | IFSQ_STAGE_FLAG_SCHED);
239         stage->stg_cnt = 0;
240         stage->stg_len = 0;
241 }
242
243 static __inline void
244 ifsq_stage_insert(struct ifsubq_stage_head *head, struct ifsubq_stage *stage)
245 {
246         KKASSERT((stage->stg_flags &
247             (IFSQ_STAGE_FLAG_QUED | IFSQ_STAGE_FLAG_SCHED)) == 0);
248         stage->stg_flags |= IFSQ_STAGE_FLAG_QUED;
249         TAILQ_INSERT_TAIL(&head->stg_head, stage, stg_link);
250 }
251
252 /*
253  * Schedule ifnet.if_start on the subqueue owner CPU
254  */
255 static void
256 ifsq_ifstart_schedule(struct ifaltq_subque *ifsq, int force)
257 {
258         int cpu;
259
260         if (!force && curthread->td_type == TD_TYPE_NETISR &&
261             ifsq_stage_cntmax > 0) {
262                 struct ifsubq_stage *stage = ifsq_get_stage(ifsq, mycpuid);
263
264                 stage->stg_cnt = 0;
265                 stage->stg_len = 0;
266                 if ((stage->stg_flags & IFSQ_STAGE_FLAG_QUED) == 0)
267                         ifsq_stage_insert(&ifsubq_stage_heads[mycpuid], stage);
268                 stage->stg_flags |= IFSQ_STAGE_FLAG_SCHED;
269                 return;
270         }
271
272         cpu = ifsq_get_cpuid(ifsq);
273         if (cpu != mycpuid)
274                 lwkt_send_ipiq(globaldata_find(cpu), ifsq_ifstart_ipifunc, ifsq);
275         else
276                 ifsq_ifstart_ipifunc(ifsq);
277 }
278
279 /*
280  * NOTE:
281  * This function will release ifnet.if_start subqueue interlock,
282  * if ifnet.if_start for the subqueue does not need to be scheduled
283  */
284 static __inline int
285 ifsq_ifstart_need_schedule(struct ifaltq_subque *ifsq, int running)
286 {
287         if (!running || ifsq_is_empty(ifsq)
288 #ifdef ALTQ
289             || ifsq->ifsq_altq->altq_tbr != NULL
290 #endif
291         ) {
292                 ALTQ_SQ_LOCK(ifsq);
293                 /*
294                  * ifnet.if_start subqueue interlock is released, if:
295                  * 1) Hardware can not take any packets, due to
296                  *    o  interface is marked down
297                  *    o  hardware queue is full (ifsq_is_oactive)
298                  *    Under the second situation, hardware interrupt
299                  *    or polling(4) will call/schedule ifnet.if_start
300                  *    on the subqueue when hardware queue is ready
301                  * 2) There is no packet in the subqueue.
302                  *    Further ifq_dispatch or ifq_handoff will call/
303                  *    schedule ifnet.if_start on the subqueue.
304                  * 3) TBR is used and it does not allow further
305                  *    dequeueing.
306                  *    TBR callout will call ifnet.if_start on the
307                  *    subqueue.
308                  */
309                 if (!running || !ifsq_data_ready(ifsq)) {
310                         ifsq_clr_started(ifsq);
311                         ALTQ_SQ_UNLOCK(ifsq);
312                         return 0;
313                 }
314                 ALTQ_SQ_UNLOCK(ifsq);
315         }
316         return 1;
317 }
318
319 static void
320 ifsq_ifstart_dispatch(netmsg_t msg)
321 {
322         struct lwkt_msg *lmsg = &msg->base.lmsg;
323         struct ifaltq_subque *ifsq = lmsg->u.ms_resultp;
324         struct ifnet *ifp = ifsq_get_ifp(ifsq);
325         struct globaldata *gd = mycpu;
326         int running = 0, need_sched;
327
328         crit_enter_gd(gd);
329
330         lwkt_replymsg(lmsg, 0); /* reply ASAP */
331
332         if (gd->gd_cpuid != ifsq_get_cpuid(ifsq)) {
333                 /*
334                  * We need to chase the subqueue owner CPU change.
335                  */
336                 ifsq_ifstart_schedule(ifsq, 1);
337                 crit_exit_gd(gd);
338                 return;
339         }
340
341         ifsq_serialize_hw(ifsq);
342         if ((ifp->if_flags & IFF_RUNNING) && !ifsq_is_oactive(ifsq)) {
343                 ifp->if_start(ifp, ifsq);
344                 if ((ifp->if_flags & IFF_RUNNING) && !ifsq_is_oactive(ifsq))
345                         running = 1;
346         }
347         need_sched = ifsq_ifstart_need_schedule(ifsq, running);
348         ifsq_deserialize_hw(ifsq);
349
350         if (need_sched) {
351                 /*
352                  * More data need to be transmitted, ifnet.if_start is
353                  * scheduled on the subqueue owner CPU, and we keep going.
354                  * NOTE: ifnet.if_start subqueue interlock is not released.
355                  */
356                 ifsq_ifstart_schedule(ifsq, 0);
357         }
358
359         crit_exit_gd(gd);
360 }
361
362 /* Device driver ifnet.if_start helper function */
363 void
364 ifsq_devstart(struct ifaltq_subque *ifsq)
365 {
366         struct ifnet *ifp = ifsq_get_ifp(ifsq);
367         int running = 0;
368
369         ASSERT_ALTQ_SQ_SERIALIZED_HW(ifsq);
370
371         ALTQ_SQ_LOCK(ifsq);
372         if (ifsq_is_started(ifsq) || !ifsq_data_ready(ifsq)) {
373                 ALTQ_SQ_UNLOCK(ifsq);
374                 return;
375         }
376         ifsq_set_started(ifsq);
377         ALTQ_SQ_UNLOCK(ifsq);
378
379         ifp->if_start(ifp, ifsq);
380
381         if ((ifp->if_flags & IFF_RUNNING) && !ifsq_is_oactive(ifsq))
382                 running = 1;
383
384         if (ifsq_ifstart_need_schedule(ifsq, running)) {
385                 /*
386                  * More data need to be transmitted, ifnet.if_start is
387                  * scheduled on ifnet's CPU, and we keep going.
388                  * NOTE: ifnet.if_start interlock is not released.
389                  */
390                 ifsq_ifstart_schedule(ifsq, 0);
391         }
392 }
393
394 void
395 if_devstart(struct ifnet *ifp)
396 {
397         ifsq_devstart(ifq_get_subq_default(&ifp->if_snd));
398 }
399
400 /* Device driver ifnet.if_start schedule helper function */
401 void
402 ifsq_devstart_sched(struct ifaltq_subque *ifsq)
403 {
404         ifsq_ifstart_schedule(ifsq, 1);
405 }
406
407 void
408 if_devstart_sched(struct ifnet *ifp)
409 {
410         ifsq_devstart_sched(ifq_get_subq_default(&ifp->if_snd));
411 }
412
413 static void
414 if_default_serialize(struct ifnet *ifp, enum ifnet_serialize slz __unused)
415 {
416         lwkt_serialize_enter(ifp->if_serializer);
417 }
418
419 static void
420 if_default_deserialize(struct ifnet *ifp, enum ifnet_serialize slz __unused)
421 {
422         lwkt_serialize_exit(ifp->if_serializer);
423 }
424
425 static int
426 if_default_tryserialize(struct ifnet *ifp, enum ifnet_serialize slz __unused)
427 {
428         return lwkt_serialize_try(ifp->if_serializer);
429 }
430
431 #ifdef INVARIANTS
432 static void
433 if_default_serialize_assert(struct ifnet *ifp,
434                             enum ifnet_serialize slz __unused,
435                             boolean_t serialized)
436 {
437         if (serialized)
438                 ASSERT_SERIALIZED(ifp->if_serializer);
439         else
440                 ASSERT_NOT_SERIALIZED(ifp->if_serializer);
441 }
442 #endif
443
444 /*
445  * Attach an interface to the list of "active" interfaces.
446  *
447  * The serializer is optional.
448  */
449 void
450 if_attach(struct ifnet *ifp, lwkt_serialize_t serializer)
451 {
452         unsigned socksize, ifasize;
453         int namelen, masklen;
454         struct sockaddr_dl *sdl;
455         struct ifaddr *ifa;
456         struct ifaltq *ifq;
457         int i, q;
458
459         static int if_indexlim = 8;
460
461         if (ifp->if_serialize != NULL) {
462                 KASSERT(ifp->if_deserialize != NULL &&
463                         ifp->if_tryserialize != NULL &&
464                         ifp->if_serialize_assert != NULL,
465                         ("serialize functions are partially setup"));
466
467                 /*
468                  * If the device supplies serialize functions,
469                  * then clear if_serializer to catch any invalid
470                  * usage of this field.
471                  */
472                 KASSERT(serializer == NULL,
473                         ("both serialize functions and default serializer "
474                          "are supplied"));
475                 ifp->if_serializer = NULL;
476         } else {
477                 KASSERT(ifp->if_deserialize == NULL &&
478                         ifp->if_tryserialize == NULL &&
479                         ifp->if_serialize_assert == NULL,
480                         ("serialize functions are partially setup"));
481                 ifp->if_serialize = if_default_serialize;
482                 ifp->if_deserialize = if_default_deserialize;
483                 ifp->if_tryserialize = if_default_tryserialize;
484 #ifdef INVARIANTS
485                 ifp->if_serialize_assert = if_default_serialize_assert;
486 #endif
487
488                 /*
489                  * The serializer can be passed in from the device,
490                  * allowing the same serializer to be used for both
491                  * the interrupt interlock and the device queue.
492                  * If not specified, the netif structure will use an
493                  * embedded serializer.
494                  */
495                 if (serializer == NULL) {
496                         serializer = &ifp->if_default_serializer;
497                         lwkt_serialize_init(serializer);
498                 }
499                 ifp->if_serializer = serializer;
500         }
501
502         mtx_init(&ifp->if_ioctl_mtx);
503         mtx_lock(&ifp->if_ioctl_mtx);
504
505         lwkt_gettoken(&ifnet_token);    /* protect if_index and ifnet tailq */
506         ifp->if_index = ++if_index;
507
508         /*
509          * XXX -
510          * The old code would work if the interface passed a pre-existing
511          * chain of ifaddrs to this code.  We don't trust our callers to
512          * properly initialize the tailq, however, so we no longer allow
513          * this unlikely case.
514          */
515         ifp->if_addrheads = kmalloc(ncpus * sizeof(struct ifaddrhead),
516                                     M_IFADDR, M_WAITOK | M_ZERO);
517         for (i = 0; i < ncpus; ++i)
518                 TAILQ_INIT(&ifp->if_addrheads[i]);
519
520         TAILQ_INIT(&ifp->if_prefixhead);
521         TAILQ_INIT(&ifp->if_multiaddrs);
522         TAILQ_INIT(&ifp->if_groups);
523         getmicrotime(&ifp->if_lastchange);
524         if (ifindex2ifnet == NULL || if_index >= if_indexlim) {
525                 unsigned int n;
526                 struct ifnet **q;
527
528                 if_indexlim <<= 1;
529
530                 /* grow ifindex2ifnet */
531                 n = if_indexlim * sizeof(*q);
532                 q = kmalloc(n, M_IFADDR, M_WAITOK | M_ZERO);
533                 if (ifindex2ifnet) {
534                         bcopy(ifindex2ifnet, q, n/2);
535                         kfree(ifindex2ifnet, M_IFADDR);
536                 }
537                 ifindex2ifnet = q;
538         }
539
540         ifindex2ifnet[if_index] = ifp;
541
542         /*
543          * create a Link Level name for this device
544          */
545         namelen = strlen(ifp->if_xname);
546         masklen = offsetof(struct sockaddr_dl, sdl_data[0]) + namelen;
547         socksize = masklen + ifp->if_addrlen;
548         if (socksize < sizeof(*sdl))
549                 socksize = sizeof(*sdl);
550         socksize = RT_ROUNDUP(socksize);
551         ifasize = sizeof(struct ifaddr) + 2 * socksize;
552         ifa = ifa_create(ifasize, M_WAITOK);
553         sdl = (struct sockaddr_dl *)(ifa + 1);
554         sdl->sdl_len = socksize;
555         sdl->sdl_family = AF_LINK;
556         bcopy(ifp->if_xname, sdl->sdl_data, namelen);
557         sdl->sdl_nlen = namelen;
558         sdl->sdl_index = ifp->if_index;
559         sdl->sdl_type = ifp->if_type;
560         ifp->if_lladdr = ifa;
561         ifa->ifa_ifp = ifp;
562         ifa->ifa_rtrequest = link_rtrequest;
563         ifa->ifa_addr = (struct sockaddr *)sdl;
564         sdl = (struct sockaddr_dl *)(socksize + (caddr_t)sdl);
565         ifa->ifa_netmask = (struct sockaddr *)sdl;
566         sdl->sdl_len = masklen;
567         while (namelen != 0)
568                 sdl->sdl_data[--namelen] = 0xff;
569         ifa_iflink(ifa, ifp, 0 /* Insert head */);
570
571         ifp->if_data_pcpu = kmalloc_cachealign(
572             ncpus * sizeof(struct ifdata_pcpu), M_DEVBUF, M_WAITOK | M_ZERO);
573
574         if (ifp->if_mapsubq == NULL)
575                 ifp->if_mapsubq = ifq_mapsubq_default;
576
577         ifq = &ifp->if_snd;
578         ifq->altq_type = 0;
579         ifq->altq_disc = NULL;
580         ifq->altq_flags &= ALTQF_CANTCHANGE;
581         ifq->altq_tbr = NULL;
582         ifq->altq_ifp = ifp;
583
584         if (ifq->altq_subq_cnt <= 0)
585                 ifq->altq_subq_cnt = 1;
586         ifq->altq_subq = kmalloc_cachealign(
587             ifq->altq_subq_cnt * sizeof(struct ifaltq_subque),
588             M_DEVBUF, M_WAITOK | M_ZERO);
589
590         if (ifq->altq_maxlen == 0) {
591                 if_printf(ifp, "driver didn't set altq_maxlen\n");
592                 ifq_set_maxlen(ifq, ifqmaxlen);
593         }
594
595         for (q = 0; q < ifq->altq_subq_cnt; ++q) {
596                 struct ifaltq_subque *ifsq = &ifq->altq_subq[q];
597
598                 ALTQ_SQ_LOCK_INIT(ifsq);
599                 ifsq->ifsq_index = q;
600
601                 ifsq->ifsq_altq = ifq;
602                 ifsq->ifsq_ifp = ifp;
603
604                 ifsq->ifsq_maxlen = ifq->altq_maxlen;
605                 ifsq->ifsq_maxbcnt = ifsq->ifsq_maxlen * MCLBYTES;
606                 ifsq->ifsq_prepended = NULL;
607                 ifsq->ifsq_started = 0;
608                 ifsq->ifsq_hw_oactive = 0;
609                 ifsq_set_cpuid(ifsq, 0);
610                 if (ifp->if_serializer != NULL)
611                         ifsq_set_hw_serialize(ifsq, ifp->if_serializer);
612
613                 ifsq->ifsq_stage =
614                     kmalloc_cachealign(ncpus * sizeof(struct ifsubq_stage),
615                     M_DEVBUF, M_WAITOK | M_ZERO);
616                 for (i = 0; i < ncpus; ++i)
617                         ifsq->ifsq_stage[i].stg_subq = ifsq;
618
619                 ifsq->ifsq_ifstart_nmsg =
620                     kmalloc(ncpus * sizeof(struct netmsg_base),
621                     M_LWKTMSG, M_WAITOK);
622                 for (i = 0; i < ncpus; ++i) {
623                         netmsg_init(&ifsq->ifsq_ifstart_nmsg[i], NULL,
624                             &netisr_adone_rport, 0, ifsq_ifstart_dispatch);
625                         ifsq->ifsq_ifstart_nmsg[i].lmsg.u.ms_resultp = ifsq;
626                 }
627         }
628         ifq_set_classic(ifq);
629
630         if (!SLIST_EMPTY(&domains))
631                 if_attachdomain1(ifp);
632
633         TAILQ_INSERT_TAIL(&ifnet, ifp, if_link);
634         lwkt_reltoken(&ifnet_token);
635
636         /* Announce the interface. */
637         EVENTHANDLER_INVOKE(ifnet_attach_event, ifp);
638         devctl_notify("IFNET", ifp->if_xname, "ATTACH", NULL);
639         rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
640
641         mtx_unlock(&ifp->if_ioctl_mtx);
642 }
643
644 static void
645 if_attachdomain(void *dummy)
646 {
647         struct ifnet *ifp;
648
649         crit_enter();
650         TAILQ_FOREACH(ifp, &ifnet, if_list)
651                 if_attachdomain1(ifp);
652         crit_exit();
653 }
654 SYSINIT(domainifattach, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST,
655         if_attachdomain, NULL);
656
657 static void
658 if_attachdomain1(struct ifnet *ifp)
659 {
660         struct domain *dp;
661
662         crit_enter();
663
664         /* address family dependent data region */
665         bzero(ifp->if_afdata, sizeof(ifp->if_afdata));
666         SLIST_FOREACH(dp, &domains, dom_next)
667                 if (dp->dom_ifattach)
668                         ifp->if_afdata[dp->dom_family] =
669                                 (*dp->dom_ifattach)(ifp);
670         crit_exit();
671 }
672
673 /*
674  * Purge all addresses whose type is _not_ AF_LINK
675  */
676 void
677 if_purgeaddrs_nolink(struct ifnet *ifp)
678 {
679         struct ifaddr_container *ifac, *next;
680
681         TAILQ_FOREACH_MUTABLE(ifac, &ifp->if_addrheads[mycpuid],
682                               ifa_link, next) {
683                 struct ifaddr *ifa = ifac->ifa;
684
685                 /* Leave link ifaddr as it is */
686                 if (ifa->ifa_addr->sa_family == AF_LINK)
687                         continue;
688 #ifdef INET
689                 /* XXX: Ugly!! ad hoc just for INET */
690                 if (ifa->ifa_addr && ifa->ifa_addr->sa_family == AF_INET) {
691                         struct ifaliasreq ifr;
692 #ifdef IFADDR_DEBUG_VERBOSE
693                         int i;
694
695                         kprintf("purge in4 addr %p: ", ifa);
696                         for (i = 0; i < ncpus; ++i)
697                                 kprintf("%d ", ifa->ifa_containers[i].ifa_refcnt);
698                         kprintf("\n");
699 #endif
700
701                         bzero(&ifr, sizeof ifr);
702                         ifr.ifra_addr = *ifa->ifa_addr;
703                         if (ifa->ifa_dstaddr)
704                                 ifr.ifra_broadaddr = *ifa->ifa_dstaddr;
705                         if (in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, ifp,
706                                        NULL) == 0)
707                                 continue;
708                 }
709 #endif /* INET */
710 #ifdef INET6
711                 if (ifa->ifa_addr && ifa->ifa_addr->sa_family == AF_INET6) {
712 #ifdef IFADDR_DEBUG_VERBOSE
713                         int i;
714
715                         kprintf("purge in6 addr %p: ", ifa);
716                         for (i = 0; i < ncpus; ++i)
717                                 kprintf("%d ", ifa->ifa_containers[i].ifa_refcnt);
718                         kprintf("\n");
719 #endif
720
721                         in6_purgeaddr(ifa);
722                         /* ifp_addrhead is already updated */
723                         continue;
724                 }
725 #endif /* INET6 */
726                 ifa_ifunlink(ifa, ifp);
727                 ifa_destroy(ifa);
728         }
729 }
730
731 static void
732 ifq_stage_detach_handler(netmsg_t nmsg)
733 {
734         struct ifaltq *ifq = nmsg->lmsg.u.ms_resultp;
735         int q;
736
737         for (q = 0; q < ifq->altq_subq_cnt; ++q) {
738                 struct ifaltq_subque *ifsq = &ifq->altq_subq[q];
739                 struct ifsubq_stage *stage = ifsq_get_stage(ifsq, mycpuid);
740
741                 if (stage->stg_flags & IFSQ_STAGE_FLAG_QUED)
742                         ifsq_stage_remove(&ifsubq_stage_heads[mycpuid], stage);
743         }
744         lwkt_replymsg(&nmsg->lmsg, 0);
745 }
746
747 static void
748 ifq_stage_detach(struct ifaltq *ifq)
749 {
750         struct netmsg_base base;
751         int cpu;
752
753         netmsg_init(&base, NULL, &curthread->td_msgport, 0,
754             ifq_stage_detach_handler);
755         base.lmsg.u.ms_resultp = ifq;
756
757         for (cpu = 0; cpu < ncpus; ++cpu)
758                 lwkt_domsg(netisr_cpuport(cpu), &base.lmsg, 0);
759 }
760
761 struct netmsg_if_rtdel {
762         struct netmsg_base      base;
763         struct ifnet            *ifp;
764 };
765
766 static void
767 if_rtdel_dispatch(netmsg_t msg)
768 {
769         struct netmsg_if_rtdel *rmsg = (void *)msg;
770         int i, nextcpu, cpu;
771
772         cpu = mycpuid;
773         for (i = 1; i <= AF_MAX; i++) {
774                 struct radix_node_head  *rnh;
775
776                 if ((rnh = rt_tables[cpu][i]) == NULL)
777                         continue;
778                 rnh->rnh_walktree(rnh, if_rtdel, rmsg->ifp);
779         }
780
781         nextcpu = cpu + 1;
782         if (nextcpu < ncpus)
783                 lwkt_forwardmsg(netisr_cpuport(nextcpu), &rmsg->base.lmsg);
784         else
785                 lwkt_replymsg(&rmsg->base.lmsg, 0);
786 }
787
788 /*
789  * Detach an interface, removing it from the
790  * list of "active" interfaces.
791  */
792 void
793 if_detach(struct ifnet *ifp)
794 {
795         struct netmsg_if_rtdel msg;
796         struct domain *dp;
797         int q;
798
799         EVENTHANDLER_INVOKE(ifnet_detach_event, ifp);
800
801         /*
802          * Remove routes and flush queues.
803          */
804         crit_enter();
805 #ifdef IFPOLL_ENABLE
806         if (ifp->if_flags & IFF_NPOLLING)
807                 ifpoll_deregister(ifp);
808 #endif
809         if_down(ifp);
810
811 #ifdef ALTQ
812         if (ifq_is_enabled(&ifp->if_snd))
813                 altq_disable(&ifp->if_snd);
814         if (ifq_is_attached(&ifp->if_snd))
815                 altq_detach(&ifp->if_snd);
816 #endif
817
818         /*
819          * Clean up all addresses.
820          */
821         ifp->if_lladdr = NULL;
822
823         if_purgeaddrs_nolink(ifp);
824         if (!TAILQ_EMPTY(&ifp->if_addrheads[mycpuid])) {
825                 struct ifaddr *ifa;
826
827                 ifa = TAILQ_FIRST(&ifp->if_addrheads[mycpuid])->ifa;
828                 KASSERT(ifa->ifa_addr->sa_family == AF_LINK,
829                         ("non-link ifaddr is left on if_addrheads"));
830
831                 ifa_ifunlink(ifa, ifp);
832                 ifa_destroy(ifa);
833                 KASSERT(TAILQ_EMPTY(&ifp->if_addrheads[mycpuid]),
834                         ("there are still ifaddrs left on if_addrheads"));
835         }
836
837 #ifdef INET
838         /*
839          * Remove all IPv4 kernel structures related to ifp.
840          */
841         in_ifdetach(ifp);
842 #endif
843
844 #ifdef INET6
845         /*
846          * Remove all IPv6 kernel structs related to ifp.  This should be done
847          * before removing routing entries below, since IPv6 interface direct
848          * routes are expected to be removed by the IPv6-specific kernel API.
849          * Otherwise, the kernel will detect some inconsistency and bark it.
850          */
851         in6_ifdetach(ifp);
852 #endif
853
854         /*
855          * Delete all remaining routes using this interface
856          */
857         netmsg_init(&msg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
858             if_rtdel_dispatch);
859         msg.ifp = ifp;
860         rt_domsg_global(&msg.base);
861
862         /* Announce that the interface is gone. */
863         rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
864         devctl_notify("IFNET", ifp->if_xname, "DETACH", NULL);
865
866         SLIST_FOREACH(dp, &domains, dom_next)
867                 if (dp->dom_ifdetach && ifp->if_afdata[dp->dom_family])
868                         (*dp->dom_ifdetach)(ifp,
869                                 ifp->if_afdata[dp->dom_family]);
870
871         /*
872          * Remove interface from ifindex2ifp[] and maybe decrement if_index.
873          */
874         lwkt_gettoken(&ifnet_token);
875         ifindex2ifnet[ifp->if_index] = NULL;
876         while (if_index > 0 && ifindex2ifnet[if_index] == NULL)
877                 if_index--;
878         TAILQ_REMOVE(&ifnet, ifp, if_link);
879         lwkt_reltoken(&ifnet_token);
880
881         kfree(ifp->if_addrheads, M_IFADDR);
882
883         lwkt_synchronize_ipiqs("if_detach");
884         ifq_stage_detach(&ifp->if_snd);
885
886         for (q = 0; q < ifp->if_snd.altq_subq_cnt; ++q) {
887                 struct ifaltq_subque *ifsq = &ifp->if_snd.altq_subq[q];
888
889                 kfree(ifsq->ifsq_ifstart_nmsg, M_LWKTMSG);
890                 kfree(ifsq->ifsq_stage, M_DEVBUF);
891         }
892         kfree(ifp->if_snd.altq_subq, M_DEVBUF);
893
894         kfree(ifp->if_data_pcpu, M_DEVBUF);
895
896         crit_exit();
897 }
898
899 /*
900  * Create interface group without members
901  */
902 struct ifg_group *
903 if_creategroup(const char *groupname)
904 {
905         struct ifg_group        *ifg = NULL;
906
907         if ((ifg = (struct ifg_group *)kmalloc(sizeof(struct ifg_group),
908             M_TEMP, M_NOWAIT)) == NULL)
909                 return (NULL);
910
911         strlcpy(ifg->ifg_group, groupname, sizeof(ifg->ifg_group));
912         ifg->ifg_refcnt = 0;
913         ifg->ifg_carp_demoted = 0;
914         TAILQ_INIT(&ifg->ifg_members);
915 #if NPF > 0
916         pfi_attach_ifgroup(ifg);
917 #endif
918         TAILQ_INSERT_TAIL(&ifg_head, ifg, ifg_next);
919
920         return (ifg);
921 }
922
923 /*
924  * Add a group to an interface
925  */
926 int
927 if_addgroup(struct ifnet *ifp, const char *groupname)
928 {
929         struct ifg_list         *ifgl;
930         struct ifg_group        *ifg = NULL;
931         struct ifg_member       *ifgm;
932
933         if (groupname[0] && groupname[strlen(groupname) - 1] >= '0' &&
934             groupname[strlen(groupname) - 1] <= '9')
935                 return (EINVAL);
936
937         TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
938                 if (!strcmp(ifgl->ifgl_group->ifg_group, groupname))
939                         return (EEXIST);
940
941         if ((ifgl = kmalloc(sizeof(*ifgl), M_TEMP, M_NOWAIT)) == NULL)
942                 return (ENOMEM);
943
944         if ((ifgm = kmalloc(sizeof(*ifgm), M_TEMP, M_NOWAIT)) == NULL) {
945                 kfree(ifgl, M_TEMP);
946                 return (ENOMEM);
947         }
948
949         TAILQ_FOREACH(ifg, &ifg_head, ifg_next)
950                 if (!strcmp(ifg->ifg_group, groupname))
951                         break;
952
953         if (ifg == NULL && (ifg = if_creategroup(groupname)) == NULL) {
954                 kfree(ifgl, M_TEMP);
955                 kfree(ifgm, M_TEMP);
956                 return (ENOMEM);
957         }
958
959         ifg->ifg_refcnt++;
960         ifgl->ifgl_group = ifg;
961         ifgm->ifgm_ifp = ifp;
962
963         TAILQ_INSERT_TAIL(&ifg->ifg_members, ifgm, ifgm_next);
964         TAILQ_INSERT_TAIL(&ifp->if_groups, ifgl, ifgl_next);
965
966 #if NPF > 0
967         pfi_group_change(groupname);
968 #endif
969
970         return (0);
971 }
972
973 /*
974  * Remove a group from an interface
975  */
976 int
977 if_delgroup(struct ifnet *ifp, const char *groupname)
978 {
979         struct ifg_list         *ifgl;
980         struct ifg_member       *ifgm;
981
982         TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
983                 if (!strcmp(ifgl->ifgl_group->ifg_group, groupname))
984                         break;
985         if (ifgl == NULL)
986                 return (ENOENT);
987
988         TAILQ_REMOVE(&ifp->if_groups, ifgl, ifgl_next);
989
990         TAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next)
991                 if (ifgm->ifgm_ifp == ifp)
992                         break;
993
994         if (ifgm != NULL) {
995                 TAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifgm_next);
996                 kfree(ifgm, M_TEMP);
997         }
998
999         if (--ifgl->ifgl_group->ifg_refcnt == 0) {
1000                 TAILQ_REMOVE(&ifg_head, ifgl->ifgl_group, ifg_next);
1001 #if NPF > 0
1002                 pfi_detach_ifgroup(ifgl->ifgl_group);
1003 #endif
1004                 kfree(ifgl->ifgl_group, M_TEMP);
1005         }
1006
1007         kfree(ifgl, M_TEMP);
1008
1009 #if NPF > 0
1010         pfi_group_change(groupname);
1011 #endif
1012
1013         return (0);
1014 }
1015
1016 /*
1017  * Stores all groups from an interface in memory pointed
1018  * to by data
1019  */
1020 int
1021 if_getgroup(caddr_t data, struct ifnet *ifp)
1022 {
1023         int                      len, error;
1024         struct ifg_list         *ifgl;
1025         struct ifg_req           ifgrq, *ifgp;
1026         struct ifgroupreq       *ifgr = (struct ifgroupreq *)data;
1027
1028         if (ifgr->ifgr_len == 0) {
1029                 TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
1030                         ifgr->ifgr_len += sizeof(struct ifg_req);
1031                 return (0);
1032         }
1033
1034         len = ifgr->ifgr_len;
1035         ifgp = ifgr->ifgr_groups;
1036         TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) {
1037                 if (len < sizeof(ifgrq))
1038                         return (EINVAL);
1039                 bzero(&ifgrq, sizeof ifgrq);
1040                 strlcpy(ifgrq.ifgrq_group, ifgl->ifgl_group->ifg_group,
1041                     sizeof(ifgrq.ifgrq_group));
1042                 if ((error = copyout((caddr_t)&ifgrq, (caddr_t)ifgp,
1043                     sizeof(struct ifg_req))))
1044                         return (error);
1045                 len -= sizeof(ifgrq);
1046                 ifgp++;
1047         }
1048
1049         return (0);
1050 }
1051
1052 /*
1053  * Stores all members of a group in memory pointed to by data
1054  */
1055 int
1056 if_getgroupmembers(caddr_t data)
1057 {
1058         struct ifgroupreq       *ifgr = (struct ifgroupreq *)data;
1059         struct ifg_group        *ifg;
1060         struct ifg_member       *ifgm;
1061         struct ifg_req           ifgrq, *ifgp;
1062         int                      len, error;
1063
1064         TAILQ_FOREACH(ifg, &ifg_head, ifg_next)
1065                 if (!strcmp(ifg->ifg_group, ifgr->ifgr_name))
1066                         break;
1067         if (ifg == NULL)
1068                 return (ENOENT);
1069
1070         if (ifgr->ifgr_len == 0) {
1071                 TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next)
1072                         ifgr->ifgr_len += sizeof(ifgrq);
1073                 return (0);
1074         }
1075
1076         len = ifgr->ifgr_len;
1077         ifgp = ifgr->ifgr_groups;
1078         TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) {
1079                 if (len < sizeof(ifgrq))
1080                         return (EINVAL);
1081                 bzero(&ifgrq, sizeof ifgrq);
1082                 strlcpy(ifgrq.ifgrq_member, ifgm->ifgm_ifp->if_xname,
1083                     sizeof(ifgrq.ifgrq_member));
1084                 if ((error = copyout((caddr_t)&ifgrq, (caddr_t)ifgp,
1085                     sizeof(struct ifg_req))))
1086                         return (error);
1087                 len -= sizeof(ifgrq);
1088                 ifgp++;
1089         }
1090
1091         return (0);
1092 }
1093
1094 /*
1095  * Delete Routes for a Network Interface
1096  *
1097  * Called for each routing entry via the rnh->rnh_walktree() call above
1098  * to delete all route entries referencing a detaching network interface.
1099  *
1100  * Arguments:
1101  *      rn      pointer to node in the routing table
1102  *      arg     argument passed to rnh->rnh_walktree() - detaching interface
1103  *
1104  * Returns:
1105  *      0       successful
1106  *      errno   failed - reason indicated
1107  *
1108  */
1109 static int
1110 if_rtdel(struct radix_node *rn, void *arg)
1111 {
1112         struct rtentry  *rt = (struct rtentry *)rn;
1113         struct ifnet    *ifp = arg;
1114         int             err;
1115
1116         if (rt->rt_ifp == ifp) {
1117
1118                 /*
1119                  * Protect (sorta) against walktree recursion problems
1120                  * with cloned routes
1121                  */
1122                 if (!(rt->rt_flags & RTF_UP))
1123                         return (0);
1124
1125                 err = rtrequest(RTM_DELETE, rt_key(rt), rt->rt_gateway,
1126                                 rt_mask(rt), rt->rt_flags,
1127                                 NULL);
1128                 if (err) {
1129                         log(LOG_WARNING, "if_rtdel: error %d\n", err);
1130                 }
1131         }
1132
1133         return (0);
1134 }
1135
1136 /*
1137  * Locate an interface based on a complete address.
1138  */
1139 struct ifaddr *
1140 ifa_ifwithaddr(struct sockaddr *addr)
1141 {
1142         struct ifnet *ifp;
1143
1144         TAILQ_FOREACH(ifp, &ifnet, if_link) {
1145                 struct ifaddr_container *ifac;
1146
1147                 TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1148                         struct ifaddr *ifa = ifac->ifa;
1149
1150                         if (ifa->ifa_addr->sa_family != addr->sa_family)
1151                                 continue;
1152                         if (sa_equal(addr, ifa->ifa_addr))
1153                                 return (ifa);
1154                         if ((ifp->if_flags & IFF_BROADCAST) &&
1155                             ifa->ifa_broadaddr &&
1156                             /* IPv6 doesn't have broadcast */
1157                             ifa->ifa_broadaddr->sa_len != 0 &&
1158                             sa_equal(ifa->ifa_broadaddr, addr))
1159                                 return (ifa);
1160                 }
1161         }
1162         return (NULL);
1163 }
1164 /*
1165  * Locate the point to point interface with a given destination address.
1166  */
1167 struct ifaddr *
1168 ifa_ifwithdstaddr(struct sockaddr *addr)
1169 {
1170         struct ifnet *ifp;
1171
1172         TAILQ_FOREACH(ifp, &ifnet, if_link) {
1173                 struct ifaddr_container *ifac;
1174
1175                 if (!(ifp->if_flags & IFF_POINTOPOINT))
1176                         continue;
1177
1178                 TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1179                         struct ifaddr *ifa = ifac->ifa;
1180
1181                         if (ifa->ifa_addr->sa_family != addr->sa_family)
1182                                 continue;
1183                         if (ifa->ifa_dstaddr &&
1184                             sa_equal(addr, ifa->ifa_dstaddr))
1185                                 return (ifa);
1186                 }
1187         }
1188         return (NULL);
1189 }
1190
1191 /*
1192  * Find an interface on a specific network.  If many, choice
1193  * is most specific found.
1194  */
1195 struct ifaddr *
1196 ifa_ifwithnet(struct sockaddr *addr)
1197 {
1198         struct ifnet *ifp;
1199         struct ifaddr *ifa_maybe = NULL;
1200         u_int af = addr->sa_family;
1201         char *addr_data = addr->sa_data, *cplim;
1202
1203         /*
1204          * AF_LINK addresses can be looked up directly by their index number,
1205          * so do that if we can.
1206          */
1207         if (af == AF_LINK) {
1208                 struct sockaddr_dl *sdl = (struct sockaddr_dl *)addr;
1209
1210                 if (sdl->sdl_index && sdl->sdl_index <= if_index)
1211                         return (ifindex2ifnet[sdl->sdl_index]->if_lladdr);
1212         }
1213
1214         /*
1215          * Scan though each interface, looking for ones that have
1216          * addresses in this address family.
1217          */
1218         TAILQ_FOREACH(ifp, &ifnet, if_link) {
1219                 struct ifaddr_container *ifac;
1220
1221                 TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1222                         struct ifaddr *ifa = ifac->ifa;
1223                         char *cp, *cp2, *cp3;
1224
1225                         if (ifa->ifa_addr->sa_family != af)
1226 next:                           continue;
1227                         if (af == AF_INET && ifp->if_flags & IFF_POINTOPOINT) {
1228                                 /*
1229                                  * This is a bit broken as it doesn't
1230                                  * take into account that the remote end may
1231                                  * be a single node in the network we are
1232                                  * looking for.
1233                                  * The trouble is that we don't know the
1234                                  * netmask for the remote end.
1235                                  */
1236                                 if (ifa->ifa_dstaddr != NULL &&
1237                                     sa_equal(addr, ifa->ifa_dstaddr))
1238                                         return (ifa);
1239                         } else {
1240                                 /*
1241                                  * if we have a special address handler,
1242                                  * then use it instead of the generic one.
1243                                  */
1244                                 if (ifa->ifa_claim_addr) {
1245                                         if ((*ifa->ifa_claim_addr)(ifa, addr)) {
1246                                                 return (ifa);
1247                                         } else {
1248                                                 continue;
1249                                         }
1250                                 }
1251
1252                                 /*
1253                                  * Scan all the bits in the ifa's address.
1254                                  * If a bit dissagrees with what we are
1255                                  * looking for, mask it with the netmask
1256                                  * to see if it really matters.
1257                                  * (A byte at a time)
1258                                  */
1259                                 if (ifa->ifa_netmask == 0)
1260                                         continue;
1261                                 cp = addr_data;
1262                                 cp2 = ifa->ifa_addr->sa_data;
1263                                 cp3 = ifa->ifa_netmask->sa_data;
1264                                 cplim = ifa->ifa_netmask->sa_len +
1265                                         (char *)ifa->ifa_netmask;
1266                                 while (cp3 < cplim)
1267                                         if ((*cp++ ^ *cp2++) & *cp3++)
1268                                                 goto next; /* next address! */
1269                                 /*
1270                                  * If the netmask of what we just found
1271                                  * is more specific than what we had before
1272                                  * (if we had one) then remember the new one
1273                                  * before continuing to search
1274                                  * for an even better one.
1275                                  */
1276                                 if (ifa_maybe == NULL ||
1277                                     rn_refines((char *)ifa->ifa_netmask,
1278                                                (char *)ifa_maybe->ifa_netmask))
1279                                         ifa_maybe = ifa;
1280                         }
1281                 }
1282         }
1283         return (ifa_maybe);
1284 }
1285
1286 /*
1287  * Find an interface address specific to an interface best matching
1288  * a given address.
1289  */
1290 struct ifaddr *
1291 ifaof_ifpforaddr(struct sockaddr *addr, struct ifnet *ifp)
1292 {
1293         struct ifaddr_container *ifac;
1294         char *cp, *cp2, *cp3;
1295         char *cplim;
1296         struct ifaddr *ifa_maybe = NULL;
1297         u_int af = addr->sa_family;
1298
1299         if (af >= AF_MAX)
1300                 return (0);
1301         TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1302                 struct ifaddr *ifa = ifac->ifa;
1303
1304                 if (ifa->ifa_addr->sa_family != af)
1305                         continue;
1306                 if (ifa_maybe == NULL)
1307                         ifa_maybe = ifa;
1308                 if (ifa->ifa_netmask == NULL) {
1309                         if (sa_equal(addr, ifa->ifa_addr) ||
1310                             (ifa->ifa_dstaddr != NULL &&
1311                              sa_equal(addr, ifa->ifa_dstaddr)))
1312                                 return (ifa);
1313                         continue;
1314                 }
1315                 if (ifp->if_flags & IFF_POINTOPOINT) {
1316                         if (sa_equal(addr, ifa->ifa_dstaddr))
1317                                 return (ifa);
1318                 } else {
1319                         cp = addr->sa_data;
1320                         cp2 = ifa->ifa_addr->sa_data;
1321                         cp3 = ifa->ifa_netmask->sa_data;
1322                         cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask;
1323                         for (; cp3 < cplim; cp3++)
1324                                 if ((*cp++ ^ *cp2++) & *cp3)
1325                                         break;
1326                         if (cp3 == cplim)
1327                                 return (ifa);
1328                 }
1329         }
1330         return (ifa_maybe);
1331 }
1332
1333 /*
1334  * Default action when installing a route with a Link Level gateway.
1335  * Lookup an appropriate real ifa to point to.
1336  * This should be moved to /sys/net/link.c eventually.
1337  */
1338 static void
1339 link_rtrequest(int cmd, struct rtentry *rt)
1340 {
1341         struct ifaddr *ifa;
1342         struct sockaddr *dst;
1343         struct ifnet *ifp;
1344
1345         if (cmd != RTM_ADD || (ifa = rt->rt_ifa) == NULL ||
1346             (ifp = ifa->ifa_ifp) == NULL || (dst = rt_key(rt)) == NULL)
1347                 return;
1348         ifa = ifaof_ifpforaddr(dst, ifp);
1349         if (ifa != NULL) {
1350                 IFAFREE(rt->rt_ifa);
1351                 IFAREF(ifa);
1352                 rt->rt_ifa = ifa;
1353                 if (ifa->ifa_rtrequest && ifa->ifa_rtrequest != link_rtrequest)
1354                         ifa->ifa_rtrequest(cmd, rt);
1355         }
1356 }
1357
1358 /*
1359  * Mark an interface down and notify protocols of
1360  * the transition.
1361  * NOTE: must be called at splnet or eqivalent.
1362  */
1363 void
1364 if_unroute(struct ifnet *ifp, int flag, int fam)
1365 {
1366         struct ifaddr_container *ifac;
1367
1368         ifp->if_flags &= ~flag;
1369         getmicrotime(&ifp->if_lastchange);
1370         TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1371                 struct ifaddr *ifa = ifac->ifa;
1372
1373                 if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family))
1374                         kpfctlinput(PRC_IFDOWN, ifa->ifa_addr);
1375         }
1376         ifq_purge_all(&ifp->if_snd);
1377         rt_ifmsg(ifp);
1378 }
1379
1380 /*
1381  * Mark an interface up and notify protocols of
1382  * the transition.
1383  * NOTE: must be called at splnet or eqivalent.
1384  */
1385 void
1386 if_route(struct ifnet *ifp, int flag, int fam)
1387 {
1388         struct ifaddr_container *ifac;
1389
1390         ifq_purge_all(&ifp->if_snd);
1391         ifp->if_flags |= flag;
1392         getmicrotime(&ifp->if_lastchange);
1393         TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1394                 struct ifaddr *ifa = ifac->ifa;
1395
1396                 if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family))
1397                         kpfctlinput(PRC_IFUP, ifa->ifa_addr);
1398         }
1399         rt_ifmsg(ifp);
1400 #ifdef INET6
1401         in6_if_up(ifp);
1402 #endif
1403 }
1404
1405 /*
1406  * Mark an interface down and notify protocols of the transition.  An
1407  * interface going down is also considered to be a synchronizing event.
1408  * We must ensure that all packet processing related to the interface
1409  * has completed before we return so e.g. the caller can free the ifnet
1410  * structure that the mbufs may be referencing.
1411  *
1412  * NOTE: must be called at splnet or eqivalent.
1413  */
1414 void
1415 if_down(struct ifnet *ifp)
1416 {
1417         if_unroute(ifp, IFF_UP, AF_UNSPEC);
1418         netmsg_service_sync();
1419 }
1420
1421 /*
1422  * Mark an interface up and notify protocols of
1423  * the transition.
1424  * NOTE: must be called at splnet or eqivalent.
1425  */
1426 void
1427 if_up(struct ifnet *ifp)
1428 {
1429         if_route(ifp, IFF_UP, AF_UNSPEC);
1430 }
1431
1432 /*
1433  * Process a link state change.
1434  * NOTE: must be called at splsoftnet or equivalent.
1435  */
1436 void
1437 if_link_state_change(struct ifnet *ifp)
1438 {
1439         int link_state = ifp->if_link_state;
1440
1441         rt_ifmsg(ifp);
1442         devctl_notify("IFNET", ifp->if_xname,
1443             (link_state == LINK_STATE_UP) ? "LINK_UP" : "LINK_DOWN", NULL);
1444 }
1445
1446 /*
1447  * Handle interface watchdog timer routines.  Called
1448  * from softclock, we decrement timers (if set) and
1449  * call the appropriate interface routine on expiration.
1450  */
1451 static void
1452 if_slowtimo(void *arg)
1453 {
1454         struct ifnet *ifp;
1455
1456         crit_enter();
1457
1458         TAILQ_FOREACH(ifp, &ifnet, if_link) {
1459                 if (if_stats_compat) {
1460                         IFNET_STAT_GET(ifp, ipackets, ifp->if_ipackets);
1461                         IFNET_STAT_GET(ifp, ierrors, ifp->if_ierrors);
1462                         IFNET_STAT_GET(ifp, opackets, ifp->if_opackets);
1463                         IFNET_STAT_GET(ifp, oerrors, ifp->if_oerrors);
1464                         IFNET_STAT_GET(ifp, collisions, ifp->if_collisions);
1465                         IFNET_STAT_GET(ifp, ibytes, ifp->if_ibytes);
1466                         IFNET_STAT_GET(ifp, obytes, ifp->if_obytes);
1467                         IFNET_STAT_GET(ifp, imcasts, ifp->if_imcasts);
1468                         IFNET_STAT_GET(ifp, omcasts, ifp->if_omcasts);
1469                         IFNET_STAT_GET(ifp, iqdrops, ifp->if_iqdrops);
1470                         IFNET_STAT_GET(ifp, noproto, ifp->if_noproto);
1471                 }
1472
1473                 if (ifp->if_timer == 0 || --ifp->if_timer)
1474                         continue;
1475                 if (ifp->if_watchdog) {
1476                         if (ifnet_tryserialize_all(ifp)) {
1477                                 (*ifp->if_watchdog)(ifp);
1478                                 ifnet_deserialize_all(ifp);
1479                         } else {
1480                                 /* try again next timeout */
1481                                 ++ifp->if_timer;
1482                         }
1483                 }
1484         }
1485
1486         crit_exit();
1487
1488         callout_reset(&if_slowtimo_timer, hz / IFNET_SLOWHZ, if_slowtimo, NULL);
1489 }
1490
1491 /*
1492  * Map interface name to
1493  * interface structure pointer.
1494  */
1495 struct ifnet *
1496 ifunit(const char *name)
1497 {
1498         struct ifnet *ifp;
1499
1500         /*
1501          * Search all the interfaces for this name/number
1502          */
1503
1504         TAILQ_FOREACH(ifp, &ifnet, if_link) {
1505                 if (strncmp(ifp->if_xname, name, IFNAMSIZ) == 0)
1506                         break;
1507         }
1508         return (ifp);
1509 }
1510
1511
1512 /*
1513  * Map interface name in a sockaddr_dl to
1514  * interface structure pointer.
1515  */
1516 struct ifnet *
1517 if_withname(struct sockaddr *sa)
1518 {
1519         char ifname[IFNAMSIZ+1];
1520         struct sockaddr_dl *sdl = (struct sockaddr_dl *)sa;
1521
1522         if ( (sa->sa_family != AF_LINK) || (sdl->sdl_nlen == 0) ||
1523              (sdl->sdl_nlen > IFNAMSIZ) )
1524                 return NULL;
1525
1526         /*
1527          * ifunit wants a null-terminated name.  It may not be null-terminated
1528          * in the sockaddr.  We don't want to change the caller's sockaddr,
1529          * and there might not be room to put the trailing null anyway, so we
1530          * make a local copy that we know we can null terminate safely.
1531          */
1532
1533         bcopy(sdl->sdl_data, ifname, sdl->sdl_nlen);
1534         ifname[sdl->sdl_nlen] = '\0';
1535         return ifunit(ifname);
1536 }
1537
1538
1539 /*
1540  * Interface ioctls.
1541  */
1542 int
1543 ifioctl(struct socket *so, u_long cmd, caddr_t data, struct ucred *cred)
1544 {
1545         struct ifnet *ifp;
1546         struct ifreq *ifr;
1547         struct ifstat *ifs;
1548         int error;
1549         short oif_flags;
1550         int new_flags;
1551 #ifdef COMPAT_43
1552         int ocmd;
1553 #endif
1554         size_t namelen, onamelen;
1555         char new_name[IFNAMSIZ];
1556         struct ifaddr *ifa;
1557         struct sockaddr_dl *sdl;
1558
1559         switch (cmd) {
1560         case SIOCGIFCONF:
1561         case OSIOCGIFCONF:
1562                 return (ifconf(cmd, data, cred));
1563         default:
1564                 break;
1565         }
1566
1567         ifr = (struct ifreq *)data;
1568
1569         switch (cmd) {
1570         case SIOCIFCREATE:
1571         case SIOCIFCREATE2:
1572                 if ((error = priv_check_cred(cred, PRIV_ROOT, 0)) != 0)
1573                         return (error);
1574                 return (if_clone_create(ifr->ifr_name, sizeof(ifr->ifr_name),
1575                         cmd == SIOCIFCREATE2 ? ifr->ifr_data : NULL));
1576         case SIOCIFDESTROY:
1577                 if ((error = priv_check_cred(cred, PRIV_ROOT, 0)) != 0)
1578                         return (error);
1579                 return (if_clone_destroy(ifr->ifr_name));
1580         case SIOCIFGCLONERS:
1581                 return (if_clone_list((struct if_clonereq *)data));
1582         default:
1583                 break;
1584         }
1585
1586         /*
1587          * Nominal ioctl through interface, lookup the ifp and obtain a
1588          * lock to serialize the ifconfig ioctl operation.
1589          */
1590         ifp = ifunit(ifr->ifr_name);
1591         if (ifp == NULL)
1592                 return (ENXIO);
1593         error = 0;
1594         mtx_lock(&ifp->if_ioctl_mtx);
1595
1596         switch (cmd) {
1597         case SIOCGIFINDEX:
1598                 ifr->ifr_index = ifp->if_index;
1599                 break;
1600
1601         case SIOCGIFFLAGS:
1602                 ifr->ifr_flags = ifp->if_flags;
1603                 ifr->ifr_flagshigh = ifp->if_flags >> 16;
1604                 break;
1605
1606         case SIOCGIFCAP:
1607                 ifr->ifr_reqcap = ifp->if_capabilities;
1608                 ifr->ifr_curcap = ifp->if_capenable;
1609                 break;
1610
1611         case SIOCGIFMETRIC:
1612                 ifr->ifr_metric = ifp->if_metric;
1613                 break;
1614
1615         case SIOCGIFMTU:
1616                 ifr->ifr_mtu = ifp->if_mtu;
1617                 break;
1618
1619         case SIOCGIFTSOLEN:
1620                 ifr->ifr_tsolen = ifp->if_tsolen;
1621                 break;
1622
1623         case SIOCGIFDATA:
1624                 error = copyout((caddr_t)&ifp->if_data, ifr->ifr_data,
1625                                 sizeof(ifp->if_data));
1626                 break;
1627
1628         case SIOCGIFPHYS:
1629                 ifr->ifr_phys = ifp->if_physical;
1630                 break;
1631
1632         case SIOCGIFPOLLCPU:
1633                 ifr->ifr_pollcpu = -1;
1634                 break;
1635
1636         case SIOCSIFPOLLCPU:
1637                 break;
1638
1639         case SIOCSIFFLAGS:
1640                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1641                 if (error)
1642                         break;
1643                 new_flags = (ifr->ifr_flags & 0xffff) |
1644                     (ifr->ifr_flagshigh << 16);
1645                 if (ifp->if_flags & IFF_SMART) {
1646                         /* Smart drivers twiddle their own routes */
1647                 } else if (ifp->if_flags & IFF_UP &&
1648                     (new_flags & IFF_UP) == 0) {
1649                         crit_enter();
1650                         if_down(ifp);
1651                         crit_exit();
1652                 } else if (new_flags & IFF_UP &&
1653                     (ifp->if_flags & IFF_UP) == 0) {
1654                         crit_enter();
1655                         if_up(ifp);
1656                         crit_exit();
1657                 }
1658
1659 #ifdef IFPOLL_ENABLE
1660                 if ((new_flags ^ ifp->if_flags) & IFF_NPOLLING) {
1661                         if (new_flags & IFF_NPOLLING)
1662                                 ifpoll_register(ifp);
1663                         else
1664                                 ifpoll_deregister(ifp);
1665                 }
1666 #endif
1667
1668                 ifp->if_flags = (ifp->if_flags & IFF_CANTCHANGE) |
1669                         (new_flags &~ IFF_CANTCHANGE);
1670                 if (new_flags & IFF_PPROMISC) {
1671                         /* Permanently promiscuous mode requested */
1672                         ifp->if_flags |= IFF_PROMISC;
1673                 } else if (ifp->if_pcount == 0) {
1674                         ifp->if_flags &= ~IFF_PROMISC;
1675                 }
1676                 if (ifp->if_ioctl) {
1677                         ifnet_serialize_all(ifp);
1678                         ifp->if_ioctl(ifp, cmd, data, cred);
1679                         ifnet_deserialize_all(ifp);
1680                 }
1681                 getmicrotime(&ifp->if_lastchange);
1682                 break;
1683
1684         case SIOCSIFCAP:
1685                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1686                 if (error)
1687                         break;
1688                 if (ifr->ifr_reqcap & ~ifp->if_capabilities) {
1689                         error = EINVAL;
1690                         break;
1691                 }
1692                 ifnet_serialize_all(ifp);
1693                 ifp->if_ioctl(ifp, cmd, data, cred);
1694                 ifnet_deserialize_all(ifp);
1695                 break;
1696
1697         case SIOCSIFNAME:
1698                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1699                 if (error)
1700                         break;
1701                 error = copyinstr(ifr->ifr_data, new_name, IFNAMSIZ, NULL);
1702                 if (error)
1703                         break;
1704                 if (new_name[0] == '\0') {
1705                         error = EINVAL;
1706                         break;
1707                 }
1708                 if (ifunit(new_name) != NULL) {
1709                         error = EEXIST;
1710                         break;
1711                 }
1712
1713                 EVENTHANDLER_INVOKE(ifnet_detach_event, ifp);
1714
1715                 /* Announce the departure of the interface. */
1716                 rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
1717
1718                 strlcpy(ifp->if_xname, new_name, sizeof(ifp->if_xname));
1719                 ifa = TAILQ_FIRST(&ifp->if_addrheads[mycpuid])->ifa;
1720                 /* XXX IFA_LOCK(ifa); */
1721                 sdl = (struct sockaddr_dl *)ifa->ifa_addr;
1722                 namelen = strlen(new_name);
1723                 onamelen = sdl->sdl_nlen;
1724                 /*
1725                  * Move the address if needed.  This is safe because we
1726                  * allocate space for a name of length IFNAMSIZ when we
1727                  * create this in if_attach().
1728                  */
1729                 if (namelen != onamelen) {
1730                         bcopy(sdl->sdl_data + onamelen,
1731                             sdl->sdl_data + namelen, sdl->sdl_alen);
1732                 }
1733                 bcopy(new_name, sdl->sdl_data, namelen);
1734                 sdl->sdl_nlen = namelen;
1735                 sdl = (struct sockaddr_dl *)ifa->ifa_netmask;
1736                 bzero(sdl->sdl_data, onamelen);
1737                 while (namelen != 0)
1738                         sdl->sdl_data[--namelen] = 0xff;
1739                 /* XXX IFA_UNLOCK(ifa) */
1740
1741                 EVENTHANDLER_INVOKE(ifnet_attach_event, ifp);
1742
1743                 /* Announce the return of the interface. */
1744                 rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
1745                 break;
1746
1747         case SIOCSIFMETRIC:
1748                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1749                 if (error)
1750                         break;
1751                 ifp->if_metric = ifr->ifr_metric;
1752                 getmicrotime(&ifp->if_lastchange);
1753                 break;
1754
1755         case SIOCSIFPHYS:
1756                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1757                 if (error)
1758                         break;
1759                 if (ifp->if_ioctl == NULL) {
1760                         error = EOPNOTSUPP;
1761                         break;
1762                 }
1763                 ifnet_serialize_all(ifp);
1764                 error = ifp->if_ioctl(ifp, cmd, data, cred);
1765                 ifnet_deserialize_all(ifp);
1766                 if (error == 0)
1767                         getmicrotime(&ifp->if_lastchange);
1768                 break;
1769
1770         case SIOCSIFMTU:
1771         {
1772                 u_long oldmtu = ifp->if_mtu;
1773
1774                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1775                 if (error)
1776                         break;
1777                 if (ifp->if_ioctl == NULL) {
1778                         error = EOPNOTSUPP;
1779                         break;
1780                 }
1781                 if (ifr->ifr_mtu < IF_MINMTU || ifr->ifr_mtu > IF_MAXMTU) {
1782                         error = EINVAL;
1783                         break;
1784                 }
1785                 ifnet_serialize_all(ifp);
1786                 error = ifp->if_ioctl(ifp, cmd, data, cred);
1787                 ifnet_deserialize_all(ifp);
1788                 if (error == 0) {
1789                         getmicrotime(&ifp->if_lastchange);
1790                         rt_ifmsg(ifp);
1791                 }
1792                 /*
1793                  * If the link MTU changed, do network layer specific procedure.
1794                  */
1795                 if (ifp->if_mtu != oldmtu) {
1796 #ifdef INET6
1797                         nd6_setmtu(ifp);
1798 #endif
1799                 }
1800                 break;
1801         }
1802
1803         case SIOCSIFTSOLEN:
1804                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1805                 if (error)
1806                         break;
1807
1808                 /* XXX need driver supplied upper limit */
1809                 if (ifr->ifr_tsolen <= 0) {
1810                         error = EINVAL;
1811                         break;
1812                 }
1813                 ifp->if_tsolen = ifr->ifr_tsolen;
1814                 break;
1815
1816         case SIOCADDMULTI:
1817         case SIOCDELMULTI:
1818                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1819                 if (error)
1820                         break;
1821
1822                 /* Don't allow group membership on non-multicast interfaces. */
1823                 if ((ifp->if_flags & IFF_MULTICAST) == 0) {
1824                         error = EOPNOTSUPP;
1825                         break;
1826                 }
1827
1828                 /* Don't let users screw up protocols' entries. */
1829                 if (ifr->ifr_addr.sa_family != AF_LINK) {
1830                         error = EINVAL;
1831                         break;
1832                 }
1833
1834                 if (cmd == SIOCADDMULTI) {
1835                         struct ifmultiaddr *ifma;
1836                         error = if_addmulti(ifp, &ifr->ifr_addr, &ifma);
1837                 } else {
1838                         error = if_delmulti(ifp, &ifr->ifr_addr);
1839                 }
1840                 if (error == 0)
1841                         getmicrotime(&ifp->if_lastchange);
1842                 break;
1843
1844         case SIOCSIFPHYADDR:
1845         case SIOCDIFPHYADDR:
1846 #ifdef INET6
1847         case SIOCSIFPHYADDR_IN6:
1848 #endif
1849         case SIOCSLIFPHYADDR:
1850         case SIOCSIFMEDIA:
1851         case SIOCSIFGENERIC:
1852                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1853                 if (error)
1854                         break;
1855                 if (ifp->if_ioctl == 0) {
1856                         error = EOPNOTSUPP;
1857                         break;
1858                 }
1859                 ifnet_serialize_all(ifp);
1860                 error = ifp->if_ioctl(ifp, cmd, data, cred);
1861                 ifnet_deserialize_all(ifp);
1862                 if (error == 0)
1863                         getmicrotime(&ifp->if_lastchange);
1864                 break;
1865
1866         case SIOCGIFSTATUS:
1867                 ifs = (struct ifstat *)data;
1868                 ifs->ascii[0] = '\0';
1869                 /* fall through */
1870         case SIOCGIFPSRCADDR:
1871         case SIOCGIFPDSTADDR:
1872         case SIOCGLIFPHYADDR:
1873         case SIOCGIFMEDIA:
1874         case SIOCGIFGENERIC:
1875                 if (ifp->if_ioctl == NULL) {
1876                         error = EOPNOTSUPP;
1877                         break;
1878                 }
1879                 ifnet_serialize_all(ifp);
1880                 error = ifp->if_ioctl(ifp, cmd, data, cred);
1881                 ifnet_deserialize_all(ifp);
1882                 break;
1883
1884         case SIOCSIFLLADDR:
1885                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1886                 if (error)
1887                         break;
1888                 error = if_setlladdr(ifp, ifr->ifr_addr.sa_data,
1889                                      ifr->ifr_addr.sa_len);
1890                 EVENTHANDLER_INVOKE(iflladdr_event, ifp);
1891                 break;
1892
1893         default:
1894                 oif_flags = ifp->if_flags;
1895                 if (so->so_proto == 0) {
1896                         error = EOPNOTSUPP;
1897                         break;
1898                 }
1899 #ifndef COMPAT_43
1900                 error = so_pru_control_direct(so, cmd, data, ifp);
1901 #else
1902                 ocmd = cmd;
1903
1904                 switch (cmd) {
1905                 case SIOCSIFDSTADDR:
1906                 case SIOCSIFADDR:
1907                 case SIOCSIFBRDADDR:
1908                 case SIOCSIFNETMASK:
1909 #if BYTE_ORDER != BIG_ENDIAN
1910                         if (ifr->ifr_addr.sa_family == 0 &&
1911                             ifr->ifr_addr.sa_len < 16) {
1912                                 ifr->ifr_addr.sa_family = ifr->ifr_addr.sa_len;
1913                                 ifr->ifr_addr.sa_len = 16;
1914                         }
1915 #else
1916                         if (ifr->ifr_addr.sa_len == 0)
1917                                 ifr->ifr_addr.sa_len = 16;
1918 #endif
1919                         break;
1920                 case OSIOCGIFADDR:
1921                         cmd = SIOCGIFADDR;
1922                         break;
1923                 case OSIOCGIFDSTADDR:
1924                         cmd = SIOCGIFDSTADDR;
1925                         break;
1926                 case OSIOCGIFBRDADDR:
1927                         cmd = SIOCGIFBRDADDR;
1928                         break;
1929                 case OSIOCGIFNETMASK:
1930                         cmd = SIOCGIFNETMASK;
1931                         break;
1932                 default:
1933                         break;
1934                 }
1935
1936                 error = so_pru_control_direct(so, cmd, data, ifp);
1937
1938                 switch (ocmd) {
1939                 case OSIOCGIFADDR:
1940                 case OSIOCGIFDSTADDR:
1941                 case OSIOCGIFBRDADDR:
1942                 case OSIOCGIFNETMASK:
1943                         *(u_short *)&ifr->ifr_addr = ifr->ifr_addr.sa_family;
1944                         break;
1945                 }
1946 #endif /* COMPAT_43 */
1947
1948                 if ((oif_flags ^ ifp->if_flags) & IFF_UP) {
1949 #ifdef INET6
1950                         DELAY(100);/* XXX: temporary workaround for fxp issue*/
1951                         if (ifp->if_flags & IFF_UP) {
1952                                 crit_enter();
1953                                 in6_if_up(ifp);
1954                                 crit_exit();
1955                         }
1956 #endif
1957                 }
1958                 break;
1959         }
1960
1961         mtx_unlock(&ifp->if_ioctl_mtx);
1962         return (error);
1963 }
1964
1965 /*
1966  * Set/clear promiscuous mode on interface ifp based on the truth value
1967  * of pswitch.  The calls are reference counted so that only the first
1968  * "on" request actually has an effect, as does the final "off" request.
1969  * Results are undefined if the "off" and "on" requests are not matched.
1970  */
1971 int
1972 ifpromisc(struct ifnet *ifp, int pswitch)
1973 {
1974         struct ifreq ifr;
1975         int error;
1976         int oldflags;
1977
1978         oldflags = ifp->if_flags;
1979         if (ifp->if_flags & IFF_PPROMISC) {
1980                 /* Do nothing if device is in permanently promiscuous mode */
1981                 ifp->if_pcount += pswitch ? 1 : -1;
1982                 return (0);
1983         }
1984         if (pswitch) {
1985                 /*
1986                  * If the device is not configured up, we cannot put it in
1987                  * promiscuous mode.
1988                  */
1989                 if ((ifp->if_flags & IFF_UP) == 0)
1990                         return (ENETDOWN);
1991                 if (ifp->if_pcount++ != 0)
1992                         return (0);
1993                 ifp->if_flags |= IFF_PROMISC;
1994                 log(LOG_INFO, "%s: promiscuous mode enabled\n",
1995                     ifp->if_xname);
1996         } else {
1997                 if (--ifp->if_pcount > 0)
1998                         return (0);
1999                 ifp->if_flags &= ~IFF_PROMISC;
2000                 log(LOG_INFO, "%s: promiscuous mode disabled\n",
2001                     ifp->if_xname);
2002         }
2003         ifr.ifr_flags = ifp->if_flags;
2004         ifr.ifr_flagshigh = ifp->if_flags >> 16;
2005         ifnet_serialize_all(ifp);
2006         error = ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr, NULL);
2007         ifnet_deserialize_all(ifp);
2008         if (error == 0)
2009                 rt_ifmsg(ifp);
2010         else
2011                 ifp->if_flags = oldflags;
2012         return error;
2013 }
2014
2015 /*
2016  * Return interface configuration
2017  * of system.  List may be used
2018  * in later ioctl's (above) to get
2019  * other information.
2020  */
2021 static int
2022 ifconf(u_long cmd, caddr_t data, struct ucred *cred)
2023 {
2024         struct ifconf *ifc = (struct ifconf *)data;
2025         struct ifnet *ifp;
2026         struct sockaddr *sa;
2027         struct ifreq ifr, *ifrp;
2028         int space = ifc->ifc_len, error = 0;
2029
2030         ifrp = ifc->ifc_req;
2031         TAILQ_FOREACH(ifp, &ifnet, if_link) {
2032                 struct ifaddr_container *ifac;
2033                 int addrs;
2034
2035                 if (space <= sizeof ifr)
2036                         break;
2037
2038                 /*
2039                  * Zero the stack declared structure first to prevent
2040                  * memory disclosure.
2041                  */
2042                 bzero(&ifr, sizeof(ifr));
2043                 if (strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name))
2044                     >= sizeof(ifr.ifr_name)) {
2045                         error = ENAMETOOLONG;
2046                         break;
2047                 }
2048
2049                 addrs = 0;
2050                 TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
2051                         struct ifaddr *ifa = ifac->ifa;
2052
2053                         if (space <= sizeof ifr)
2054                                 break;
2055                         sa = ifa->ifa_addr;
2056                         if (cred->cr_prison &&
2057                             prison_if(cred, sa))
2058                                 continue;
2059                         addrs++;
2060 #ifdef COMPAT_43
2061                         if (cmd == OSIOCGIFCONF) {
2062                                 struct osockaddr *osa =
2063                                          (struct osockaddr *)&ifr.ifr_addr;
2064                                 ifr.ifr_addr = *sa;
2065                                 osa->sa_family = sa->sa_family;
2066                                 error = copyout(&ifr, ifrp, sizeof ifr);
2067                                 ifrp++;
2068                         } else
2069 #endif
2070                         if (sa->sa_len <= sizeof(*sa)) {
2071                                 ifr.ifr_addr = *sa;
2072                                 error = copyout(&ifr, ifrp, sizeof ifr);
2073                                 ifrp++;
2074                         } else {
2075                                 if (space < (sizeof ifr) + sa->sa_len -
2076                                             sizeof(*sa))
2077                                         break;
2078                                 space -= sa->sa_len - sizeof(*sa);
2079                                 error = copyout(&ifr, ifrp,
2080                                                 sizeof ifr.ifr_name);
2081                                 if (error == 0)
2082                                         error = copyout(sa, &ifrp->ifr_addr,
2083                                                         sa->sa_len);
2084                                 ifrp = (struct ifreq *)
2085                                         (sa->sa_len + (caddr_t)&ifrp->ifr_addr);
2086                         }
2087                         if (error)
2088                                 break;
2089                         space -= sizeof ifr;
2090                 }
2091                 if (error)
2092                         break;
2093                 if (!addrs) {
2094                         bzero(&ifr.ifr_addr, sizeof ifr.ifr_addr);
2095                         error = copyout(&ifr, ifrp, sizeof ifr);
2096                         if (error)
2097                                 break;
2098                         space -= sizeof ifr;
2099                         ifrp++;
2100                 }
2101         }
2102         ifc->ifc_len -= space;
2103         return (error);
2104 }
2105
2106 /*
2107  * Just like if_promisc(), but for all-multicast-reception mode.
2108  */
2109 int
2110 if_allmulti(struct ifnet *ifp, int onswitch)
2111 {
2112         int error = 0;
2113         struct ifreq ifr;
2114
2115         crit_enter();
2116
2117         if (onswitch) {
2118                 if (ifp->if_amcount++ == 0) {
2119                         ifp->if_flags |= IFF_ALLMULTI;
2120                         ifr.ifr_flags = ifp->if_flags;
2121                         ifr.ifr_flagshigh = ifp->if_flags >> 16;
2122                         ifnet_serialize_all(ifp);
2123                         error = ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr,
2124                                               NULL);
2125                         ifnet_deserialize_all(ifp);
2126                 }
2127         } else {
2128                 if (ifp->if_amcount > 1) {
2129                         ifp->if_amcount--;
2130                 } else {
2131                         ifp->if_amcount = 0;
2132                         ifp->if_flags &= ~IFF_ALLMULTI;
2133                         ifr.ifr_flags = ifp->if_flags;
2134                         ifr.ifr_flagshigh = ifp->if_flags >> 16;
2135                         ifnet_serialize_all(ifp);
2136                         error = ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr,
2137                                               NULL);
2138                         ifnet_deserialize_all(ifp);
2139                 }
2140         }
2141
2142         crit_exit();
2143
2144         if (error == 0)
2145                 rt_ifmsg(ifp);
2146         return error;
2147 }
2148
2149 /*
2150  * Add a multicast listenership to the interface in question.
2151  * The link layer provides a routine which converts
2152  */
2153 int
2154 if_addmulti_serialized(struct ifnet *ifp, struct sockaddr *sa,
2155     struct ifmultiaddr **retifma)
2156 {
2157         struct sockaddr *llsa, *dupsa;
2158         int error;
2159         struct ifmultiaddr *ifma;
2160
2161         ASSERT_IFNET_SERIALIZED_ALL(ifp);
2162
2163         /*
2164          * If the matching multicast address already exists
2165          * then don't add a new one, just add a reference
2166          */
2167         TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
2168                 if (sa_equal(sa, ifma->ifma_addr)) {
2169                         ifma->ifma_refcount++;
2170                         if (retifma)
2171                                 *retifma = ifma;
2172                         return 0;
2173                 }
2174         }
2175
2176         /*
2177          * Give the link layer a chance to accept/reject it, and also
2178          * find out which AF_LINK address this maps to, if it isn't one
2179          * already.
2180          */
2181         if (ifp->if_resolvemulti) {
2182                 error = ifp->if_resolvemulti(ifp, &llsa, sa);
2183                 if (error)
2184                         return error;
2185         } else {
2186                 llsa = NULL;
2187         }
2188
2189         ifma = kmalloc(sizeof *ifma, M_IFMADDR, M_WAITOK);
2190         dupsa = kmalloc(sa->sa_len, M_IFMADDR, M_WAITOK);
2191         bcopy(sa, dupsa, sa->sa_len);
2192
2193         ifma->ifma_addr = dupsa;
2194         ifma->ifma_lladdr = llsa;
2195         ifma->ifma_ifp = ifp;
2196         ifma->ifma_refcount = 1;
2197         ifma->ifma_protospec = NULL;
2198         rt_newmaddrmsg(RTM_NEWMADDR, ifma);
2199
2200         TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link);
2201         if (retifma)
2202                 *retifma = ifma;
2203
2204         if (llsa != NULL) {
2205                 TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
2206                         if (sa_equal(ifma->ifma_addr, llsa))
2207                                 break;
2208                 }
2209                 if (ifma) {
2210                         ifma->ifma_refcount++;
2211                 } else {
2212                         ifma = kmalloc(sizeof *ifma, M_IFMADDR, M_WAITOK);
2213                         dupsa = kmalloc(llsa->sa_len, M_IFMADDR, M_WAITOK);
2214                         bcopy(llsa, dupsa, llsa->sa_len);
2215                         ifma->ifma_addr = dupsa;
2216                         ifma->ifma_ifp = ifp;
2217                         ifma->ifma_refcount = 1;
2218                         TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link);
2219                 }
2220         }
2221         /*
2222          * We are certain we have added something, so call down to the
2223          * interface to let them know about it.
2224          */
2225         if (ifp->if_ioctl)
2226                 ifp->if_ioctl(ifp, SIOCADDMULTI, 0, NULL);
2227
2228         return 0;
2229 }
2230
2231 int
2232 if_addmulti(struct ifnet *ifp, struct sockaddr *sa,
2233     struct ifmultiaddr **retifma)
2234 {
2235         int error;
2236
2237         ifnet_serialize_all(ifp);
2238         error = if_addmulti_serialized(ifp, sa, retifma);
2239         ifnet_deserialize_all(ifp);
2240
2241         return error;
2242 }
2243
2244 /*
2245  * Remove a reference to a multicast address on this interface.  Yell
2246  * if the request does not match an existing membership.
2247  */
2248 static int
2249 if_delmulti_serialized(struct ifnet *ifp, struct sockaddr *sa)
2250 {
2251         struct ifmultiaddr *ifma;
2252
2253         ASSERT_IFNET_SERIALIZED_ALL(ifp);
2254
2255         TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link)
2256                 if (sa_equal(sa, ifma->ifma_addr))
2257                         break;
2258         if (ifma == NULL)
2259                 return ENOENT;
2260
2261         if (ifma->ifma_refcount > 1) {
2262                 ifma->ifma_refcount--;
2263                 return 0;
2264         }
2265
2266         rt_newmaddrmsg(RTM_DELMADDR, ifma);
2267         sa = ifma->ifma_lladdr;
2268         TAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifma_link);
2269         /*
2270          * Make sure the interface driver is notified
2271          * in the case of a link layer mcast group being left.
2272          */
2273         if (ifma->ifma_addr->sa_family == AF_LINK && sa == NULL)
2274                 ifp->if_ioctl(ifp, SIOCDELMULTI, 0, NULL);
2275         kfree(ifma->ifma_addr, M_IFMADDR);
2276         kfree(ifma, M_IFMADDR);
2277         if (sa == NULL)
2278                 return 0;
2279
2280         /*
2281          * Now look for the link-layer address which corresponds to
2282          * this network address.  It had been squirreled away in
2283          * ifma->ifma_lladdr for this purpose (so we don't have
2284          * to call ifp->if_resolvemulti() again), and we saved that
2285          * value in sa above.  If some nasty deleted the
2286          * link-layer address out from underneath us, we can deal because
2287          * the address we stored was is not the same as the one which was
2288          * in the record for the link-layer address.  (So we don't complain
2289          * in that case.)
2290          */
2291         TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link)
2292                 if (sa_equal(sa, ifma->ifma_addr))
2293                         break;
2294         if (ifma == NULL)
2295                 return 0;
2296
2297         if (ifma->ifma_refcount > 1) {
2298                 ifma->ifma_refcount--;
2299                 return 0;
2300         }
2301
2302         TAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifma_link);
2303         ifp->if_ioctl(ifp, SIOCDELMULTI, 0, NULL);
2304         kfree(ifma->ifma_addr, M_IFMADDR);
2305         kfree(sa, M_IFMADDR);
2306         kfree(ifma, M_IFMADDR);
2307
2308         return 0;
2309 }
2310
2311 int
2312 if_delmulti(struct ifnet *ifp, struct sockaddr *sa)
2313 {
2314         int error;
2315
2316         ifnet_serialize_all(ifp);
2317         error = if_delmulti_serialized(ifp, sa);
2318         ifnet_deserialize_all(ifp);
2319
2320         return error;
2321 }
2322
2323 /*
2324  * Delete all multicast group membership for an interface.
2325  * Should be used to quickly flush all multicast filters.
2326  */
2327 void
2328 if_delallmulti_serialized(struct ifnet *ifp)
2329 {
2330         struct ifmultiaddr *ifma, mark;
2331         struct sockaddr sa;
2332
2333         ASSERT_IFNET_SERIALIZED_ALL(ifp);
2334
2335         bzero(&sa, sizeof(sa));
2336         sa.sa_family = AF_UNSPEC;
2337         sa.sa_len = sizeof(sa);
2338
2339         bzero(&mark, sizeof(mark));
2340         mark.ifma_addr = &sa;
2341
2342         TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, &mark, ifma_link);
2343         while ((ifma = TAILQ_NEXT(&mark, ifma_link)) != NULL) {
2344                 TAILQ_REMOVE(&ifp->if_multiaddrs, &mark, ifma_link);
2345                 TAILQ_INSERT_AFTER(&ifp->if_multiaddrs, ifma, &mark,
2346                     ifma_link);
2347
2348                 if (ifma->ifma_addr->sa_family == AF_UNSPEC)
2349                         continue;
2350
2351                 if_delmulti_serialized(ifp, ifma->ifma_addr);
2352         }
2353         TAILQ_REMOVE(&ifp->if_multiaddrs, &mark, ifma_link);
2354 }
2355
2356
2357 /*
2358  * Set the link layer address on an interface.
2359  *
2360  * At this time we only support certain types of interfaces,
2361  * and we don't allow the length of the address to change.
2362  */
2363 int
2364 if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len)
2365 {
2366         struct sockaddr_dl *sdl;
2367         struct ifreq ifr;
2368
2369         sdl = IF_LLSOCKADDR(ifp);
2370         if (sdl == NULL)
2371                 return (EINVAL);
2372         if (len != sdl->sdl_alen)       /* don't allow length to change */
2373                 return (EINVAL);
2374         switch (ifp->if_type) {
2375         case IFT_ETHER:                 /* these types use struct arpcom */
2376         case IFT_XETHER:
2377         case IFT_L2VLAN:
2378         case IFT_IEEE8023ADLAG:
2379                 bcopy(lladdr, ((struct arpcom *)ifp->if_softc)->ac_enaddr, len);
2380                 bcopy(lladdr, LLADDR(sdl), len);
2381                 break;
2382         default:
2383                 return (ENODEV);
2384         }
2385         /*
2386          * If the interface is already up, we need
2387          * to re-init it in order to reprogram its
2388          * address filter.
2389          */
2390         ifnet_serialize_all(ifp);
2391         if ((ifp->if_flags & IFF_UP) != 0) {
2392 #ifdef INET
2393                 struct ifaddr_container *ifac;
2394 #endif
2395
2396                 ifp->if_flags &= ~IFF_UP;
2397                 ifr.ifr_flags = ifp->if_flags;
2398                 ifr.ifr_flagshigh = ifp->if_flags >> 16;
2399                 ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr,
2400                               NULL);
2401                 ifp->if_flags |= IFF_UP;
2402                 ifr.ifr_flags = ifp->if_flags;
2403                 ifr.ifr_flagshigh = ifp->if_flags >> 16;
2404                 ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr,
2405                                  NULL);
2406 #ifdef INET
2407                 /*
2408                  * Also send gratuitous ARPs to notify other nodes about
2409                  * the address change.
2410                  */
2411                 TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
2412                         struct ifaddr *ifa = ifac->ifa;
2413
2414                         if (ifa->ifa_addr != NULL &&
2415                             ifa->ifa_addr->sa_family == AF_INET)
2416                                 arp_gratuitous(ifp, ifa);
2417                 }
2418 #endif
2419         }
2420         ifnet_deserialize_all(ifp);
2421         return (0);
2422 }
2423
2424 struct ifmultiaddr *
2425 ifmaof_ifpforaddr(struct sockaddr *sa, struct ifnet *ifp)
2426 {
2427         struct ifmultiaddr *ifma;
2428
2429         /* TODO: need ifnet_serialize_main */
2430         ifnet_serialize_all(ifp);
2431         TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link)
2432                 if (sa_equal(ifma->ifma_addr, sa))
2433                         break;
2434         ifnet_deserialize_all(ifp);
2435
2436         return ifma;
2437 }
2438
2439 /*
2440  * This function locates the first real ethernet MAC from a network
2441  * card and loads it into node, returning 0 on success or ENOENT if
2442  * no suitable interfaces were found.  It is used by the uuid code to
2443  * generate a unique 6-byte number.
2444  */
2445 int
2446 if_getanyethermac(uint16_t *node, int minlen)
2447 {
2448         struct ifnet *ifp;
2449         struct sockaddr_dl *sdl;
2450
2451         TAILQ_FOREACH(ifp, &ifnet, if_link) {
2452                 if (ifp->if_type != IFT_ETHER)
2453                         continue;
2454                 sdl = IF_LLSOCKADDR(ifp);
2455                 if (sdl->sdl_alen < minlen)
2456                         continue;
2457                 bcopy(((struct arpcom *)ifp->if_softc)->ac_enaddr, node,
2458                       minlen);
2459                 return(0);
2460         }
2461         return (ENOENT);
2462 }
2463
2464 /*
2465  * The name argument must be a pointer to storage which will last as
2466  * long as the interface does.  For physical devices, the result of
2467  * device_get_name(dev) is a good choice and for pseudo-devices a
2468  * static string works well.
2469  */
2470 void
2471 if_initname(struct ifnet *ifp, const char *name, int unit)
2472 {
2473         ifp->if_dname = name;
2474         ifp->if_dunit = unit;
2475         if (unit != IF_DUNIT_NONE)
2476                 ksnprintf(ifp->if_xname, IFNAMSIZ, "%s%d", name, unit);
2477         else
2478                 strlcpy(ifp->if_xname, name, IFNAMSIZ);
2479 }
2480
2481 int
2482 if_printf(struct ifnet *ifp, const char *fmt, ...)
2483 {
2484         __va_list ap;
2485         int retval;
2486
2487         retval = kprintf("%s: ", ifp->if_xname);
2488         __va_start(ap, fmt);
2489         retval += kvprintf(fmt, ap);
2490         __va_end(ap);
2491         return (retval);
2492 }
2493
2494 struct ifnet *
2495 if_alloc(uint8_t type)
2496 {
2497         struct ifnet *ifp;
2498         size_t size;
2499
2500         /*
2501          * XXX temporary hack until arpcom is setup in if_l2com
2502          */
2503         if (type == IFT_ETHER)
2504                 size = sizeof(struct arpcom);
2505         else
2506                 size = sizeof(struct ifnet);
2507
2508         ifp = kmalloc(size, M_IFNET, M_WAITOK|M_ZERO);
2509
2510         ifp->if_type = type;
2511
2512         if (if_com_alloc[type] != NULL) {
2513                 ifp->if_l2com = if_com_alloc[type](type, ifp);
2514                 if (ifp->if_l2com == NULL) {
2515                         kfree(ifp, M_IFNET);
2516                         return (NULL);
2517                 }
2518         }
2519         return (ifp);
2520 }
2521
2522 void
2523 if_free(struct ifnet *ifp)
2524 {
2525         kfree(ifp, M_IFNET);
2526 }
2527
2528 void
2529 ifq_set_classic(struct ifaltq *ifq)
2530 {
2531         ifq_set_methods(ifq, ifq->altq_ifp->if_mapsubq,
2532             ifsq_classic_enqueue, ifsq_classic_dequeue, ifsq_classic_request);
2533 }
2534
2535 void
2536 ifq_set_methods(struct ifaltq *ifq, altq_mapsubq_t mapsubq,
2537     ifsq_enqueue_t enqueue, ifsq_dequeue_t dequeue, ifsq_request_t request)
2538 {
2539         int q;
2540
2541         KASSERT(mapsubq != NULL, ("mapsubq is not specified"));
2542         KASSERT(enqueue != NULL, ("enqueue is not specified"));
2543         KASSERT(dequeue != NULL, ("dequeue is not specified"));
2544         KASSERT(request != NULL, ("request is not specified"));
2545
2546         ifq->altq_mapsubq = mapsubq;
2547         for (q = 0; q < ifq->altq_subq_cnt; ++q) {
2548                 struct ifaltq_subque *ifsq = &ifq->altq_subq[q];
2549
2550                 ifsq->ifsq_enqueue = enqueue;
2551                 ifsq->ifsq_dequeue = dequeue;
2552                 ifsq->ifsq_request = request;
2553         }
2554 }
2555
2556 static void
2557 ifsq_norm_enqueue(struct ifaltq_subque *ifsq, struct mbuf *m)
2558 {
2559         m->m_nextpkt = NULL;
2560         if (ifsq->ifsq_norm_tail == NULL)
2561                 ifsq->ifsq_norm_head = m;
2562         else
2563                 ifsq->ifsq_norm_tail->m_nextpkt = m;
2564         ifsq->ifsq_norm_tail = m;
2565         ALTQ_SQ_CNTR_INC(ifsq, m->m_pkthdr.len);
2566 }
2567
2568 static void
2569 ifsq_prio_enqueue(struct ifaltq_subque *ifsq, struct mbuf *m)
2570 {
2571         m->m_nextpkt = NULL;
2572         if (ifsq->ifsq_prio_tail == NULL)
2573                 ifsq->ifsq_prio_head = m;
2574         else
2575                 ifsq->ifsq_prio_tail->m_nextpkt = m;
2576         ifsq->ifsq_prio_tail = m;
2577         ALTQ_SQ_CNTR_INC(ifsq, m->m_pkthdr.len);
2578         ALTQ_SQ_PRIO_CNTR_INC(ifsq, m->m_pkthdr.len);
2579 }
2580
2581 static struct mbuf *
2582 ifsq_norm_dequeue(struct ifaltq_subque *ifsq)
2583 {
2584         struct mbuf *m;
2585
2586         m = ifsq->ifsq_norm_head;
2587         if (m != NULL) {
2588                 if ((ifsq->ifsq_norm_head = m->m_nextpkt) == NULL)
2589                         ifsq->ifsq_norm_tail = NULL;
2590                 m->m_nextpkt = NULL;
2591                 ALTQ_SQ_CNTR_DEC(ifsq, m->m_pkthdr.len);
2592         }
2593         return m;
2594 }
2595
2596 static struct mbuf *
2597 ifsq_prio_dequeue(struct ifaltq_subque *ifsq)
2598 {
2599         struct mbuf *m;
2600
2601         m = ifsq->ifsq_prio_head;
2602         if (m != NULL) {
2603                 if ((ifsq->ifsq_prio_head = m->m_nextpkt) == NULL)
2604                         ifsq->ifsq_prio_tail = NULL;
2605                 m->m_nextpkt = NULL;
2606                 ALTQ_SQ_CNTR_DEC(ifsq, m->m_pkthdr.len);
2607                 ALTQ_SQ_PRIO_CNTR_DEC(ifsq, m->m_pkthdr.len);
2608         }
2609         return m;
2610 }
2611
2612 int
2613 ifsq_classic_enqueue(struct ifaltq_subque *ifsq, struct mbuf *m,
2614     struct altq_pktattr *pa __unused)
2615 {
2616         M_ASSERTPKTHDR(m);
2617         if (ifsq->ifsq_len >= ifsq->ifsq_maxlen ||
2618             ifsq->ifsq_bcnt >= ifsq->ifsq_maxbcnt) {
2619                 if ((m->m_flags & M_PRIO) &&
2620                     ifsq->ifsq_prio_len < (ifsq->ifsq_maxlen / 2) &&
2621                     ifsq->ifsq_prio_bcnt < (ifsq->ifsq_maxbcnt / 2)) {
2622                         struct mbuf *m_drop;
2623
2624                         /*
2625                          * Perform drop-head on normal queue
2626                          */
2627                         m_drop = ifsq_norm_dequeue(ifsq);
2628                         if (m_drop != NULL) {
2629                                 m_freem(m_drop);
2630                                 ifsq_prio_enqueue(ifsq, m);
2631                                 return 0;
2632                         }
2633                         /* XXX nothing could be dropped? */
2634                 }
2635                 m_freem(m);
2636                 return ENOBUFS;
2637         } else {
2638                 if (m->m_flags & M_PRIO)
2639                         ifsq_prio_enqueue(ifsq, m);
2640                 else
2641                         ifsq_norm_enqueue(ifsq, m);
2642                 return 0;
2643         }
2644 }
2645
2646 struct mbuf *
2647 ifsq_classic_dequeue(struct ifaltq_subque *ifsq, int op)
2648 {
2649         struct mbuf *m;
2650
2651         switch (op) {
2652         case ALTDQ_POLL:
2653                 m = ifsq->ifsq_prio_head;
2654                 if (m == NULL)
2655                         m = ifsq->ifsq_norm_head;
2656                 break;
2657
2658         case ALTDQ_REMOVE:
2659                 m = ifsq_prio_dequeue(ifsq);
2660                 if (m == NULL)
2661                         m = ifsq_norm_dequeue(ifsq);
2662                 break;
2663
2664         default:
2665                 panic("unsupported ALTQ dequeue op: %d", op);
2666         }
2667         return m;
2668 }
2669
2670 int
2671 ifsq_classic_request(struct ifaltq_subque *ifsq, int req, void *arg)
2672 {
2673         switch (req) {
2674         case ALTRQ_PURGE:
2675                 for (;;) {
2676                         struct mbuf *m;
2677
2678                         m = ifsq_classic_dequeue(ifsq, ALTDQ_REMOVE);
2679                         if (m == NULL)
2680                                 break;
2681                         m_freem(m);
2682                 }
2683                 break;
2684
2685         default:
2686                 panic("unsupported ALTQ request: %d", req);
2687         }
2688         return 0;
2689 }
2690
2691 static void
2692 ifsq_ifstart_try(struct ifaltq_subque *ifsq, int force_sched)
2693 {
2694         struct ifnet *ifp = ifsq_get_ifp(ifsq);
2695         int running = 0, need_sched;
2696
2697         /*
2698          * Try to do direct ifnet.if_start on the subqueue first, if there is
2699          * contention on the subqueue hardware serializer, ifnet.if_start on
2700          * the subqueue will be scheduled on the subqueue owner CPU.
2701          */
2702         if (!ifsq_tryserialize_hw(ifsq)) {
2703                 /*
2704                  * Subqueue hardware serializer contention happened,
2705                  * ifnet.if_start on the subqueue is scheduled on
2706                  * the subqueue owner CPU, and we keep going.
2707                  */
2708                 ifsq_ifstart_schedule(ifsq, 1);
2709                 return;
2710         }
2711
2712         if ((ifp->if_flags & IFF_RUNNING) && !ifsq_is_oactive(ifsq)) {
2713                 ifp->if_start(ifp, ifsq);
2714                 if ((ifp->if_flags & IFF_RUNNING) && !ifsq_is_oactive(ifsq))
2715                         running = 1;
2716         }
2717         need_sched = ifsq_ifstart_need_schedule(ifsq, running);
2718
2719         ifsq_deserialize_hw(ifsq);
2720
2721         if (need_sched) {
2722                 /*
2723                  * More data need to be transmitted, ifnet.if_start on the
2724                  * subqueue is scheduled on the subqueue owner CPU, and we
2725                  * keep going.
2726                  * NOTE: ifnet.if_start subqueue interlock is not released.
2727                  */
2728                 ifsq_ifstart_schedule(ifsq, force_sched);
2729         }
2730 }
2731
2732 /*
2733  * Subqeue packets staging mechanism:
2734  *
2735  * The packets enqueued into the subqueue are staged to a certain amount
2736  * before the ifnet.if_start on the subqueue is called.  In this way, the
2737  * driver could avoid writing to hardware registers upon every packet,
2738  * instead, hardware registers could be written when certain amount of
2739  * packets are put onto hardware TX ring.  The measurement on several modern
2740  * NICs (emx(4), igb(4), bnx(4), bge(4), jme(4)) shows that the hardware
2741  * registers writing aggregation could save ~20% CPU time when 18bytes UDP
2742  * datagrams are transmitted at 1.48Mpps.  The performance improvement by
2743  * hardware registers writing aggeregation is also mentioned by Luigi Rizzo's
2744  * netmap paper (http://info.iet.unipi.it/~luigi/netmap/).
2745  *
2746  * Subqueue packets staging is performed for two entry points into drivers'
2747  * transmission function:
2748  * - Direct ifnet.if_start calling on the subqueue, i.e. ifsq_ifstart_try()
2749  * - ifnet.if_start scheduling on the subqueue, i.e. ifsq_ifstart_schedule()
2750  *
2751  * Subqueue packets staging will be stopped upon any of the following
2752  * conditions:
2753  * - If the count of packets enqueued on the current CPU is great than or
2754  *   equal to ifsq_stage_cntmax. (XXX this should be per-interface)
2755  * - If the total length of packets enqueued on the current CPU is great
2756  *   than or equal to the hardware's MTU - max_protohdr.  max_protohdr is
2757  *   cut from the hardware's MTU mainly bacause a full TCP segment's size
2758  *   is usually less than hardware's MTU.
2759  * - ifsq_ifstart_schedule() is not pending on the current CPU and
2760  *   ifnet.if_start subqueue interlock (ifaltq_subq.ifsq_started) is not
2761  *   released.
2762  * - The if_start_rollup(), which is registered as low priority netisr
2763  *   rollup function, is called; probably because no more work is pending
2764  *   for netisr.
2765  *
2766  * NOTE:
2767  * Currently subqueue packet staging is only performed in netisr threads.
2768  */
2769 int
2770 ifq_dispatch(struct ifnet *ifp, struct mbuf *m, struct altq_pktattr *pa)
2771 {
2772         struct ifaltq *ifq = &ifp->if_snd;
2773         struct ifaltq_subque *ifsq;
2774         int error, start = 0, len, mcast = 0, avoid_start = 0;
2775         struct ifsubq_stage_head *head = NULL;
2776         struct ifsubq_stage *stage = NULL;
2777         struct globaldata *gd = mycpu;
2778         struct thread *td = gd->gd_curthread;
2779
2780         crit_enter_quick(td);
2781
2782         ifsq = ifq_map_subq(ifq, gd->gd_cpuid);
2783         ASSERT_ALTQ_SQ_NOT_SERIALIZED_HW(ifsq);
2784
2785         len = m->m_pkthdr.len;
2786         if (m->m_flags & M_MCAST)
2787                 mcast = 1;
2788
2789         if (td->td_type == TD_TYPE_NETISR) {
2790                 head = &ifsubq_stage_heads[mycpuid];
2791                 stage = ifsq_get_stage(ifsq, mycpuid);
2792
2793                 stage->stg_cnt++;
2794                 stage->stg_len += len;
2795                 if (stage->stg_cnt < ifsq_stage_cntmax &&
2796                     stage->stg_len < (ifp->if_mtu - max_protohdr))
2797                         avoid_start = 1;
2798         }
2799
2800         ALTQ_SQ_LOCK(ifsq);
2801         error = ifsq_enqueue_locked(ifsq, m, pa);
2802         if (error) {
2803                 if (!ifsq_data_ready(ifsq)) {
2804                         ALTQ_SQ_UNLOCK(ifsq);
2805                         crit_exit_quick(td);
2806                         return error;
2807                 }
2808                 avoid_start = 0;
2809         }
2810         if (!ifsq_is_started(ifsq)) {
2811                 if (avoid_start) {
2812                         ALTQ_SQ_UNLOCK(ifsq);
2813
2814                         KKASSERT(!error);
2815                         if ((stage->stg_flags & IFSQ_STAGE_FLAG_QUED) == 0)
2816                                 ifsq_stage_insert(head, stage);
2817
2818                         IFNET_STAT_INC(ifp, obytes, len);
2819                         if (mcast)
2820                                 IFNET_STAT_INC(ifp, omcasts, 1);
2821                         crit_exit_quick(td);
2822                         return error;
2823                 }
2824
2825                 /*
2826                  * Hold the subqueue interlock of ifnet.if_start
2827                  */
2828                 ifsq_set_started(ifsq);
2829                 start = 1;
2830         }
2831         ALTQ_SQ_UNLOCK(ifsq);
2832
2833         if (!error) {
2834                 IFNET_STAT_INC(ifp, obytes, len);
2835                 if (mcast)
2836                         IFNET_STAT_INC(ifp, omcasts, 1);
2837         }
2838
2839         if (stage != NULL) {
2840                 if (!start && (stage->stg_flags & IFSQ_STAGE_FLAG_SCHED)) {
2841                         KKASSERT(stage->stg_flags & IFSQ_STAGE_FLAG_QUED);
2842                         if (!avoid_start) {
2843                                 ifsq_stage_remove(head, stage);
2844                                 ifsq_ifstart_schedule(ifsq, 1);
2845                         }
2846                         crit_exit_quick(td);
2847                         return error;
2848                 }
2849
2850                 if (stage->stg_flags & IFSQ_STAGE_FLAG_QUED) {
2851                         ifsq_stage_remove(head, stage);
2852                 } else {
2853                         stage->stg_cnt = 0;
2854                         stage->stg_len = 0;
2855                 }
2856         }
2857
2858         if (!start) {
2859                 crit_exit_quick(td);
2860                 return error;
2861         }
2862
2863         ifsq_ifstart_try(ifsq, 0);
2864
2865         crit_exit_quick(td);
2866         return error;
2867 }
2868
2869 void *
2870 ifa_create(int size, int flags)
2871 {
2872         struct ifaddr *ifa;
2873         int i;
2874
2875         KASSERT(size >= sizeof(*ifa), ("ifaddr size too small"));
2876
2877         ifa = kmalloc(size, M_IFADDR, flags | M_ZERO);
2878         if (ifa == NULL)
2879                 return NULL;
2880
2881         ifa->ifa_containers =
2882             kmalloc_cachealign(ncpus * sizeof(struct ifaddr_container),
2883                 M_IFADDR, M_WAITOK | M_ZERO);
2884         ifa->ifa_ncnt = ncpus;
2885         for (i = 0; i < ncpus; ++i) {
2886                 struct ifaddr_container *ifac = &ifa->ifa_containers[i];
2887
2888                 ifac->ifa_magic = IFA_CONTAINER_MAGIC;
2889                 ifac->ifa = ifa;
2890                 ifac->ifa_refcnt = 1;
2891         }
2892 #ifdef IFADDR_DEBUG
2893         kprintf("alloc ifa %p %d\n", ifa, size);
2894 #endif
2895         return ifa;
2896 }
2897
2898 void
2899 ifac_free(struct ifaddr_container *ifac, int cpu_id)
2900 {
2901         struct ifaddr *ifa = ifac->ifa;
2902
2903         KKASSERT(ifac->ifa_magic == IFA_CONTAINER_MAGIC);
2904         KKASSERT(ifac->ifa_refcnt == 0);
2905         KASSERT(ifac->ifa_listmask == 0,
2906                 ("ifa is still on %#x lists", ifac->ifa_listmask));
2907
2908         ifac->ifa_magic = IFA_CONTAINER_DEAD;
2909
2910 #ifdef IFADDR_DEBUG_VERBOSE
2911         kprintf("try free ifa %p cpu_id %d\n", ifac->ifa, cpu_id);
2912 #endif
2913
2914         KASSERT(ifa->ifa_ncnt > 0 && ifa->ifa_ncnt <= ncpus,
2915                 ("invalid # of ifac, %d", ifa->ifa_ncnt));
2916         if (atomic_fetchadd_int(&ifa->ifa_ncnt, -1) == 1) {
2917 #ifdef IFADDR_DEBUG
2918                 kprintf("free ifa %p\n", ifa);
2919 #endif
2920                 kfree(ifa->ifa_containers, M_IFADDR);
2921                 kfree(ifa, M_IFADDR);
2922         }
2923 }
2924
2925 static void
2926 ifa_iflink_dispatch(netmsg_t nmsg)
2927 {
2928         struct netmsg_ifaddr *msg = (struct netmsg_ifaddr *)nmsg;
2929         struct ifaddr *ifa = msg->ifa;
2930         struct ifnet *ifp = msg->ifp;
2931         int cpu = mycpuid;
2932         struct ifaddr_container *ifac;
2933
2934         crit_enter();
2935
2936         ifac = &ifa->ifa_containers[cpu];
2937         ASSERT_IFAC_VALID(ifac);
2938         KASSERT((ifac->ifa_listmask & IFA_LIST_IFADDRHEAD) == 0,
2939                 ("ifaddr is on if_addrheads"));
2940
2941         ifac->ifa_listmask |= IFA_LIST_IFADDRHEAD;
2942         if (msg->tail)
2943                 TAILQ_INSERT_TAIL(&ifp->if_addrheads[cpu], ifac, ifa_link);
2944         else
2945                 TAILQ_INSERT_HEAD(&ifp->if_addrheads[cpu], ifac, ifa_link);
2946
2947         crit_exit();
2948
2949         ifa_forwardmsg(&nmsg->lmsg, cpu + 1);
2950 }
2951
2952 void
2953 ifa_iflink(struct ifaddr *ifa, struct ifnet *ifp, int tail)
2954 {
2955         struct netmsg_ifaddr msg;
2956
2957         netmsg_init(&msg.base, NULL, &curthread->td_msgport,
2958                     0, ifa_iflink_dispatch);
2959         msg.ifa = ifa;
2960         msg.ifp = ifp;
2961         msg.tail = tail;
2962
2963         ifa_domsg(&msg.base.lmsg, 0);
2964 }
2965
2966 static void
2967 ifa_ifunlink_dispatch(netmsg_t nmsg)
2968 {
2969         struct netmsg_ifaddr *msg = (struct netmsg_ifaddr *)nmsg;
2970         struct ifaddr *ifa = msg->ifa;
2971         struct ifnet *ifp = msg->ifp;
2972         int cpu = mycpuid;
2973         struct ifaddr_container *ifac;
2974
2975         crit_enter();
2976
2977         ifac = &ifa->ifa_containers[cpu];
2978         ASSERT_IFAC_VALID(ifac);
2979         KASSERT(ifac->ifa_listmask & IFA_LIST_IFADDRHEAD,
2980                 ("ifaddr is not on if_addrhead"));
2981
2982         TAILQ_REMOVE(&ifp->if_addrheads[cpu], ifac, ifa_link);
2983         ifac->ifa_listmask &= ~IFA_LIST_IFADDRHEAD;
2984
2985         crit_exit();
2986
2987         ifa_forwardmsg(&nmsg->lmsg, cpu + 1);
2988 }
2989
2990 void
2991 ifa_ifunlink(struct ifaddr *ifa, struct ifnet *ifp)
2992 {
2993         struct netmsg_ifaddr msg;
2994
2995         netmsg_init(&msg.base, NULL, &curthread->td_msgport,
2996                     0, ifa_ifunlink_dispatch);
2997         msg.ifa = ifa;
2998         msg.ifp = ifp;
2999
3000         ifa_domsg(&msg.base.lmsg, 0);
3001 }
3002
3003 static void
3004 ifa_destroy_dispatch(netmsg_t nmsg)
3005 {
3006         struct netmsg_ifaddr *msg = (struct netmsg_ifaddr *)nmsg;
3007
3008         IFAFREE(msg->ifa);
3009         ifa_forwardmsg(&nmsg->lmsg, mycpuid + 1);
3010 }
3011
3012 void
3013 ifa_destroy(struct ifaddr *ifa)
3014 {
3015         struct netmsg_ifaddr msg;
3016
3017         netmsg_init(&msg.base, NULL, &curthread->td_msgport,
3018                     0, ifa_destroy_dispatch);
3019         msg.ifa = ifa;
3020
3021         ifa_domsg(&msg.base.lmsg, 0);
3022 }
3023
3024 struct lwkt_port *
3025 ifnet_portfn(int cpu)
3026 {
3027         return &ifnet_threads[cpu].td_msgport;
3028 }
3029
3030 void
3031 ifnet_forwardmsg(struct lwkt_msg *lmsg, int next_cpu)
3032 {
3033         KKASSERT(next_cpu > mycpuid && next_cpu <= ncpus);
3034
3035         if (next_cpu < ncpus)
3036                 lwkt_forwardmsg(ifnet_portfn(next_cpu), lmsg);
3037         else
3038                 lwkt_replymsg(lmsg, 0);
3039 }
3040
3041 int
3042 ifnet_domsg(struct lwkt_msg *lmsg, int cpu)
3043 {
3044         KKASSERT(cpu < ncpus);
3045         return lwkt_domsg(ifnet_portfn(cpu), lmsg, 0);
3046 }
3047
3048 void
3049 ifnet_sendmsg(struct lwkt_msg *lmsg, int cpu)
3050 {
3051         KKASSERT(cpu < ncpus);
3052         lwkt_sendmsg(ifnet_portfn(cpu), lmsg);
3053 }
3054
3055 /*
3056  * Generic netmsg service loop.  Some protocols may roll their own but all
3057  * must do the basic command dispatch function call done here.
3058  */
3059 static void
3060 ifnet_service_loop(void *arg __unused)
3061 {
3062         netmsg_t msg;
3063
3064         while ((msg = lwkt_waitport(&curthread->td_msgport, 0))) {
3065                 KASSERT(msg->base.nm_dispatch, ("ifnet_service: badmsg"));
3066                 msg->base.nm_dispatch(msg);
3067         }
3068 }
3069
3070 static void
3071 if_start_rollup(void)
3072 {
3073         struct ifsubq_stage_head *head = &ifsubq_stage_heads[mycpuid];
3074         struct ifsubq_stage *stage;
3075
3076         crit_enter();
3077
3078         while ((stage = TAILQ_FIRST(&head->stg_head)) != NULL) {
3079                 struct ifaltq_subque *ifsq = stage->stg_subq;
3080                 int is_sched = 0;
3081
3082                 if (stage->stg_flags & IFSQ_STAGE_FLAG_SCHED)
3083                         is_sched = 1;
3084                 ifsq_stage_remove(head, stage);
3085
3086                 if (is_sched) {
3087                         ifsq_ifstart_schedule(ifsq, 1);
3088                 } else {
3089                         int start = 0;
3090
3091                         ALTQ_SQ_LOCK(ifsq);
3092                         if (!ifsq_is_started(ifsq)) {
3093                                 /*
3094                                  * Hold the subqueue interlock of
3095                                  * ifnet.if_start
3096                                  */
3097                                 ifsq_set_started(ifsq);
3098                                 start = 1;
3099                         }
3100                         ALTQ_SQ_UNLOCK(ifsq);
3101
3102                         if (start)
3103                                 ifsq_ifstart_try(ifsq, 1);
3104                 }
3105                 KKASSERT((stage->stg_flags &
3106                     (IFSQ_STAGE_FLAG_QUED | IFSQ_STAGE_FLAG_SCHED)) == 0);
3107         }
3108
3109         crit_exit();
3110 }
3111
3112 static void
3113 ifnetinit(void *dummy __unused)
3114 {
3115         int i;
3116
3117         for (i = 0; i < ncpus; ++i) {
3118                 struct thread *thr = &ifnet_threads[i];
3119
3120                 lwkt_create(ifnet_service_loop, NULL, NULL,
3121                             thr, TDF_NOSTART|TDF_FORCE_SPINPORT|TDF_FIXEDCPU,
3122                             i, "ifnet %d", i);
3123                 netmsg_service_port_init(&thr->td_msgport);
3124                 lwkt_schedule(thr);
3125         }
3126
3127         for (i = 0; i < ncpus; ++i)
3128                 TAILQ_INIT(&ifsubq_stage_heads[i].stg_head);
3129         netisr_register_rollup(if_start_rollup, NETISR_ROLLUP_PRIO_IFSTART);
3130 }
3131
3132 struct ifnet *
3133 ifnet_byindex(unsigned short idx)
3134 {
3135         if (idx > if_index)
3136                 return NULL;
3137         return ifindex2ifnet[idx];
3138 }
3139
3140 struct ifaddr *
3141 ifaddr_byindex(unsigned short idx)
3142 {
3143         struct ifnet *ifp;
3144
3145         ifp = ifnet_byindex(idx);
3146         if (!ifp)
3147                 return NULL;
3148         return TAILQ_FIRST(&ifp->if_addrheads[mycpuid])->ifa;
3149 }
3150
3151 void
3152 if_register_com_alloc(u_char type,
3153     if_com_alloc_t *a, if_com_free_t *f)
3154 {
3155
3156         KASSERT(if_com_alloc[type] == NULL,
3157             ("if_register_com_alloc: %d already registered", type));
3158         KASSERT(if_com_free[type] == NULL,
3159             ("if_register_com_alloc: %d free already registered", type));
3160
3161         if_com_alloc[type] = a;
3162         if_com_free[type] = f;
3163 }
3164
3165 void
3166 if_deregister_com_alloc(u_char type)
3167 {
3168
3169         KASSERT(if_com_alloc[type] != NULL,
3170             ("if_deregister_com_alloc: %d not registered", type));
3171         KASSERT(if_com_free[type] != NULL,
3172             ("if_deregister_com_alloc: %d free not registered", type));
3173         if_com_alloc[type] = NULL;
3174         if_com_free[type] = NULL;
3175 }
3176
3177 int
3178 if_ring_count2(int cnt, int cnt_max)
3179 {
3180         int shift = 0;
3181
3182         KASSERT(cnt_max >= 1 && powerof2(cnt_max),
3183             ("invalid ring count max %d", cnt_max));
3184
3185         if (cnt <= 0)
3186                 cnt = cnt_max;
3187         if (cnt > ncpus2)
3188                 cnt = ncpus2;
3189         if (cnt > cnt_max)
3190                 cnt = cnt_max;
3191
3192         while ((1 << (shift + 1)) <= cnt)
3193                 ++shift;
3194         cnt = 1 << shift;
3195
3196         KASSERT(cnt >= 1 && cnt <= ncpus2 && cnt <= cnt_max,
3197             ("calculate cnt %d, ncpus2 %d, cnt max %d",
3198              cnt, ncpus2, cnt_max));
3199         return cnt;
3200 }
3201
3202 void
3203 ifq_set_maxlen(struct ifaltq *ifq, int len)
3204 {
3205         ifq->altq_maxlen = len + (ncpus * ifsq_stage_cntmax);
3206 }
3207
3208 int
3209 ifq_mapsubq_default(struct ifaltq *ifq __unused, int cpuid __unused)
3210 {
3211         return ALTQ_SUBQ_INDEX_DEFAULT;
3212 }
3213
3214 int
3215 ifq_mapsubq_mask(struct ifaltq *ifq, int cpuid)
3216 {
3217         return (cpuid & ifq->altq_subq_mask);
3218 }
3219
3220 static void
3221 ifsq_watchdog(void *arg)
3222 {
3223         struct ifsubq_watchdog *wd = arg;
3224         struct ifnet *ifp;
3225
3226         if (__predict_true(wd->wd_timer == 0 || --wd->wd_timer))
3227                 goto done;
3228
3229         ifp = ifsq_get_ifp(wd->wd_subq);
3230         if (ifnet_tryserialize_all(ifp)) {
3231                 wd->wd_watchdog(wd->wd_subq);
3232                 ifnet_deserialize_all(ifp);
3233         } else {
3234                 /* try again next timeout */
3235                 wd->wd_timer = 1;
3236         }
3237 done:
3238         ifsq_watchdog_reset(wd);
3239 }
3240
3241 static void
3242 ifsq_watchdog_reset(struct ifsubq_watchdog *wd)
3243 {
3244         callout_reset_bycpu(&wd->wd_callout, hz, ifsq_watchdog, wd,
3245             ifsq_get_cpuid(wd->wd_subq));
3246 }
3247
3248 void
3249 ifsq_watchdog_init(struct ifsubq_watchdog *wd, struct ifaltq_subque *ifsq,
3250     ifsq_watchdog_t watchdog)
3251 {
3252         callout_init_mp(&wd->wd_callout);
3253         wd->wd_timer = 0;
3254         wd->wd_subq = ifsq;
3255         wd->wd_watchdog = watchdog;
3256 }
3257
3258 void
3259 ifsq_watchdog_start(struct ifsubq_watchdog *wd)
3260 {
3261         wd->wd_timer = 0;
3262         ifsq_watchdog_reset(wd);
3263 }
3264
3265 void
3266 ifsq_watchdog_stop(struct ifsubq_watchdog *wd)
3267 {
3268         wd->wd_timer = 0;
3269         callout_stop(&wd->wd_callout);
3270 }