if: if_start_schedule -> ifq_ifstart_schedule
[dragonfly.git] / sys / net / if.c
1 /*
2  * Copyright (c) 1980, 1986, 1993
3  *      The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. All advertising materials mentioning features or use of this software
14  *    must display the following acknowledgement:
15  *      This product includes software developed by the University of
16  *      California, Berkeley and its contributors.
17  * 4. Neither the name of the University nor the names of its contributors
18  *    may be used to endorse or promote products derived from this software
19  *    without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  *      @(#)if.c        8.3 (Berkeley) 1/4/94
34  * $FreeBSD: src/sys/net/if.c,v 1.185 2004/03/13 02:35:03 brooks Exp $
35  */
36
37 #include "opt_compat.h"
38 #include "opt_inet6.h"
39 #include "opt_inet.h"
40 #include "opt_ifpoll.h"
41
42 #include <sys/param.h>
43 #include <sys/malloc.h>
44 #include <sys/mbuf.h>
45 #include <sys/systm.h>
46 #include <sys/proc.h>
47 #include <sys/priv.h>
48 #include <sys/protosw.h>
49 #include <sys/socket.h>
50 #include <sys/socketvar.h>
51 #include <sys/socketops.h>
52 #include <sys/protosw.h>
53 #include <sys/kernel.h>
54 #include <sys/ktr.h>
55 #include <sys/mutex.h>
56 #include <sys/sockio.h>
57 #include <sys/syslog.h>
58 #include <sys/sysctl.h>
59 #include <sys/domain.h>
60 #include <sys/thread.h>
61 #include <sys/serialize.h>
62 #include <sys/bus.h>
63
64 #include <sys/thread2.h>
65 #include <sys/msgport2.h>
66 #include <sys/mutex2.h>
67
68 #include <net/if.h>
69 #include <net/if_arp.h>
70 #include <net/if_dl.h>
71 #include <net/if_types.h>
72 #include <net/if_var.h>
73 #include <net/ifq_var.h>
74 #include <net/radix.h>
75 #include <net/route.h>
76 #include <net/if_clone.h>
77 #include <net/netisr.h>
78 #include <net/netmsg2.h>
79
80 #include <machine/atomic.h>
81 #include <machine/stdarg.h>
82 #include <machine/smp.h>
83
84 #if defined(INET) || defined(INET6)
85 /*XXX*/
86 #include <netinet/in.h>
87 #include <netinet/in_var.h>
88 #include <netinet/if_ether.h>
89 #ifdef INET6
90 #include <netinet6/in6_var.h>
91 #include <netinet6/in6_ifattach.h>
92 #endif
93 #endif
94
95 #if defined(COMPAT_43)
96 #include <emulation/43bsd/43bsd_socket.h>
97 #endif /* COMPAT_43 */
98
99 struct netmsg_ifaddr {
100         struct netmsg_base base;
101         struct ifaddr   *ifa;
102         struct ifnet    *ifp;
103         int             tail;
104 };
105
106 struct ifaltq_stage_head {
107         TAILQ_HEAD(, ifaltq_stage)      ifqs_head;
108 } __cachealign;
109
110 /*
111  * System initialization
112  */
113 static void     if_attachdomain(void *);
114 static void     if_attachdomain1(struct ifnet *);
115 static int      ifconf(u_long, caddr_t, struct ucred *);
116 static void     ifinit(void *);
117 static void     ifnetinit(void *);
118 static void     if_slowtimo(void *);
119 static void     link_rtrequest(int, struct rtentry *, struct rt_addrinfo *);
120 static int      if_rtdel(struct radix_node *, void *);
121
122 #ifdef INET6
123 /*
124  * XXX: declare here to avoid to include many inet6 related files..
125  * should be more generalized?
126  */
127 extern void     nd6_setmtu(struct ifnet *);
128 #endif
129
130 SYSCTL_NODE(_net, PF_LINK, link, CTLFLAG_RW, 0, "Link layers");
131 SYSCTL_NODE(_net_link, 0, generic, CTLFLAG_RW, 0, "Generic link-management");
132
133 static int ifq_stage_cntmax = 4;
134 TUNABLE_INT("net.link.stage_cntmax", &ifq_stage_cntmax);
135 SYSCTL_INT(_net_link, OID_AUTO, stage_cntmax, CTLFLAG_RW,
136     &ifq_stage_cntmax, 0, "ifq staging packet count max");
137
138 SYSINIT(interfaces, SI_SUB_PROTO_IF, SI_ORDER_FIRST, ifinit, NULL)
139 /* Must be after netisr_init */
140 SYSINIT(ifnet, SI_SUB_PRE_DRIVERS, SI_ORDER_SECOND, ifnetinit, NULL)
141
142 static  if_com_alloc_t *if_com_alloc[256];
143 static  if_com_free_t *if_com_free[256];
144
145 MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address");
146 MALLOC_DEFINE(M_IFMADDR, "ether_multi", "link-level multicast address");
147 MALLOC_DEFINE(M_IFNET, "ifnet", "interface structure");
148
149 int                     ifqmaxlen = IFQ_MAXLEN;
150 struct ifnethead        ifnet = TAILQ_HEAD_INITIALIZER(ifnet);
151
152 struct callout          if_slowtimo_timer;
153
154 int                     if_index = 0;
155 struct ifnet            **ifindex2ifnet = NULL;
156 static struct thread    ifnet_threads[MAXCPU];
157
158 static struct ifaltq_stage_head ifq_stage_heads[MAXCPU];
159
160 #define IFQ_KTR_STRING          "ifq=%p"
161 #define IFQ_KTR_ARGS    struct ifaltq *ifq
162 #ifndef KTR_IFQ
163 #define KTR_IFQ                 KTR_ALL
164 #endif
165 KTR_INFO_MASTER(ifq);
166 KTR_INFO(KTR_IFQ, ifq, enqueue, 0, IFQ_KTR_STRING, IFQ_KTR_ARGS);
167 KTR_INFO(KTR_IFQ, ifq, dequeue, 1, IFQ_KTR_STRING, IFQ_KTR_ARGS);
168 #define logifq(name, arg)       KTR_LOG(ifq_ ## name, arg)
169
170 #define IF_START_KTR_STRING     "ifp=%p"
171 #define IF_START_KTR_ARGS       struct ifnet *ifp
172 #ifndef KTR_IF_START
173 #define KTR_IF_START            KTR_ALL
174 #endif
175 KTR_INFO_MASTER(if_start);
176 KTR_INFO(KTR_IF_START, if_start, run, 0,
177          IF_START_KTR_STRING, IF_START_KTR_ARGS);
178 KTR_INFO(KTR_IF_START, if_start, sched, 1,
179          IF_START_KTR_STRING, IF_START_KTR_ARGS);
180 KTR_INFO(KTR_IF_START, if_start, avoid, 2,
181          IF_START_KTR_STRING, IF_START_KTR_ARGS);
182 KTR_INFO(KTR_IF_START, if_start, contend_sched, 3,
183          IF_START_KTR_STRING, IF_START_KTR_ARGS);
184 KTR_INFO(KTR_IF_START, if_start, chase_sched, 4,
185          IF_START_KTR_STRING, IF_START_KTR_ARGS);
186 #define logifstart(name, arg)   KTR_LOG(if_start_ ## name, arg)
187
188 TAILQ_HEAD(, ifg_group) ifg_head = TAILQ_HEAD_INITIALIZER(ifg_head);
189
190 /*
191  * Network interface utility routines.
192  *
193  * Routines with ifa_ifwith* names take sockaddr *'s as
194  * parameters.
195  */
196 /* ARGSUSED*/
197 void
198 ifinit(void *dummy)
199 {
200         struct ifnet *ifp;
201
202         callout_init(&if_slowtimo_timer);
203
204         crit_enter();
205         TAILQ_FOREACH(ifp, &ifnet, if_link) {
206                 if (ifp->if_snd.ifq_maxlen == 0) {
207                         if_printf(ifp, "XXX: driver didn't set ifq_maxlen\n");
208                         ifq_set_maxlen(&ifp->if_snd, ifqmaxlen);
209                 }
210         }
211         crit_exit();
212
213         if_slowtimo(0);
214 }
215
216 static int
217 if_start_cpuid(struct ifnet *ifp)
218 {
219         return ifp->if_cpuid;
220 }
221
222 #ifdef IFPOLL_ENABLE
223 static int
224 if_start_cpuid_npoll(struct ifnet *ifp)
225 {
226         int poll_cpuid = ifp->if_npoll_cpuid;
227
228         if (poll_cpuid >= 0)
229                 return poll_cpuid;
230         else
231                 return ifp->if_cpuid;
232 }
233 #endif
234
235 static void
236 ifq_ifstart_ipifunc(void *arg)
237 {
238         struct ifnet *ifp = arg;
239         struct lwkt_msg *lmsg = &ifp->if_start_nmsg[mycpuid].lmsg;
240
241         crit_enter();
242         if (lmsg->ms_flags & MSGF_DONE)
243                 lwkt_sendmsg(netisr_portfn(mycpuid), lmsg);
244         crit_exit();
245 }
246
247 static __inline void
248 ifq_stage_remove(struct ifaltq_stage_head *head, struct ifaltq_stage *stage)
249 {
250         KKASSERT(stage->ifqs_flags & IFQ_STAGE_FLAG_QUED);
251         TAILQ_REMOVE(&head->ifqs_head, stage, ifqs_link);
252         stage->ifqs_flags &= ~(IFQ_STAGE_FLAG_QUED | IFQ_STAGE_FLAG_SCHED);
253         stage->ifqs_cnt = 0;
254         stage->ifqs_len = 0;
255 }
256
257 static __inline void
258 ifq_stage_insert(struct ifaltq_stage_head *head, struct ifaltq_stage *stage)
259 {
260         KKASSERT((stage->ifqs_flags &
261             (IFQ_STAGE_FLAG_QUED | IFQ_STAGE_FLAG_SCHED)) == 0);
262         stage->ifqs_flags |= IFQ_STAGE_FLAG_QUED;
263         TAILQ_INSERT_TAIL(&head->ifqs_head, stage, ifqs_link);
264 }
265
266 /*
267  * Schedule ifnet.if_start on ifnet's CPU
268  */
269 static void
270 ifq_ifstart_schedule(struct ifaltq *ifq, int force)
271 {
272         struct ifnet *ifp = ifq->altq_ifp;
273         int cpu;
274
275         if (!force && curthread->td_type == TD_TYPE_NETISR &&
276             ifq_stage_cntmax > 0) {
277                 struct ifaltq_stage *stage = ifq_get_stage(ifq, mycpuid);
278
279                 stage->ifqs_cnt = 0;
280                 stage->ifqs_len = 0;
281                 if ((stage->ifqs_flags & IFQ_STAGE_FLAG_QUED) == 0)
282                         ifq_stage_insert(&ifq_stage_heads[mycpuid], stage);
283                 stage->ifqs_flags |= IFQ_STAGE_FLAG_SCHED;
284                 return;
285         }
286
287         cpu = ifp->if_start_cpuid(ifp);
288         if (cpu != mycpuid)
289                 lwkt_send_ipiq(globaldata_find(cpu), ifq_ifstart_ipifunc, ifp);
290         else
291                 ifq_ifstart_ipifunc(ifp);
292 }
293
294 /*
295  * NOTE:
296  * This function will release ifnet.if_start interlock,
297  * if ifnet.if_start does not need to be scheduled
298  */
299 static __inline int
300 if_start_need_schedule(struct ifaltq *ifq, int running)
301 {
302         if (!running || ifq_is_empty(ifq)
303 #ifdef ALTQ
304             || ifq->altq_tbr != NULL
305 #endif
306         ) {
307                 ALTQ_LOCK(ifq);
308                 /*
309                  * ifnet.if_start interlock is released, if:
310                  * 1) Hardware can not take any packets, due to
311                  *    o  interface is marked down
312                  *    o  hardware queue is full (ifq_is_oactive)
313                  *    Under the second situation, hardware interrupt
314                  *    or polling(4) will call/schedule ifnet.if_start
315                  *    when hardware queue is ready
316                  * 2) There is not packet in the ifnet.if_snd.
317                  *    Further ifq_dispatch or ifq_handoff will call/
318                  *    schedule ifnet.if_start
319                  * 3) TBR is used and it does not allow further
320                  *    dequeueing.
321                  *    TBR callout will call ifnet.if_start
322                  */
323                 if (!running || !ifq_data_ready(ifq)) {
324                         ifq_clr_started(ifq);
325                         ALTQ_UNLOCK(ifq);
326                         return 0;
327                 }
328                 ALTQ_UNLOCK(ifq);
329         }
330         return 1;
331 }
332
333 static void
334 if_start_dispatch(netmsg_t msg)
335 {
336         struct lwkt_msg *lmsg = &msg->base.lmsg;
337         struct ifnet *ifp = lmsg->u.ms_resultp;
338         struct ifaltq *ifq = &ifp->if_snd;
339         int running = 0, need_sched;
340
341         crit_enter();
342         lwkt_replymsg(lmsg, 0); /* reply ASAP */
343         crit_exit();
344
345         if (mycpuid != ifp->if_start_cpuid(ifp)) {
346                 /*
347                  * We need to chase the ifnet CPU change.
348                  */
349                 logifstart(chase_sched, ifp);
350                 ifq_ifstart_schedule(ifq, 1);
351                 return;
352         }
353
354         ifnet_serialize_tx(ifp);
355         if ((ifp->if_flags & IFF_RUNNING) && !ifq_is_oactive(ifq)) {
356                 logifstart(run, ifp);
357                 ifp->if_start(ifp);
358                 if ((ifp->if_flags & IFF_RUNNING) && !ifq_is_oactive(ifq))
359                         running = 1;
360         }
361         need_sched = if_start_need_schedule(ifq, running);
362         ifnet_deserialize_tx(ifp);
363
364         if (need_sched) {
365                 /*
366                  * More data need to be transmitted, ifnet.if_start is
367                  * scheduled on ifnet's CPU, and we keep going.
368                  * NOTE: ifnet.if_start interlock is not released.
369                  */
370                 logifstart(sched, ifp);
371                 ifq_ifstart_schedule(ifq, 0);
372         }
373 }
374
375 /* Device driver ifnet.if_start helper function */
376 void
377 if_devstart(struct ifnet *ifp)
378 {
379         struct ifaltq *ifq = &ifp->if_snd;
380         int running = 0;
381
382         ASSERT_IFNET_SERIALIZED_TX(ifp);
383
384         ALTQ_LOCK(ifq);
385         if (ifq_is_started(ifq) || !ifq_data_ready(ifq)) {
386                 logifstart(avoid, ifp);
387                 ALTQ_UNLOCK(ifq);
388                 return;
389         }
390         ifq_set_started(ifq);
391         ALTQ_UNLOCK(ifq);
392
393         logifstart(run, ifp);
394         ifp->if_start(ifp);
395
396         if ((ifp->if_flags & IFF_RUNNING) && !ifq_is_oactive(ifq))
397                 running = 1;
398
399         if (if_start_need_schedule(ifq, running)) {
400                 /*
401                  * More data need to be transmitted, ifnet.if_start is
402                  * scheduled on ifnet's CPU, and we keep going.
403                  * NOTE: ifnet.if_start interlock is not released.
404                  */
405                 logifstart(sched, ifp);
406                 ifq_ifstart_schedule(ifq, 0);
407         }
408 }
409
410 static void
411 if_default_serialize(struct ifnet *ifp, enum ifnet_serialize slz __unused)
412 {
413         lwkt_serialize_enter(ifp->if_serializer);
414 }
415
416 static void
417 if_default_deserialize(struct ifnet *ifp, enum ifnet_serialize slz __unused)
418 {
419         lwkt_serialize_exit(ifp->if_serializer);
420 }
421
422 static int
423 if_default_tryserialize(struct ifnet *ifp, enum ifnet_serialize slz __unused)
424 {
425         return lwkt_serialize_try(ifp->if_serializer);
426 }
427
428 #ifdef INVARIANTS
429 static void
430 if_default_serialize_assert(struct ifnet *ifp,
431                             enum ifnet_serialize slz __unused,
432                             boolean_t serialized)
433 {
434         if (serialized)
435                 ASSERT_SERIALIZED(ifp->if_serializer);
436         else
437                 ASSERT_NOT_SERIALIZED(ifp->if_serializer);
438 }
439 #endif
440
441 /*
442  * Attach an interface to the list of "active" interfaces.
443  *
444  * The serializer is optional.  If non-NULL access to the interface
445  * may be MPSAFE.
446  */
447 void
448 if_attach(struct ifnet *ifp, lwkt_serialize_t serializer)
449 {
450         unsigned socksize, ifasize;
451         int namelen, masklen;
452         struct sockaddr_dl *sdl;
453         struct ifaddr *ifa;
454         struct ifaltq *ifq;
455         int i;
456
457         static int if_indexlim = 8;
458
459         if (ifp->if_serialize != NULL) {
460                 KASSERT(ifp->if_deserialize != NULL &&
461                         ifp->if_tryserialize != NULL &&
462                         ifp->if_serialize_assert != NULL,
463                         ("serialize functions are partially setup"));
464
465                 /*
466                  * If the device supplies serialize functions,
467                  * then clear if_serializer to catch any invalid
468                  * usage of this field.
469                  */
470                 KASSERT(serializer == NULL,
471                         ("both serialize functions and default serializer "
472                          "are supplied"));
473                 ifp->if_serializer = NULL;
474         } else {
475                 KASSERT(ifp->if_deserialize == NULL &&
476                         ifp->if_tryserialize == NULL &&
477                         ifp->if_serialize_assert == NULL,
478                         ("serialize functions are partially setup"));
479                 ifp->if_serialize = if_default_serialize;
480                 ifp->if_deserialize = if_default_deserialize;
481                 ifp->if_tryserialize = if_default_tryserialize;
482 #ifdef INVARIANTS
483                 ifp->if_serialize_assert = if_default_serialize_assert;
484 #endif
485
486                 /*
487                  * The serializer can be passed in from the device,
488                  * allowing the same serializer to be used for both
489                  * the interrupt interlock and the device queue.
490                  * If not specified, the netif structure will use an
491                  * embedded serializer.
492                  */
493                 if (serializer == NULL) {
494                         serializer = &ifp->if_default_serializer;
495                         lwkt_serialize_init(serializer);
496                 }
497                 ifp->if_serializer = serializer;
498         }
499
500         ifp->if_start_cpuid = if_start_cpuid;
501         ifp->if_cpuid = 0;
502
503 #ifdef IFPOLL_ENABLE
504         /* Device is not in polling mode by default */
505         ifp->if_npoll_cpuid = -1;
506         if (ifp->if_npoll != NULL)
507                 ifp->if_start_cpuid = if_start_cpuid_npoll;
508 #endif
509
510         ifp->if_start_nmsg = kmalloc(ncpus * sizeof(*ifp->if_start_nmsg),
511                                      M_LWKTMSG, M_WAITOK);
512         for (i = 0; i < ncpus; ++i) {
513                 netmsg_init(&ifp->if_start_nmsg[i], NULL, &netisr_adone_rport,
514                             0, if_start_dispatch);
515                 ifp->if_start_nmsg[i].lmsg.u.ms_resultp = ifp;
516         }
517
518         mtx_init(&ifp->if_ioctl_mtx);
519         mtx_lock(&ifp->if_ioctl_mtx);
520
521         TAILQ_INSERT_TAIL(&ifnet, ifp, if_link);
522         ifp->if_index = ++if_index;
523
524         /*
525          * XXX -
526          * The old code would work if the interface passed a pre-existing
527          * chain of ifaddrs to this code.  We don't trust our callers to
528          * properly initialize the tailq, however, so we no longer allow
529          * this unlikely case.
530          */
531         ifp->if_addrheads = kmalloc(ncpus * sizeof(struct ifaddrhead),
532                                     M_IFADDR, M_WAITOK | M_ZERO);
533         for (i = 0; i < ncpus; ++i)
534                 TAILQ_INIT(&ifp->if_addrheads[i]);
535
536         TAILQ_INIT(&ifp->if_prefixhead);
537         TAILQ_INIT(&ifp->if_multiaddrs);
538         TAILQ_INIT(&ifp->if_groups);
539         getmicrotime(&ifp->if_lastchange);
540         if (ifindex2ifnet == NULL || if_index >= if_indexlim) {
541                 unsigned int n;
542                 struct ifnet **q;
543
544                 if_indexlim <<= 1;
545
546                 /* grow ifindex2ifnet */
547                 n = if_indexlim * sizeof(*q);
548                 q = kmalloc(n, M_IFADDR, M_WAITOK | M_ZERO);
549                 if (ifindex2ifnet) {
550                         bcopy(ifindex2ifnet, q, n/2);
551                         kfree(ifindex2ifnet, M_IFADDR);
552                 }
553                 ifindex2ifnet = q;
554         }
555
556         ifindex2ifnet[if_index] = ifp;
557
558         /*
559          * create a Link Level name for this device
560          */
561         namelen = strlen(ifp->if_xname);
562         masklen = offsetof(struct sockaddr_dl, sdl_data[0]) + namelen;
563         socksize = masklen + ifp->if_addrlen;
564 #define ROUNDUP(a) (1 + (((a) - 1) | (sizeof(long) - 1)))
565         if (socksize < sizeof(*sdl))
566                 socksize = sizeof(*sdl);
567         socksize = ROUNDUP(socksize);
568 #undef ROUNDUP
569         ifasize = sizeof(struct ifaddr) + 2 * socksize;
570         ifa = ifa_create(ifasize, M_WAITOK);
571         sdl = (struct sockaddr_dl *)(ifa + 1);
572         sdl->sdl_len = socksize;
573         sdl->sdl_family = AF_LINK;
574         bcopy(ifp->if_xname, sdl->sdl_data, namelen);
575         sdl->sdl_nlen = namelen;
576         sdl->sdl_index = ifp->if_index;
577         sdl->sdl_type = ifp->if_type;
578         ifp->if_lladdr = ifa;
579         ifa->ifa_ifp = ifp;
580         ifa->ifa_rtrequest = link_rtrequest;
581         ifa->ifa_addr = (struct sockaddr *)sdl;
582         sdl = (struct sockaddr_dl *)(socksize + (caddr_t)sdl);
583         ifa->ifa_netmask = (struct sockaddr *)sdl;
584         sdl->sdl_len = masklen;
585         while (namelen != 0)
586                 sdl->sdl_data[--namelen] = 0xff;
587         ifa_iflink(ifa, ifp, 0 /* Insert head */);
588
589         EVENTHANDLER_INVOKE(ifnet_attach_event, ifp);
590         devctl_notify("IFNET", ifp->if_xname, "ATTACH", NULL);
591
592         ifq = &ifp->if_snd;
593         ifq->altq_type = 0;
594         ifq->altq_disc = NULL;
595         ifq->altq_flags &= ALTQF_CANTCHANGE;
596         ifq->altq_tbr = NULL;
597         ifq->altq_ifp = ifp;
598         ifq->altq_started = 0;
599         ifq->altq_prepended = NULL;
600         ALTQ_LOCK_INIT(ifq);
601         ifq_set_classic(ifq);
602
603         ifq->altq_stage =
604             kmalloc_cachealign(ncpus * sizeof(struct ifaltq_stage),
605             M_DEVBUF, M_WAITOK | M_ZERO);
606         for (i = 0; i < ncpus; ++i)
607                 ifq->altq_stage[i].ifqs_altq = ifq;
608
609         if (!SLIST_EMPTY(&domains))
610                 if_attachdomain1(ifp);
611
612         /* Announce the interface. */
613         rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
614
615         mtx_unlock(&ifp->if_ioctl_mtx);
616 }
617
618 static void
619 if_attachdomain(void *dummy)
620 {
621         struct ifnet *ifp;
622
623         crit_enter();
624         TAILQ_FOREACH(ifp, &ifnet, if_list)
625                 if_attachdomain1(ifp);
626         crit_exit();
627 }
628 SYSINIT(domainifattach, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST,
629         if_attachdomain, NULL);
630
631 static void
632 if_attachdomain1(struct ifnet *ifp)
633 {
634         struct domain *dp;
635
636         crit_enter();
637
638         /* address family dependent data region */
639         bzero(ifp->if_afdata, sizeof(ifp->if_afdata));
640         SLIST_FOREACH(dp, &domains, dom_next)
641                 if (dp->dom_ifattach)
642                         ifp->if_afdata[dp->dom_family] =
643                                 (*dp->dom_ifattach)(ifp);
644         crit_exit();
645 }
646
647 /*
648  * Purge all addresses whose type is _not_ AF_LINK
649  */
650 void
651 if_purgeaddrs_nolink(struct ifnet *ifp)
652 {
653         struct ifaddr_container *ifac, *next;
654
655         TAILQ_FOREACH_MUTABLE(ifac, &ifp->if_addrheads[mycpuid],
656                               ifa_link, next) {
657                 struct ifaddr *ifa = ifac->ifa;
658
659                 /* Leave link ifaddr as it is */
660                 if (ifa->ifa_addr->sa_family == AF_LINK)
661                         continue;
662 #ifdef INET
663                 /* XXX: Ugly!! ad hoc just for INET */
664                 if (ifa->ifa_addr && ifa->ifa_addr->sa_family == AF_INET) {
665                         struct ifaliasreq ifr;
666 #ifdef IFADDR_DEBUG_VERBOSE
667                         int i;
668
669                         kprintf("purge in4 addr %p: ", ifa);
670                         for (i = 0; i < ncpus; ++i)
671                                 kprintf("%d ", ifa->ifa_containers[i].ifa_refcnt);
672                         kprintf("\n");
673 #endif
674
675                         bzero(&ifr, sizeof ifr);
676                         ifr.ifra_addr = *ifa->ifa_addr;
677                         if (ifa->ifa_dstaddr)
678                                 ifr.ifra_broadaddr = *ifa->ifa_dstaddr;
679                         if (in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, ifp,
680                                        NULL) == 0)
681                                 continue;
682                 }
683 #endif /* INET */
684 #ifdef INET6
685                 if (ifa->ifa_addr && ifa->ifa_addr->sa_family == AF_INET6) {
686 #ifdef IFADDR_DEBUG_VERBOSE
687                         int i;
688
689                         kprintf("purge in6 addr %p: ", ifa);
690                         for (i = 0; i < ncpus; ++i)
691                                 kprintf("%d ", ifa->ifa_containers[i].ifa_refcnt);
692                         kprintf("\n");
693 #endif
694
695                         in6_purgeaddr(ifa);
696                         /* ifp_addrhead is already updated */
697                         continue;
698                 }
699 #endif /* INET6 */
700                 ifa_ifunlink(ifa, ifp);
701                 ifa_destroy(ifa);
702         }
703 }
704
705 static void
706 ifq_stage_detach_handler(netmsg_t nmsg)
707 {
708         struct ifaltq *ifq = nmsg->lmsg.u.ms_resultp;
709         struct ifaltq_stage *stage = ifq_get_stage(ifq, mycpuid);
710
711         if (stage->ifqs_flags & IFQ_STAGE_FLAG_QUED)
712                 ifq_stage_remove(&ifq_stage_heads[mycpuid], stage);
713         lwkt_replymsg(&nmsg->lmsg, 0);
714 }
715
716 static void
717 ifq_stage_detach(struct ifaltq *ifq)
718 {
719         struct netmsg_base base;
720         int cpu;
721
722         netmsg_init(&base, NULL, &curthread->td_msgport, 0,
723             ifq_stage_detach_handler);
724         base.lmsg.u.ms_resultp = ifq;
725
726         for (cpu = 0; cpu < ncpus; ++cpu)
727                 lwkt_domsg(netisr_portfn(cpu), &base.lmsg, 0);
728 }
729
730 /*
731  * Detach an interface, removing it from the
732  * list of "active" interfaces.
733  */
734 void
735 if_detach(struct ifnet *ifp)
736 {
737         struct radix_node_head  *rnh;
738         int i;
739         int cpu, origcpu;
740         struct domain *dp;
741
742         EVENTHANDLER_INVOKE(ifnet_detach_event, ifp);
743
744         /*
745          * Remove routes and flush queues.
746          */
747         crit_enter();
748 #ifdef IFPOLL_ENABLE
749         if (ifp->if_flags & IFF_NPOLLING)
750                 ifpoll_deregister(ifp);
751 #endif
752         if_down(ifp);
753
754 #ifdef ALTQ
755         if (ifq_is_enabled(&ifp->if_snd))
756                 altq_disable(&ifp->if_snd);
757         if (ifq_is_attached(&ifp->if_snd))
758                 altq_detach(&ifp->if_snd);
759 #endif
760
761         /*
762          * Clean up all addresses.
763          */
764         ifp->if_lladdr = NULL;
765
766         if_purgeaddrs_nolink(ifp);
767         if (!TAILQ_EMPTY(&ifp->if_addrheads[mycpuid])) {
768                 struct ifaddr *ifa;
769
770                 ifa = TAILQ_FIRST(&ifp->if_addrheads[mycpuid])->ifa;
771                 KASSERT(ifa->ifa_addr->sa_family == AF_LINK,
772                         ("non-link ifaddr is left on if_addrheads"));
773
774                 ifa_ifunlink(ifa, ifp);
775                 ifa_destroy(ifa);
776                 KASSERT(TAILQ_EMPTY(&ifp->if_addrheads[mycpuid]),
777                         ("there are still ifaddrs left on if_addrheads"));
778         }
779
780 #ifdef INET
781         /*
782          * Remove all IPv4 kernel structures related to ifp.
783          */
784         in_ifdetach(ifp);
785 #endif
786
787 #ifdef INET6
788         /*
789          * Remove all IPv6 kernel structs related to ifp.  This should be done
790          * before removing routing entries below, since IPv6 interface direct
791          * routes are expected to be removed by the IPv6-specific kernel API.
792          * Otherwise, the kernel will detect some inconsistency and bark it.
793          */
794         in6_ifdetach(ifp);
795 #endif
796
797         /*
798          * Delete all remaining routes using this interface
799          * Unfortuneatly the only way to do this is to slog through
800          * the entire routing table looking for routes which point
801          * to this interface...oh well...
802          */
803         origcpu = mycpuid;
804         for (cpu = 0; cpu < ncpus; cpu++) {
805                 lwkt_migratecpu(cpu);
806                 for (i = 1; i <= AF_MAX; i++) {
807                         if ((rnh = rt_tables[cpu][i]) == NULL)
808                                 continue;
809                         rnh->rnh_walktree(rnh, if_rtdel, ifp);
810                 }
811         }
812         lwkt_migratecpu(origcpu);
813
814         /* Announce that the interface is gone. */
815         rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
816         devctl_notify("IFNET", ifp->if_xname, "DETACH", NULL);
817
818         SLIST_FOREACH(dp, &domains, dom_next)
819                 if (dp->dom_ifdetach && ifp->if_afdata[dp->dom_family])
820                         (*dp->dom_ifdetach)(ifp,
821                                 ifp->if_afdata[dp->dom_family]);
822
823         /*
824          * Remove interface from ifindex2ifp[] and maybe decrement if_index.
825          */
826         ifindex2ifnet[ifp->if_index] = NULL;
827         while (if_index > 0 && ifindex2ifnet[if_index] == NULL)
828                 if_index--;
829
830         TAILQ_REMOVE(&ifnet, ifp, if_link);
831         kfree(ifp->if_addrheads, M_IFADDR);
832
833         lwkt_synchronize_ipiqs("if_detach");
834         ifq_stage_detach(&ifp->if_snd);
835
836         kfree(ifp->if_start_nmsg, M_LWKTMSG);
837         kfree(ifp->if_snd.altq_stage, M_DEVBUF);
838         crit_exit();
839 }
840
841 /*
842  * Create interface group without members
843  */
844 struct ifg_group *
845 if_creategroup(const char *groupname)
846 {
847         struct ifg_group        *ifg = NULL;
848
849         if ((ifg = (struct ifg_group *)kmalloc(sizeof(struct ifg_group),
850             M_TEMP, M_NOWAIT)) == NULL)
851                 return (NULL);
852
853         strlcpy(ifg->ifg_group, groupname, sizeof(ifg->ifg_group));
854         ifg->ifg_refcnt = 0;
855         ifg->ifg_carp_demoted = 0;
856         TAILQ_INIT(&ifg->ifg_members);
857 #if NPF > 0
858         pfi_attach_ifgroup(ifg);
859 #endif
860         TAILQ_INSERT_TAIL(&ifg_head, ifg, ifg_next);
861
862         return (ifg);
863 }
864
865 /*
866  * Add a group to an interface
867  */
868 int
869 if_addgroup(struct ifnet *ifp, const char *groupname)
870 {
871         struct ifg_list         *ifgl;
872         struct ifg_group        *ifg = NULL;
873         struct ifg_member       *ifgm;
874
875         if (groupname[0] && groupname[strlen(groupname) - 1] >= '0' &&
876             groupname[strlen(groupname) - 1] <= '9')
877                 return (EINVAL);
878
879         TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
880                 if (!strcmp(ifgl->ifgl_group->ifg_group, groupname))
881                         return (EEXIST);
882
883         if ((ifgl = kmalloc(sizeof(*ifgl), M_TEMP, M_NOWAIT)) == NULL)
884                 return (ENOMEM);
885
886         if ((ifgm = kmalloc(sizeof(*ifgm), M_TEMP, M_NOWAIT)) == NULL) {
887                 kfree(ifgl, M_TEMP);
888                 return (ENOMEM);
889         }
890
891         TAILQ_FOREACH(ifg, &ifg_head, ifg_next)
892                 if (!strcmp(ifg->ifg_group, groupname))
893                         break;
894
895         if (ifg == NULL && (ifg = if_creategroup(groupname)) == NULL) {
896                 kfree(ifgl, M_TEMP);
897                 kfree(ifgm, M_TEMP);
898                 return (ENOMEM);
899         }
900
901         ifg->ifg_refcnt++;
902         ifgl->ifgl_group = ifg;
903         ifgm->ifgm_ifp = ifp;
904
905         TAILQ_INSERT_TAIL(&ifg->ifg_members, ifgm, ifgm_next);
906         TAILQ_INSERT_TAIL(&ifp->if_groups, ifgl, ifgl_next);
907
908 #if NPF > 0
909         pfi_group_change(groupname);
910 #endif
911
912         return (0);
913 }
914
915 /*
916  * Remove a group from an interface
917  */
918 int
919 if_delgroup(struct ifnet *ifp, const char *groupname)
920 {
921         struct ifg_list         *ifgl;
922         struct ifg_member       *ifgm;
923
924         TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
925                 if (!strcmp(ifgl->ifgl_group->ifg_group, groupname))
926                         break;
927         if (ifgl == NULL)
928                 return (ENOENT);
929
930         TAILQ_REMOVE(&ifp->if_groups, ifgl, ifgl_next);
931
932         TAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next)
933                 if (ifgm->ifgm_ifp == ifp)
934                         break;
935
936         if (ifgm != NULL) {
937                 TAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifgm_next);
938                 kfree(ifgm, M_TEMP);
939         }
940
941         if (--ifgl->ifgl_group->ifg_refcnt == 0) {
942                 TAILQ_REMOVE(&ifg_head, ifgl->ifgl_group, ifg_next);
943 #if NPF > 0
944                 pfi_detach_ifgroup(ifgl->ifgl_group);
945 #endif
946                 kfree(ifgl->ifgl_group, M_TEMP);
947         }
948
949         kfree(ifgl, M_TEMP);
950
951 #if NPF > 0
952         pfi_group_change(groupname);
953 #endif
954
955         return (0);
956 }
957
958 /*
959  * Stores all groups from an interface in memory pointed
960  * to by data
961  */
962 int
963 if_getgroup(caddr_t data, struct ifnet *ifp)
964 {
965         int                      len, error;
966         struct ifg_list         *ifgl;
967         struct ifg_req           ifgrq, *ifgp;
968         struct ifgroupreq       *ifgr = (struct ifgroupreq *)data;
969
970         if (ifgr->ifgr_len == 0) {
971                 TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
972                         ifgr->ifgr_len += sizeof(struct ifg_req);
973                 return (0);
974         }
975
976         len = ifgr->ifgr_len;
977         ifgp = ifgr->ifgr_groups;
978         TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) {
979                 if (len < sizeof(ifgrq))
980                         return (EINVAL);
981                 bzero(&ifgrq, sizeof ifgrq);
982                 strlcpy(ifgrq.ifgrq_group, ifgl->ifgl_group->ifg_group,
983                     sizeof(ifgrq.ifgrq_group));
984                 if ((error = copyout((caddr_t)&ifgrq, (caddr_t)ifgp,
985                     sizeof(struct ifg_req))))
986                         return (error);
987                 len -= sizeof(ifgrq);
988                 ifgp++;
989         }
990
991         return (0);
992 }
993
994 /*
995  * Stores all members of a group in memory pointed to by data
996  */
997 int
998 if_getgroupmembers(caddr_t data)
999 {
1000         struct ifgroupreq       *ifgr = (struct ifgroupreq *)data;
1001         struct ifg_group        *ifg;
1002         struct ifg_member       *ifgm;
1003         struct ifg_req           ifgrq, *ifgp;
1004         int                      len, error;
1005
1006         TAILQ_FOREACH(ifg, &ifg_head, ifg_next)
1007                 if (!strcmp(ifg->ifg_group, ifgr->ifgr_name))
1008                         break;
1009         if (ifg == NULL)
1010                 return (ENOENT);
1011
1012         if (ifgr->ifgr_len == 0) {
1013                 TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next)
1014                         ifgr->ifgr_len += sizeof(ifgrq);
1015                 return (0);
1016         }
1017
1018         len = ifgr->ifgr_len;
1019         ifgp = ifgr->ifgr_groups;
1020         TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) {
1021                 if (len < sizeof(ifgrq))
1022                         return (EINVAL);
1023                 bzero(&ifgrq, sizeof ifgrq);
1024                 strlcpy(ifgrq.ifgrq_member, ifgm->ifgm_ifp->if_xname,
1025                     sizeof(ifgrq.ifgrq_member));
1026                 if ((error = copyout((caddr_t)&ifgrq, (caddr_t)ifgp,
1027                     sizeof(struct ifg_req))))
1028                         return (error);
1029                 len -= sizeof(ifgrq);
1030                 ifgp++;
1031         }
1032
1033         return (0);
1034 }
1035
1036 /*
1037  * Delete Routes for a Network Interface
1038  *
1039  * Called for each routing entry via the rnh->rnh_walktree() call above
1040  * to delete all route entries referencing a detaching network interface.
1041  *
1042  * Arguments:
1043  *      rn      pointer to node in the routing table
1044  *      arg     argument passed to rnh->rnh_walktree() - detaching interface
1045  *
1046  * Returns:
1047  *      0       successful
1048  *      errno   failed - reason indicated
1049  *
1050  */
1051 static int
1052 if_rtdel(struct radix_node *rn, void *arg)
1053 {
1054         struct rtentry  *rt = (struct rtentry *)rn;
1055         struct ifnet    *ifp = arg;
1056         int             err;
1057
1058         if (rt->rt_ifp == ifp) {
1059
1060                 /*
1061                  * Protect (sorta) against walktree recursion problems
1062                  * with cloned routes
1063                  */
1064                 if (!(rt->rt_flags & RTF_UP))
1065                         return (0);
1066
1067                 err = rtrequest(RTM_DELETE, rt_key(rt), rt->rt_gateway,
1068                                 rt_mask(rt), rt->rt_flags,
1069                                 NULL);
1070                 if (err) {
1071                         log(LOG_WARNING, "if_rtdel: error %d\n", err);
1072                 }
1073         }
1074
1075         return (0);
1076 }
1077
1078 /*
1079  * Locate an interface based on a complete address.
1080  */
1081 struct ifaddr *
1082 ifa_ifwithaddr(struct sockaddr *addr)
1083 {
1084         struct ifnet *ifp;
1085
1086         TAILQ_FOREACH(ifp, &ifnet, if_link) {
1087                 struct ifaddr_container *ifac;
1088
1089                 TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1090                         struct ifaddr *ifa = ifac->ifa;
1091
1092                         if (ifa->ifa_addr->sa_family != addr->sa_family)
1093                                 continue;
1094                         if (sa_equal(addr, ifa->ifa_addr))
1095                                 return (ifa);
1096                         if ((ifp->if_flags & IFF_BROADCAST) &&
1097                             ifa->ifa_broadaddr &&
1098                             /* IPv6 doesn't have broadcast */
1099                             ifa->ifa_broadaddr->sa_len != 0 &&
1100                             sa_equal(ifa->ifa_broadaddr, addr))
1101                                 return (ifa);
1102                 }
1103         }
1104         return (NULL);
1105 }
1106 /*
1107  * Locate the point to point interface with a given destination address.
1108  */
1109 struct ifaddr *
1110 ifa_ifwithdstaddr(struct sockaddr *addr)
1111 {
1112         struct ifnet *ifp;
1113
1114         TAILQ_FOREACH(ifp, &ifnet, if_link) {
1115                 struct ifaddr_container *ifac;
1116
1117                 if (!(ifp->if_flags & IFF_POINTOPOINT))
1118                         continue;
1119
1120                 TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1121                         struct ifaddr *ifa = ifac->ifa;
1122
1123                         if (ifa->ifa_addr->sa_family != addr->sa_family)
1124                                 continue;
1125                         if (ifa->ifa_dstaddr &&
1126                             sa_equal(addr, ifa->ifa_dstaddr))
1127                                 return (ifa);
1128                 }
1129         }
1130         return (NULL);
1131 }
1132
1133 /*
1134  * Find an interface on a specific network.  If many, choice
1135  * is most specific found.
1136  */
1137 struct ifaddr *
1138 ifa_ifwithnet(struct sockaddr *addr)
1139 {
1140         struct ifnet *ifp;
1141         struct ifaddr *ifa_maybe = NULL;
1142         u_int af = addr->sa_family;
1143         char *addr_data = addr->sa_data, *cplim;
1144
1145         /*
1146          * AF_LINK addresses can be looked up directly by their index number,
1147          * so do that if we can.
1148          */
1149         if (af == AF_LINK) {
1150                 struct sockaddr_dl *sdl = (struct sockaddr_dl *)addr;
1151
1152                 if (sdl->sdl_index && sdl->sdl_index <= if_index)
1153                         return (ifindex2ifnet[sdl->sdl_index]->if_lladdr);
1154         }
1155
1156         /*
1157          * Scan though each interface, looking for ones that have
1158          * addresses in this address family.
1159          */
1160         TAILQ_FOREACH(ifp, &ifnet, if_link) {
1161                 struct ifaddr_container *ifac;
1162
1163                 TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1164                         struct ifaddr *ifa = ifac->ifa;
1165                         char *cp, *cp2, *cp3;
1166
1167                         if (ifa->ifa_addr->sa_family != af)
1168 next:                           continue;
1169                         if (af == AF_INET && ifp->if_flags & IFF_POINTOPOINT) {
1170                                 /*
1171                                  * This is a bit broken as it doesn't
1172                                  * take into account that the remote end may
1173                                  * be a single node in the network we are
1174                                  * looking for.
1175                                  * The trouble is that we don't know the
1176                                  * netmask for the remote end.
1177                                  */
1178                                 if (ifa->ifa_dstaddr != NULL &&
1179                                     sa_equal(addr, ifa->ifa_dstaddr))
1180                                         return (ifa);
1181                         } else {
1182                                 /*
1183                                  * if we have a special address handler,
1184                                  * then use it instead of the generic one.
1185                                  */
1186                                 if (ifa->ifa_claim_addr) {
1187                                         if ((*ifa->ifa_claim_addr)(ifa, addr)) {
1188                                                 return (ifa);
1189                                         } else {
1190                                                 continue;
1191                                         }
1192                                 }
1193
1194                                 /*
1195                                  * Scan all the bits in the ifa's address.
1196                                  * If a bit dissagrees with what we are
1197                                  * looking for, mask it with the netmask
1198                                  * to see if it really matters.
1199                                  * (A byte at a time)
1200                                  */
1201                                 if (ifa->ifa_netmask == 0)
1202                                         continue;
1203                                 cp = addr_data;
1204                                 cp2 = ifa->ifa_addr->sa_data;
1205                                 cp3 = ifa->ifa_netmask->sa_data;
1206                                 cplim = ifa->ifa_netmask->sa_len +
1207                                         (char *)ifa->ifa_netmask;
1208                                 while (cp3 < cplim)
1209                                         if ((*cp++ ^ *cp2++) & *cp3++)
1210                                                 goto next; /* next address! */
1211                                 /*
1212                                  * If the netmask of what we just found
1213                                  * is more specific than what we had before
1214                                  * (if we had one) then remember the new one
1215                                  * before continuing to search
1216                                  * for an even better one.
1217                                  */
1218                                 if (ifa_maybe == NULL ||
1219                                     rn_refines((char *)ifa->ifa_netmask,
1220                                                (char *)ifa_maybe->ifa_netmask))
1221                                         ifa_maybe = ifa;
1222                         }
1223                 }
1224         }
1225         return (ifa_maybe);
1226 }
1227
1228 /*
1229  * Find an interface address specific to an interface best matching
1230  * a given address.
1231  */
1232 struct ifaddr *
1233 ifaof_ifpforaddr(struct sockaddr *addr, struct ifnet *ifp)
1234 {
1235         struct ifaddr_container *ifac;
1236         char *cp, *cp2, *cp3;
1237         char *cplim;
1238         struct ifaddr *ifa_maybe = NULL;
1239         u_int af = addr->sa_family;
1240
1241         if (af >= AF_MAX)
1242                 return (0);
1243         TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1244                 struct ifaddr *ifa = ifac->ifa;
1245
1246                 if (ifa->ifa_addr->sa_family != af)
1247                         continue;
1248                 if (ifa_maybe == NULL)
1249                         ifa_maybe = ifa;
1250                 if (ifa->ifa_netmask == NULL) {
1251                         if (sa_equal(addr, ifa->ifa_addr) ||
1252                             (ifa->ifa_dstaddr != NULL &&
1253                              sa_equal(addr, ifa->ifa_dstaddr)))
1254                                 return (ifa);
1255                         continue;
1256                 }
1257                 if (ifp->if_flags & IFF_POINTOPOINT) {
1258                         if (sa_equal(addr, ifa->ifa_dstaddr))
1259                                 return (ifa);
1260                 } else {
1261                         cp = addr->sa_data;
1262                         cp2 = ifa->ifa_addr->sa_data;
1263                         cp3 = ifa->ifa_netmask->sa_data;
1264                         cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask;
1265                         for (; cp3 < cplim; cp3++)
1266                                 if ((*cp++ ^ *cp2++) & *cp3)
1267                                         break;
1268                         if (cp3 == cplim)
1269                                 return (ifa);
1270                 }
1271         }
1272         return (ifa_maybe);
1273 }
1274
1275 /*
1276  * Default action when installing a route with a Link Level gateway.
1277  * Lookup an appropriate real ifa to point to.
1278  * This should be moved to /sys/net/link.c eventually.
1279  */
1280 static void
1281 link_rtrequest(int cmd, struct rtentry *rt, struct rt_addrinfo *info)
1282 {
1283         struct ifaddr *ifa;
1284         struct sockaddr *dst;
1285         struct ifnet *ifp;
1286
1287         if (cmd != RTM_ADD || (ifa = rt->rt_ifa) == NULL ||
1288             (ifp = ifa->ifa_ifp) == NULL || (dst = rt_key(rt)) == NULL)
1289                 return;
1290         ifa = ifaof_ifpforaddr(dst, ifp);
1291         if (ifa != NULL) {
1292                 IFAFREE(rt->rt_ifa);
1293                 IFAREF(ifa);
1294                 rt->rt_ifa = ifa;
1295                 if (ifa->ifa_rtrequest && ifa->ifa_rtrequest != link_rtrequest)
1296                         ifa->ifa_rtrequest(cmd, rt, info);
1297         }
1298 }
1299
1300 /*
1301  * Mark an interface down and notify protocols of
1302  * the transition.
1303  * NOTE: must be called at splnet or eqivalent.
1304  */
1305 void
1306 if_unroute(struct ifnet *ifp, int flag, int fam)
1307 {
1308         struct ifaddr_container *ifac;
1309
1310         ifp->if_flags &= ~flag;
1311         getmicrotime(&ifp->if_lastchange);
1312         TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1313                 struct ifaddr *ifa = ifac->ifa;
1314
1315                 if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family))
1316                         kpfctlinput(PRC_IFDOWN, ifa->ifa_addr);
1317         }
1318         ifq_purge_all(&ifp->if_snd);
1319         rt_ifmsg(ifp);
1320 }
1321
1322 /*
1323  * Mark an interface up and notify protocols of
1324  * the transition.
1325  * NOTE: must be called at splnet or eqivalent.
1326  */
1327 void
1328 if_route(struct ifnet *ifp, int flag, int fam)
1329 {
1330         struct ifaddr_container *ifac;
1331
1332         ifq_purge_all(&ifp->if_snd);
1333         ifp->if_flags |= flag;
1334         getmicrotime(&ifp->if_lastchange);
1335         TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1336                 struct ifaddr *ifa = ifac->ifa;
1337
1338                 if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family))
1339                         kpfctlinput(PRC_IFUP, ifa->ifa_addr);
1340         }
1341         rt_ifmsg(ifp);
1342 #ifdef INET6
1343         in6_if_up(ifp);
1344 #endif
1345 }
1346
1347 /*
1348  * Mark an interface down and notify protocols of the transition.  An
1349  * interface going down is also considered to be a synchronizing event.
1350  * We must ensure that all packet processing related to the interface
1351  * has completed before we return so e.g. the caller can free the ifnet
1352  * structure that the mbufs may be referencing.
1353  *
1354  * NOTE: must be called at splnet or eqivalent.
1355  */
1356 void
1357 if_down(struct ifnet *ifp)
1358 {
1359         if_unroute(ifp, IFF_UP, AF_UNSPEC);
1360         netmsg_service_sync();
1361 }
1362
1363 /*
1364  * Mark an interface up and notify protocols of
1365  * the transition.
1366  * NOTE: must be called at splnet or eqivalent.
1367  */
1368 void
1369 if_up(struct ifnet *ifp)
1370 {
1371         if_route(ifp, IFF_UP, AF_UNSPEC);
1372 }
1373
1374 /*
1375  * Process a link state change.
1376  * NOTE: must be called at splsoftnet or equivalent.
1377  */
1378 void
1379 if_link_state_change(struct ifnet *ifp)
1380 {
1381         int link_state = ifp->if_link_state;
1382
1383         rt_ifmsg(ifp);
1384         devctl_notify("IFNET", ifp->if_xname,
1385             (link_state == LINK_STATE_UP) ? "LINK_UP" : "LINK_DOWN", NULL);
1386 }
1387
1388 /*
1389  * Handle interface watchdog timer routines.  Called
1390  * from softclock, we decrement timers (if set) and
1391  * call the appropriate interface routine on expiration.
1392  */
1393 static void
1394 if_slowtimo(void *arg)
1395 {
1396         struct ifnet *ifp;
1397
1398         crit_enter();
1399
1400         TAILQ_FOREACH(ifp, &ifnet, if_link) {
1401                 if (ifp->if_timer == 0 || --ifp->if_timer)
1402                         continue;
1403                 if (ifp->if_watchdog) {
1404                         if (ifnet_tryserialize_all(ifp)) {
1405                                 (*ifp->if_watchdog)(ifp);
1406                                 ifnet_deserialize_all(ifp);
1407                         } else {
1408                                 /* try again next timeout */
1409                                 ++ifp->if_timer;
1410                         }
1411                 }
1412         }
1413
1414         crit_exit();
1415
1416         callout_reset(&if_slowtimo_timer, hz / IFNET_SLOWHZ, if_slowtimo, NULL);
1417 }
1418
1419 /*
1420  * Map interface name to
1421  * interface structure pointer.
1422  */
1423 struct ifnet *
1424 ifunit(const char *name)
1425 {
1426         struct ifnet *ifp;
1427
1428         /*
1429          * Search all the interfaces for this name/number
1430          */
1431
1432         TAILQ_FOREACH(ifp, &ifnet, if_link) {
1433                 if (strncmp(ifp->if_xname, name, IFNAMSIZ) == 0)
1434                         break;
1435         }
1436         return (ifp);
1437 }
1438
1439
1440 /*
1441  * Map interface name in a sockaddr_dl to
1442  * interface structure pointer.
1443  */
1444 struct ifnet *
1445 if_withname(struct sockaddr *sa)
1446 {
1447         char ifname[IFNAMSIZ+1];
1448         struct sockaddr_dl *sdl = (struct sockaddr_dl *)sa;
1449
1450         if ( (sa->sa_family != AF_LINK) || (sdl->sdl_nlen == 0) ||
1451              (sdl->sdl_nlen > IFNAMSIZ) )
1452                 return NULL;
1453
1454         /*
1455          * ifunit wants a null-terminated name.  It may not be null-terminated
1456          * in the sockaddr.  We don't want to change the caller's sockaddr,
1457          * and there might not be room to put the trailing null anyway, so we
1458          * make a local copy that we know we can null terminate safely.
1459          */
1460
1461         bcopy(sdl->sdl_data, ifname, sdl->sdl_nlen);
1462         ifname[sdl->sdl_nlen] = '\0';
1463         return ifunit(ifname);
1464 }
1465
1466
1467 /*
1468  * Interface ioctls.
1469  */
1470 int
1471 ifioctl(struct socket *so, u_long cmd, caddr_t data, struct ucred *cred)
1472 {
1473         struct ifnet *ifp;
1474         struct ifreq *ifr;
1475         struct ifstat *ifs;
1476         int error;
1477         short oif_flags;
1478         int new_flags;
1479 #ifdef COMPAT_43
1480         int ocmd;
1481 #endif
1482         size_t namelen, onamelen;
1483         char new_name[IFNAMSIZ];
1484         struct ifaddr *ifa;
1485         struct sockaddr_dl *sdl;
1486
1487         switch (cmd) {
1488         case SIOCGIFCONF:
1489         case OSIOCGIFCONF:
1490                 return (ifconf(cmd, data, cred));
1491         default:
1492                 break;
1493         }
1494
1495         ifr = (struct ifreq *)data;
1496
1497         switch (cmd) {
1498         case SIOCIFCREATE:
1499         case SIOCIFCREATE2:
1500                 if ((error = priv_check_cred(cred, PRIV_ROOT, 0)) != 0)
1501                         return (error);
1502                 return (if_clone_create(ifr->ifr_name, sizeof(ifr->ifr_name),
1503                         cmd == SIOCIFCREATE2 ? ifr->ifr_data : NULL));
1504         case SIOCIFDESTROY:
1505                 if ((error = priv_check_cred(cred, PRIV_ROOT, 0)) != 0)
1506                         return (error);
1507                 return (if_clone_destroy(ifr->ifr_name));
1508         case SIOCIFGCLONERS:
1509                 return (if_clone_list((struct if_clonereq *)data));
1510         default:
1511                 break;
1512         }
1513
1514         /*
1515          * Nominal ioctl through interface, lookup the ifp and obtain a
1516          * lock to serialize the ifconfig ioctl operation.
1517          */
1518         ifp = ifunit(ifr->ifr_name);
1519         if (ifp == NULL)
1520                 return (ENXIO);
1521         error = 0;
1522         mtx_lock(&ifp->if_ioctl_mtx);
1523
1524         switch (cmd) {
1525         case SIOCGIFINDEX:
1526                 ifr->ifr_index = ifp->if_index;
1527                 break;
1528
1529         case SIOCGIFFLAGS:
1530                 ifr->ifr_flags = ifp->if_flags;
1531                 ifr->ifr_flagshigh = ifp->if_flags >> 16;
1532                 break;
1533
1534         case SIOCGIFCAP:
1535                 ifr->ifr_reqcap = ifp->if_capabilities;
1536                 ifr->ifr_curcap = ifp->if_capenable;
1537                 break;
1538
1539         case SIOCGIFMETRIC:
1540                 ifr->ifr_metric = ifp->if_metric;
1541                 break;
1542
1543         case SIOCGIFMTU:
1544                 ifr->ifr_mtu = ifp->if_mtu;
1545                 break;
1546
1547         case SIOCGIFDATA:
1548                 error = copyout((caddr_t)&ifp->if_data, ifr->ifr_data,
1549                                 sizeof(ifp->if_data));
1550                 break;
1551
1552         case SIOCGIFPHYS:
1553                 ifr->ifr_phys = ifp->if_physical;
1554                 break;
1555
1556         case SIOCGIFPOLLCPU:
1557                 ifr->ifr_pollcpu = -1;
1558                 break;
1559
1560         case SIOCSIFPOLLCPU:
1561                 break;
1562
1563         case SIOCSIFFLAGS:
1564                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1565                 if (error)
1566                         break;
1567                 new_flags = (ifr->ifr_flags & 0xffff) |
1568                     (ifr->ifr_flagshigh << 16);
1569                 if (ifp->if_flags & IFF_SMART) {
1570                         /* Smart drivers twiddle their own routes */
1571                 } else if (ifp->if_flags & IFF_UP &&
1572                     (new_flags & IFF_UP) == 0) {
1573                         crit_enter();
1574                         if_down(ifp);
1575                         crit_exit();
1576                 } else if (new_flags & IFF_UP &&
1577                     (ifp->if_flags & IFF_UP) == 0) {
1578                         crit_enter();
1579                         if_up(ifp);
1580                         crit_exit();
1581                 }
1582
1583 #ifdef IFPOLL_ENABLE
1584                 if ((new_flags ^ ifp->if_flags) & IFF_NPOLLING) {
1585                         if (new_flags & IFF_NPOLLING)
1586                                 ifpoll_register(ifp);
1587                         else
1588                                 ifpoll_deregister(ifp);
1589                 }
1590 #endif
1591
1592                 ifp->if_flags = (ifp->if_flags & IFF_CANTCHANGE) |
1593                         (new_flags &~ IFF_CANTCHANGE);
1594                 if (new_flags & IFF_PPROMISC) {
1595                         /* Permanently promiscuous mode requested */
1596                         ifp->if_flags |= IFF_PROMISC;
1597                 } else if (ifp->if_pcount == 0) {
1598                         ifp->if_flags &= ~IFF_PROMISC;
1599                 }
1600                 if (ifp->if_ioctl) {
1601                         ifnet_serialize_all(ifp);
1602                         ifp->if_ioctl(ifp, cmd, data, cred);
1603                         ifnet_deserialize_all(ifp);
1604                 }
1605                 getmicrotime(&ifp->if_lastchange);
1606                 break;
1607
1608         case SIOCSIFCAP:
1609                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1610                 if (error)
1611                         break;
1612                 if (ifr->ifr_reqcap & ~ifp->if_capabilities) {
1613                         error = EINVAL;
1614                         break;
1615                 }
1616                 ifnet_serialize_all(ifp);
1617                 ifp->if_ioctl(ifp, cmd, data, cred);
1618                 ifnet_deserialize_all(ifp);
1619                 break;
1620
1621         case SIOCSIFNAME:
1622                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1623                 if (error)
1624                         break;
1625                 error = copyinstr(ifr->ifr_data, new_name, IFNAMSIZ, NULL);
1626                 if (error)
1627                         break;
1628                 if (new_name[0] == '\0') {
1629                         error = EINVAL;
1630                         break;
1631                 }
1632                 if (ifunit(new_name) != NULL) {
1633                         error = EEXIST;
1634                         break;
1635                 }
1636
1637                 EVENTHANDLER_INVOKE(ifnet_detach_event, ifp);
1638
1639                 /* Announce the departure of the interface. */
1640                 rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
1641
1642                 strlcpy(ifp->if_xname, new_name, sizeof(ifp->if_xname));
1643                 ifa = TAILQ_FIRST(&ifp->if_addrheads[mycpuid])->ifa;
1644                 /* XXX IFA_LOCK(ifa); */
1645                 sdl = (struct sockaddr_dl *)ifa->ifa_addr;
1646                 namelen = strlen(new_name);
1647                 onamelen = sdl->sdl_nlen;
1648                 /*
1649                  * Move the address if needed.  This is safe because we
1650                  * allocate space for a name of length IFNAMSIZ when we
1651                  * create this in if_attach().
1652                  */
1653                 if (namelen != onamelen) {
1654                         bcopy(sdl->sdl_data + onamelen,
1655                             sdl->sdl_data + namelen, sdl->sdl_alen);
1656                 }
1657                 bcopy(new_name, sdl->sdl_data, namelen);
1658                 sdl->sdl_nlen = namelen;
1659                 sdl = (struct sockaddr_dl *)ifa->ifa_netmask;
1660                 bzero(sdl->sdl_data, onamelen);
1661                 while (namelen != 0)
1662                         sdl->sdl_data[--namelen] = 0xff;
1663                 /* XXX IFA_UNLOCK(ifa) */
1664
1665                 EVENTHANDLER_INVOKE(ifnet_attach_event, ifp);
1666
1667                 /* Announce the return of the interface. */
1668                 rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
1669                 break;
1670
1671         case SIOCSIFMETRIC:
1672                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1673                 if (error)
1674                         break;
1675                 ifp->if_metric = ifr->ifr_metric;
1676                 getmicrotime(&ifp->if_lastchange);
1677                 break;
1678
1679         case SIOCSIFPHYS:
1680                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1681                 if (error)
1682                         break;
1683                 if (ifp->if_ioctl == NULL) {
1684                         error = EOPNOTSUPP;
1685                         break;
1686                 }
1687                 ifnet_serialize_all(ifp);
1688                 error = ifp->if_ioctl(ifp, cmd, data, cred);
1689                 ifnet_deserialize_all(ifp);
1690                 if (error == 0)
1691                         getmicrotime(&ifp->if_lastchange);
1692                 break;
1693
1694         case SIOCSIFMTU:
1695         {
1696                 u_long oldmtu = ifp->if_mtu;
1697
1698                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1699                 if (error)
1700                         break;
1701                 if (ifp->if_ioctl == NULL) {
1702                         error = EOPNOTSUPP;
1703                         break;
1704                 }
1705                 if (ifr->ifr_mtu < IF_MINMTU || ifr->ifr_mtu > IF_MAXMTU) {
1706                         error = EINVAL;
1707                         break;
1708                 }
1709                 ifnet_serialize_all(ifp);
1710                 error = ifp->if_ioctl(ifp, cmd, data, cred);
1711                 ifnet_deserialize_all(ifp);
1712                 if (error == 0) {
1713                         getmicrotime(&ifp->if_lastchange);
1714                         rt_ifmsg(ifp);
1715                 }
1716                 /*
1717                  * If the link MTU changed, do network layer specific procedure.
1718                  */
1719                 if (ifp->if_mtu != oldmtu) {
1720 #ifdef INET6
1721                         nd6_setmtu(ifp);
1722 #endif
1723                 }
1724                 break;
1725         }
1726
1727         case SIOCADDMULTI:
1728         case SIOCDELMULTI:
1729                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1730                 if (error)
1731                         break;
1732
1733                 /* Don't allow group membership on non-multicast interfaces. */
1734                 if ((ifp->if_flags & IFF_MULTICAST) == 0) {
1735                         error = EOPNOTSUPP;
1736                         break;
1737                 }
1738
1739                 /* Don't let users screw up protocols' entries. */
1740                 if (ifr->ifr_addr.sa_family != AF_LINK) {
1741                         error = EINVAL;
1742                         break;
1743                 }
1744
1745                 if (cmd == SIOCADDMULTI) {
1746                         struct ifmultiaddr *ifma;
1747                         error = if_addmulti(ifp, &ifr->ifr_addr, &ifma);
1748                 } else {
1749                         error = if_delmulti(ifp, &ifr->ifr_addr);
1750                 }
1751                 if (error == 0)
1752                         getmicrotime(&ifp->if_lastchange);
1753                 break;
1754
1755         case SIOCSIFPHYADDR:
1756         case SIOCDIFPHYADDR:
1757 #ifdef INET6
1758         case SIOCSIFPHYADDR_IN6:
1759 #endif
1760         case SIOCSLIFPHYADDR:
1761         case SIOCSIFMEDIA:
1762         case SIOCSIFGENERIC:
1763                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1764                 if (error)
1765                         break;
1766                 if (ifp->if_ioctl == 0) {
1767                         error = EOPNOTSUPP;
1768                         break;
1769                 }
1770                 ifnet_serialize_all(ifp);
1771                 error = ifp->if_ioctl(ifp, cmd, data, cred);
1772                 ifnet_deserialize_all(ifp);
1773                 if (error == 0)
1774                         getmicrotime(&ifp->if_lastchange);
1775                 break;
1776
1777         case SIOCGIFSTATUS:
1778                 ifs = (struct ifstat *)data;
1779                 ifs->ascii[0] = '\0';
1780                 /* fall through */
1781         case SIOCGIFPSRCADDR:
1782         case SIOCGIFPDSTADDR:
1783         case SIOCGLIFPHYADDR:
1784         case SIOCGIFMEDIA:
1785         case SIOCGIFGENERIC:
1786                 if (ifp->if_ioctl == NULL) {
1787                         error = EOPNOTSUPP;
1788                         break;
1789                 }
1790                 ifnet_serialize_all(ifp);
1791                 error = ifp->if_ioctl(ifp, cmd, data, cred);
1792                 ifnet_deserialize_all(ifp);
1793                 break;
1794
1795         case SIOCSIFLLADDR:
1796                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1797                 if (error)
1798                         break;
1799                 error = if_setlladdr(ifp, ifr->ifr_addr.sa_data,
1800                                      ifr->ifr_addr.sa_len);
1801                 EVENTHANDLER_INVOKE(iflladdr_event, ifp);
1802                 break;
1803
1804         default:
1805                 oif_flags = ifp->if_flags;
1806                 if (so->so_proto == 0) {
1807                         error = EOPNOTSUPP;
1808                         break;
1809                 }
1810 #ifndef COMPAT_43
1811                 error = so_pru_control_direct(so, cmd, data, ifp);
1812 #else
1813                 ocmd = cmd;
1814
1815                 switch (cmd) {
1816                 case SIOCSIFDSTADDR:
1817                 case SIOCSIFADDR:
1818                 case SIOCSIFBRDADDR:
1819                 case SIOCSIFNETMASK:
1820 #if BYTE_ORDER != BIG_ENDIAN
1821                         if (ifr->ifr_addr.sa_family == 0 &&
1822                             ifr->ifr_addr.sa_len < 16) {
1823                                 ifr->ifr_addr.sa_family = ifr->ifr_addr.sa_len;
1824                                 ifr->ifr_addr.sa_len = 16;
1825                         }
1826 #else
1827                         if (ifr->ifr_addr.sa_len == 0)
1828                                 ifr->ifr_addr.sa_len = 16;
1829 #endif
1830                         break;
1831                 case OSIOCGIFADDR:
1832                         cmd = SIOCGIFADDR;
1833                         break;
1834                 case OSIOCGIFDSTADDR:
1835                         cmd = SIOCGIFDSTADDR;
1836                         break;
1837                 case OSIOCGIFBRDADDR:
1838                         cmd = SIOCGIFBRDADDR;
1839                         break;
1840                 case OSIOCGIFNETMASK:
1841                         cmd = SIOCGIFNETMASK;
1842                         break;
1843                 default:
1844                         break;
1845                 }
1846
1847                 error = so_pru_control_direct(so, cmd, data, ifp);
1848
1849                 switch (ocmd) {
1850                 case OSIOCGIFADDR:
1851                 case OSIOCGIFDSTADDR:
1852                 case OSIOCGIFBRDADDR:
1853                 case OSIOCGIFNETMASK:
1854                         *(u_short *)&ifr->ifr_addr = ifr->ifr_addr.sa_family;
1855                         break;
1856                 }
1857 #endif /* COMPAT_43 */
1858
1859                 if ((oif_flags ^ ifp->if_flags) & IFF_UP) {
1860 #ifdef INET6
1861                         DELAY(100);/* XXX: temporary workaround for fxp issue*/
1862                         if (ifp->if_flags & IFF_UP) {
1863                                 crit_enter();
1864                                 in6_if_up(ifp);
1865                                 crit_exit();
1866                         }
1867 #endif
1868                 }
1869                 break;
1870         }
1871
1872         mtx_unlock(&ifp->if_ioctl_mtx);
1873         return (error);
1874 }
1875
1876 /*
1877  * Set/clear promiscuous mode on interface ifp based on the truth value
1878  * of pswitch.  The calls are reference counted so that only the first
1879  * "on" request actually has an effect, as does the final "off" request.
1880  * Results are undefined if the "off" and "on" requests are not matched.
1881  */
1882 int
1883 ifpromisc(struct ifnet *ifp, int pswitch)
1884 {
1885         struct ifreq ifr;
1886         int error;
1887         int oldflags;
1888
1889         oldflags = ifp->if_flags;
1890         if (ifp->if_flags & IFF_PPROMISC) {
1891                 /* Do nothing if device is in permanently promiscuous mode */
1892                 ifp->if_pcount += pswitch ? 1 : -1;
1893                 return (0);
1894         }
1895         if (pswitch) {
1896                 /*
1897                  * If the device is not configured up, we cannot put it in
1898                  * promiscuous mode.
1899                  */
1900                 if ((ifp->if_flags & IFF_UP) == 0)
1901                         return (ENETDOWN);
1902                 if (ifp->if_pcount++ != 0)
1903                         return (0);
1904                 ifp->if_flags |= IFF_PROMISC;
1905                 log(LOG_INFO, "%s: promiscuous mode enabled\n",
1906                     ifp->if_xname);
1907         } else {
1908                 if (--ifp->if_pcount > 0)
1909                         return (0);
1910                 ifp->if_flags &= ~IFF_PROMISC;
1911                 log(LOG_INFO, "%s: promiscuous mode disabled\n",
1912                     ifp->if_xname);
1913         }
1914         ifr.ifr_flags = ifp->if_flags;
1915         ifr.ifr_flagshigh = ifp->if_flags >> 16;
1916         ifnet_serialize_all(ifp);
1917         error = ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr, NULL);
1918         ifnet_deserialize_all(ifp);
1919         if (error == 0)
1920                 rt_ifmsg(ifp);
1921         else
1922                 ifp->if_flags = oldflags;
1923         return error;
1924 }
1925
1926 /*
1927  * Return interface configuration
1928  * of system.  List may be used
1929  * in later ioctl's (above) to get
1930  * other information.
1931  */
1932 static int
1933 ifconf(u_long cmd, caddr_t data, struct ucred *cred)
1934 {
1935         struct ifconf *ifc = (struct ifconf *)data;
1936         struct ifnet *ifp;
1937         struct sockaddr *sa;
1938         struct ifreq ifr, *ifrp;
1939         int space = ifc->ifc_len, error = 0;
1940
1941         ifrp = ifc->ifc_req;
1942         TAILQ_FOREACH(ifp, &ifnet, if_link) {
1943                 struct ifaddr_container *ifac;
1944                 int addrs;
1945
1946                 if (space <= sizeof ifr)
1947                         break;
1948
1949                 /*
1950                  * Zero the stack declared structure first to prevent
1951                  * memory disclosure.
1952                  */
1953                 bzero(&ifr, sizeof(ifr));
1954                 if (strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name))
1955                     >= sizeof(ifr.ifr_name)) {
1956                         error = ENAMETOOLONG;
1957                         break;
1958                 }
1959
1960                 addrs = 0;
1961                 TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1962                         struct ifaddr *ifa = ifac->ifa;
1963
1964                         if (space <= sizeof ifr)
1965                                 break;
1966                         sa = ifa->ifa_addr;
1967                         if (cred->cr_prison &&
1968                             prison_if(cred, sa))
1969                                 continue;
1970                         addrs++;
1971 #ifdef COMPAT_43
1972                         if (cmd == OSIOCGIFCONF) {
1973                                 struct osockaddr *osa =
1974                                          (struct osockaddr *)&ifr.ifr_addr;
1975                                 ifr.ifr_addr = *sa;
1976                                 osa->sa_family = sa->sa_family;
1977                                 error = copyout(&ifr, ifrp, sizeof ifr);
1978                                 ifrp++;
1979                         } else
1980 #endif
1981                         if (sa->sa_len <= sizeof(*sa)) {
1982                                 ifr.ifr_addr = *sa;
1983                                 error = copyout(&ifr, ifrp, sizeof ifr);
1984                                 ifrp++;
1985                         } else {
1986                                 if (space < (sizeof ifr) + sa->sa_len -
1987                                             sizeof(*sa))
1988                                         break;
1989                                 space -= sa->sa_len - sizeof(*sa);
1990                                 error = copyout(&ifr, ifrp,
1991                                                 sizeof ifr.ifr_name);
1992                                 if (error == 0)
1993                                         error = copyout(sa, &ifrp->ifr_addr,
1994                                                         sa->sa_len);
1995                                 ifrp = (struct ifreq *)
1996                                         (sa->sa_len + (caddr_t)&ifrp->ifr_addr);
1997                         }
1998                         if (error)
1999                                 break;
2000                         space -= sizeof ifr;
2001                 }
2002                 if (error)
2003                         break;
2004                 if (!addrs) {
2005                         bzero(&ifr.ifr_addr, sizeof ifr.ifr_addr);
2006                         error = copyout(&ifr, ifrp, sizeof ifr);
2007                         if (error)
2008                                 break;
2009                         space -= sizeof ifr;
2010                         ifrp++;
2011                 }
2012         }
2013         ifc->ifc_len -= space;
2014         return (error);
2015 }
2016
2017 /*
2018  * Just like if_promisc(), but for all-multicast-reception mode.
2019  */
2020 int
2021 if_allmulti(struct ifnet *ifp, int onswitch)
2022 {
2023         int error = 0;
2024         struct ifreq ifr;
2025
2026         crit_enter();
2027
2028         if (onswitch) {
2029                 if (ifp->if_amcount++ == 0) {
2030                         ifp->if_flags |= IFF_ALLMULTI;
2031                         ifr.ifr_flags = ifp->if_flags;
2032                         ifr.ifr_flagshigh = ifp->if_flags >> 16;
2033                         ifnet_serialize_all(ifp);
2034                         error = ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr,
2035                                               NULL);
2036                         ifnet_deserialize_all(ifp);
2037                 }
2038         } else {
2039                 if (ifp->if_amcount > 1) {
2040                         ifp->if_amcount--;
2041                 } else {
2042                         ifp->if_amcount = 0;
2043                         ifp->if_flags &= ~IFF_ALLMULTI;
2044                         ifr.ifr_flags = ifp->if_flags;
2045                         ifr.ifr_flagshigh = ifp->if_flags >> 16;
2046                         ifnet_serialize_all(ifp);
2047                         error = ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr,
2048                                               NULL);
2049                         ifnet_deserialize_all(ifp);
2050                 }
2051         }
2052
2053         crit_exit();
2054
2055         if (error == 0)
2056                 rt_ifmsg(ifp);
2057         return error;
2058 }
2059
2060 /*
2061  * Add a multicast listenership to the interface in question.
2062  * The link layer provides a routine which converts
2063  */
2064 int
2065 if_addmulti(
2066         struct ifnet *ifp,      /* interface to manipulate */
2067         struct sockaddr *sa,    /* address to add */
2068         struct ifmultiaddr **retifma)
2069 {
2070         struct sockaddr *llsa, *dupsa;
2071         int error;
2072         struct ifmultiaddr *ifma;
2073
2074         /*
2075          * If the matching multicast address already exists
2076          * then don't add a new one, just add a reference
2077          */
2078         TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
2079                 if (sa_equal(sa, ifma->ifma_addr)) {
2080                         ifma->ifma_refcount++;
2081                         if (retifma)
2082                                 *retifma = ifma;
2083                         return 0;
2084                 }
2085         }
2086
2087         /*
2088          * Give the link layer a chance to accept/reject it, and also
2089          * find out which AF_LINK address this maps to, if it isn't one
2090          * already.
2091          */
2092         if (ifp->if_resolvemulti) {
2093                 ifnet_serialize_all(ifp);
2094                 error = ifp->if_resolvemulti(ifp, &llsa, sa);
2095                 ifnet_deserialize_all(ifp);
2096                 if (error) 
2097                         return error;
2098         } else {
2099                 llsa = NULL;
2100         }
2101
2102         ifma = kmalloc(sizeof *ifma, M_IFMADDR, M_WAITOK);
2103         dupsa = kmalloc(sa->sa_len, M_IFMADDR, M_WAITOK);
2104         bcopy(sa, dupsa, sa->sa_len);
2105
2106         ifma->ifma_addr = dupsa;
2107         ifma->ifma_lladdr = llsa;
2108         ifma->ifma_ifp = ifp;
2109         ifma->ifma_refcount = 1;
2110         ifma->ifma_protospec = 0;
2111         rt_newmaddrmsg(RTM_NEWMADDR, ifma);
2112
2113         /*
2114          * Some network interfaces can scan the address list at
2115          * interrupt time; lock them out.
2116          */
2117         crit_enter();
2118         TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link);
2119         crit_exit();
2120         if (retifma)
2121                 *retifma = ifma;
2122
2123         if (llsa != NULL) {
2124                 TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
2125                         if (sa_equal(ifma->ifma_addr, llsa))
2126                                 break;
2127                 }
2128                 if (ifma) {
2129                         ifma->ifma_refcount++;
2130                 } else {
2131                         ifma = kmalloc(sizeof *ifma, M_IFMADDR, M_WAITOK);
2132                         dupsa = kmalloc(llsa->sa_len, M_IFMADDR, M_WAITOK);
2133                         bcopy(llsa, dupsa, llsa->sa_len);
2134                         ifma->ifma_addr = dupsa;
2135                         ifma->ifma_ifp = ifp;
2136                         ifma->ifma_refcount = 1;
2137                         crit_enter();
2138                         TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link);
2139                         crit_exit();
2140                 }
2141         }
2142         /*
2143          * We are certain we have added something, so call down to the
2144          * interface to let them know about it.
2145          */
2146         crit_enter();
2147         ifnet_serialize_all(ifp);
2148         if (ifp->if_ioctl)
2149                 ifp->if_ioctl(ifp, SIOCADDMULTI, 0, NULL);
2150         ifnet_deserialize_all(ifp);
2151         crit_exit();
2152
2153         return 0;
2154 }
2155
2156 /*
2157  * Remove a reference to a multicast address on this interface.  Yell
2158  * if the request does not match an existing membership.
2159  */
2160 int
2161 if_delmulti(struct ifnet *ifp, struct sockaddr *sa)
2162 {
2163         struct ifmultiaddr *ifma;
2164
2165         TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link)
2166                 if (sa_equal(sa, ifma->ifma_addr))
2167                         break;
2168         if (ifma == NULL)
2169                 return ENOENT;
2170
2171         if (ifma->ifma_refcount > 1) {
2172                 ifma->ifma_refcount--;
2173                 return 0;
2174         }
2175
2176         rt_newmaddrmsg(RTM_DELMADDR, ifma);
2177         sa = ifma->ifma_lladdr;
2178         crit_enter();
2179         TAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifma_link);
2180         /*
2181          * Make sure the interface driver is notified
2182          * in the case of a link layer mcast group being left.
2183          */
2184         if (ifma->ifma_addr->sa_family == AF_LINK && sa == NULL) {
2185                 ifnet_serialize_all(ifp);
2186                 ifp->if_ioctl(ifp, SIOCDELMULTI, 0, NULL);
2187                 ifnet_deserialize_all(ifp);
2188         }
2189         crit_exit();
2190         kfree(ifma->ifma_addr, M_IFMADDR);
2191         kfree(ifma, M_IFMADDR);
2192         if (sa == NULL)
2193                 return 0;
2194
2195         /*
2196          * Now look for the link-layer address which corresponds to
2197          * this network address.  It had been squirreled away in
2198          * ifma->ifma_lladdr for this purpose (so we don't have
2199          * to call ifp->if_resolvemulti() again), and we saved that
2200          * value in sa above.  If some nasty deleted the
2201          * link-layer address out from underneath us, we can deal because
2202          * the address we stored was is not the same as the one which was
2203          * in the record for the link-layer address.  (So we don't complain
2204          * in that case.)
2205          */
2206         TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link)
2207                 if (sa_equal(sa, ifma->ifma_addr))
2208                         break;
2209         if (ifma == NULL)
2210                 return 0;
2211
2212         if (ifma->ifma_refcount > 1) {
2213                 ifma->ifma_refcount--;
2214                 return 0;
2215         }
2216
2217         crit_enter();
2218         ifnet_serialize_all(ifp);
2219         TAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifma_link);
2220         ifp->if_ioctl(ifp, SIOCDELMULTI, 0, NULL);
2221         ifnet_deserialize_all(ifp);
2222         crit_exit();
2223         kfree(ifma->ifma_addr, M_IFMADDR);
2224         kfree(sa, M_IFMADDR);
2225         kfree(ifma, M_IFMADDR);
2226
2227         return 0;
2228 }
2229
2230 /*
2231  * Delete all multicast group membership for an interface.
2232  * Should be used to quickly flush all multicast filters.
2233  */
2234 void
2235 if_delallmulti(struct ifnet *ifp)
2236 {
2237         struct ifmultiaddr *ifma;
2238         struct ifmultiaddr *next;
2239
2240         TAILQ_FOREACH_MUTABLE(ifma, &ifp->if_multiaddrs, ifma_link, next)
2241                 if_delmulti(ifp, ifma->ifma_addr);
2242 }
2243
2244
2245 /*
2246  * Set the link layer address on an interface.
2247  *
2248  * At this time we only support certain types of interfaces,
2249  * and we don't allow the length of the address to change.
2250  */
2251 int
2252 if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len)
2253 {
2254         struct sockaddr_dl *sdl;
2255         struct ifreq ifr;
2256
2257         sdl = IF_LLSOCKADDR(ifp);
2258         if (sdl == NULL)
2259                 return (EINVAL);
2260         if (len != sdl->sdl_alen)       /* don't allow length to change */
2261                 return (EINVAL);
2262         switch (ifp->if_type) {
2263         case IFT_ETHER:                 /* these types use struct arpcom */
2264         case IFT_XETHER:
2265         case IFT_L2VLAN:
2266                 bcopy(lladdr, ((struct arpcom *)ifp->if_softc)->ac_enaddr, len);
2267                 bcopy(lladdr, LLADDR(sdl), len);
2268                 break;
2269         default:
2270                 return (ENODEV);
2271         }
2272         /*
2273          * If the interface is already up, we need
2274          * to re-init it in order to reprogram its
2275          * address filter.
2276          */
2277         ifnet_serialize_all(ifp);
2278         if ((ifp->if_flags & IFF_UP) != 0) {
2279 #ifdef INET
2280                 struct ifaddr_container *ifac;
2281 #endif
2282
2283                 ifp->if_flags &= ~IFF_UP;
2284                 ifr.ifr_flags = ifp->if_flags;
2285                 ifr.ifr_flagshigh = ifp->if_flags >> 16;
2286                 ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr,
2287                               NULL);
2288                 ifp->if_flags |= IFF_UP;
2289                 ifr.ifr_flags = ifp->if_flags;
2290                 ifr.ifr_flagshigh = ifp->if_flags >> 16;
2291                 ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr,
2292                                  NULL);
2293 #ifdef INET
2294                 /*
2295                  * Also send gratuitous ARPs to notify other nodes about
2296                  * the address change.
2297                  */
2298                 TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
2299                         struct ifaddr *ifa = ifac->ifa;
2300
2301                         if (ifa->ifa_addr != NULL &&
2302                             ifa->ifa_addr->sa_family == AF_INET)
2303                                 arp_gratuitous(ifp, ifa);
2304                 }
2305 #endif
2306         }
2307         ifnet_deserialize_all(ifp);
2308         return (0);
2309 }
2310
2311 struct ifmultiaddr *
2312 ifmaof_ifpforaddr(struct sockaddr *sa, struct ifnet *ifp)
2313 {
2314         struct ifmultiaddr *ifma;
2315
2316         TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link)
2317                 if (sa_equal(ifma->ifma_addr, sa))
2318                         break;
2319
2320         return ifma;
2321 }
2322
2323 /*
2324  * This function locates the first real ethernet MAC from a network
2325  * card and loads it into node, returning 0 on success or ENOENT if
2326  * no suitable interfaces were found.  It is used by the uuid code to
2327  * generate a unique 6-byte number.
2328  */
2329 int
2330 if_getanyethermac(uint16_t *node, int minlen)
2331 {
2332         struct ifnet *ifp;
2333         struct sockaddr_dl *sdl;
2334
2335         TAILQ_FOREACH(ifp, &ifnet, if_link) {
2336                 if (ifp->if_type != IFT_ETHER)
2337                         continue;
2338                 sdl = IF_LLSOCKADDR(ifp);
2339                 if (sdl->sdl_alen < minlen)
2340                         continue;
2341                 bcopy(((struct arpcom *)ifp->if_softc)->ac_enaddr, node,
2342                       minlen);
2343                 return(0);
2344         }
2345         return (ENOENT);
2346 }
2347
2348 /*
2349  * The name argument must be a pointer to storage which will last as
2350  * long as the interface does.  For physical devices, the result of
2351  * device_get_name(dev) is a good choice and for pseudo-devices a
2352  * static string works well.
2353  */
2354 void
2355 if_initname(struct ifnet *ifp, const char *name, int unit)
2356 {
2357         ifp->if_dname = name;
2358         ifp->if_dunit = unit;
2359         if (unit != IF_DUNIT_NONE)
2360                 ksnprintf(ifp->if_xname, IFNAMSIZ, "%s%d", name, unit);
2361         else
2362                 strlcpy(ifp->if_xname, name, IFNAMSIZ);
2363 }
2364
2365 int
2366 if_printf(struct ifnet *ifp, const char *fmt, ...)
2367 {
2368         __va_list ap;
2369         int retval;
2370
2371         retval = kprintf("%s: ", ifp->if_xname);
2372         __va_start(ap, fmt);
2373         retval += kvprintf(fmt, ap);
2374         __va_end(ap);
2375         return (retval);
2376 }
2377
2378 struct ifnet *
2379 if_alloc(uint8_t type)
2380 {
2381         struct ifnet *ifp;
2382         size_t size;
2383
2384         /*
2385          * XXX temporary hack until arpcom is setup in if_l2com
2386          */
2387         if (type == IFT_ETHER)
2388                 size = sizeof(struct arpcom);
2389         else
2390                 size = sizeof(struct ifnet);
2391
2392         ifp = kmalloc(size, M_IFNET, M_WAITOK|M_ZERO);
2393
2394         ifp->if_type = type;
2395
2396         if (if_com_alloc[type] != NULL) {
2397                 ifp->if_l2com = if_com_alloc[type](type, ifp);
2398                 if (ifp->if_l2com == NULL) {
2399                         kfree(ifp, M_IFNET);
2400                         return (NULL);
2401                 }
2402         }
2403         return (ifp);
2404 }
2405
2406 void
2407 if_free(struct ifnet *ifp)
2408 {
2409         kfree(ifp, M_IFNET);
2410 }
2411
2412 void
2413 ifq_set_classic(struct ifaltq *ifq)
2414 {
2415         ifq->altq_enqueue = ifq_classic_enqueue;
2416         ifq->altq_dequeue = ifq_classic_dequeue;
2417         ifq->altq_request = ifq_classic_request;
2418 }
2419
2420 int
2421 ifq_classic_enqueue(struct ifaltq *ifq, struct mbuf *m,
2422                     struct altq_pktattr *pa __unused)
2423 {
2424         logifq(enqueue, ifq);
2425         if (IF_QFULL(ifq)) {
2426                 m_freem(m);
2427                 return(ENOBUFS);
2428         } else {
2429                 IF_ENQUEUE(ifq, m);
2430                 return(0);
2431         }       
2432 }
2433
2434 struct mbuf *
2435 ifq_classic_dequeue(struct ifaltq *ifq, struct mbuf *mpolled, int op)
2436 {
2437         struct mbuf *m;
2438
2439         switch (op) {
2440         case ALTDQ_POLL:
2441                 IF_POLL(ifq, m);
2442                 break;
2443         case ALTDQ_REMOVE:
2444                 logifq(dequeue, ifq);
2445                 IF_DEQUEUE(ifq, m);
2446                 break;
2447         default:
2448                 panic("unsupported ALTQ dequeue op: %d", op);
2449         }
2450         KKASSERT(mpolled == NULL || mpolled == m);
2451         return(m);
2452 }
2453
2454 int
2455 ifq_classic_request(struct ifaltq *ifq, int req, void *arg)
2456 {
2457         switch (req) {
2458         case ALTRQ_PURGE:
2459                 IF_DRAIN(ifq);
2460                 break;
2461         default:
2462                 panic("unsupported ALTQ request: %d", req);
2463         }
2464         return(0);
2465 }
2466
2467 static void
2468 ifq_try_ifstart(struct ifaltq *ifq, int force_sched)
2469 {
2470         struct ifnet *ifp = ifq->altq_ifp;
2471         int running = 0, need_sched;
2472
2473         /*
2474          * Try to do direct ifnet.if_start first, if there is
2475          * contention on ifnet's serializer, ifnet.if_start will
2476          * be scheduled on ifnet's CPU.
2477          */
2478         if (!ifnet_tryserialize_tx(ifp)) {
2479                 /*
2480                  * ifnet serializer contention happened,
2481                  * ifnet.if_start is scheduled on ifnet's
2482                  * CPU, and we keep going.
2483                  */
2484                 logifstart(contend_sched, ifp);
2485                 ifq_ifstart_schedule(ifq, 1);
2486                 return;
2487         }
2488
2489         if ((ifp->if_flags & IFF_RUNNING) && !ifq_is_oactive(ifq)) {
2490                 logifstart(run, ifp);
2491                 ifp->if_start(ifp);
2492                 if ((ifp->if_flags & IFF_RUNNING) && !ifq_is_oactive(ifq))
2493                         running = 1;
2494         }
2495         need_sched = if_start_need_schedule(ifq, running);
2496
2497         ifnet_deserialize_tx(ifp);
2498
2499         if (need_sched) {
2500                 /*
2501                  * More data need to be transmitted, ifnet.if_start is
2502                  * scheduled on ifnet's CPU, and we keep going.
2503                  * NOTE: ifnet.if_start interlock is not released.
2504                  */
2505                 logifstart(sched, ifp);
2506                 ifq_ifstart_schedule(ifq, force_sched);
2507         }
2508 }
2509
2510 /*
2511  * IFQ packets staging mechanism:
2512  *
2513  * The packets enqueued into IFQ are staged to a certain amount before the
2514  * ifnet's if_start is called.  In this way, the driver could avoid writing
2515  * to hardware registers upon every packet, instead, hardware registers
2516  * could be written when certain amount of packets are put onto hardware
2517  * TX ring.  The measurement on several modern NICs (emx(4), igb(4), bnx(4),
2518  * bge(4), jme(4)) shows that the hardware registers writing aggregation
2519  * could save ~20% CPU time when 18bytes UDP datagrams are transmitted at
2520  * 1.48Mpps.  The performance improvement by hardware registers writing
2521  * aggeregation is also mentioned by Luigi Rizzo's netmap paper
2522  * (http://info.iet.unipi.it/~luigi/netmap/).
2523  *
2524  * IFQ packets staging is performed for two entry points into drivers's
2525  * transmission function:
2526  * - Direct ifnet's if_start calling, i.e. ifq_try_ifstart()
2527  * - ifnet's if_start scheduling, i.e. ifq_ifstart_schedule()
2528  *
2529  * IFQ packets staging will be stopped upon any of the following conditions:
2530  * - If the count of packets enqueued on the current CPU is great than or
2531  *   equal to ifq_stage_cntmax. (XXX this should be per-interface)
2532  * - If the total length of packets enqueued on the current CPU is great
2533  *   than or equal to the hardware's MTU - max_protohdr.  max_protohdr is
2534  *   cut from the hardware's MTU mainly bacause a full TCP segment's size
2535  *   is usually less than hardware's MTU.
2536  * - ifq_ifstart_schedule() is not pending on the current CPU and if_start
2537  *   interlock (if_snd.altq_started) is not released.
2538  * - The if_start_rollup(), which is registered as low priority netisr
2539  *   rollup function, is called; probably because no more work is pending
2540  *   for netisr.
2541  *
2542  * NOTE:
2543  * Currently IFQ packet staging is only performed in netisr threads.
2544  */
2545 int
2546 ifq_dispatch(struct ifnet *ifp, struct mbuf *m, struct altq_pktattr *pa)
2547 {
2548         struct ifaltq *ifq = &ifp->if_snd;
2549         int error, start = 0, len, mcast = 0, avoid_start = 0;
2550         struct ifaltq_stage_head *head = NULL;
2551         struct ifaltq_stage *stage = NULL;
2552
2553         ASSERT_IFNET_NOT_SERIALIZED_TX(ifp);
2554
2555         len = m->m_pkthdr.len;
2556         if (m->m_flags & M_MCAST)
2557                 mcast = 1;
2558
2559         if (curthread->td_type == TD_TYPE_NETISR) {
2560                 head = &ifq_stage_heads[mycpuid];
2561                 stage = ifq_get_stage(ifq, mycpuid);
2562
2563                 stage->ifqs_cnt++;
2564                 stage->ifqs_len += len;
2565                 if (stage->ifqs_cnt < ifq_stage_cntmax &&
2566                     stage->ifqs_len < (ifp->if_mtu - max_protohdr))
2567                         avoid_start = 1;
2568         }
2569
2570         ALTQ_LOCK(ifq);
2571         error = ifq_enqueue_locked(ifq, m, pa);
2572         if (error) {
2573                 if (!ifq_data_ready(ifq)) {
2574                         ALTQ_UNLOCK(ifq);
2575                         return error;
2576                 }
2577                 avoid_start = 0;
2578         }
2579         if (!ifq_is_started(ifq)) {
2580                 if (avoid_start) {
2581                         ALTQ_UNLOCK(ifq);
2582
2583                         KKASSERT(!error);
2584                         if ((stage->ifqs_flags & IFQ_STAGE_FLAG_QUED) == 0)
2585                                 ifq_stage_insert(head, stage);
2586
2587                         ifp->if_obytes += len;
2588                         if (mcast)
2589                                 ifp->if_omcasts++;
2590                         return error;
2591                 }
2592
2593                 /*
2594                  * Hold the interlock of ifnet.if_start
2595                  */
2596                 ifq_set_started(ifq);
2597                 start = 1;
2598         }
2599         ALTQ_UNLOCK(ifq);
2600
2601         if (!error) {
2602                 ifp->if_obytes += len;
2603                 if (mcast)
2604                         ifp->if_omcasts++;
2605         }
2606
2607         if (stage != NULL) {
2608                 if (!start && (stage->ifqs_flags & IFQ_STAGE_FLAG_SCHED)) {
2609                         KKASSERT(stage->ifqs_flags & IFQ_STAGE_FLAG_QUED);
2610                         if (!avoid_start) {
2611                                 ifq_stage_remove(head, stage);
2612                                 ifq_ifstart_schedule(ifq, 1);
2613                         }
2614                         return error;
2615                 }
2616
2617                 if (stage->ifqs_flags & IFQ_STAGE_FLAG_QUED) {
2618                         ifq_stage_remove(head, stage);
2619                 } else {
2620                         stage->ifqs_cnt = 0;
2621                         stage->ifqs_len = 0;
2622                 }
2623         }
2624
2625         if (!start) {
2626                 logifstart(avoid, ifp);
2627                 return error;
2628         }
2629
2630         ifq_try_ifstart(ifq, 0);
2631         return error;
2632 }
2633
2634 void *
2635 ifa_create(int size, int flags)
2636 {
2637         struct ifaddr *ifa;
2638         int i;
2639
2640         KASSERT(size >= sizeof(*ifa), ("ifaddr size too small"));
2641
2642         ifa = kmalloc(size, M_IFADDR, flags | M_ZERO);
2643         if (ifa == NULL)
2644                 return NULL;
2645
2646         ifa->ifa_containers = kmalloc(ncpus * sizeof(struct ifaddr_container),
2647                                       M_IFADDR, M_WAITOK | M_ZERO);
2648         ifa->ifa_ncnt = ncpus;
2649         for (i = 0; i < ncpus; ++i) {
2650                 struct ifaddr_container *ifac = &ifa->ifa_containers[i];
2651
2652                 ifac->ifa_magic = IFA_CONTAINER_MAGIC;
2653                 ifac->ifa = ifa;
2654                 ifac->ifa_refcnt = 1;
2655         }
2656 #ifdef IFADDR_DEBUG
2657         kprintf("alloc ifa %p %d\n", ifa, size);
2658 #endif
2659         return ifa;
2660 }
2661
2662 void
2663 ifac_free(struct ifaddr_container *ifac, int cpu_id)
2664 {
2665         struct ifaddr *ifa = ifac->ifa;
2666
2667         KKASSERT(ifac->ifa_magic == IFA_CONTAINER_MAGIC);
2668         KKASSERT(ifac->ifa_refcnt == 0);
2669         KASSERT(ifac->ifa_listmask == 0,
2670                 ("ifa is still on %#x lists", ifac->ifa_listmask));
2671
2672         ifac->ifa_magic = IFA_CONTAINER_DEAD;
2673
2674 #ifdef IFADDR_DEBUG_VERBOSE
2675         kprintf("try free ifa %p cpu_id %d\n", ifac->ifa, cpu_id);
2676 #endif
2677
2678         KASSERT(ifa->ifa_ncnt > 0 && ifa->ifa_ncnt <= ncpus,
2679                 ("invalid # of ifac, %d", ifa->ifa_ncnt));
2680         if (atomic_fetchadd_int(&ifa->ifa_ncnt, -1) == 1) {
2681 #ifdef IFADDR_DEBUG
2682                 kprintf("free ifa %p\n", ifa);
2683 #endif
2684                 kfree(ifa->ifa_containers, M_IFADDR);
2685                 kfree(ifa, M_IFADDR);
2686         }
2687 }
2688
2689 static void
2690 ifa_iflink_dispatch(netmsg_t nmsg)
2691 {
2692         struct netmsg_ifaddr *msg = (struct netmsg_ifaddr *)nmsg;
2693         struct ifaddr *ifa = msg->ifa;
2694         struct ifnet *ifp = msg->ifp;
2695         int cpu = mycpuid;
2696         struct ifaddr_container *ifac;
2697
2698         crit_enter();
2699
2700         ifac = &ifa->ifa_containers[cpu];
2701         ASSERT_IFAC_VALID(ifac);
2702         KASSERT((ifac->ifa_listmask & IFA_LIST_IFADDRHEAD) == 0,
2703                 ("ifaddr is on if_addrheads"));
2704
2705         ifac->ifa_listmask |= IFA_LIST_IFADDRHEAD;
2706         if (msg->tail)
2707                 TAILQ_INSERT_TAIL(&ifp->if_addrheads[cpu], ifac, ifa_link);
2708         else
2709                 TAILQ_INSERT_HEAD(&ifp->if_addrheads[cpu], ifac, ifa_link);
2710
2711         crit_exit();
2712
2713         ifa_forwardmsg(&nmsg->lmsg, cpu + 1);
2714 }
2715
2716 void
2717 ifa_iflink(struct ifaddr *ifa, struct ifnet *ifp, int tail)
2718 {
2719         struct netmsg_ifaddr msg;
2720
2721         netmsg_init(&msg.base, NULL, &curthread->td_msgport,
2722                     0, ifa_iflink_dispatch);
2723         msg.ifa = ifa;
2724         msg.ifp = ifp;
2725         msg.tail = tail;
2726
2727         ifa_domsg(&msg.base.lmsg, 0);
2728 }
2729
2730 static void
2731 ifa_ifunlink_dispatch(netmsg_t nmsg)
2732 {
2733         struct netmsg_ifaddr *msg = (struct netmsg_ifaddr *)nmsg;
2734         struct ifaddr *ifa = msg->ifa;
2735         struct ifnet *ifp = msg->ifp;
2736         int cpu = mycpuid;
2737         struct ifaddr_container *ifac;
2738
2739         crit_enter();
2740
2741         ifac = &ifa->ifa_containers[cpu];
2742         ASSERT_IFAC_VALID(ifac);
2743         KASSERT(ifac->ifa_listmask & IFA_LIST_IFADDRHEAD,
2744                 ("ifaddr is not on if_addrhead"));
2745
2746         TAILQ_REMOVE(&ifp->if_addrheads[cpu], ifac, ifa_link);
2747         ifac->ifa_listmask &= ~IFA_LIST_IFADDRHEAD;
2748
2749         crit_exit();
2750
2751         ifa_forwardmsg(&nmsg->lmsg, cpu + 1);
2752 }
2753
2754 void
2755 ifa_ifunlink(struct ifaddr *ifa, struct ifnet *ifp)
2756 {
2757         struct netmsg_ifaddr msg;
2758
2759         netmsg_init(&msg.base, NULL, &curthread->td_msgport,
2760                     0, ifa_ifunlink_dispatch);
2761         msg.ifa = ifa;
2762         msg.ifp = ifp;
2763
2764         ifa_domsg(&msg.base.lmsg, 0);
2765 }
2766
2767 static void
2768 ifa_destroy_dispatch(netmsg_t nmsg)
2769 {
2770         struct netmsg_ifaddr *msg = (struct netmsg_ifaddr *)nmsg;
2771
2772         IFAFREE(msg->ifa);
2773         ifa_forwardmsg(&nmsg->lmsg, mycpuid + 1);
2774 }
2775
2776 void
2777 ifa_destroy(struct ifaddr *ifa)
2778 {
2779         struct netmsg_ifaddr msg;
2780
2781         netmsg_init(&msg.base, NULL, &curthread->td_msgport,
2782                     0, ifa_destroy_dispatch);
2783         msg.ifa = ifa;
2784
2785         ifa_domsg(&msg.base.lmsg, 0);
2786 }
2787
2788 struct lwkt_port *
2789 ifnet_portfn(int cpu)
2790 {
2791         return &ifnet_threads[cpu].td_msgport;
2792 }
2793
2794 void
2795 ifnet_forwardmsg(struct lwkt_msg *lmsg, int next_cpu)
2796 {
2797         KKASSERT(next_cpu > mycpuid && next_cpu <= ncpus);
2798
2799         if (next_cpu < ncpus)
2800                 lwkt_forwardmsg(ifnet_portfn(next_cpu), lmsg);
2801         else
2802                 lwkt_replymsg(lmsg, 0);
2803 }
2804
2805 int
2806 ifnet_domsg(struct lwkt_msg *lmsg, int cpu)
2807 {
2808         KKASSERT(cpu < ncpus);
2809         return lwkt_domsg(ifnet_portfn(cpu), lmsg, 0);
2810 }
2811
2812 void
2813 ifnet_sendmsg(struct lwkt_msg *lmsg, int cpu)
2814 {
2815         KKASSERT(cpu < ncpus);
2816         lwkt_sendmsg(ifnet_portfn(cpu), lmsg);
2817 }
2818
2819 /*
2820  * Generic netmsg service loop.  Some protocols may roll their own but all
2821  * must do the basic command dispatch function call done here.
2822  */
2823 static void
2824 ifnet_service_loop(void *arg __unused)
2825 {
2826         netmsg_t msg;
2827
2828         while ((msg = lwkt_waitport(&curthread->td_msgport, 0))) {
2829                 KASSERT(msg->base.nm_dispatch, ("ifnet_service: badmsg"));
2830                 msg->base.nm_dispatch(msg);
2831         }
2832 }
2833
2834 static void
2835 if_start_rollup(void)
2836 {
2837         struct ifaltq_stage_head *head = &ifq_stage_heads[mycpuid];
2838         struct ifaltq_stage *stage;
2839
2840         while ((stage = TAILQ_FIRST(&head->ifqs_head)) != NULL) {
2841                 struct ifaltq *ifq = stage->ifqs_altq;
2842                 int is_sched = 0;
2843
2844                 if (stage->ifqs_flags & IFQ_STAGE_FLAG_SCHED)
2845                         is_sched = 1;
2846                 ifq_stage_remove(head, stage);
2847
2848                 if (is_sched) {
2849                         ifq_ifstart_schedule(ifq, 1);
2850                 } else {
2851                         int start = 0;
2852
2853                         ALTQ_LOCK(ifq);
2854                         if (!ifq_is_started(ifq)) {
2855                                 /*
2856                                  * Hold the interlock of ifnet.if_start
2857                                  */
2858                                 ifq_set_started(ifq);
2859                                 start = 1;
2860                         }
2861                         ALTQ_UNLOCK(ifq);
2862
2863                         if (start)
2864                                 ifq_try_ifstart(ifq, 1);
2865                 }
2866                 KKASSERT((stage->ifqs_flags &
2867                     (IFQ_STAGE_FLAG_QUED | IFQ_STAGE_FLAG_SCHED)) == 0);
2868         }
2869 }
2870
2871 static void
2872 ifnetinit(void *dummy __unused)
2873 {
2874         int i;
2875
2876         for (i = 0; i < ncpus; ++i) {
2877                 struct thread *thr = &ifnet_threads[i];
2878
2879                 lwkt_create(ifnet_service_loop, NULL, NULL,
2880                             thr, TDF_NOSTART|TDF_FORCE_SPINPORT,
2881                             i, "ifnet %d", i);
2882                 netmsg_service_port_init(&thr->td_msgport);
2883                 lwkt_schedule(thr);
2884         }
2885
2886         for (i = 0; i < ncpus; ++i)
2887                 TAILQ_INIT(&ifq_stage_heads[i].ifqs_head);
2888         netisr_register_rollup(if_start_rollup, NETISR_ROLLUP_PRIO_IFSTART);
2889 }
2890
2891 struct ifnet *
2892 ifnet_byindex(unsigned short idx)
2893 {
2894         if (idx > if_index)
2895                 return NULL;
2896         return ifindex2ifnet[idx];
2897 }
2898
2899 struct ifaddr *
2900 ifaddr_byindex(unsigned short idx)
2901 {
2902         struct ifnet *ifp;
2903
2904         ifp = ifnet_byindex(idx);
2905         if (!ifp)
2906                 return NULL;
2907         return TAILQ_FIRST(&ifp->if_addrheads[mycpuid])->ifa;
2908 }
2909
2910 void
2911 if_register_com_alloc(u_char type,
2912     if_com_alloc_t *a, if_com_free_t *f)
2913 {
2914
2915         KASSERT(if_com_alloc[type] == NULL,
2916             ("if_register_com_alloc: %d already registered", type));
2917         KASSERT(if_com_free[type] == NULL,
2918             ("if_register_com_alloc: %d free already registered", type));
2919
2920         if_com_alloc[type] = a;
2921         if_com_free[type] = f;
2922 }
2923
2924 void
2925 if_deregister_com_alloc(u_char type)
2926 {
2927
2928         KASSERT(if_com_alloc[type] != NULL,
2929             ("if_deregister_com_alloc: %d not registered", type));
2930         KASSERT(if_com_free[type] != NULL,
2931             ("if_deregister_com_alloc: %d free not registered", type));
2932         if_com_alloc[type] = NULL;
2933         if_com_free[type] = NULL;
2934 }
2935
2936 int
2937 if_ring_count2(int cnt, int cnt_max)
2938 {
2939         int shift = 0;
2940
2941         KASSERT(cnt_max >= 1 && powerof2(cnt_max),
2942             ("invalid ring count max %d", cnt_max));
2943
2944         if (cnt <= 0)
2945                 cnt = cnt_max;
2946         if (cnt > ncpus2)
2947                 cnt = ncpus2;
2948         if (cnt > cnt_max)
2949                 cnt = cnt_max;
2950
2951         while ((1 << (shift + 1)) <= cnt)
2952                 ++shift;
2953         cnt = 1 << shift;
2954
2955         KASSERT(cnt >= 1 && cnt <= ncpus2 && cnt <= cnt_max,
2956             ("calculate cnt %d, ncpus2 %d, cnt max %d",
2957              cnt, ncpus2, cnt_max));
2958         return cnt;
2959 }
2960
2961 void
2962 ifq_set_maxlen(struct ifaltq *ifq, int len)
2963 {
2964         ifq->ifq_maxlen = len + (ncpus * ifq_stage_cntmax);
2965 }