if: Add if_devstart_sched to schedule ifnet.if_start call
[dragonfly.git] / sys / net / if.c
1 /*
2  * Copyright (c) 1980, 1986, 1993
3  *      The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. All advertising materials mentioning features or use of this software
14  *    must display the following acknowledgement:
15  *      This product includes software developed by the University of
16  *      California, Berkeley and its contributors.
17  * 4. Neither the name of the University nor the names of its contributors
18  *    may be used to endorse or promote products derived from this software
19  *    without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  *      @(#)if.c        8.3 (Berkeley) 1/4/94
34  * $FreeBSD: src/sys/net/if.c,v 1.185 2004/03/13 02:35:03 brooks Exp $
35  */
36
37 #include "opt_compat.h"
38 #include "opt_inet6.h"
39 #include "opt_inet.h"
40 #include "opt_ifpoll.h"
41
42 #include <sys/param.h>
43 #include <sys/malloc.h>
44 #include <sys/mbuf.h>
45 #include <sys/systm.h>
46 #include <sys/proc.h>
47 #include <sys/priv.h>
48 #include <sys/protosw.h>
49 #include <sys/socket.h>
50 #include <sys/socketvar.h>
51 #include <sys/socketops.h>
52 #include <sys/protosw.h>
53 #include <sys/kernel.h>
54 #include <sys/ktr.h>
55 #include <sys/mutex.h>
56 #include <sys/sockio.h>
57 #include <sys/syslog.h>
58 #include <sys/sysctl.h>
59 #include <sys/domain.h>
60 #include <sys/thread.h>
61 #include <sys/serialize.h>
62 #include <sys/bus.h>
63
64 #include <sys/thread2.h>
65 #include <sys/msgport2.h>
66 #include <sys/mutex2.h>
67
68 #include <net/if.h>
69 #include <net/if_arp.h>
70 #include <net/if_dl.h>
71 #include <net/if_types.h>
72 #include <net/if_var.h>
73 #include <net/ifq_var.h>
74 #include <net/radix.h>
75 #include <net/route.h>
76 #include <net/if_clone.h>
77 #include <net/netisr.h>
78 #include <net/netmsg2.h>
79
80 #include <machine/atomic.h>
81 #include <machine/stdarg.h>
82 #include <machine/smp.h>
83
84 #if defined(INET) || defined(INET6)
85 /*XXX*/
86 #include <netinet/in.h>
87 #include <netinet/in_var.h>
88 #include <netinet/if_ether.h>
89 #ifdef INET6
90 #include <netinet6/in6_var.h>
91 #include <netinet6/in6_ifattach.h>
92 #endif
93 #endif
94
95 #if defined(COMPAT_43)
96 #include <emulation/43bsd/43bsd_socket.h>
97 #endif /* COMPAT_43 */
98
99 struct netmsg_ifaddr {
100         struct netmsg_base base;
101         struct ifaddr   *ifa;
102         struct ifnet    *ifp;
103         int             tail;
104 };
105
106 struct ifaltq_stage_head {
107         TAILQ_HEAD(, ifaltq_stage)      ifqs_head;
108 } __cachealign;
109
110 /*
111  * System initialization
112  */
113 static void     if_attachdomain(void *);
114 static void     if_attachdomain1(struct ifnet *);
115 static int      ifconf(u_long, caddr_t, struct ucred *);
116 static void     ifinit(void *);
117 static void     ifnetinit(void *);
118 static void     if_slowtimo(void *);
119 static void     link_rtrequest(int, struct rtentry *, struct rt_addrinfo *);
120 static int      if_rtdel(struct radix_node *, void *);
121
122 #ifdef INET6
123 /*
124  * XXX: declare here to avoid to include many inet6 related files..
125  * should be more generalized?
126  */
127 extern void     nd6_setmtu(struct ifnet *);
128 #endif
129
130 SYSCTL_NODE(_net, PF_LINK, link, CTLFLAG_RW, 0, "Link layers");
131 SYSCTL_NODE(_net_link, 0, generic, CTLFLAG_RW, 0, "Generic link-management");
132
133 static int ifq_stage_cntmax = 4;
134 TUNABLE_INT("net.link.stage_cntmax", &ifq_stage_cntmax);
135 SYSCTL_INT(_net_link, OID_AUTO, stage_cntmax, CTLFLAG_RW,
136     &ifq_stage_cntmax, 0, "ifq staging packet count max");
137
138 SYSINIT(interfaces, SI_SUB_PROTO_IF, SI_ORDER_FIRST, ifinit, NULL)
139 /* Must be after netisr_init */
140 SYSINIT(ifnet, SI_SUB_PRE_DRIVERS, SI_ORDER_SECOND, ifnetinit, NULL)
141
142 static  if_com_alloc_t *if_com_alloc[256];
143 static  if_com_free_t *if_com_free[256];
144
145 MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address");
146 MALLOC_DEFINE(M_IFMADDR, "ether_multi", "link-level multicast address");
147 MALLOC_DEFINE(M_IFNET, "ifnet", "interface structure");
148
149 int                     ifqmaxlen = IFQ_MAXLEN;
150 struct ifnethead        ifnet = TAILQ_HEAD_INITIALIZER(ifnet);
151
152 struct callout          if_slowtimo_timer;
153
154 int                     if_index = 0;
155 struct ifnet            **ifindex2ifnet = NULL;
156 static struct thread    ifnet_threads[MAXCPU];
157
158 static struct ifaltq_stage_head ifq_stage_heads[MAXCPU];
159
160 #define IFQ_KTR_STRING          "ifq=%p"
161 #define IFQ_KTR_ARGS    struct ifaltq *ifq
162 #ifndef KTR_IFQ
163 #define KTR_IFQ                 KTR_ALL
164 #endif
165 KTR_INFO_MASTER(ifq);
166 KTR_INFO(KTR_IFQ, ifq, enqueue, 0, IFQ_KTR_STRING, IFQ_KTR_ARGS);
167 KTR_INFO(KTR_IFQ, ifq, dequeue, 1, IFQ_KTR_STRING, IFQ_KTR_ARGS);
168 #define logifq(name, arg)       KTR_LOG(ifq_ ## name, arg)
169
170 #define IF_START_KTR_STRING     "ifp=%p"
171 #define IF_START_KTR_ARGS       struct ifnet *ifp
172 #ifndef KTR_IF_START
173 #define KTR_IF_START            KTR_ALL
174 #endif
175 KTR_INFO_MASTER(if_start);
176 KTR_INFO(KTR_IF_START, if_start, run, 0,
177          IF_START_KTR_STRING, IF_START_KTR_ARGS);
178 KTR_INFO(KTR_IF_START, if_start, sched, 1,
179          IF_START_KTR_STRING, IF_START_KTR_ARGS);
180 KTR_INFO(KTR_IF_START, if_start, avoid, 2,
181          IF_START_KTR_STRING, IF_START_KTR_ARGS);
182 KTR_INFO(KTR_IF_START, if_start, contend_sched, 3,
183          IF_START_KTR_STRING, IF_START_KTR_ARGS);
184 KTR_INFO(KTR_IF_START, if_start, chase_sched, 4,
185          IF_START_KTR_STRING, IF_START_KTR_ARGS);
186 #define logifstart(name, arg)   KTR_LOG(if_start_ ## name, arg)
187
188 TAILQ_HEAD(, ifg_group) ifg_head = TAILQ_HEAD_INITIALIZER(ifg_head);
189
190 /*
191  * Network interface utility routines.
192  *
193  * Routines with ifa_ifwith* names take sockaddr *'s as
194  * parameters.
195  */
196 /* ARGSUSED*/
197 void
198 ifinit(void *dummy)
199 {
200         struct ifnet *ifp;
201
202         callout_init(&if_slowtimo_timer);
203
204         crit_enter();
205         TAILQ_FOREACH(ifp, &ifnet, if_link) {
206                 if (ifp->if_snd.ifq_maxlen == 0) {
207                         if_printf(ifp, "XXX: driver didn't set ifq_maxlen\n");
208                         ifq_set_maxlen(&ifp->if_snd, ifqmaxlen);
209                 }
210         }
211         crit_exit();
212
213         if_slowtimo(0);
214 }
215
216 static int
217 if_start_cpuid(struct ifnet *ifp)
218 {
219         return ifp->if_cpuid;
220 }
221
222 #ifdef IFPOLL_ENABLE
223 static int
224 if_start_cpuid_npoll(struct ifnet *ifp)
225 {
226         int poll_cpuid = ifp->if_npoll_cpuid;
227
228         if (poll_cpuid >= 0)
229                 return poll_cpuid;
230         else
231                 return ifp->if_cpuid;
232 }
233 #endif
234
235 static void
236 ifq_ifstart_ipifunc(void *arg)
237 {
238         struct ifnet *ifp = arg;
239         struct lwkt_msg *lmsg = &ifp->if_start_nmsg[mycpuid].lmsg;
240
241         crit_enter();
242         if (lmsg->ms_flags & MSGF_DONE)
243                 lwkt_sendmsg(netisr_portfn(mycpuid), lmsg);
244         crit_exit();
245 }
246
247 static __inline void
248 ifq_stage_remove(struct ifaltq_stage_head *head, struct ifaltq_stage *stage)
249 {
250         KKASSERT(stage->ifqs_flags & IFQ_STAGE_FLAG_QUED);
251         TAILQ_REMOVE(&head->ifqs_head, stage, ifqs_link);
252         stage->ifqs_flags &= ~(IFQ_STAGE_FLAG_QUED | IFQ_STAGE_FLAG_SCHED);
253         stage->ifqs_cnt = 0;
254         stage->ifqs_len = 0;
255 }
256
257 static __inline void
258 ifq_stage_insert(struct ifaltq_stage_head *head, struct ifaltq_stage *stage)
259 {
260         KKASSERT((stage->ifqs_flags &
261             (IFQ_STAGE_FLAG_QUED | IFQ_STAGE_FLAG_SCHED)) == 0);
262         stage->ifqs_flags |= IFQ_STAGE_FLAG_QUED;
263         TAILQ_INSERT_TAIL(&head->ifqs_head, stage, ifqs_link);
264 }
265
266 /*
267  * Schedule ifnet.if_start on ifnet's CPU
268  */
269 static void
270 ifq_ifstart_schedule(struct ifaltq *ifq, int force)
271 {
272         struct ifnet *ifp = ifq->altq_ifp;
273         int cpu;
274
275         if (!force && curthread->td_type == TD_TYPE_NETISR &&
276             ifq_stage_cntmax > 0) {
277                 struct ifaltq_stage *stage = ifq_get_stage(ifq, mycpuid);
278
279                 stage->ifqs_cnt = 0;
280                 stage->ifqs_len = 0;
281                 if ((stage->ifqs_flags & IFQ_STAGE_FLAG_QUED) == 0)
282                         ifq_stage_insert(&ifq_stage_heads[mycpuid], stage);
283                 stage->ifqs_flags |= IFQ_STAGE_FLAG_SCHED;
284                 return;
285         }
286
287         cpu = ifp->if_start_cpuid(ifp);
288         if (cpu != mycpuid)
289                 lwkt_send_ipiq(globaldata_find(cpu), ifq_ifstart_ipifunc, ifp);
290         else
291                 ifq_ifstart_ipifunc(ifp);
292 }
293
294 /*
295  * NOTE:
296  * This function will release ifnet.if_start interlock,
297  * if ifnet.if_start does not need to be scheduled
298  */
299 static __inline int
300 if_start_need_schedule(struct ifaltq *ifq, int running)
301 {
302         if (!running || ifq_is_empty(ifq)
303 #ifdef ALTQ
304             || ifq->altq_tbr != NULL
305 #endif
306         ) {
307                 ALTQ_LOCK(ifq);
308                 /*
309                  * ifnet.if_start interlock is released, if:
310                  * 1) Hardware can not take any packets, due to
311                  *    o  interface is marked down
312                  *    o  hardware queue is full (ifq_is_oactive)
313                  *    Under the second situation, hardware interrupt
314                  *    or polling(4) will call/schedule ifnet.if_start
315                  *    when hardware queue is ready
316                  * 2) There is not packet in the ifnet.if_snd.
317                  *    Further ifq_dispatch or ifq_handoff will call/
318                  *    schedule ifnet.if_start
319                  * 3) TBR is used and it does not allow further
320                  *    dequeueing.
321                  *    TBR callout will call ifnet.if_start
322                  */
323                 if (!running || !ifq_data_ready(ifq)) {
324                         ifq_clr_started(ifq);
325                         ALTQ_UNLOCK(ifq);
326                         return 0;
327                 }
328                 ALTQ_UNLOCK(ifq);
329         }
330         return 1;
331 }
332
333 static void
334 if_start_dispatch(netmsg_t msg)
335 {
336         struct lwkt_msg *lmsg = &msg->base.lmsg;
337         struct ifnet *ifp = lmsg->u.ms_resultp;
338         struct ifaltq *ifq = &ifp->if_snd;
339         int running = 0, need_sched;
340
341         crit_enter();
342         lwkt_replymsg(lmsg, 0); /* reply ASAP */
343         crit_exit();
344
345         if (mycpuid != ifp->if_start_cpuid(ifp)) {
346                 /*
347                  * We need to chase the ifnet CPU change.
348                  */
349                 logifstart(chase_sched, ifp);
350                 ifq_ifstart_schedule(ifq, 1);
351                 return;
352         }
353
354         ifnet_serialize_tx(ifp);
355         if ((ifp->if_flags & IFF_RUNNING) && !ifq_is_oactive(ifq)) {
356                 logifstart(run, ifp);
357                 ifp->if_start(ifp);
358                 if ((ifp->if_flags & IFF_RUNNING) && !ifq_is_oactive(ifq))
359                         running = 1;
360         }
361         need_sched = if_start_need_schedule(ifq, running);
362         ifnet_deserialize_tx(ifp);
363
364         if (need_sched) {
365                 /*
366                  * More data need to be transmitted, ifnet.if_start is
367                  * scheduled on ifnet's CPU, and we keep going.
368                  * NOTE: ifnet.if_start interlock is not released.
369                  */
370                 logifstart(sched, ifp);
371                 ifq_ifstart_schedule(ifq, 0);
372         }
373 }
374
375 /* Device driver ifnet.if_start helper function */
376 void
377 if_devstart(struct ifnet *ifp)
378 {
379         struct ifaltq *ifq = &ifp->if_snd;
380         int running = 0;
381
382         ASSERT_IFNET_SERIALIZED_TX(ifp);
383
384         ALTQ_LOCK(ifq);
385         if (ifq_is_started(ifq) || !ifq_data_ready(ifq)) {
386                 logifstart(avoid, ifp);
387                 ALTQ_UNLOCK(ifq);
388                 return;
389         }
390         ifq_set_started(ifq);
391         ALTQ_UNLOCK(ifq);
392
393         logifstart(run, ifp);
394         ifp->if_start(ifp);
395
396         if ((ifp->if_flags & IFF_RUNNING) && !ifq_is_oactive(ifq))
397                 running = 1;
398
399         if (if_start_need_schedule(ifq, running)) {
400                 /*
401                  * More data need to be transmitted, ifnet.if_start is
402                  * scheduled on ifnet's CPU, and we keep going.
403                  * NOTE: ifnet.if_start interlock is not released.
404                  */
405                 logifstart(sched, ifp);
406                 ifq_ifstart_schedule(ifq, 0);
407         }
408 }
409
410 /* Device driver ifnet.if_start schedule helper function */
411 void
412 if_devstart_sched(struct ifnet *ifp)
413 {
414         ifq_ifstart_schedule(&ifp->if_snd, 1);
415 }
416
417 static void
418 if_default_serialize(struct ifnet *ifp, enum ifnet_serialize slz __unused)
419 {
420         lwkt_serialize_enter(ifp->if_serializer);
421 }
422
423 static void
424 if_default_deserialize(struct ifnet *ifp, enum ifnet_serialize slz __unused)
425 {
426         lwkt_serialize_exit(ifp->if_serializer);
427 }
428
429 static int
430 if_default_tryserialize(struct ifnet *ifp, enum ifnet_serialize slz __unused)
431 {
432         return lwkt_serialize_try(ifp->if_serializer);
433 }
434
435 #ifdef INVARIANTS
436 static void
437 if_default_serialize_assert(struct ifnet *ifp,
438                             enum ifnet_serialize slz __unused,
439                             boolean_t serialized)
440 {
441         if (serialized)
442                 ASSERT_SERIALIZED(ifp->if_serializer);
443         else
444                 ASSERT_NOT_SERIALIZED(ifp->if_serializer);
445 }
446 #endif
447
448 /*
449  * Attach an interface to the list of "active" interfaces.
450  *
451  * The serializer is optional.  If non-NULL access to the interface
452  * may be MPSAFE.
453  */
454 void
455 if_attach(struct ifnet *ifp, lwkt_serialize_t serializer)
456 {
457         unsigned socksize, ifasize;
458         int namelen, masklen;
459         struct sockaddr_dl *sdl;
460         struct ifaddr *ifa;
461         struct ifaltq *ifq;
462         int i;
463
464         static int if_indexlim = 8;
465
466         if (ifp->if_serialize != NULL) {
467                 KASSERT(ifp->if_deserialize != NULL &&
468                         ifp->if_tryserialize != NULL &&
469                         ifp->if_serialize_assert != NULL,
470                         ("serialize functions are partially setup"));
471
472                 /*
473                  * If the device supplies serialize functions,
474                  * then clear if_serializer to catch any invalid
475                  * usage of this field.
476                  */
477                 KASSERT(serializer == NULL,
478                         ("both serialize functions and default serializer "
479                          "are supplied"));
480                 ifp->if_serializer = NULL;
481         } else {
482                 KASSERT(ifp->if_deserialize == NULL &&
483                         ifp->if_tryserialize == NULL &&
484                         ifp->if_serialize_assert == NULL,
485                         ("serialize functions are partially setup"));
486                 ifp->if_serialize = if_default_serialize;
487                 ifp->if_deserialize = if_default_deserialize;
488                 ifp->if_tryserialize = if_default_tryserialize;
489 #ifdef INVARIANTS
490                 ifp->if_serialize_assert = if_default_serialize_assert;
491 #endif
492
493                 /*
494                  * The serializer can be passed in from the device,
495                  * allowing the same serializer to be used for both
496                  * the interrupt interlock and the device queue.
497                  * If not specified, the netif structure will use an
498                  * embedded serializer.
499                  */
500                 if (serializer == NULL) {
501                         serializer = &ifp->if_default_serializer;
502                         lwkt_serialize_init(serializer);
503                 }
504                 ifp->if_serializer = serializer;
505         }
506
507         ifp->if_start_cpuid = if_start_cpuid;
508         ifp->if_cpuid = 0;
509
510 #ifdef IFPOLL_ENABLE
511         /* Device is not in polling mode by default */
512         ifp->if_npoll_cpuid = -1;
513         if (ifp->if_npoll != NULL)
514                 ifp->if_start_cpuid = if_start_cpuid_npoll;
515 #endif
516
517         ifp->if_start_nmsg = kmalloc(ncpus * sizeof(*ifp->if_start_nmsg),
518                                      M_LWKTMSG, M_WAITOK);
519         for (i = 0; i < ncpus; ++i) {
520                 netmsg_init(&ifp->if_start_nmsg[i], NULL, &netisr_adone_rport,
521                             0, if_start_dispatch);
522                 ifp->if_start_nmsg[i].lmsg.u.ms_resultp = ifp;
523         }
524
525         mtx_init(&ifp->if_ioctl_mtx);
526         mtx_lock(&ifp->if_ioctl_mtx);
527
528         TAILQ_INSERT_TAIL(&ifnet, ifp, if_link);
529         ifp->if_index = ++if_index;
530
531         /*
532          * XXX -
533          * The old code would work if the interface passed a pre-existing
534          * chain of ifaddrs to this code.  We don't trust our callers to
535          * properly initialize the tailq, however, so we no longer allow
536          * this unlikely case.
537          */
538         ifp->if_addrheads = kmalloc(ncpus * sizeof(struct ifaddrhead),
539                                     M_IFADDR, M_WAITOK | M_ZERO);
540         for (i = 0; i < ncpus; ++i)
541                 TAILQ_INIT(&ifp->if_addrheads[i]);
542
543         TAILQ_INIT(&ifp->if_prefixhead);
544         TAILQ_INIT(&ifp->if_multiaddrs);
545         TAILQ_INIT(&ifp->if_groups);
546         getmicrotime(&ifp->if_lastchange);
547         if (ifindex2ifnet == NULL || if_index >= if_indexlim) {
548                 unsigned int n;
549                 struct ifnet **q;
550
551                 if_indexlim <<= 1;
552
553                 /* grow ifindex2ifnet */
554                 n = if_indexlim * sizeof(*q);
555                 q = kmalloc(n, M_IFADDR, M_WAITOK | M_ZERO);
556                 if (ifindex2ifnet) {
557                         bcopy(ifindex2ifnet, q, n/2);
558                         kfree(ifindex2ifnet, M_IFADDR);
559                 }
560                 ifindex2ifnet = q;
561         }
562
563         ifindex2ifnet[if_index] = ifp;
564
565         /*
566          * create a Link Level name for this device
567          */
568         namelen = strlen(ifp->if_xname);
569         masklen = offsetof(struct sockaddr_dl, sdl_data[0]) + namelen;
570         socksize = masklen + ifp->if_addrlen;
571 #define ROUNDUP(a) (1 + (((a) - 1) | (sizeof(long) - 1)))
572         if (socksize < sizeof(*sdl))
573                 socksize = sizeof(*sdl);
574         socksize = ROUNDUP(socksize);
575 #undef ROUNDUP
576         ifasize = sizeof(struct ifaddr) + 2 * socksize;
577         ifa = ifa_create(ifasize, M_WAITOK);
578         sdl = (struct sockaddr_dl *)(ifa + 1);
579         sdl->sdl_len = socksize;
580         sdl->sdl_family = AF_LINK;
581         bcopy(ifp->if_xname, sdl->sdl_data, namelen);
582         sdl->sdl_nlen = namelen;
583         sdl->sdl_index = ifp->if_index;
584         sdl->sdl_type = ifp->if_type;
585         ifp->if_lladdr = ifa;
586         ifa->ifa_ifp = ifp;
587         ifa->ifa_rtrequest = link_rtrequest;
588         ifa->ifa_addr = (struct sockaddr *)sdl;
589         sdl = (struct sockaddr_dl *)(socksize + (caddr_t)sdl);
590         ifa->ifa_netmask = (struct sockaddr *)sdl;
591         sdl->sdl_len = masklen;
592         while (namelen != 0)
593                 sdl->sdl_data[--namelen] = 0xff;
594         ifa_iflink(ifa, ifp, 0 /* Insert head */);
595
596         EVENTHANDLER_INVOKE(ifnet_attach_event, ifp);
597         devctl_notify("IFNET", ifp->if_xname, "ATTACH", NULL);
598
599         ifq = &ifp->if_snd;
600         ifq->altq_type = 0;
601         ifq->altq_disc = NULL;
602         ifq->altq_flags &= ALTQF_CANTCHANGE;
603         ifq->altq_tbr = NULL;
604         ifq->altq_ifp = ifp;
605         ifq->altq_started = 0;
606         ifq->altq_prepended = NULL;
607         ALTQ_LOCK_INIT(ifq);
608         ifq_set_classic(ifq);
609
610         ifq->altq_stage =
611             kmalloc_cachealign(ncpus * sizeof(struct ifaltq_stage),
612             M_DEVBUF, M_WAITOK | M_ZERO);
613         for (i = 0; i < ncpus; ++i)
614                 ifq->altq_stage[i].ifqs_altq = ifq;
615
616         if (!SLIST_EMPTY(&domains))
617                 if_attachdomain1(ifp);
618
619         /* Announce the interface. */
620         rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
621
622         mtx_unlock(&ifp->if_ioctl_mtx);
623 }
624
625 static void
626 if_attachdomain(void *dummy)
627 {
628         struct ifnet *ifp;
629
630         crit_enter();
631         TAILQ_FOREACH(ifp, &ifnet, if_list)
632                 if_attachdomain1(ifp);
633         crit_exit();
634 }
635 SYSINIT(domainifattach, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST,
636         if_attachdomain, NULL);
637
638 static void
639 if_attachdomain1(struct ifnet *ifp)
640 {
641         struct domain *dp;
642
643         crit_enter();
644
645         /* address family dependent data region */
646         bzero(ifp->if_afdata, sizeof(ifp->if_afdata));
647         SLIST_FOREACH(dp, &domains, dom_next)
648                 if (dp->dom_ifattach)
649                         ifp->if_afdata[dp->dom_family] =
650                                 (*dp->dom_ifattach)(ifp);
651         crit_exit();
652 }
653
654 /*
655  * Purge all addresses whose type is _not_ AF_LINK
656  */
657 void
658 if_purgeaddrs_nolink(struct ifnet *ifp)
659 {
660         struct ifaddr_container *ifac, *next;
661
662         TAILQ_FOREACH_MUTABLE(ifac, &ifp->if_addrheads[mycpuid],
663                               ifa_link, next) {
664                 struct ifaddr *ifa = ifac->ifa;
665
666                 /* Leave link ifaddr as it is */
667                 if (ifa->ifa_addr->sa_family == AF_LINK)
668                         continue;
669 #ifdef INET
670                 /* XXX: Ugly!! ad hoc just for INET */
671                 if (ifa->ifa_addr && ifa->ifa_addr->sa_family == AF_INET) {
672                         struct ifaliasreq ifr;
673 #ifdef IFADDR_DEBUG_VERBOSE
674                         int i;
675
676                         kprintf("purge in4 addr %p: ", ifa);
677                         for (i = 0; i < ncpus; ++i)
678                                 kprintf("%d ", ifa->ifa_containers[i].ifa_refcnt);
679                         kprintf("\n");
680 #endif
681
682                         bzero(&ifr, sizeof ifr);
683                         ifr.ifra_addr = *ifa->ifa_addr;
684                         if (ifa->ifa_dstaddr)
685                                 ifr.ifra_broadaddr = *ifa->ifa_dstaddr;
686                         if (in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, ifp,
687                                        NULL) == 0)
688                                 continue;
689                 }
690 #endif /* INET */
691 #ifdef INET6
692                 if (ifa->ifa_addr && ifa->ifa_addr->sa_family == AF_INET6) {
693 #ifdef IFADDR_DEBUG_VERBOSE
694                         int i;
695
696                         kprintf("purge in6 addr %p: ", ifa);
697                         for (i = 0; i < ncpus; ++i)
698                                 kprintf("%d ", ifa->ifa_containers[i].ifa_refcnt);
699                         kprintf("\n");
700 #endif
701
702                         in6_purgeaddr(ifa);
703                         /* ifp_addrhead is already updated */
704                         continue;
705                 }
706 #endif /* INET6 */
707                 ifa_ifunlink(ifa, ifp);
708                 ifa_destroy(ifa);
709         }
710 }
711
712 static void
713 ifq_stage_detach_handler(netmsg_t nmsg)
714 {
715         struct ifaltq *ifq = nmsg->lmsg.u.ms_resultp;
716         struct ifaltq_stage *stage = ifq_get_stage(ifq, mycpuid);
717
718         if (stage->ifqs_flags & IFQ_STAGE_FLAG_QUED)
719                 ifq_stage_remove(&ifq_stage_heads[mycpuid], stage);
720         lwkt_replymsg(&nmsg->lmsg, 0);
721 }
722
723 static void
724 ifq_stage_detach(struct ifaltq *ifq)
725 {
726         struct netmsg_base base;
727         int cpu;
728
729         netmsg_init(&base, NULL, &curthread->td_msgport, 0,
730             ifq_stage_detach_handler);
731         base.lmsg.u.ms_resultp = ifq;
732
733         for (cpu = 0; cpu < ncpus; ++cpu)
734                 lwkt_domsg(netisr_portfn(cpu), &base.lmsg, 0);
735 }
736
737 /*
738  * Detach an interface, removing it from the
739  * list of "active" interfaces.
740  */
741 void
742 if_detach(struct ifnet *ifp)
743 {
744         struct radix_node_head  *rnh;
745         int i;
746         int cpu, origcpu;
747         struct domain *dp;
748
749         EVENTHANDLER_INVOKE(ifnet_detach_event, ifp);
750
751         /*
752          * Remove routes and flush queues.
753          */
754         crit_enter();
755 #ifdef IFPOLL_ENABLE
756         if (ifp->if_flags & IFF_NPOLLING)
757                 ifpoll_deregister(ifp);
758 #endif
759         if_down(ifp);
760
761 #ifdef ALTQ
762         if (ifq_is_enabled(&ifp->if_snd))
763                 altq_disable(&ifp->if_snd);
764         if (ifq_is_attached(&ifp->if_snd))
765                 altq_detach(&ifp->if_snd);
766 #endif
767
768         /*
769          * Clean up all addresses.
770          */
771         ifp->if_lladdr = NULL;
772
773         if_purgeaddrs_nolink(ifp);
774         if (!TAILQ_EMPTY(&ifp->if_addrheads[mycpuid])) {
775                 struct ifaddr *ifa;
776
777                 ifa = TAILQ_FIRST(&ifp->if_addrheads[mycpuid])->ifa;
778                 KASSERT(ifa->ifa_addr->sa_family == AF_LINK,
779                         ("non-link ifaddr is left on if_addrheads"));
780
781                 ifa_ifunlink(ifa, ifp);
782                 ifa_destroy(ifa);
783                 KASSERT(TAILQ_EMPTY(&ifp->if_addrheads[mycpuid]),
784                         ("there are still ifaddrs left on if_addrheads"));
785         }
786
787 #ifdef INET
788         /*
789          * Remove all IPv4 kernel structures related to ifp.
790          */
791         in_ifdetach(ifp);
792 #endif
793
794 #ifdef INET6
795         /*
796          * Remove all IPv6 kernel structs related to ifp.  This should be done
797          * before removing routing entries below, since IPv6 interface direct
798          * routes are expected to be removed by the IPv6-specific kernel API.
799          * Otherwise, the kernel will detect some inconsistency and bark it.
800          */
801         in6_ifdetach(ifp);
802 #endif
803
804         /*
805          * Delete all remaining routes using this interface
806          * Unfortuneatly the only way to do this is to slog through
807          * the entire routing table looking for routes which point
808          * to this interface...oh well...
809          */
810         origcpu = mycpuid;
811         for (cpu = 0; cpu < ncpus; cpu++) {
812                 lwkt_migratecpu(cpu);
813                 for (i = 1; i <= AF_MAX; i++) {
814                         if ((rnh = rt_tables[cpu][i]) == NULL)
815                                 continue;
816                         rnh->rnh_walktree(rnh, if_rtdel, ifp);
817                 }
818         }
819         lwkt_migratecpu(origcpu);
820
821         /* Announce that the interface is gone. */
822         rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
823         devctl_notify("IFNET", ifp->if_xname, "DETACH", NULL);
824
825         SLIST_FOREACH(dp, &domains, dom_next)
826                 if (dp->dom_ifdetach && ifp->if_afdata[dp->dom_family])
827                         (*dp->dom_ifdetach)(ifp,
828                                 ifp->if_afdata[dp->dom_family]);
829
830         /*
831          * Remove interface from ifindex2ifp[] and maybe decrement if_index.
832          */
833         ifindex2ifnet[ifp->if_index] = NULL;
834         while (if_index > 0 && ifindex2ifnet[if_index] == NULL)
835                 if_index--;
836
837         TAILQ_REMOVE(&ifnet, ifp, if_link);
838         kfree(ifp->if_addrheads, M_IFADDR);
839
840         lwkt_synchronize_ipiqs("if_detach");
841         ifq_stage_detach(&ifp->if_snd);
842
843         kfree(ifp->if_start_nmsg, M_LWKTMSG);
844         kfree(ifp->if_snd.altq_stage, M_DEVBUF);
845         crit_exit();
846 }
847
848 /*
849  * Create interface group without members
850  */
851 struct ifg_group *
852 if_creategroup(const char *groupname)
853 {
854         struct ifg_group        *ifg = NULL;
855
856         if ((ifg = (struct ifg_group *)kmalloc(sizeof(struct ifg_group),
857             M_TEMP, M_NOWAIT)) == NULL)
858                 return (NULL);
859
860         strlcpy(ifg->ifg_group, groupname, sizeof(ifg->ifg_group));
861         ifg->ifg_refcnt = 0;
862         ifg->ifg_carp_demoted = 0;
863         TAILQ_INIT(&ifg->ifg_members);
864 #if NPF > 0
865         pfi_attach_ifgroup(ifg);
866 #endif
867         TAILQ_INSERT_TAIL(&ifg_head, ifg, ifg_next);
868
869         return (ifg);
870 }
871
872 /*
873  * Add a group to an interface
874  */
875 int
876 if_addgroup(struct ifnet *ifp, const char *groupname)
877 {
878         struct ifg_list         *ifgl;
879         struct ifg_group        *ifg = NULL;
880         struct ifg_member       *ifgm;
881
882         if (groupname[0] && groupname[strlen(groupname) - 1] >= '0' &&
883             groupname[strlen(groupname) - 1] <= '9')
884                 return (EINVAL);
885
886         TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
887                 if (!strcmp(ifgl->ifgl_group->ifg_group, groupname))
888                         return (EEXIST);
889
890         if ((ifgl = kmalloc(sizeof(*ifgl), M_TEMP, M_NOWAIT)) == NULL)
891                 return (ENOMEM);
892
893         if ((ifgm = kmalloc(sizeof(*ifgm), M_TEMP, M_NOWAIT)) == NULL) {
894                 kfree(ifgl, M_TEMP);
895                 return (ENOMEM);
896         }
897
898         TAILQ_FOREACH(ifg, &ifg_head, ifg_next)
899                 if (!strcmp(ifg->ifg_group, groupname))
900                         break;
901
902         if (ifg == NULL && (ifg = if_creategroup(groupname)) == NULL) {
903                 kfree(ifgl, M_TEMP);
904                 kfree(ifgm, M_TEMP);
905                 return (ENOMEM);
906         }
907
908         ifg->ifg_refcnt++;
909         ifgl->ifgl_group = ifg;
910         ifgm->ifgm_ifp = ifp;
911
912         TAILQ_INSERT_TAIL(&ifg->ifg_members, ifgm, ifgm_next);
913         TAILQ_INSERT_TAIL(&ifp->if_groups, ifgl, ifgl_next);
914
915 #if NPF > 0
916         pfi_group_change(groupname);
917 #endif
918
919         return (0);
920 }
921
922 /*
923  * Remove a group from an interface
924  */
925 int
926 if_delgroup(struct ifnet *ifp, const char *groupname)
927 {
928         struct ifg_list         *ifgl;
929         struct ifg_member       *ifgm;
930
931         TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
932                 if (!strcmp(ifgl->ifgl_group->ifg_group, groupname))
933                         break;
934         if (ifgl == NULL)
935                 return (ENOENT);
936
937         TAILQ_REMOVE(&ifp->if_groups, ifgl, ifgl_next);
938
939         TAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next)
940                 if (ifgm->ifgm_ifp == ifp)
941                         break;
942
943         if (ifgm != NULL) {
944                 TAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifgm_next);
945                 kfree(ifgm, M_TEMP);
946         }
947
948         if (--ifgl->ifgl_group->ifg_refcnt == 0) {
949                 TAILQ_REMOVE(&ifg_head, ifgl->ifgl_group, ifg_next);
950 #if NPF > 0
951                 pfi_detach_ifgroup(ifgl->ifgl_group);
952 #endif
953                 kfree(ifgl->ifgl_group, M_TEMP);
954         }
955
956         kfree(ifgl, M_TEMP);
957
958 #if NPF > 0
959         pfi_group_change(groupname);
960 #endif
961
962         return (0);
963 }
964
965 /*
966  * Stores all groups from an interface in memory pointed
967  * to by data
968  */
969 int
970 if_getgroup(caddr_t data, struct ifnet *ifp)
971 {
972         int                      len, error;
973         struct ifg_list         *ifgl;
974         struct ifg_req           ifgrq, *ifgp;
975         struct ifgroupreq       *ifgr = (struct ifgroupreq *)data;
976
977         if (ifgr->ifgr_len == 0) {
978                 TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
979                         ifgr->ifgr_len += sizeof(struct ifg_req);
980                 return (0);
981         }
982
983         len = ifgr->ifgr_len;
984         ifgp = ifgr->ifgr_groups;
985         TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) {
986                 if (len < sizeof(ifgrq))
987                         return (EINVAL);
988                 bzero(&ifgrq, sizeof ifgrq);
989                 strlcpy(ifgrq.ifgrq_group, ifgl->ifgl_group->ifg_group,
990                     sizeof(ifgrq.ifgrq_group));
991                 if ((error = copyout((caddr_t)&ifgrq, (caddr_t)ifgp,
992                     sizeof(struct ifg_req))))
993                         return (error);
994                 len -= sizeof(ifgrq);
995                 ifgp++;
996         }
997
998         return (0);
999 }
1000
1001 /*
1002  * Stores all members of a group in memory pointed to by data
1003  */
1004 int
1005 if_getgroupmembers(caddr_t data)
1006 {
1007         struct ifgroupreq       *ifgr = (struct ifgroupreq *)data;
1008         struct ifg_group        *ifg;
1009         struct ifg_member       *ifgm;
1010         struct ifg_req           ifgrq, *ifgp;
1011         int                      len, error;
1012
1013         TAILQ_FOREACH(ifg, &ifg_head, ifg_next)
1014                 if (!strcmp(ifg->ifg_group, ifgr->ifgr_name))
1015                         break;
1016         if (ifg == NULL)
1017                 return (ENOENT);
1018
1019         if (ifgr->ifgr_len == 0) {
1020                 TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next)
1021                         ifgr->ifgr_len += sizeof(ifgrq);
1022                 return (0);
1023         }
1024
1025         len = ifgr->ifgr_len;
1026         ifgp = ifgr->ifgr_groups;
1027         TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) {
1028                 if (len < sizeof(ifgrq))
1029                         return (EINVAL);
1030                 bzero(&ifgrq, sizeof ifgrq);
1031                 strlcpy(ifgrq.ifgrq_member, ifgm->ifgm_ifp->if_xname,
1032                     sizeof(ifgrq.ifgrq_member));
1033                 if ((error = copyout((caddr_t)&ifgrq, (caddr_t)ifgp,
1034                     sizeof(struct ifg_req))))
1035                         return (error);
1036                 len -= sizeof(ifgrq);
1037                 ifgp++;
1038         }
1039
1040         return (0);
1041 }
1042
1043 /*
1044  * Delete Routes for a Network Interface
1045  *
1046  * Called for each routing entry via the rnh->rnh_walktree() call above
1047  * to delete all route entries referencing a detaching network interface.
1048  *
1049  * Arguments:
1050  *      rn      pointer to node in the routing table
1051  *      arg     argument passed to rnh->rnh_walktree() - detaching interface
1052  *
1053  * Returns:
1054  *      0       successful
1055  *      errno   failed - reason indicated
1056  *
1057  */
1058 static int
1059 if_rtdel(struct radix_node *rn, void *arg)
1060 {
1061         struct rtentry  *rt = (struct rtentry *)rn;
1062         struct ifnet    *ifp = arg;
1063         int             err;
1064
1065         if (rt->rt_ifp == ifp) {
1066
1067                 /*
1068                  * Protect (sorta) against walktree recursion problems
1069                  * with cloned routes
1070                  */
1071                 if (!(rt->rt_flags & RTF_UP))
1072                         return (0);
1073
1074                 err = rtrequest(RTM_DELETE, rt_key(rt), rt->rt_gateway,
1075                                 rt_mask(rt), rt->rt_flags,
1076                                 NULL);
1077                 if (err) {
1078                         log(LOG_WARNING, "if_rtdel: error %d\n", err);
1079                 }
1080         }
1081
1082         return (0);
1083 }
1084
1085 /*
1086  * Locate an interface based on a complete address.
1087  */
1088 struct ifaddr *
1089 ifa_ifwithaddr(struct sockaddr *addr)
1090 {
1091         struct ifnet *ifp;
1092
1093         TAILQ_FOREACH(ifp, &ifnet, if_link) {
1094                 struct ifaddr_container *ifac;
1095
1096                 TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1097                         struct ifaddr *ifa = ifac->ifa;
1098
1099                         if (ifa->ifa_addr->sa_family != addr->sa_family)
1100                                 continue;
1101                         if (sa_equal(addr, ifa->ifa_addr))
1102                                 return (ifa);
1103                         if ((ifp->if_flags & IFF_BROADCAST) &&
1104                             ifa->ifa_broadaddr &&
1105                             /* IPv6 doesn't have broadcast */
1106                             ifa->ifa_broadaddr->sa_len != 0 &&
1107                             sa_equal(ifa->ifa_broadaddr, addr))
1108                                 return (ifa);
1109                 }
1110         }
1111         return (NULL);
1112 }
1113 /*
1114  * Locate the point to point interface with a given destination address.
1115  */
1116 struct ifaddr *
1117 ifa_ifwithdstaddr(struct sockaddr *addr)
1118 {
1119         struct ifnet *ifp;
1120
1121         TAILQ_FOREACH(ifp, &ifnet, if_link) {
1122                 struct ifaddr_container *ifac;
1123
1124                 if (!(ifp->if_flags & IFF_POINTOPOINT))
1125                         continue;
1126
1127                 TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1128                         struct ifaddr *ifa = ifac->ifa;
1129
1130                         if (ifa->ifa_addr->sa_family != addr->sa_family)
1131                                 continue;
1132                         if (ifa->ifa_dstaddr &&
1133                             sa_equal(addr, ifa->ifa_dstaddr))
1134                                 return (ifa);
1135                 }
1136         }
1137         return (NULL);
1138 }
1139
1140 /*
1141  * Find an interface on a specific network.  If many, choice
1142  * is most specific found.
1143  */
1144 struct ifaddr *
1145 ifa_ifwithnet(struct sockaddr *addr)
1146 {
1147         struct ifnet *ifp;
1148         struct ifaddr *ifa_maybe = NULL;
1149         u_int af = addr->sa_family;
1150         char *addr_data = addr->sa_data, *cplim;
1151
1152         /*
1153          * AF_LINK addresses can be looked up directly by their index number,
1154          * so do that if we can.
1155          */
1156         if (af == AF_LINK) {
1157                 struct sockaddr_dl *sdl = (struct sockaddr_dl *)addr;
1158
1159                 if (sdl->sdl_index && sdl->sdl_index <= if_index)
1160                         return (ifindex2ifnet[sdl->sdl_index]->if_lladdr);
1161         }
1162
1163         /*
1164          * Scan though each interface, looking for ones that have
1165          * addresses in this address family.
1166          */
1167         TAILQ_FOREACH(ifp, &ifnet, if_link) {
1168                 struct ifaddr_container *ifac;
1169
1170                 TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1171                         struct ifaddr *ifa = ifac->ifa;
1172                         char *cp, *cp2, *cp3;
1173
1174                         if (ifa->ifa_addr->sa_family != af)
1175 next:                           continue;
1176                         if (af == AF_INET && ifp->if_flags & IFF_POINTOPOINT) {
1177                                 /*
1178                                  * This is a bit broken as it doesn't
1179                                  * take into account that the remote end may
1180                                  * be a single node in the network we are
1181                                  * looking for.
1182                                  * The trouble is that we don't know the
1183                                  * netmask for the remote end.
1184                                  */
1185                                 if (ifa->ifa_dstaddr != NULL &&
1186                                     sa_equal(addr, ifa->ifa_dstaddr))
1187                                         return (ifa);
1188                         } else {
1189                                 /*
1190                                  * if we have a special address handler,
1191                                  * then use it instead of the generic one.
1192                                  */
1193                                 if (ifa->ifa_claim_addr) {
1194                                         if ((*ifa->ifa_claim_addr)(ifa, addr)) {
1195                                                 return (ifa);
1196                                         } else {
1197                                                 continue;
1198                                         }
1199                                 }
1200
1201                                 /*
1202                                  * Scan all the bits in the ifa's address.
1203                                  * If a bit dissagrees with what we are
1204                                  * looking for, mask it with the netmask
1205                                  * to see if it really matters.
1206                                  * (A byte at a time)
1207                                  */
1208                                 if (ifa->ifa_netmask == 0)
1209                                         continue;
1210                                 cp = addr_data;
1211                                 cp2 = ifa->ifa_addr->sa_data;
1212                                 cp3 = ifa->ifa_netmask->sa_data;
1213                                 cplim = ifa->ifa_netmask->sa_len +
1214                                         (char *)ifa->ifa_netmask;
1215                                 while (cp3 < cplim)
1216                                         if ((*cp++ ^ *cp2++) & *cp3++)
1217                                                 goto next; /* next address! */
1218                                 /*
1219                                  * If the netmask of what we just found
1220                                  * is more specific than what we had before
1221                                  * (if we had one) then remember the new one
1222                                  * before continuing to search
1223                                  * for an even better one.
1224                                  */
1225                                 if (ifa_maybe == NULL ||
1226                                     rn_refines((char *)ifa->ifa_netmask,
1227                                                (char *)ifa_maybe->ifa_netmask))
1228                                         ifa_maybe = ifa;
1229                         }
1230                 }
1231         }
1232         return (ifa_maybe);
1233 }
1234
1235 /*
1236  * Find an interface address specific to an interface best matching
1237  * a given address.
1238  */
1239 struct ifaddr *
1240 ifaof_ifpforaddr(struct sockaddr *addr, struct ifnet *ifp)
1241 {
1242         struct ifaddr_container *ifac;
1243         char *cp, *cp2, *cp3;
1244         char *cplim;
1245         struct ifaddr *ifa_maybe = NULL;
1246         u_int af = addr->sa_family;
1247
1248         if (af >= AF_MAX)
1249                 return (0);
1250         TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1251                 struct ifaddr *ifa = ifac->ifa;
1252
1253                 if (ifa->ifa_addr->sa_family != af)
1254                         continue;
1255                 if (ifa_maybe == NULL)
1256                         ifa_maybe = ifa;
1257                 if (ifa->ifa_netmask == NULL) {
1258                         if (sa_equal(addr, ifa->ifa_addr) ||
1259                             (ifa->ifa_dstaddr != NULL &&
1260                              sa_equal(addr, ifa->ifa_dstaddr)))
1261                                 return (ifa);
1262                         continue;
1263                 }
1264                 if (ifp->if_flags & IFF_POINTOPOINT) {
1265                         if (sa_equal(addr, ifa->ifa_dstaddr))
1266                                 return (ifa);
1267                 } else {
1268                         cp = addr->sa_data;
1269                         cp2 = ifa->ifa_addr->sa_data;
1270                         cp3 = ifa->ifa_netmask->sa_data;
1271                         cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask;
1272                         for (; cp3 < cplim; cp3++)
1273                                 if ((*cp++ ^ *cp2++) & *cp3)
1274                                         break;
1275                         if (cp3 == cplim)
1276                                 return (ifa);
1277                 }
1278         }
1279         return (ifa_maybe);
1280 }
1281
1282 /*
1283  * Default action when installing a route with a Link Level gateway.
1284  * Lookup an appropriate real ifa to point to.
1285  * This should be moved to /sys/net/link.c eventually.
1286  */
1287 static void
1288 link_rtrequest(int cmd, struct rtentry *rt, struct rt_addrinfo *info)
1289 {
1290         struct ifaddr *ifa;
1291         struct sockaddr *dst;
1292         struct ifnet *ifp;
1293
1294         if (cmd != RTM_ADD || (ifa = rt->rt_ifa) == NULL ||
1295             (ifp = ifa->ifa_ifp) == NULL || (dst = rt_key(rt)) == NULL)
1296                 return;
1297         ifa = ifaof_ifpforaddr(dst, ifp);
1298         if (ifa != NULL) {
1299                 IFAFREE(rt->rt_ifa);
1300                 IFAREF(ifa);
1301                 rt->rt_ifa = ifa;
1302                 if (ifa->ifa_rtrequest && ifa->ifa_rtrequest != link_rtrequest)
1303                         ifa->ifa_rtrequest(cmd, rt, info);
1304         }
1305 }
1306
1307 /*
1308  * Mark an interface down and notify protocols of
1309  * the transition.
1310  * NOTE: must be called at splnet or eqivalent.
1311  */
1312 void
1313 if_unroute(struct ifnet *ifp, int flag, int fam)
1314 {
1315         struct ifaddr_container *ifac;
1316
1317         ifp->if_flags &= ~flag;
1318         getmicrotime(&ifp->if_lastchange);
1319         TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1320                 struct ifaddr *ifa = ifac->ifa;
1321
1322                 if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family))
1323                         kpfctlinput(PRC_IFDOWN, ifa->ifa_addr);
1324         }
1325         ifq_purge_all(&ifp->if_snd);
1326         rt_ifmsg(ifp);
1327 }
1328
1329 /*
1330  * Mark an interface up and notify protocols of
1331  * the transition.
1332  * NOTE: must be called at splnet or eqivalent.
1333  */
1334 void
1335 if_route(struct ifnet *ifp, int flag, int fam)
1336 {
1337         struct ifaddr_container *ifac;
1338
1339         ifq_purge_all(&ifp->if_snd);
1340         ifp->if_flags |= flag;
1341         getmicrotime(&ifp->if_lastchange);
1342         TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1343                 struct ifaddr *ifa = ifac->ifa;
1344
1345                 if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family))
1346                         kpfctlinput(PRC_IFUP, ifa->ifa_addr);
1347         }
1348         rt_ifmsg(ifp);
1349 #ifdef INET6
1350         in6_if_up(ifp);
1351 #endif
1352 }
1353
1354 /*
1355  * Mark an interface down and notify protocols of the transition.  An
1356  * interface going down is also considered to be a synchronizing event.
1357  * We must ensure that all packet processing related to the interface
1358  * has completed before we return so e.g. the caller can free the ifnet
1359  * structure that the mbufs may be referencing.
1360  *
1361  * NOTE: must be called at splnet or eqivalent.
1362  */
1363 void
1364 if_down(struct ifnet *ifp)
1365 {
1366         if_unroute(ifp, IFF_UP, AF_UNSPEC);
1367         netmsg_service_sync();
1368 }
1369
1370 /*
1371  * Mark an interface up and notify protocols of
1372  * the transition.
1373  * NOTE: must be called at splnet or eqivalent.
1374  */
1375 void
1376 if_up(struct ifnet *ifp)
1377 {
1378         if_route(ifp, IFF_UP, AF_UNSPEC);
1379 }
1380
1381 /*
1382  * Process a link state change.
1383  * NOTE: must be called at splsoftnet or equivalent.
1384  */
1385 void
1386 if_link_state_change(struct ifnet *ifp)
1387 {
1388         int link_state = ifp->if_link_state;
1389
1390         rt_ifmsg(ifp);
1391         devctl_notify("IFNET", ifp->if_xname,
1392             (link_state == LINK_STATE_UP) ? "LINK_UP" : "LINK_DOWN", NULL);
1393 }
1394
1395 /*
1396  * Handle interface watchdog timer routines.  Called
1397  * from softclock, we decrement timers (if set) and
1398  * call the appropriate interface routine on expiration.
1399  */
1400 static void
1401 if_slowtimo(void *arg)
1402 {
1403         struct ifnet *ifp;
1404
1405         crit_enter();
1406
1407         TAILQ_FOREACH(ifp, &ifnet, if_link) {
1408                 if (ifp->if_timer == 0 || --ifp->if_timer)
1409                         continue;
1410                 if (ifp->if_watchdog) {
1411                         if (ifnet_tryserialize_all(ifp)) {
1412                                 (*ifp->if_watchdog)(ifp);
1413                                 ifnet_deserialize_all(ifp);
1414                         } else {
1415                                 /* try again next timeout */
1416                                 ++ifp->if_timer;
1417                         }
1418                 }
1419         }
1420
1421         crit_exit();
1422
1423         callout_reset(&if_slowtimo_timer, hz / IFNET_SLOWHZ, if_slowtimo, NULL);
1424 }
1425
1426 /*
1427  * Map interface name to
1428  * interface structure pointer.
1429  */
1430 struct ifnet *
1431 ifunit(const char *name)
1432 {
1433         struct ifnet *ifp;
1434
1435         /*
1436          * Search all the interfaces for this name/number
1437          */
1438
1439         TAILQ_FOREACH(ifp, &ifnet, if_link) {
1440                 if (strncmp(ifp->if_xname, name, IFNAMSIZ) == 0)
1441                         break;
1442         }
1443         return (ifp);
1444 }
1445
1446
1447 /*
1448  * Map interface name in a sockaddr_dl to
1449  * interface structure pointer.
1450  */
1451 struct ifnet *
1452 if_withname(struct sockaddr *sa)
1453 {
1454         char ifname[IFNAMSIZ+1];
1455         struct sockaddr_dl *sdl = (struct sockaddr_dl *)sa;
1456
1457         if ( (sa->sa_family != AF_LINK) || (sdl->sdl_nlen == 0) ||
1458              (sdl->sdl_nlen > IFNAMSIZ) )
1459                 return NULL;
1460
1461         /*
1462          * ifunit wants a null-terminated name.  It may not be null-terminated
1463          * in the sockaddr.  We don't want to change the caller's sockaddr,
1464          * and there might not be room to put the trailing null anyway, so we
1465          * make a local copy that we know we can null terminate safely.
1466          */
1467
1468         bcopy(sdl->sdl_data, ifname, sdl->sdl_nlen);
1469         ifname[sdl->sdl_nlen] = '\0';
1470         return ifunit(ifname);
1471 }
1472
1473
1474 /*
1475  * Interface ioctls.
1476  */
1477 int
1478 ifioctl(struct socket *so, u_long cmd, caddr_t data, struct ucred *cred)
1479 {
1480         struct ifnet *ifp;
1481         struct ifreq *ifr;
1482         struct ifstat *ifs;
1483         int error;
1484         short oif_flags;
1485         int new_flags;
1486 #ifdef COMPAT_43
1487         int ocmd;
1488 #endif
1489         size_t namelen, onamelen;
1490         char new_name[IFNAMSIZ];
1491         struct ifaddr *ifa;
1492         struct sockaddr_dl *sdl;
1493
1494         switch (cmd) {
1495         case SIOCGIFCONF:
1496         case OSIOCGIFCONF:
1497                 return (ifconf(cmd, data, cred));
1498         default:
1499                 break;
1500         }
1501
1502         ifr = (struct ifreq *)data;
1503
1504         switch (cmd) {
1505         case SIOCIFCREATE:
1506         case SIOCIFCREATE2:
1507                 if ((error = priv_check_cred(cred, PRIV_ROOT, 0)) != 0)
1508                         return (error);
1509                 return (if_clone_create(ifr->ifr_name, sizeof(ifr->ifr_name),
1510                         cmd == SIOCIFCREATE2 ? ifr->ifr_data : NULL));
1511         case SIOCIFDESTROY:
1512                 if ((error = priv_check_cred(cred, PRIV_ROOT, 0)) != 0)
1513                         return (error);
1514                 return (if_clone_destroy(ifr->ifr_name));
1515         case SIOCIFGCLONERS:
1516                 return (if_clone_list((struct if_clonereq *)data));
1517         default:
1518                 break;
1519         }
1520
1521         /*
1522          * Nominal ioctl through interface, lookup the ifp and obtain a
1523          * lock to serialize the ifconfig ioctl operation.
1524          */
1525         ifp = ifunit(ifr->ifr_name);
1526         if (ifp == NULL)
1527                 return (ENXIO);
1528         error = 0;
1529         mtx_lock(&ifp->if_ioctl_mtx);
1530
1531         switch (cmd) {
1532         case SIOCGIFINDEX:
1533                 ifr->ifr_index = ifp->if_index;
1534                 break;
1535
1536         case SIOCGIFFLAGS:
1537                 ifr->ifr_flags = ifp->if_flags;
1538                 ifr->ifr_flagshigh = ifp->if_flags >> 16;
1539                 break;
1540
1541         case SIOCGIFCAP:
1542                 ifr->ifr_reqcap = ifp->if_capabilities;
1543                 ifr->ifr_curcap = ifp->if_capenable;
1544                 break;
1545
1546         case SIOCGIFMETRIC:
1547                 ifr->ifr_metric = ifp->if_metric;
1548                 break;
1549
1550         case SIOCGIFMTU:
1551                 ifr->ifr_mtu = ifp->if_mtu;
1552                 break;
1553
1554         case SIOCGIFDATA:
1555                 error = copyout((caddr_t)&ifp->if_data, ifr->ifr_data,
1556                                 sizeof(ifp->if_data));
1557                 break;
1558
1559         case SIOCGIFPHYS:
1560                 ifr->ifr_phys = ifp->if_physical;
1561                 break;
1562
1563         case SIOCGIFPOLLCPU:
1564                 ifr->ifr_pollcpu = -1;
1565                 break;
1566
1567         case SIOCSIFPOLLCPU:
1568                 break;
1569
1570         case SIOCSIFFLAGS:
1571                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1572                 if (error)
1573                         break;
1574                 new_flags = (ifr->ifr_flags & 0xffff) |
1575                     (ifr->ifr_flagshigh << 16);
1576                 if (ifp->if_flags & IFF_SMART) {
1577                         /* Smart drivers twiddle their own routes */
1578                 } else if (ifp->if_flags & IFF_UP &&
1579                     (new_flags & IFF_UP) == 0) {
1580                         crit_enter();
1581                         if_down(ifp);
1582                         crit_exit();
1583                 } else if (new_flags & IFF_UP &&
1584                     (ifp->if_flags & IFF_UP) == 0) {
1585                         crit_enter();
1586                         if_up(ifp);
1587                         crit_exit();
1588                 }
1589
1590 #ifdef IFPOLL_ENABLE
1591                 if ((new_flags ^ ifp->if_flags) & IFF_NPOLLING) {
1592                         if (new_flags & IFF_NPOLLING)
1593                                 ifpoll_register(ifp);
1594                         else
1595                                 ifpoll_deregister(ifp);
1596                 }
1597 #endif
1598
1599                 ifp->if_flags = (ifp->if_flags & IFF_CANTCHANGE) |
1600                         (new_flags &~ IFF_CANTCHANGE);
1601                 if (new_flags & IFF_PPROMISC) {
1602                         /* Permanently promiscuous mode requested */
1603                         ifp->if_flags |= IFF_PROMISC;
1604                 } else if (ifp->if_pcount == 0) {
1605                         ifp->if_flags &= ~IFF_PROMISC;
1606                 }
1607                 if (ifp->if_ioctl) {
1608                         ifnet_serialize_all(ifp);
1609                         ifp->if_ioctl(ifp, cmd, data, cred);
1610                         ifnet_deserialize_all(ifp);
1611                 }
1612                 getmicrotime(&ifp->if_lastchange);
1613                 break;
1614
1615         case SIOCSIFCAP:
1616                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1617                 if (error)
1618                         break;
1619                 if (ifr->ifr_reqcap & ~ifp->if_capabilities) {
1620                         error = EINVAL;
1621                         break;
1622                 }
1623                 ifnet_serialize_all(ifp);
1624                 ifp->if_ioctl(ifp, cmd, data, cred);
1625                 ifnet_deserialize_all(ifp);
1626                 break;
1627
1628         case SIOCSIFNAME:
1629                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1630                 if (error)
1631                         break;
1632                 error = copyinstr(ifr->ifr_data, new_name, IFNAMSIZ, NULL);
1633                 if (error)
1634                         break;
1635                 if (new_name[0] == '\0') {
1636                         error = EINVAL;
1637                         break;
1638                 }
1639                 if (ifunit(new_name) != NULL) {
1640                         error = EEXIST;
1641                         break;
1642                 }
1643
1644                 EVENTHANDLER_INVOKE(ifnet_detach_event, ifp);
1645
1646                 /* Announce the departure of the interface. */
1647                 rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
1648
1649                 strlcpy(ifp->if_xname, new_name, sizeof(ifp->if_xname));
1650                 ifa = TAILQ_FIRST(&ifp->if_addrheads[mycpuid])->ifa;
1651                 /* XXX IFA_LOCK(ifa); */
1652                 sdl = (struct sockaddr_dl *)ifa->ifa_addr;
1653                 namelen = strlen(new_name);
1654                 onamelen = sdl->sdl_nlen;
1655                 /*
1656                  * Move the address if needed.  This is safe because we
1657                  * allocate space for a name of length IFNAMSIZ when we
1658                  * create this in if_attach().
1659                  */
1660                 if (namelen != onamelen) {
1661                         bcopy(sdl->sdl_data + onamelen,
1662                             sdl->sdl_data + namelen, sdl->sdl_alen);
1663                 }
1664                 bcopy(new_name, sdl->sdl_data, namelen);
1665                 sdl->sdl_nlen = namelen;
1666                 sdl = (struct sockaddr_dl *)ifa->ifa_netmask;
1667                 bzero(sdl->sdl_data, onamelen);
1668                 while (namelen != 0)
1669                         sdl->sdl_data[--namelen] = 0xff;
1670                 /* XXX IFA_UNLOCK(ifa) */
1671
1672                 EVENTHANDLER_INVOKE(ifnet_attach_event, ifp);
1673
1674                 /* Announce the return of the interface. */
1675                 rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
1676                 break;
1677
1678         case SIOCSIFMETRIC:
1679                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1680                 if (error)
1681                         break;
1682                 ifp->if_metric = ifr->ifr_metric;
1683                 getmicrotime(&ifp->if_lastchange);
1684                 break;
1685
1686         case SIOCSIFPHYS:
1687                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1688                 if (error)
1689                         break;
1690                 if (ifp->if_ioctl == NULL) {
1691                         error = EOPNOTSUPP;
1692                         break;
1693                 }
1694                 ifnet_serialize_all(ifp);
1695                 error = ifp->if_ioctl(ifp, cmd, data, cred);
1696                 ifnet_deserialize_all(ifp);
1697                 if (error == 0)
1698                         getmicrotime(&ifp->if_lastchange);
1699                 break;
1700
1701         case SIOCSIFMTU:
1702         {
1703                 u_long oldmtu = ifp->if_mtu;
1704
1705                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1706                 if (error)
1707                         break;
1708                 if (ifp->if_ioctl == NULL) {
1709                         error = EOPNOTSUPP;
1710                         break;
1711                 }
1712                 if (ifr->ifr_mtu < IF_MINMTU || ifr->ifr_mtu > IF_MAXMTU) {
1713                         error = EINVAL;
1714                         break;
1715                 }
1716                 ifnet_serialize_all(ifp);
1717                 error = ifp->if_ioctl(ifp, cmd, data, cred);
1718                 ifnet_deserialize_all(ifp);
1719                 if (error == 0) {
1720                         getmicrotime(&ifp->if_lastchange);
1721                         rt_ifmsg(ifp);
1722                 }
1723                 /*
1724                  * If the link MTU changed, do network layer specific procedure.
1725                  */
1726                 if (ifp->if_mtu != oldmtu) {
1727 #ifdef INET6
1728                         nd6_setmtu(ifp);
1729 #endif
1730                 }
1731                 break;
1732         }
1733
1734         case SIOCADDMULTI:
1735         case SIOCDELMULTI:
1736                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1737                 if (error)
1738                         break;
1739
1740                 /* Don't allow group membership on non-multicast interfaces. */
1741                 if ((ifp->if_flags & IFF_MULTICAST) == 0) {
1742                         error = EOPNOTSUPP;
1743                         break;
1744                 }
1745
1746                 /* Don't let users screw up protocols' entries. */
1747                 if (ifr->ifr_addr.sa_family != AF_LINK) {
1748                         error = EINVAL;
1749                         break;
1750                 }
1751
1752                 if (cmd == SIOCADDMULTI) {
1753                         struct ifmultiaddr *ifma;
1754                         error = if_addmulti(ifp, &ifr->ifr_addr, &ifma);
1755                 } else {
1756                         error = if_delmulti(ifp, &ifr->ifr_addr);
1757                 }
1758                 if (error == 0)
1759                         getmicrotime(&ifp->if_lastchange);
1760                 break;
1761
1762         case SIOCSIFPHYADDR:
1763         case SIOCDIFPHYADDR:
1764 #ifdef INET6
1765         case SIOCSIFPHYADDR_IN6:
1766 #endif
1767         case SIOCSLIFPHYADDR:
1768         case SIOCSIFMEDIA:
1769         case SIOCSIFGENERIC:
1770                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1771                 if (error)
1772                         break;
1773                 if (ifp->if_ioctl == 0) {
1774                         error = EOPNOTSUPP;
1775                         break;
1776                 }
1777                 ifnet_serialize_all(ifp);
1778                 error = ifp->if_ioctl(ifp, cmd, data, cred);
1779                 ifnet_deserialize_all(ifp);
1780                 if (error == 0)
1781                         getmicrotime(&ifp->if_lastchange);
1782                 break;
1783
1784         case SIOCGIFSTATUS:
1785                 ifs = (struct ifstat *)data;
1786                 ifs->ascii[0] = '\0';
1787                 /* fall through */
1788         case SIOCGIFPSRCADDR:
1789         case SIOCGIFPDSTADDR:
1790         case SIOCGLIFPHYADDR:
1791         case SIOCGIFMEDIA:
1792         case SIOCGIFGENERIC:
1793                 if (ifp->if_ioctl == NULL) {
1794                         error = EOPNOTSUPP;
1795                         break;
1796                 }
1797                 ifnet_serialize_all(ifp);
1798                 error = ifp->if_ioctl(ifp, cmd, data, cred);
1799                 ifnet_deserialize_all(ifp);
1800                 break;
1801
1802         case SIOCSIFLLADDR:
1803                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1804                 if (error)
1805                         break;
1806                 error = if_setlladdr(ifp, ifr->ifr_addr.sa_data,
1807                                      ifr->ifr_addr.sa_len);
1808                 EVENTHANDLER_INVOKE(iflladdr_event, ifp);
1809                 break;
1810
1811         default:
1812                 oif_flags = ifp->if_flags;
1813                 if (so->so_proto == 0) {
1814                         error = EOPNOTSUPP;
1815                         break;
1816                 }
1817 #ifndef COMPAT_43
1818                 error = so_pru_control_direct(so, cmd, data, ifp);
1819 #else
1820                 ocmd = cmd;
1821
1822                 switch (cmd) {
1823                 case SIOCSIFDSTADDR:
1824                 case SIOCSIFADDR:
1825                 case SIOCSIFBRDADDR:
1826                 case SIOCSIFNETMASK:
1827 #if BYTE_ORDER != BIG_ENDIAN
1828                         if (ifr->ifr_addr.sa_family == 0 &&
1829                             ifr->ifr_addr.sa_len < 16) {
1830                                 ifr->ifr_addr.sa_family = ifr->ifr_addr.sa_len;
1831                                 ifr->ifr_addr.sa_len = 16;
1832                         }
1833 #else
1834                         if (ifr->ifr_addr.sa_len == 0)
1835                                 ifr->ifr_addr.sa_len = 16;
1836 #endif
1837                         break;
1838                 case OSIOCGIFADDR:
1839                         cmd = SIOCGIFADDR;
1840                         break;
1841                 case OSIOCGIFDSTADDR:
1842                         cmd = SIOCGIFDSTADDR;
1843                         break;
1844                 case OSIOCGIFBRDADDR:
1845                         cmd = SIOCGIFBRDADDR;
1846                         break;
1847                 case OSIOCGIFNETMASK:
1848                         cmd = SIOCGIFNETMASK;
1849                         break;
1850                 default:
1851                         break;
1852                 }
1853
1854                 error = so_pru_control_direct(so, cmd, data, ifp);
1855
1856                 switch (ocmd) {
1857                 case OSIOCGIFADDR:
1858                 case OSIOCGIFDSTADDR:
1859                 case OSIOCGIFBRDADDR:
1860                 case OSIOCGIFNETMASK:
1861                         *(u_short *)&ifr->ifr_addr = ifr->ifr_addr.sa_family;
1862                         break;
1863                 }
1864 #endif /* COMPAT_43 */
1865
1866                 if ((oif_flags ^ ifp->if_flags) & IFF_UP) {
1867 #ifdef INET6
1868                         DELAY(100);/* XXX: temporary workaround for fxp issue*/
1869                         if (ifp->if_flags & IFF_UP) {
1870                                 crit_enter();
1871                                 in6_if_up(ifp);
1872                                 crit_exit();
1873                         }
1874 #endif
1875                 }
1876                 break;
1877         }
1878
1879         mtx_unlock(&ifp->if_ioctl_mtx);
1880         return (error);
1881 }
1882
1883 /*
1884  * Set/clear promiscuous mode on interface ifp based on the truth value
1885  * of pswitch.  The calls are reference counted so that only the first
1886  * "on" request actually has an effect, as does the final "off" request.
1887  * Results are undefined if the "off" and "on" requests are not matched.
1888  */
1889 int
1890 ifpromisc(struct ifnet *ifp, int pswitch)
1891 {
1892         struct ifreq ifr;
1893         int error;
1894         int oldflags;
1895
1896         oldflags = ifp->if_flags;
1897         if (ifp->if_flags & IFF_PPROMISC) {
1898                 /* Do nothing if device is in permanently promiscuous mode */
1899                 ifp->if_pcount += pswitch ? 1 : -1;
1900                 return (0);
1901         }
1902         if (pswitch) {
1903                 /*
1904                  * If the device is not configured up, we cannot put it in
1905                  * promiscuous mode.
1906                  */
1907                 if ((ifp->if_flags & IFF_UP) == 0)
1908                         return (ENETDOWN);
1909                 if (ifp->if_pcount++ != 0)
1910                         return (0);
1911                 ifp->if_flags |= IFF_PROMISC;
1912                 log(LOG_INFO, "%s: promiscuous mode enabled\n",
1913                     ifp->if_xname);
1914         } else {
1915                 if (--ifp->if_pcount > 0)
1916                         return (0);
1917                 ifp->if_flags &= ~IFF_PROMISC;
1918                 log(LOG_INFO, "%s: promiscuous mode disabled\n",
1919                     ifp->if_xname);
1920         }
1921         ifr.ifr_flags = ifp->if_flags;
1922         ifr.ifr_flagshigh = ifp->if_flags >> 16;
1923         ifnet_serialize_all(ifp);
1924         error = ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr, NULL);
1925         ifnet_deserialize_all(ifp);
1926         if (error == 0)
1927                 rt_ifmsg(ifp);
1928         else
1929                 ifp->if_flags = oldflags;
1930         return error;
1931 }
1932
1933 /*
1934  * Return interface configuration
1935  * of system.  List may be used
1936  * in later ioctl's (above) to get
1937  * other information.
1938  */
1939 static int
1940 ifconf(u_long cmd, caddr_t data, struct ucred *cred)
1941 {
1942         struct ifconf *ifc = (struct ifconf *)data;
1943         struct ifnet *ifp;
1944         struct sockaddr *sa;
1945         struct ifreq ifr, *ifrp;
1946         int space = ifc->ifc_len, error = 0;
1947
1948         ifrp = ifc->ifc_req;
1949         TAILQ_FOREACH(ifp, &ifnet, if_link) {
1950                 struct ifaddr_container *ifac;
1951                 int addrs;
1952
1953                 if (space <= sizeof ifr)
1954                         break;
1955
1956                 /*
1957                  * Zero the stack declared structure first to prevent
1958                  * memory disclosure.
1959                  */
1960                 bzero(&ifr, sizeof(ifr));
1961                 if (strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name))
1962                     >= sizeof(ifr.ifr_name)) {
1963                         error = ENAMETOOLONG;
1964                         break;
1965                 }
1966
1967                 addrs = 0;
1968                 TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1969                         struct ifaddr *ifa = ifac->ifa;
1970
1971                         if (space <= sizeof ifr)
1972                                 break;
1973                         sa = ifa->ifa_addr;
1974                         if (cred->cr_prison &&
1975                             prison_if(cred, sa))
1976                                 continue;
1977                         addrs++;
1978 #ifdef COMPAT_43
1979                         if (cmd == OSIOCGIFCONF) {
1980                                 struct osockaddr *osa =
1981                                          (struct osockaddr *)&ifr.ifr_addr;
1982                                 ifr.ifr_addr = *sa;
1983                                 osa->sa_family = sa->sa_family;
1984                                 error = copyout(&ifr, ifrp, sizeof ifr);
1985                                 ifrp++;
1986                         } else
1987 #endif
1988                         if (sa->sa_len <= sizeof(*sa)) {
1989                                 ifr.ifr_addr = *sa;
1990                                 error = copyout(&ifr, ifrp, sizeof ifr);
1991                                 ifrp++;
1992                         } else {
1993                                 if (space < (sizeof ifr) + sa->sa_len -
1994                                             sizeof(*sa))
1995                                         break;
1996                                 space -= sa->sa_len - sizeof(*sa);
1997                                 error = copyout(&ifr, ifrp,
1998                                                 sizeof ifr.ifr_name);
1999                                 if (error == 0)
2000                                         error = copyout(sa, &ifrp->ifr_addr,
2001                                                         sa->sa_len);
2002                                 ifrp = (struct ifreq *)
2003                                         (sa->sa_len + (caddr_t)&ifrp->ifr_addr);
2004                         }
2005                         if (error)
2006                                 break;
2007                         space -= sizeof ifr;
2008                 }
2009                 if (error)
2010                         break;
2011                 if (!addrs) {
2012                         bzero(&ifr.ifr_addr, sizeof ifr.ifr_addr);
2013                         error = copyout(&ifr, ifrp, sizeof ifr);
2014                         if (error)
2015                                 break;
2016                         space -= sizeof ifr;
2017                         ifrp++;
2018                 }
2019         }
2020         ifc->ifc_len -= space;
2021         return (error);
2022 }
2023
2024 /*
2025  * Just like if_promisc(), but for all-multicast-reception mode.
2026  */
2027 int
2028 if_allmulti(struct ifnet *ifp, int onswitch)
2029 {
2030         int error = 0;
2031         struct ifreq ifr;
2032
2033         crit_enter();
2034
2035         if (onswitch) {
2036                 if (ifp->if_amcount++ == 0) {
2037                         ifp->if_flags |= IFF_ALLMULTI;
2038                         ifr.ifr_flags = ifp->if_flags;
2039                         ifr.ifr_flagshigh = ifp->if_flags >> 16;
2040                         ifnet_serialize_all(ifp);
2041                         error = ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr,
2042                                               NULL);
2043                         ifnet_deserialize_all(ifp);
2044                 }
2045         } else {
2046                 if (ifp->if_amcount > 1) {
2047                         ifp->if_amcount--;
2048                 } else {
2049                         ifp->if_amcount = 0;
2050                         ifp->if_flags &= ~IFF_ALLMULTI;
2051                         ifr.ifr_flags = ifp->if_flags;
2052                         ifr.ifr_flagshigh = ifp->if_flags >> 16;
2053                         ifnet_serialize_all(ifp);
2054                         error = ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr,
2055                                               NULL);
2056                         ifnet_deserialize_all(ifp);
2057                 }
2058         }
2059
2060         crit_exit();
2061
2062         if (error == 0)
2063                 rt_ifmsg(ifp);
2064         return error;
2065 }
2066
2067 /*
2068  * Add a multicast listenership to the interface in question.
2069  * The link layer provides a routine which converts
2070  */
2071 int
2072 if_addmulti(
2073         struct ifnet *ifp,      /* interface to manipulate */
2074         struct sockaddr *sa,    /* address to add */
2075         struct ifmultiaddr **retifma)
2076 {
2077         struct sockaddr *llsa, *dupsa;
2078         int error;
2079         struct ifmultiaddr *ifma;
2080
2081         /*
2082          * If the matching multicast address already exists
2083          * then don't add a new one, just add a reference
2084          */
2085         TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
2086                 if (sa_equal(sa, ifma->ifma_addr)) {
2087                         ifma->ifma_refcount++;
2088                         if (retifma)
2089                                 *retifma = ifma;
2090                         return 0;
2091                 }
2092         }
2093
2094         /*
2095          * Give the link layer a chance to accept/reject it, and also
2096          * find out which AF_LINK address this maps to, if it isn't one
2097          * already.
2098          */
2099         if (ifp->if_resolvemulti) {
2100                 ifnet_serialize_all(ifp);
2101                 error = ifp->if_resolvemulti(ifp, &llsa, sa);
2102                 ifnet_deserialize_all(ifp);
2103                 if (error) 
2104                         return error;
2105         } else {
2106                 llsa = NULL;
2107         }
2108
2109         ifma = kmalloc(sizeof *ifma, M_IFMADDR, M_WAITOK);
2110         dupsa = kmalloc(sa->sa_len, M_IFMADDR, M_WAITOK);
2111         bcopy(sa, dupsa, sa->sa_len);
2112
2113         ifma->ifma_addr = dupsa;
2114         ifma->ifma_lladdr = llsa;
2115         ifma->ifma_ifp = ifp;
2116         ifma->ifma_refcount = 1;
2117         ifma->ifma_protospec = 0;
2118         rt_newmaddrmsg(RTM_NEWMADDR, ifma);
2119
2120         /*
2121          * Some network interfaces can scan the address list at
2122          * interrupt time; lock them out.
2123          */
2124         crit_enter();
2125         TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link);
2126         crit_exit();
2127         if (retifma)
2128                 *retifma = ifma;
2129
2130         if (llsa != NULL) {
2131                 TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
2132                         if (sa_equal(ifma->ifma_addr, llsa))
2133                                 break;
2134                 }
2135                 if (ifma) {
2136                         ifma->ifma_refcount++;
2137                 } else {
2138                         ifma = kmalloc(sizeof *ifma, M_IFMADDR, M_WAITOK);
2139                         dupsa = kmalloc(llsa->sa_len, M_IFMADDR, M_WAITOK);
2140                         bcopy(llsa, dupsa, llsa->sa_len);
2141                         ifma->ifma_addr = dupsa;
2142                         ifma->ifma_ifp = ifp;
2143                         ifma->ifma_refcount = 1;
2144                         crit_enter();
2145                         TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link);
2146                         crit_exit();
2147                 }
2148         }
2149         /*
2150          * We are certain we have added something, so call down to the
2151          * interface to let them know about it.
2152          */
2153         crit_enter();
2154         ifnet_serialize_all(ifp);
2155         if (ifp->if_ioctl)
2156                 ifp->if_ioctl(ifp, SIOCADDMULTI, 0, NULL);
2157         ifnet_deserialize_all(ifp);
2158         crit_exit();
2159
2160         return 0;
2161 }
2162
2163 /*
2164  * Remove a reference to a multicast address on this interface.  Yell
2165  * if the request does not match an existing membership.
2166  */
2167 int
2168 if_delmulti(struct ifnet *ifp, struct sockaddr *sa)
2169 {
2170         struct ifmultiaddr *ifma;
2171
2172         TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link)
2173                 if (sa_equal(sa, ifma->ifma_addr))
2174                         break;
2175         if (ifma == NULL)
2176                 return ENOENT;
2177
2178         if (ifma->ifma_refcount > 1) {
2179                 ifma->ifma_refcount--;
2180                 return 0;
2181         }
2182
2183         rt_newmaddrmsg(RTM_DELMADDR, ifma);
2184         sa = ifma->ifma_lladdr;
2185         crit_enter();
2186         TAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifma_link);
2187         /*
2188          * Make sure the interface driver is notified
2189          * in the case of a link layer mcast group being left.
2190          */
2191         if (ifma->ifma_addr->sa_family == AF_LINK && sa == NULL) {
2192                 ifnet_serialize_all(ifp);
2193                 ifp->if_ioctl(ifp, SIOCDELMULTI, 0, NULL);
2194                 ifnet_deserialize_all(ifp);
2195         }
2196         crit_exit();
2197         kfree(ifma->ifma_addr, M_IFMADDR);
2198         kfree(ifma, M_IFMADDR);
2199         if (sa == NULL)
2200                 return 0;
2201
2202         /*
2203          * Now look for the link-layer address which corresponds to
2204          * this network address.  It had been squirreled away in
2205          * ifma->ifma_lladdr for this purpose (so we don't have
2206          * to call ifp->if_resolvemulti() again), and we saved that
2207          * value in sa above.  If some nasty deleted the
2208          * link-layer address out from underneath us, we can deal because
2209          * the address we stored was is not the same as the one which was
2210          * in the record for the link-layer address.  (So we don't complain
2211          * in that case.)
2212          */
2213         TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link)
2214                 if (sa_equal(sa, ifma->ifma_addr))
2215                         break;
2216         if (ifma == NULL)
2217                 return 0;
2218
2219         if (ifma->ifma_refcount > 1) {
2220                 ifma->ifma_refcount--;
2221                 return 0;
2222         }
2223
2224         crit_enter();
2225         ifnet_serialize_all(ifp);
2226         TAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifma_link);
2227         ifp->if_ioctl(ifp, SIOCDELMULTI, 0, NULL);
2228         ifnet_deserialize_all(ifp);
2229         crit_exit();
2230         kfree(ifma->ifma_addr, M_IFMADDR);
2231         kfree(sa, M_IFMADDR);
2232         kfree(ifma, M_IFMADDR);
2233
2234         return 0;
2235 }
2236
2237 /*
2238  * Delete all multicast group membership for an interface.
2239  * Should be used to quickly flush all multicast filters.
2240  */
2241 void
2242 if_delallmulti(struct ifnet *ifp)
2243 {
2244         struct ifmultiaddr *ifma;
2245         struct ifmultiaddr *next;
2246
2247         TAILQ_FOREACH_MUTABLE(ifma, &ifp->if_multiaddrs, ifma_link, next)
2248                 if_delmulti(ifp, ifma->ifma_addr);
2249 }
2250
2251
2252 /*
2253  * Set the link layer address on an interface.
2254  *
2255  * At this time we only support certain types of interfaces,
2256  * and we don't allow the length of the address to change.
2257  */
2258 int
2259 if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len)
2260 {
2261         struct sockaddr_dl *sdl;
2262         struct ifreq ifr;
2263
2264         sdl = IF_LLSOCKADDR(ifp);
2265         if (sdl == NULL)
2266                 return (EINVAL);
2267         if (len != sdl->sdl_alen)       /* don't allow length to change */
2268                 return (EINVAL);
2269         switch (ifp->if_type) {
2270         case IFT_ETHER:                 /* these types use struct arpcom */
2271         case IFT_XETHER:
2272         case IFT_L2VLAN:
2273                 bcopy(lladdr, ((struct arpcom *)ifp->if_softc)->ac_enaddr, len);
2274                 bcopy(lladdr, LLADDR(sdl), len);
2275                 break;
2276         default:
2277                 return (ENODEV);
2278         }
2279         /*
2280          * If the interface is already up, we need
2281          * to re-init it in order to reprogram its
2282          * address filter.
2283          */
2284         ifnet_serialize_all(ifp);
2285         if ((ifp->if_flags & IFF_UP) != 0) {
2286 #ifdef INET
2287                 struct ifaddr_container *ifac;
2288 #endif
2289
2290                 ifp->if_flags &= ~IFF_UP;
2291                 ifr.ifr_flags = ifp->if_flags;
2292                 ifr.ifr_flagshigh = ifp->if_flags >> 16;
2293                 ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr,
2294                               NULL);
2295                 ifp->if_flags |= IFF_UP;
2296                 ifr.ifr_flags = ifp->if_flags;
2297                 ifr.ifr_flagshigh = ifp->if_flags >> 16;
2298                 ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr,
2299                                  NULL);
2300 #ifdef INET
2301                 /*
2302                  * Also send gratuitous ARPs to notify other nodes about
2303                  * the address change.
2304                  */
2305                 TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
2306                         struct ifaddr *ifa = ifac->ifa;
2307
2308                         if (ifa->ifa_addr != NULL &&
2309                             ifa->ifa_addr->sa_family == AF_INET)
2310                                 arp_gratuitous(ifp, ifa);
2311                 }
2312 #endif
2313         }
2314         ifnet_deserialize_all(ifp);
2315         return (0);
2316 }
2317
2318 struct ifmultiaddr *
2319 ifmaof_ifpforaddr(struct sockaddr *sa, struct ifnet *ifp)
2320 {
2321         struct ifmultiaddr *ifma;
2322
2323         TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link)
2324                 if (sa_equal(ifma->ifma_addr, sa))
2325                         break;
2326
2327         return ifma;
2328 }
2329
2330 /*
2331  * This function locates the first real ethernet MAC from a network
2332  * card and loads it into node, returning 0 on success or ENOENT if
2333  * no suitable interfaces were found.  It is used by the uuid code to
2334  * generate a unique 6-byte number.
2335  */
2336 int
2337 if_getanyethermac(uint16_t *node, int minlen)
2338 {
2339         struct ifnet *ifp;
2340         struct sockaddr_dl *sdl;
2341
2342         TAILQ_FOREACH(ifp, &ifnet, if_link) {
2343                 if (ifp->if_type != IFT_ETHER)
2344                         continue;
2345                 sdl = IF_LLSOCKADDR(ifp);
2346                 if (sdl->sdl_alen < minlen)
2347                         continue;
2348                 bcopy(((struct arpcom *)ifp->if_softc)->ac_enaddr, node,
2349                       minlen);
2350                 return(0);
2351         }
2352         return (ENOENT);
2353 }
2354
2355 /*
2356  * The name argument must be a pointer to storage which will last as
2357  * long as the interface does.  For physical devices, the result of
2358  * device_get_name(dev) is a good choice and for pseudo-devices a
2359  * static string works well.
2360  */
2361 void
2362 if_initname(struct ifnet *ifp, const char *name, int unit)
2363 {
2364         ifp->if_dname = name;
2365         ifp->if_dunit = unit;
2366         if (unit != IF_DUNIT_NONE)
2367                 ksnprintf(ifp->if_xname, IFNAMSIZ, "%s%d", name, unit);
2368         else
2369                 strlcpy(ifp->if_xname, name, IFNAMSIZ);
2370 }
2371
2372 int
2373 if_printf(struct ifnet *ifp, const char *fmt, ...)
2374 {
2375         __va_list ap;
2376         int retval;
2377
2378         retval = kprintf("%s: ", ifp->if_xname);
2379         __va_start(ap, fmt);
2380         retval += kvprintf(fmt, ap);
2381         __va_end(ap);
2382         return (retval);
2383 }
2384
2385 struct ifnet *
2386 if_alloc(uint8_t type)
2387 {
2388         struct ifnet *ifp;
2389         size_t size;
2390
2391         /*
2392          * XXX temporary hack until arpcom is setup in if_l2com
2393          */
2394         if (type == IFT_ETHER)
2395                 size = sizeof(struct arpcom);
2396         else
2397                 size = sizeof(struct ifnet);
2398
2399         ifp = kmalloc(size, M_IFNET, M_WAITOK|M_ZERO);
2400
2401         ifp->if_type = type;
2402
2403         if (if_com_alloc[type] != NULL) {
2404                 ifp->if_l2com = if_com_alloc[type](type, ifp);
2405                 if (ifp->if_l2com == NULL) {
2406                         kfree(ifp, M_IFNET);
2407                         return (NULL);
2408                 }
2409         }
2410         return (ifp);
2411 }
2412
2413 void
2414 if_free(struct ifnet *ifp)
2415 {
2416         kfree(ifp, M_IFNET);
2417 }
2418
2419 void
2420 ifq_set_classic(struct ifaltq *ifq)
2421 {
2422         ifq->altq_enqueue = ifq_classic_enqueue;
2423         ifq->altq_dequeue = ifq_classic_dequeue;
2424         ifq->altq_request = ifq_classic_request;
2425 }
2426
2427 int
2428 ifq_classic_enqueue(struct ifaltq *ifq, struct mbuf *m,
2429                     struct altq_pktattr *pa __unused)
2430 {
2431         logifq(enqueue, ifq);
2432         if (IF_QFULL(ifq)) {
2433                 m_freem(m);
2434                 return(ENOBUFS);
2435         } else {
2436                 IF_ENQUEUE(ifq, m);
2437                 return(0);
2438         }       
2439 }
2440
2441 struct mbuf *
2442 ifq_classic_dequeue(struct ifaltq *ifq, struct mbuf *mpolled, int op)
2443 {
2444         struct mbuf *m;
2445
2446         switch (op) {
2447         case ALTDQ_POLL:
2448                 IF_POLL(ifq, m);
2449                 break;
2450         case ALTDQ_REMOVE:
2451                 logifq(dequeue, ifq);
2452                 IF_DEQUEUE(ifq, m);
2453                 break;
2454         default:
2455                 panic("unsupported ALTQ dequeue op: %d", op);
2456         }
2457         KKASSERT(mpolled == NULL || mpolled == m);
2458         return(m);
2459 }
2460
2461 int
2462 ifq_classic_request(struct ifaltq *ifq, int req, void *arg)
2463 {
2464         switch (req) {
2465         case ALTRQ_PURGE:
2466                 IF_DRAIN(ifq);
2467                 break;
2468         default:
2469                 panic("unsupported ALTQ request: %d", req);
2470         }
2471         return(0);
2472 }
2473
2474 static void
2475 ifq_try_ifstart(struct ifaltq *ifq, int force_sched)
2476 {
2477         struct ifnet *ifp = ifq->altq_ifp;
2478         int running = 0, need_sched;
2479
2480         /*
2481          * Try to do direct ifnet.if_start first, if there is
2482          * contention on ifnet's serializer, ifnet.if_start will
2483          * be scheduled on ifnet's CPU.
2484          */
2485         if (!ifnet_tryserialize_tx(ifp)) {
2486                 /*
2487                  * ifnet serializer contention happened,
2488                  * ifnet.if_start is scheduled on ifnet's
2489                  * CPU, and we keep going.
2490                  */
2491                 logifstart(contend_sched, ifp);
2492                 ifq_ifstart_schedule(ifq, 1);
2493                 return;
2494         }
2495
2496         if ((ifp->if_flags & IFF_RUNNING) && !ifq_is_oactive(ifq)) {
2497                 logifstart(run, ifp);
2498                 ifp->if_start(ifp);
2499                 if ((ifp->if_flags & IFF_RUNNING) && !ifq_is_oactive(ifq))
2500                         running = 1;
2501         }
2502         need_sched = if_start_need_schedule(ifq, running);
2503
2504         ifnet_deserialize_tx(ifp);
2505
2506         if (need_sched) {
2507                 /*
2508                  * More data need to be transmitted, ifnet.if_start is
2509                  * scheduled on ifnet's CPU, and we keep going.
2510                  * NOTE: ifnet.if_start interlock is not released.
2511                  */
2512                 logifstart(sched, ifp);
2513                 ifq_ifstart_schedule(ifq, force_sched);
2514         }
2515 }
2516
2517 /*
2518  * IFQ packets staging mechanism:
2519  *
2520  * The packets enqueued into IFQ are staged to a certain amount before the
2521  * ifnet's if_start is called.  In this way, the driver could avoid writing
2522  * to hardware registers upon every packet, instead, hardware registers
2523  * could be written when certain amount of packets are put onto hardware
2524  * TX ring.  The measurement on several modern NICs (emx(4), igb(4), bnx(4),
2525  * bge(4), jme(4)) shows that the hardware registers writing aggregation
2526  * could save ~20% CPU time when 18bytes UDP datagrams are transmitted at
2527  * 1.48Mpps.  The performance improvement by hardware registers writing
2528  * aggeregation is also mentioned by Luigi Rizzo's netmap paper
2529  * (http://info.iet.unipi.it/~luigi/netmap/).
2530  *
2531  * IFQ packets staging is performed for two entry points into drivers's
2532  * transmission function:
2533  * - Direct ifnet's if_start calling, i.e. ifq_try_ifstart()
2534  * - ifnet's if_start scheduling, i.e. ifq_ifstart_schedule()
2535  *
2536  * IFQ packets staging will be stopped upon any of the following conditions:
2537  * - If the count of packets enqueued on the current CPU is great than or
2538  *   equal to ifq_stage_cntmax. (XXX this should be per-interface)
2539  * - If the total length of packets enqueued on the current CPU is great
2540  *   than or equal to the hardware's MTU - max_protohdr.  max_protohdr is
2541  *   cut from the hardware's MTU mainly bacause a full TCP segment's size
2542  *   is usually less than hardware's MTU.
2543  * - ifq_ifstart_schedule() is not pending on the current CPU and if_start
2544  *   interlock (if_snd.altq_started) is not released.
2545  * - The if_start_rollup(), which is registered as low priority netisr
2546  *   rollup function, is called; probably because no more work is pending
2547  *   for netisr.
2548  *
2549  * NOTE:
2550  * Currently IFQ packet staging is only performed in netisr threads.
2551  */
2552 int
2553 ifq_dispatch(struct ifnet *ifp, struct mbuf *m, struct altq_pktattr *pa)
2554 {
2555         struct ifaltq *ifq = &ifp->if_snd;
2556         int error, start = 0, len, mcast = 0, avoid_start = 0;
2557         struct ifaltq_stage_head *head = NULL;
2558         struct ifaltq_stage *stage = NULL;
2559
2560         ASSERT_IFNET_NOT_SERIALIZED_TX(ifp);
2561
2562         len = m->m_pkthdr.len;
2563         if (m->m_flags & M_MCAST)
2564                 mcast = 1;
2565
2566         if (curthread->td_type == TD_TYPE_NETISR) {
2567                 head = &ifq_stage_heads[mycpuid];
2568                 stage = ifq_get_stage(ifq, mycpuid);
2569
2570                 stage->ifqs_cnt++;
2571                 stage->ifqs_len += len;
2572                 if (stage->ifqs_cnt < ifq_stage_cntmax &&
2573                     stage->ifqs_len < (ifp->if_mtu - max_protohdr))
2574                         avoid_start = 1;
2575         }
2576
2577         ALTQ_LOCK(ifq);
2578         error = ifq_enqueue_locked(ifq, m, pa);
2579         if (error) {
2580                 if (!ifq_data_ready(ifq)) {
2581                         ALTQ_UNLOCK(ifq);
2582                         return error;
2583                 }
2584                 avoid_start = 0;
2585         }
2586         if (!ifq_is_started(ifq)) {
2587                 if (avoid_start) {
2588                         ALTQ_UNLOCK(ifq);
2589
2590                         KKASSERT(!error);
2591                         if ((stage->ifqs_flags & IFQ_STAGE_FLAG_QUED) == 0)
2592                                 ifq_stage_insert(head, stage);
2593
2594                         ifp->if_obytes += len;
2595                         if (mcast)
2596                                 ifp->if_omcasts++;
2597                         return error;
2598                 }
2599
2600                 /*
2601                  * Hold the interlock of ifnet.if_start
2602                  */
2603                 ifq_set_started(ifq);
2604                 start = 1;
2605         }
2606         ALTQ_UNLOCK(ifq);
2607
2608         if (!error) {
2609                 ifp->if_obytes += len;
2610                 if (mcast)
2611                         ifp->if_omcasts++;
2612         }
2613
2614         if (stage != NULL) {
2615                 if (!start && (stage->ifqs_flags & IFQ_STAGE_FLAG_SCHED)) {
2616                         KKASSERT(stage->ifqs_flags & IFQ_STAGE_FLAG_QUED);
2617                         if (!avoid_start) {
2618                                 ifq_stage_remove(head, stage);
2619                                 ifq_ifstart_schedule(ifq, 1);
2620                         }
2621                         return error;
2622                 }
2623
2624                 if (stage->ifqs_flags & IFQ_STAGE_FLAG_QUED) {
2625                         ifq_stage_remove(head, stage);
2626                 } else {
2627                         stage->ifqs_cnt = 0;
2628                         stage->ifqs_len = 0;
2629                 }
2630         }
2631
2632         if (!start) {
2633                 logifstart(avoid, ifp);
2634                 return error;
2635         }
2636
2637         ifq_try_ifstart(ifq, 0);
2638         return error;
2639 }
2640
2641 void *
2642 ifa_create(int size, int flags)
2643 {
2644         struct ifaddr *ifa;
2645         int i;
2646
2647         KASSERT(size >= sizeof(*ifa), ("ifaddr size too small"));
2648
2649         ifa = kmalloc(size, M_IFADDR, flags | M_ZERO);
2650         if (ifa == NULL)
2651                 return NULL;
2652
2653         ifa->ifa_containers = kmalloc(ncpus * sizeof(struct ifaddr_container),
2654                                       M_IFADDR, M_WAITOK | M_ZERO);
2655         ifa->ifa_ncnt = ncpus;
2656         for (i = 0; i < ncpus; ++i) {
2657                 struct ifaddr_container *ifac = &ifa->ifa_containers[i];
2658
2659                 ifac->ifa_magic = IFA_CONTAINER_MAGIC;
2660                 ifac->ifa = ifa;
2661                 ifac->ifa_refcnt = 1;
2662         }
2663 #ifdef IFADDR_DEBUG
2664         kprintf("alloc ifa %p %d\n", ifa, size);
2665 #endif
2666         return ifa;
2667 }
2668
2669 void
2670 ifac_free(struct ifaddr_container *ifac, int cpu_id)
2671 {
2672         struct ifaddr *ifa = ifac->ifa;
2673
2674         KKASSERT(ifac->ifa_magic == IFA_CONTAINER_MAGIC);
2675         KKASSERT(ifac->ifa_refcnt == 0);
2676         KASSERT(ifac->ifa_listmask == 0,
2677                 ("ifa is still on %#x lists", ifac->ifa_listmask));
2678
2679         ifac->ifa_magic = IFA_CONTAINER_DEAD;
2680
2681 #ifdef IFADDR_DEBUG_VERBOSE
2682         kprintf("try free ifa %p cpu_id %d\n", ifac->ifa, cpu_id);
2683 #endif
2684
2685         KASSERT(ifa->ifa_ncnt > 0 && ifa->ifa_ncnt <= ncpus,
2686                 ("invalid # of ifac, %d", ifa->ifa_ncnt));
2687         if (atomic_fetchadd_int(&ifa->ifa_ncnt, -1) == 1) {
2688 #ifdef IFADDR_DEBUG
2689                 kprintf("free ifa %p\n", ifa);
2690 #endif
2691                 kfree(ifa->ifa_containers, M_IFADDR);
2692                 kfree(ifa, M_IFADDR);
2693         }
2694 }
2695
2696 static void
2697 ifa_iflink_dispatch(netmsg_t nmsg)
2698 {
2699         struct netmsg_ifaddr *msg = (struct netmsg_ifaddr *)nmsg;
2700         struct ifaddr *ifa = msg->ifa;
2701         struct ifnet *ifp = msg->ifp;
2702         int cpu = mycpuid;
2703         struct ifaddr_container *ifac;
2704
2705         crit_enter();
2706
2707         ifac = &ifa->ifa_containers[cpu];
2708         ASSERT_IFAC_VALID(ifac);
2709         KASSERT((ifac->ifa_listmask & IFA_LIST_IFADDRHEAD) == 0,
2710                 ("ifaddr is on if_addrheads"));
2711
2712         ifac->ifa_listmask |= IFA_LIST_IFADDRHEAD;
2713         if (msg->tail)
2714                 TAILQ_INSERT_TAIL(&ifp->if_addrheads[cpu], ifac, ifa_link);
2715         else
2716                 TAILQ_INSERT_HEAD(&ifp->if_addrheads[cpu], ifac, ifa_link);
2717
2718         crit_exit();
2719
2720         ifa_forwardmsg(&nmsg->lmsg, cpu + 1);
2721 }
2722
2723 void
2724 ifa_iflink(struct ifaddr *ifa, struct ifnet *ifp, int tail)
2725 {
2726         struct netmsg_ifaddr msg;
2727
2728         netmsg_init(&msg.base, NULL, &curthread->td_msgport,
2729                     0, ifa_iflink_dispatch);
2730         msg.ifa = ifa;
2731         msg.ifp = ifp;
2732         msg.tail = tail;
2733
2734         ifa_domsg(&msg.base.lmsg, 0);
2735 }
2736
2737 static void
2738 ifa_ifunlink_dispatch(netmsg_t nmsg)
2739 {
2740         struct netmsg_ifaddr *msg = (struct netmsg_ifaddr *)nmsg;
2741         struct ifaddr *ifa = msg->ifa;
2742         struct ifnet *ifp = msg->ifp;
2743         int cpu = mycpuid;
2744         struct ifaddr_container *ifac;
2745
2746         crit_enter();
2747
2748         ifac = &ifa->ifa_containers[cpu];
2749         ASSERT_IFAC_VALID(ifac);
2750         KASSERT(ifac->ifa_listmask & IFA_LIST_IFADDRHEAD,
2751                 ("ifaddr is not on if_addrhead"));
2752
2753         TAILQ_REMOVE(&ifp->if_addrheads[cpu], ifac, ifa_link);
2754         ifac->ifa_listmask &= ~IFA_LIST_IFADDRHEAD;
2755
2756         crit_exit();
2757
2758         ifa_forwardmsg(&nmsg->lmsg, cpu + 1);
2759 }
2760
2761 void
2762 ifa_ifunlink(struct ifaddr *ifa, struct ifnet *ifp)
2763 {
2764         struct netmsg_ifaddr msg;
2765
2766         netmsg_init(&msg.base, NULL, &curthread->td_msgport,
2767                     0, ifa_ifunlink_dispatch);
2768         msg.ifa = ifa;
2769         msg.ifp = ifp;
2770
2771         ifa_domsg(&msg.base.lmsg, 0);
2772 }
2773
2774 static void
2775 ifa_destroy_dispatch(netmsg_t nmsg)
2776 {
2777         struct netmsg_ifaddr *msg = (struct netmsg_ifaddr *)nmsg;
2778
2779         IFAFREE(msg->ifa);
2780         ifa_forwardmsg(&nmsg->lmsg, mycpuid + 1);
2781 }
2782
2783 void
2784 ifa_destroy(struct ifaddr *ifa)
2785 {
2786         struct netmsg_ifaddr msg;
2787
2788         netmsg_init(&msg.base, NULL, &curthread->td_msgport,
2789                     0, ifa_destroy_dispatch);
2790         msg.ifa = ifa;
2791
2792         ifa_domsg(&msg.base.lmsg, 0);
2793 }
2794
2795 struct lwkt_port *
2796 ifnet_portfn(int cpu)
2797 {
2798         return &ifnet_threads[cpu].td_msgport;
2799 }
2800
2801 void
2802 ifnet_forwardmsg(struct lwkt_msg *lmsg, int next_cpu)
2803 {
2804         KKASSERT(next_cpu > mycpuid && next_cpu <= ncpus);
2805
2806         if (next_cpu < ncpus)
2807                 lwkt_forwardmsg(ifnet_portfn(next_cpu), lmsg);
2808         else
2809                 lwkt_replymsg(lmsg, 0);
2810 }
2811
2812 int
2813 ifnet_domsg(struct lwkt_msg *lmsg, int cpu)
2814 {
2815         KKASSERT(cpu < ncpus);
2816         return lwkt_domsg(ifnet_portfn(cpu), lmsg, 0);
2817 }
2818
2819 void
2820 ifnet_sendmsg(struct lwkt_msg *lmsg, int cpu)
2821 {
2822         KKASSERT(cpu < ncpus);
2823         lwkt_sendmsg(ifnet_portfn(cpu), lmsg);
2824 }
2825
2826 /*
2827  * Generic netmsg service loop.  Some protocols may roll their own but all
2828  * must do the basic command dispatch function call done here.
2829  */
2830 static void
2831 ifnet_service_loop(void *arg __unused)
2832 {
2833         netmsg_t msg;
2834
2835         while ((msg = lwkt_waitport(&curthread->td_msgport, 0))) {
2836                 KASSERT(msg->base.nm_dispatch, ("ifnet_service: badmsg"));
2837                 msg->base.nm_dispatch(msg);
2838         }
2839 }
2840
2841 static void
2842 if_start_rollup(void)
2843 {
2844         struct ifaltq_stage_head *head = &ifq_stage_heads[mycpuid];
2845         struct ifaltq_stage *stage;
2846
2847         while ((stage = TAILQ_FIRST(&head->ifqs_head)) != NULL) {
2848                 struct ifaltq *ifq = stage->ifqs_altq;
2849                 int is_sched = 0;
2850
2851                 if (stage->ifqs_flags & IFQ_STAGE_FLAG_SCHED)
2852                         is_sched = 1;
2853                 ifq_stage_remove(head, stage);
2854
2855                 if (is_sched) {
2856                         ifq_ifstart_schedule(ifq, 1);
2857                 } else {
2858                         int start = 0;
2859
2860                         ALTQ_LOCK(ifq);
2861                         if (!ifq_is_started(ifq)) {
2862                                 /*
2863                                  * Hold the interlock of ifnet.if_start
2864                                  */
2865                                 ifq_set_started(ifq);
2866                                 start = 1;
2867                         }
2868                         ALTQ_UNLOCK(ifq);
2869
2870                         if (start)
2871                                 ifq_try_ifstart(ifq, 1);
2872                 }
2873                 KKASSERT((stage->ifqs_flags &
2874                     (IFQ_STAGE_FLAG_QUED | IFQ_STAGE_FLAG_SCHED)) == 0);
2875         }
2876 }
2877
2878 static void
2879 ifnetinit(void *dummy __unused)
2880 {
2881         int i;
2882
2883         for (i = 0; i < ncpus; ++i) {
2884                 struct thread *thr = &ifnet_threads[i];
2885
2886                 lwkt_create(ifnet_service_loop, NULL, NULL,
2887                             thr, TDF_NOSTART|TDF_FORCE_SPINPORT,
2888                             i, "ifnet %d", i);
2889                 netmsg_service_port_init(&thr->td_msgport);
2890                 lwkt_schedule(thr);
2891         }
2892
2893         for (i = 0; i < ncpus; ++i)
2894                 TAILQ_INIT(&ifq_stage_heads[i].ifqs_head);
2895         netisr_register_rollup(if_start_rollup, NETISR_ROLLUP_PRIO_IFSTART);
2896 }
2897
2898 struct ifnet *
2899 ifnet_byindex(unsigned short idx)
2900 {
2901         if (idx > if_index)
2902                 return NULL;
2903         return ifindex2ifnet[idx];
2904 }
2905
2906 struct ifaddr *
2907 ifaddr_byindex(unsigned short idx)
2908 {
2909         struct ifnet *ifp;
2910
2911         ifp = ifnet_byindex(idx);
2912         if (!ifp)
2913                 return NULL;
2914         return TAILQ_FIRST(&ifp->if_addrheads[mycpuid])->ifa;
2915 }
2916
2917 void
2918 if_register_com_alloc(u_char type,
2919     if_com_alloc_t *a, if_com_free_t *f)
2920 {
2921
2922         KASSERT(if_com_alloc[type] == NULL,
2923             ("if_register_com_alloc: %d already registered", type));
2924         KASSERT(if_com_free[type] == NULL,
2925             ("if_register_com_alloc: %d free already registered", type));
2926
2927         if_com_alloc[type] = a;
2928         if_com_free[type] = f;
2929 }
2930
2931 void
2932 if_deregister_com_alloc(u_char type)
2933 {
2934
2935         KASSERT(if_com_alloc[type] != NULL,
2936             ("if_deregister_com_alloc: %d not registered", type));
2937         KASSERT(if_com_free[type] != NULL,
2938             ("if_deregister_com_alloc: %d free not registered", type));
2939         if_com_alloc[type] = NULL;
2940         if_com_free[type] = NULL;
2941 }
2942
2943 int
2944 if_ring_count2(int cnt, int cnt_max)
2945 {
2946         int shift = 0;
2947
2948         KASSERT(cnt_max >= 1 && powerof2(cnt_max),
2949             ("invalid ring count max %d", cnt_max));
2950
2951         if (cnt <= 0)
2952                 cnt = cnt_max;
2953         if (cnt > ncpus2)
2954                 cnt = ncpus2;
2955         if (cnt > cnt_max)
2956                 cnt = cnt_max;
2957
2958         while ((1 << (shift + 1)) <= cnt)
2959                 ++shift;
2960         cnt = 1 << shift;
2961
2962         KASSERT(cnt >= 1 && cnt <= ncpus2 && cnt <= cnt_max,
2963             ("calculate cnt %d, ncpus2 %d, cnt max %d",
2964              cnt, ncpus2, cnt_max));
2965         return cnt;
2966 }
2967
2968 void
2969 ifq_set_maxlen(struct ifaltq *ifq, int len)
2970 {
2971         ifq->ifq_maxlen = len + (ncpus * ifq_stage_cntmax);
2972 }