if: Move IFF_OACTIVE bit into ifaltq; prepare multiple TX queues support
[dragonfly.git] / sys / net / if.c
1 /*
2  * Copyright (c) 1980, 1986, 1993
3  *      The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. All advertising materials mentioning features or use of this software
14  *    must display the following acknowledgement:
15  *      This product includes software developed by the University of
16  *      California, Berkeley and its contributors.
17  * 4. Neither the name of the University nor the names of its contributors
18  *    may be used to endorse or promote products derived from this software
19  *    without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  *      @(#)if.c        8.3 (Berkeley) 1/4/94
34  * $FreeBSD: src/sys/net/if.c,v 1.185 2004/03/13 02:35:03 brooks Exp $
35  */
36
37 #include "opt_compat.h"
38 #include "opt_inet6.h"
39 #include "opt_inet.h"
40 #include "opt_ifpoll.h"
41
42 #include <sys/param.h>
43 #include <sys/malloc.h>
44 #include <sys/mbuf.h>
45 #include <sys/systm.h>
46 #include <sys/proc.h>
47 #include <sys/priv.h>
48 #include <sys/protosw.h>
49 #include <sys/socket.h>
50 #include <sys/socketvar.h>
51 #include <sys/socketops.h>
52 #include <sys/protosw.h>
53 #include <sys/kernel.h>
54 #include <sys/ktr.h>
55 #include <sys/mutex.h>
56 #include <sys/sockio.h>
57 #include <sys/syslog.h>
58 #include <sys/sysctl.h>
59 #include <sys/domain.h>
60 #include <sys/thread.h>
61 #include <sys/serialize.h>
62 #include <sys/bus.h>
63
64 #include <sys/thread2.h>
65 #include <sys/msgport2.h>
66 #include <sys/mutex2.h>
67
68 #include <net/if.h>
69 #include <net/if_arp.h>
70 #include <net/if_dl.h>
71 #include <net/if_types.h>
72 #include <net/if_var.h>
73 #include <net/ifq_var.h>
74 #include <net/radix.h>
75 #include <net/route.h>
76 #include <net/if_clone.h>
77 #include <net/netisr.h>
78 #include <net/netmsg2.h>
79
80 #include <machine/atomic.h>
81 #include <machine/stdarg.h>
82 #include <machine/smp.h>
83
84 #if defined(INET) || defined(INET6)
85 /*XXX*/
86 #include <netinet/in.h>
87 #include <netinet/in_var.h>
88 #include <netinet/if_ether.h>
89 #ifdef INET6
90 #include <netinet6/in6_var.h>
91 #include <netinet6/in6_ifattach.h>
92 #endif
93 #endif
94
95 #if defined(COMPAT_43)
96 #include <emulation/43bsd/43bsd_socket.h>
97 #endif /* COMPAT_43 */
98
99 struct netmsg_ifaddr {
100         struct netmsg_base base;
101         struct ifaddr   *ifa;
102         struct ifnet    *ifp;
103         int             tail;
104 };
105
106 struct ifaltq_stage_head {
107         TAILQ_HEAD(, ifaltq_stage)      ifqs_head;
108 } __cachealign;
109
110 /*
111  * System initialization
112  */
113 static void     if_attachdomain(void *);
114 static void     if_attachdomain1(struct ifnet *);
115 static int      ifconf(u_long, caddr_t, struct ucred *);
116 static void     ifinit(void *);
117 static void     ifnetinit(void *);
118 static void     if_slowtimo(void *);
119 static void     link_rtrequest(int, struct rtentry *, struct rt_addrinfo *);
120 static int      if_rtdel(struct radix_node *, void *);
121
122 #ifdef INET6
123 /*
124  * XXX: declare here to avoid to include many inet6 related files..
125  * should be more generalized?
126  */
127 extern void     nd6_setmtu(struct ifnet *);
128 #endif
129
130 SYSCTL_NODE(_net, PF_LINK, link, CTLFLAG_RW, 0, "Link layers");
131 SYSCTL_NODE(_net_link, 0, generic, CTLFLAG_RW, 0, "Generic link-management");
132
133 static int ifq_stage_cntmax = 4;
134 TUNABLE_INT("net.link.stage_cntmax", &ifq_stage_cntmax);
135 SYSCTL_INT(_net_link, OID_AUTO, stage_cntmax, CTLFLAG_RW,
136     &ifq_stage_cntmax, 0, "ifq staging packet count max");
137
138 SYSINIT(interfaces, SI_SUB_PROTO_IF, SI_ORDER_FIRST, ifinit, NULL)
139 /* Must be after netisr_init */
140 SYSINIT(ifnet, SI_SUB_PRE_DRIVERS, SI_ORDER_SECOND, ifnetinit, NULL)
141
142 static  if_com_alloc_t *if_com_alloc[256];
143 static  if_com_free_t *if_com_free[256];
144
145 MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address");
146 MALLOC_DEFINE(M_IFMADDR, "ether_multi", "link-level multicast address");
147 MALLOC_DEFINE(M_IFNET, "ifnet", "interface structure");
148
149 int                     ifqmaxlen = IFQ_MAXLEN;
150 struct ifnethead        ifnet = TAILQ_HEAD_INITIALIZER(ifnet);
151
152 struct callout          if_slowtimo_timer;
153
154 int                     if_index = 0;
155 struct ifnet            **ifindex2ifnet = NULL;
156 static struct thread    ifnet_threads[MAXCPU];
157
158 static struct ifaltq_stage_head ifq_stage_heads[MAXCPU];
159
160 #define IFQ_KTR_STRING          "ifq=%p"
161 #define IFQ_KTR_ARGS    struct ifaltq *ifq
162 #ifndef KTR_IFQ
163 #define KTR_IFQ                 KTR_ALL
164 #endif
165 KTR_INFO_MASTER(ifq);
166 KTR_INFO(KTR_IFQ, ifq, enqueue, 0, IFQ_KTR_STRING, IFQ_KTR_ARGS);
167 KTR_INFO(KTR_IFQ, ifq, dequeue, 1, IFQ_KTR_STRING, IFQ_KTR_ARGS);
168 #define logifq(name, arg)       KTR_LOG(ifq_ ## name, arg)
169
170 #define IF_START_KTR_STRING     "ifp=%p"
171 #define IF_START_KTR_ARGS       struct ifnet *ifp
172 #ifndef KTR_IF_START
173 #define KTR_IF_START            KTR_ALL
174 #endif
175 KTR_INFO_MASTER(if_start);
176 KTR_INFO(KTR_IF_START, if_start, run, 0,
177          IF_START_KTR_STRING, IF_START_KTR_ARGS);
178 KTR_INFO(KTR_IF_START, if_start, sched, 1,
179          IF_START_KTR_STRING, IF_START_KTR_ARGS);
180 KTR_INFO(KTR_IF_START, if_start, avoid, 2,
181          IF_START_KTR_STRING, IF_START_KTR_ARGS);
182 KTR_INFO(KTR_IF_START, if_start, contend_sched, 3,
183          IF_START_KTR_STRING, IF_START_KTR_ARGS);
184 KTR_INFO(KTR_IF_START, if_start, chase_sched, 4,
185          IF_START_KTR_STRING, IF_START_KTR_ARGS);
186 #define logifstart(name, arg)   KTR_LOG(if_start_ ## name, arg)
187
188 TAILQ_HEAD(, ifg_group) ifg_head = TAILQ_HEAD_INITIALIZER(ifg_head);
189
190 /*
191  * Network interface utility routines.
192  *
193  * Routines with ifa_ifwith* names take sockaddr *'s as
194  * parameters.
195  */
196 /* ARGSUSED*/
197 void
198 ifinit(void *dummy)
199 {
200         struct ifnet *ifp;
201
202         callout_init(&if_slowtimo_timer);
203
204         crit_enter();
205         TAILQ_FOREACH(ifp, &ifnet, if_link) {
206                 if (ifp->if_snd.ifq_maxlen == 0) {
207                         if_printf(ifp, "XXX: driver didn't set ifq_maxlen\n");
208                         ifq_set_maxlen(&ifp->if_snd, ifqmaxlen);
209                 }
210         }
211         crit_exit();
212
213         if_slowtimo(0);
214 }
215
216 static int
217 if_start_cpuid(struct ifnet *ifp)
218 {
219         return ifp->if_cpuid;
220 }
221
222 #ifdef IFPOLL_ENABLE
223 static int
224 if_start_cpuid_npoll(struct ifnet *ifp)
225 {
226         int poll_cpuid = ifp->if_npoll_cpuid;
227
228         if (poll_cpuid >= 0)
229                 return poll_cpuid;
230         else
231                 return ifp->if_cpuid;
232 }
233 #endif
234
235 static void
236 if_start_ipifunc(void *arg)
237 {
238         struct ifnet *ifp = arg;
239         struct lwkt_msg *lmsg = &ifp->if_start_nmsg[mycpuid].lmsg;
240
241         crit_enter();
242         if (lmsg->ms_flags & MSGF_DONE)
243                 lwkt_sendmsg(netisr_portfn(mycpuid), lmsg);
244         crit_exit();
245 }
246
247 static __inline void
248 ifq_stage_remove(struct ifaltq_stage_head *head, struct ifaltq_stage *stage)
249 {
250         KKASSERT(stage->ifqs_flags & IFQ_STAGE_FLAG_QUED);
251         TAILQ_REMOVE(&head->ifqs_head, stage, ifqs_link);
252         stage->ifqs_flags &= ~(IFQ_STAGE_FLAG_QUED | IFQ_STAGE_FLAG_SCHED);
253         stage->ifqs_cnt = 0;
254         stage->ifqs_len = 0;
255 }
256
257 static __inline void
258 ifq_stage_insert(struct ifaltq_stage_head *head, struct ifaltq_stage *stage)
259 {
260         KKASSERT((stage->ifqs_flags &
261             (IFQ_STAGE_FLAG_QUED | IFQ_STAGE_FLAG_SCHED)) == 0);
262         stage->ifqs_flags |= IFQ_STAGE_FLAG_QUED;
263         TAILQ_INSERT_TAIL(&head->ifqs_head, stage, ifqs_link);
264 }
265
266 /*
267  * Schedule ifnet.if_start on ifnet's CPU
268  */
269 static void
270 if_start_schedule(struct ifnet *ifp, int force)
271 {
272         int cpu;
273
274         if (!force && curthread->td_type == TD_TYPE_NETISR &&
275             ifq_stage_cntmax > 0) {
276                 struct ifaltq_stage *stage = &ifp->if_snd.altq_stage[mycpuid];
277
278                 stage->ifqs_cnt = 0;
279                 stage->ifqs_len = 0;
280                 if ((stage->ifqs_flags & IFQ_STAGE_FLAG_QUED) == 0)
281                         ifq_stage_insert(&ifq_stage_heads[mycpuid], stage);
282                 stage->ifqs_flags |= IFQ_STAGE_FLAG_SCHED;
283                 return;
284         }
285
286         cpu = ifp->if_start_cpuid(ifp);
287         if (cpu != mycpuid)
288                 lwkt_send_ipiq(globaldata_find(cpu), if_start_ipifunc, ifp);
289         else
290                 if_start_ipifunc(ifp);
291 }
292
293 /*
294  * NOTE:
295  * This function will release ifnet.if_start interlock,
296  * if ifnet.if_start does not need to be scheduled
297  */
298 static __inline int
299 if_start_need_schedule(struct ifaltq *ifq, int running)
300 {
301         if (!running || ifq_is_empty(ifq)
302 #ifdef ALTQ
303             || ifq->altq_tbr != NULL
304 #endif
305         ) {
306                 ALTQ_LOCK(ifq);
307                 /*
308                  * ifnet.if_start interlock is released, if:
309                  * 1) Hardware can not take any packets, due to
310                  *    o  interface is marked down
311                  *    o  hardware queue is full (ifq_is_oactive)
312                  *    Under the second situation, hardware interrupt
313                  *    or polling(4) will call/schedule ifnet.if_start
314                  *    when hardware queue is ready
315                  * 2) There is not packet in the ifnet.if_snd.
316                  *    Further ifq_dispatch or ifq_handoff will call/
317                  *    schedule ifnet.if_start
318                  * 3) TBR is used and it does not allow further
319                  *    dequeueing.
320                  *    TBR callout will call ifnet.if_start
321                  */
322                 if (!running || !ifq_data_ready(ifq)) {
323                         ifq->altq_started = 0;
324                         ALTQ_UNLOCK(ifq);
325                         return 0;
326                 }
327                 ALTQ_UNLOCK(ifq);
328         }
329         return 1;
330 }
331
332 static void
333 if_start_dispatch(netmsg_t msg)
334 {
335         struct lwkt_msg *lmsg = &msg->base.lmsg;
336         struct ifnet *ifp = lmsg->u.ms_resultp;
337         struct ifaltq *ifq = &ifp->if_snd;
338         int running = 0, need_sched;
339
340         crit_enter();
341         lwkt_replymsg(lmsg, 0); /* reply ASAP */
342         crit_exit();
343
344         if (mycpuid != ifp->if_start_cpuid(ifp)) {
345                 /*
346                  * We need to chase the ifnet CPU change.
347                  */
348                 logifstart(chase_sched, ifp);
349                 if_start_schedule(ifp, 1);
350                 return;
351         }
352
353         ifnet_serialize_tx(ifp);
354         if ((ifp->if_flags & IFF_RUNNING) && !ifq_is_oactive(ifq)) {
355                 logifstart(run, ifp);
356                 ifp->if_start(ifp);
357                 if ((ifp->if_flags & IFF_RUNNING) && !ifq_is_oactive(ifq))
358                         running = 1;
359         }
360         need_sched = if_start_need_schedule(ifq, running);
361         ifnet_deserialize_tx(ifp);
362
363         if (need_sched) {
364                 /*
365                  * More data need to be transmitted, ifnet.if_start is
366                  * scheduled on ifnet's CPU, and we keep going.
367                  * NOTE: ifnet.if_start interlock is not released.
368                  */
369                 logifstart(sched, ifp);
370                 if_start_schedule(ifp, 0);
371         }
372 }
373
374 /* Device driver ifnet.if_start helper function */
375 void
376 if_devstart(struct ifnet *ifp)
377 {
378         struct ifaltq *ifq = &ifp->if_snd;
379         int running = 0;
380
381         ASSERT_IFNET_SERIALIZED_TX(ifp);
382
383         ALTQ_LOCK(ifq);
384         if (ifq->altq_started || !ifq_data_ready(ifq)) {
385                 logifstart(avoid, ifp);
386                 ALTQ_UNLOCK(ifq);
387                 return;
388         }
389         ifq->altq_started = 1;
390         ALTQ_UNLOCK(ifq);
391
392         logifstart(run, ifp);
393         ifp->if_start(ifp);
394
395         if ((ifp->if_flags & IFF_RUNNING) && !ifq_is_oactive(ifq))
396                 running = 1;
397
398         if (if_start_need_schedule(ifq, running)) {
399                 /*
400                  * More data need to be transmitted, ifnet.if_start is
401                  * scheduled on ifnet's CPU, and we keep going.
402                  * NOTE: ifnet.if_start interlock is not released.
403                  */
404                 logifstart(sched, ifp);
405                 if_start_schedule(ifp, 0);
406         }
407 }
408
409 static void
410 if_default_serialize(struct ifnet *ifp, enum ifnet_serialize slz __unused)
411 {
412         lwkt_serialize_enter(ifp->if_serializer);
413 }
414
415 static void
416 if_default_deserialize(struct ifnet *ifp, enum ifnet_serialize slz __unused)
417 {
418         lwkt_serialize_exit(ifp->if_serializer);
419 }
420
421 static int
422 if_default_tryserialize(struct ifnet *ifp, enum ifnet_serialize slz __unused)
423 {
424         return lwkt_serialize_try(ifp->if_serializer);
425 }
426
427 #ifdef INVARIANTS
428 static void
429 if_default_serialize_assert(struct ifnet *ifp,
430                             enum ifnet_serialize slz __unused,
431                             boolean_t serialized)
432 {
433         if (serialized)
434                 ASSERT_SERIALIZED(ifp->if_serializer);
435         else
436                 ASSERT_NOT_SERIALIZED(ifp->if_serializer);
437 }
438 #endif
439
440 /*
441  * Attach an interface to the list of "active" interfaces.
442  *
443  * The serializer is optional.  If non-NULL access to the interface
444  * may be MPSAFE.
445  */
446 void
447 if_attach(struct ifnet *ifp, lwkt_serialize_t serializer)
448 {
449         unsigned socksize, ifasize;
450         int namelen, masklen;
451         struct sockaddr_dl *sdl;
452         struct ifaddr *ifa;
453         struct ifaltq *ifq;
454         int i;
455
456         static int if_indexlim = 8;
457
458         if (ifp->if_serialize != NULL) {
459                 KASSERT(ifp->if_deserialize != NULL &&
460                         ifp->if_tryserialize != NULL &&
461                         ifp->if_serialize_assert != NULL,
462                         ("serialize functions are partially setup"));
463
464                 /*
465                  * If the device supplies serialize functions,
466                  * then clear if_serializer to catch any invalid
467                  * usage of this field.
468                  */
469                 KASSERT(serializer == NULL,
470                         ("both serialize functions and default serializer "
471                          "are supplied"));
472                 ifp->if_serializer = NULL;
473         } else {
474                 KASSERT(ifp->if_deserialize == NULL &&
475                         ifp->if_tryserialize == NULL &&
476                         ifp->if_serialize_assert == NULL,
477                         ("serialize functions are partially setup"));
478                 ifp->if_serialize = if_default_serialize;
479                 ifp->if_deserialize = if_default_deserialize;
480                 ifp->if_tryserialize = if_default_tryserialize;
481 #ifdef INVARIANTS
482                 ifp->if_serialize_assert = if_default_serialize_assert;
483 #endif
484
485                 /*
486                  * The serializer can be passed in from the device,
487                  * allowing the same serializer to be used for both
488                  * the interrupt interlock and the device queue.
489                  * If not specified, the netif structure will use an
490                  * embedded serializer.
491                  */
492                 if (serializer == NULL) {
493                         serializer = &ifp->if_default_serializer;
494                         lwkt_serialize_init(serializer);
495                 }
496                 ifp->if_serializer = serializer;
497         }
498
499         ifp->if_start_cpuid = if_start_cpuid;
500         ifp->if_cpuid = 0;
501
502 #ifdef IFPOLL_ENABLE
503         /* Device is not in polling mode by default */
504         ifp->if_npoll_cpuid = -1;
505         if (ifp->if_npoll != NULL)
506                 ifp->if_start_cpuid = if_start_cpuid_npoll;
507 #endif
508
509         ifp->if_start_nmsg = kmalloc(ncpus * sizeof(*ifp->if_start_nmsg),
510                                      M_LWKTMSG, M_WAITOK);
511         for (i = 0; i < ncpus; ++i) {
512                 netmsg_init(&ifp->if_start_nmsg[i], NULL, &netisr_adone_rport,
513                             0, if_start_dispatch);
514                 ifp->if_start_nmsg[i].lmsg.u.ms_resultp = ifp;
515         }
516
517         mtx_init(&ifp->if_ioctl_mtx);
518         mtx_lock(&ifp->if_ioctl_mtx);
519
520         TAILQ_INSERT_TAIL(&ifnet, ifp, if_link);
521         ifp->if_index = ++if_index;
522
523         /*
524          * XXX -
525          * The old code would work if the interface passed a pre-existing
526          * chain of ifaddrs to this code.  We don't trust our callers to
527          * properly initialize the tailq, however, so we no longer allow
528          * this unlikely case.
529          */
530         ifp->if_addrheads = kmalloc(ncpus * sizeof(struct ifaddrhead),
531                                     M_IFADDR, M_WAITOK | M_ZERO);
532         for (i = 0; i < ncpus; ++i)
533                 TAILQ_INIT(&ifp->if_addrheads[i]);
534
535         TAILQ_INIT(&ifp->if_prefixhead);
536         TAILQ_INIT(&ifp->if_multiaddrs);
537         TAILQ_INIT(&ifp->if_groups);
538         getmicrotime(&ifp->if_lastchange);
539         if (ifindex2ifnet == NULL || if_index >= if_indexlim) {
540                 unsigned int n;
541                 struct ifnet **q;
542
543                 if_indexlim <<= 1;
544
545                 /* grow ifindex2ifnet */
546                 n = if_indexlim * sizeof(*q);
547                 q = kmalloc(n, M_IFADDR, M_WAITOK | M_ZERO);
548                 if (ifindex2ifnet) {
549                         bcopy(ifindex2ifnet, q, n/2);
550                         kfree(ifindex2ifnet, M_IFADDR);
551                 }
552                 ifindex2ifnet = q;
553         }
554
555         ifindex2ifnet[if_index] = ifp;
556
557         /*
558          * create a Link Level name for this device
559          */
560         namelen = strlen(ifp->if_xname);
561         masklen = offsetof(struct sockaddr_dl, sdl_data[0]) + namelen;
562         socksize = masklen + ifp->if_addrlen;
563 #define ROUNDUP(a) (1 + (((a) - 1) | (sizeof(long) - 1)))
564         if (socksize < sizeof(*sdl))
565                 socksize = sizeof(*sdl);
566         socksize = ROUNDUP(socksize);
567 #undef ROUNDUP
568         ifasize = sizeof(struct ifaddr) + 2 * socksize;
569         ifa = ifa_create(ifasize, M_WAITOK);
570         sdl = (struct sockaddr_dl *)(ifa + 1);
571         sdl->sdl_len = socksize;
572         sdl->sdl_family = AF_LINK;
573         bcopy(ifp->if_xname, sdl->sdl_data, namelen);
574         sdl->sdl_nlen = namelen;
575         sdl->sdl_index = ifp->if_index;
576         sdl->sdl_type = ifp->if_type;
577         ifp->if_lladdr = ifa;
578         ifa->ifa_ifp = ifp;
579         ifa->ifa_rtrequest = link_rtrequest;
580         ifa->ifa_addr = (struct sockaddr *)sdl;
581         sdl = (struct sockaddr_dl *)(socksize + (caddr_t)sdl);
582         ifa->ifa_netmask = (struct sockaddr *)sdl;
583         sdl->sdl_len = masklen;
584         while (namelen != 0)
585                 sdl->sdl_data[--namelen] = 0xff;
586         ifa_iflink(ifa, ifp, 0 /* Insert head */);
587
588         EVENTHANDLER_INVOKE(ifnet_attach_event, ifp);
589         devctl_notify("IFNET", ifp->if_xname, "ATTACH", NULL);
590
591         ifq = &ifp->if_snd;
592         ifq->altq_type = 0;
593         ifq->altq_disc = NULL;
594         ifq->altq_flags &= ALTQF_CANTCHANGE;
595         ifq->altq_tbr = NULL;
596         ifq->altq_ifp = ifp;
597         ifq->altq_started = 0;
598         ifq->altq_prepended = NULL;
599         ALTQ_LOCK_INIT(ifq);
600         ifq_set_classic(ifq);
601
602         ifq->altq_stage =
603             kmalloc_cachealign(ncpus * sizeof(struct ifaltq_stage),
604             M_DEVBUF, M_WAITOK | M_ZERO);
605         for (i = 0; i < ncpus; ++i)
606                 ifq->altq_stage[i].ifqs_altq = ifq;
607
608         if (!SLIST_EMPTY(&domains))
609                 if_attachdomain1(ifp);
610
611         /* Announce the interface. */
612         rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
613
614         mtx_unlock(&ifp->if_ioctl_mtx);
615 }
616
617 static void
618 if_attachdomain(void *dummy)
619 {
620         struct ifnet *ifp;
621
622         crit_enter();
623         TAILQ_FOREACH(ifp, &ifnet, if_list)
624                 if_attachdomain1(ifp);
625         crit_exit();
626 }
627 SYSINIT(domainifattach, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST,
628         if_attachdomain, NULL);
629
630 static void
631 if_attachdomain1(struct ifnet *ifp)
632 {
633         struct domain *dp;
634
635         crit_enter();
636
637         /* address family dependent data region */
638         bzero(ifp->if_afdata, sizeof(ifp->if_afdata));
639         SLIST_FOREACH(dp, &domains, dom_next)
640                 if (dp->dom_ifattach)
641                         ifp->if_afdata[dp->dom_family] =
642                                 (*dp->dom_ifattach)(ifp);
643         crit_exit();
644 }
645
646 /*
647  * Purge all addresses whose type is _not_ AF_LINK
648  */
649 void
650 if_purgeaddrs_nolink(struct ifnet *ifp)
651 {
652         struct ifaddr_container *ifac, *next;
653
654         TAILQ_FOREACH_MUTABLE(ifac, &ifp->if_addrheads[mycpuid],
655                               ifa_link, next) {
656                 struct ifaddr *ifa = ifac->ifa;
657
658                 /* Leave link ifaddr as it is */
659                 if (ifa->ifa_addr->sa_family == AF_LINK)
660                         continue;
661 #ifdef INET
662                 /* XXX: Ugly!! ad hoc just for INET */
663                 if (ifa->ifa_addr && ifa->ifa_addr->sa_family == AF_INET) {
664                         struct ifaliasreq ifr;
665 #ifdef IFADDR_DEBUG_VERBOSE
666                         int i;
667
668                         kprintf("purge in4 addr %p: ", ifa);
669                         for (i = 0; i < ncpus; ++i)
670                                 kprintf("%d ", ifa->ifa_containers[i].ifa_refcnt);
671                         kprintf("\n");
672 #endif
673
674                         bzero(&ifr, sizeof ifr);
675                         ifr.ifra_addr = *ifa->ifa_addr;
676                         if (ifa->ifa_dstaddr)
677                                 ifr.ifra_broadaddr = *ifa->ifa_dstaddr;
678                         if (in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, ifp,
679                                        NULL) == 0)
680                                 continue;
681                 }
682 #endif /* INET */
683 #ifdef INET6
684                 if (ifa->ifa_addr && ifa->ifa_addr->sa_family == AF_INET6) {
685 #ifdef IFADDR_DEBUG_VERBOSE
686                         int i;
687
688                         kprintf("purge in6 addr %p: ", ifa);
689                         for (i = 0; i < ncpus; ++i)
690                                 kprintf("%d ", ifa->ifa_containers[i].ifa_refcnt);
691                         kprintf("\n");
692 #endif
693
694                         in6_purgeaddr(ifa);
695                         /* ifp_addrhead is already updated */
696                         continue;
697                 }
698 #endif /* INET6 */
699                 ifa_ifunlink(ifa, ifp);
700                 ifa_destroy(ifa);
701         }
702 }
703
704 static void
705 ifq_stage_detach_handler(netmsg_t nmsg)
706 {
707         struct ifaltq *ifq = nmsg->lmsg.u.ms_resultp;
708         struct ifaltq_stage *stage = &ifq->altq_stage[mycpuid];
709
710         if (stage->ifqs_flags & IFQ_STAGE_FLAG_QUED)
711                 ifq_stage_remove(&ifq_stage_heads[mycpuid], stage);
712         lwkt_replymsg(&nmsg->lmsg, 0);
713 }
714
715 static void
716 ifq_stage_detach(struct ifaltq *ifq)
717 {
718         struct netmsg_base base;
719         int cpu;
720
721         netmsg_init(&base, NULL, &curthread->td_msgport, 0,
722             ifq_stage_detach_handler);
723         base.lmsg.u.ms_resultp = ifq;
724
725         for (cpu = 0; cpu < ncpus; ++cpu)
726                 lwkt_domsg(netisr_portfn(cpu), &base.lmsg, 0);
727 }
728
729 /*
730  * Detach an interface, removing it from the
731  * list of "active" interfaces.
732  */
733 void
734 if_detach(struct ifnet *ifp)
735 {
736         struct radix_node_head  *rnh;
737         int i;
738         int cpu, origcpu;
739         struct domain *dp;
740
741         EVENTHANDLER_INVOKE(ifnet_detach_event, ifp);
742
743         /*
744          * Remove routes and flush queues.
745          */
746         crit_enter();
747 #ifdef IFPOLL_ENABLE
748         if (ifp->if_flags & IFF_NPOLLING)
749                 ifpoll_deregister(ifp);
750 #endif
751         if_down(ifp);
752
753 #ifdef ALTQ
754         if (ifq_is_enabled(&ifp->if_snd))
755                 altq_disable(&ifp->if_snd);
756         if (ifq_is_attached(&ifp->if_snd))
757                 altq_detach(&ifp->if_snd);
758 #endif
759
760         /*
761          * Clean up all addresses.
762          */
763         ifp->if_lladdr = NULL;
764
765         if_purgeaddrs_nolink(ifp);
766         if (!TAILQ_EMPTY(&ifp->if_addrheads[mycpuid])) {
767                 struct ifaddr *ifa;
768
769                 ifa = TAILQ_FIRST(&ifp->if_addrheads[mycpuid])->ifa;
770                 KASSERT(ifa->ifa_addr->sa_family == AF_LINK,
771                         ("non-link ifaddr is left on if_addrheads"));
772
773                 ifa_ifunlink(ifa, ifp);
774                 ifa_destroy(ifa);
775                 KASSERT(TAILQ_EMPTY(&ifp->if_addrheads[mycpuid]),
776                         ("there are still ifaddrs left on if_addrheads"));
777         }
778
779 #ifdef INET
780         /*
781          * Remove all IPv4 kernel structures related to ifp.
782          */
783         in_ifdetach(ifp);
784 #endif
785
786 #ifdef INET6
787         /*
788          * Remove all IPv6 kernel structs related to ifp.  This should be done
789          * before removing routing entries below, since IPv6 interface direct
790          * routes are expected to be removed by the IPv6-specific kernel API.
791          * Otherwise, the kernel will detect some inconsistency and bark it.
792          */
793         in6_ifdetach(ifp);
794 #endif
795
796         /*
797          * Delete all remaining routes using this interface
798          * Unfortuneatly the only way to do this is to slog through
799          * the entire routing table looking for routes which point
800          * to this interface...oh well...
801          */
802         origcpu = mycpuid;
803         for (cpu = 0; cpu < ncpus; cpu++) {
804                 lwkt_migratecpu(cpu);
805                 for (i = 1; i <= AF_MAX; i++) {
806                         if ((rnh = rt_tables[cpu][i]) == NULL)
807                                 continue;
808                         rnh->rnh_walktree(rnh, if_rtdel, ifp);
809                 }
810         }
811         lwkt_migratecpu(origcpu);
812
813         /* Announce that the interface is gone. */
814         rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
815         devctl_notify("IFNET", ifp->if_xname, "DETACH", NULL);
816
817         SLIST_FOREACH(dp, &domains, dom_next)
818                 if (dp->dom_ifdetach && ifp->if_afdata[dp->dom_family])
819                         (*dp->dom_ifdetach)(ifp,
820                                 ifp->if_afdata[dp->dom_family]);
821
822         /*
823          * Remove interface from ifindex2ifp[] and maybe decrement if_index.
824          */
825         ifindex2ifnet[ifp->if_index] = NULL;
826         while (if_index > 0 && ifindex2ifnet[if_index] == NULL)
827                 if_index--;
828
829         TAILQ_REMOVE(&ifnet, ifp, if_link);
830         kfree(ifp->if_addrheads, M_IFADDR);
831
832         lwkt_synchronize_ipiqs("if_detach");
833         ifq_stage_detach(&ifp->if_snd);
834
835         kfree(ifp->if_start_nmsg, M_LWKTMSG);
836         kfree(ifp->if_snd.altq_stage, M_DEVBUF);
837         crit_exit();
838 }
839
840 /*
841  * Create interface group without members
842  */
843 struct ifg_group *
844 if_creategroup(const char *groupname)
845 {
846         struct ifg_group        *ifg = NULL;
847
848         if ((ifg = (struct ifg_group *)kmalloc(sizeof(struct ifg_group),
849             M_TEMP, M_NOWAIT)) == NULL)
850                 return (NULL);
851
852         strlcpy(ifg->ifg_group, groupname, sizeof(ifg->ifg_group));
853         ifg->ifg_refcnt = 0;
854         ifg->ifg_carp_demoted = 0;
855         TAILQ_INIT(&ifg->ifg_members);
856 #if NPF > 0
857         pfi_attach_ifgroup(ifg);
858 #endif
859         TAILQ_INSERT_TAIL(&ifg_head, ifg, ifg_next);
860
861         return (ifg);
862 }
863
864 /*
865  * Add a group to an interface
866  */
867 int
868 if_addgroup(struct ifnet *ifp, const char *groupname)
869 {
870         struct ifg_list         *ifgl;
871         struct ifg_group        *ifg = NULL;
872         struct ifg_member       *ifgm;
873
874         if (groupname[0] && groupname[strlen(groupname) - 1] >= '0' &&
875             groupname[strlen(groupname) - 1] <= '9')
876                 return (EINVAL);
877
878         TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
879                 if (!strcmp(ifgl->ifgl_group->ifg_group, groupname))
880                         return (EEXIST);
881
882         if ((ifgl = kmalloc(sizeof(*ifgl), M_TEMP, M_NOWAIT)) == NULL)
883                 return (ENOMEM);
884
885         if ((ifgm = kmalloc(sizeof(*ifgm), M_TEMP, M_NOWAIT)) == NULL) {
886                 kfree(ifgl, M_TEMP);
887                 return (ENOMEM);
888         }
889
890         TAILQ_FOREACH(ifg, &ifg_head, ifg_next)
891                 if (!strcmp(ifg->ifg_group, groupname))
892                         break;
893
894         if (ifg == NULL && (ifg = if_creategroup(groupname)) == NULL) {
895                 kfree(ifgl, M_TEMP);
896                 kfree(ifgm, M_TEMP);
897                 return (ENOMEM);
898         }
899
900         ifg->ifg_refcnt++;
901         ifgl->ifgl_group = ifg;
902         ifgm->ifgm_ifp = ifp;
903
904         TAILQ_INSERT_TAIL(&ifg->ifg_members, ifgm, ifgm_next);
905         TAILQ_INSERT_TAIL(&ifp->if_groups, ifgl, ifgl_next);
906
907 #if NPF > 0
908         pfi_group_change(groupname);
909 #endif
910
911         return (0);
912 }
913
914 /*
915  * Remove a group from an interface
916  */
917 int
918 if_delgroup(struct ifnet *ifp, const char *groupname)
919 {
920         struct ifg_list         *ifgl;
921         struct ifg_member       *ifgm;
922
923         TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
924                 if (!strcmp(ifgl->ifgl_group->ifg_group, groupname))
925                         break;
926         if (ifgl == NULL)
927                 return (ENOENT);
928
929         TAILQ_REMOVE(&ifp->if_groups, ifgl, ifgl_next);
930
931         TAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next)
932                 if (ifgm->ifgm_ifp == ifp)
933                         break;
934
935         if (ifgm != NULL) {
936                 TAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifgm_next);
937                 kfree(ifgm, M_TEMP);
938         }
939
940         if (--ifgl->ifgl_group->ifg_refcnt == 0) {
941                 TAILQ_REMOVE(&ifg_head, ifgl->ifgl_group, ifg_next);
942 #if NPF > 0
943                 pfi_detach_ifgroup(ifgl->ifgl_group);
944 #endif
945                 kfree(ifgl->ifgl_group, M_TEMP);
946         }
947
948         kfree(ifgl, M_TEMP);
949
950 #if NPF > 0
951         pfi_group_change(groupname);
952 #endif
953
954         return (0);
955 }
956
957 /*
958  * Stores all groups from an interface in memory pointed
959  * to by data
960  */
961 int
962 if_getgroup(caddr_t data, struct ifnet *ifp)
963 {
964         int                      len, error;
965         struct ifg_list         *ifgl;
966         struct ifg_req           ifgrq, *ifgp;
967         struct ifgroupreq       *ifgr = (struct ifgroupreq *)data;
968
969         if (ifgr->ifgr_len == 0) {
970                 TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
971                         ifgr->ifgr_len += sizeof(struct ifg_req);
972                 return (0);
973         }
974
975         len = ifgr->ifgr_len;
976         ifgp = ifgr->ifgr_groups;
977         TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) {
978                 if (len < sizeof(ifgrq))
979                         return (EINVAL);
980                 bzero(&ifgrq, sizeof ifgrq);
981                 strlcpy(ifgrq.ifgrq_group, ifgl->ifgl_group->ifg_group,
982                     sizeof(ifgrq.ifgrq_group));
983                 if ((error = copyout((caddr_t)&ifgrq, (caddr_t)ifgp,
984                     sizeof(struct ifg_req))))
985                         return (error);
986                 len -= sizeof(ifgrq);
987                 ifgp++;
988         }
989
990         return (0);
991 }
992
993 /*
994  * Stores all members of a group in memory pointed to by data
995  */
996 int
997 if_getgroupmembers(caddr_t data)
998 {
999         struct ifgroupreq       *ifgr = (struct ifgroupreq *)data;
1000         struct ifg_group        *ifg;
1001         struct ifg_member       *ifgm;
1002         struct ifg_req           ifgrq, *ifgp;
1003         int                      len, error;
1004
1005         TAILQ_FOREACH(ifg, &ifg_head, ifg_next)
1006                 if (!strcmp(ifg->ifg_group, ifgr->ifgr_name))
1007                         break;
1008         if (ifg == NULL)
1009                 return (ENOENT);
1010
1011         if (ifgr->ifgr_len == 0) {
1012                 TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next)
1013                         ifgr->ifgr_len += sizeof(ifgrq);
1014                 return (0);
1015         }
1016
1017         len = ifgr->ifgr_len;
1018         ifgp = ifgr->ifgr_groups;
1019         TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) {
1020                 if (len < sizeof(ifgrq))
1021                         return (EINVAL);
1022                 bzero(&ifgrq, sizeof ifgrq);
1023                 strlcpy(ifgrq.ifgrq_member, ifgm->ifgm_ifp->if_xname,
1024                     sizeof(ifgrq.ifgrq_member));
1025                 if ((error = copyout((caddr_t)&ifgrq, (caddr_t)ifgp,
1026                     sizeof(struct ifg_req))))
1027                         return (error);
1028                 len -= sizeof(ifgrq);
1029                 ifgp++;
1030         }
1031
1032         return (0);
1033 }
1034
1035 /*
1036  * Delete Routes for a Network Interface
1037  *
1038  * Called for each routing entry via the rnh->rnh_walktree() call above
1039  * to delete all route entries referencing a detaching network interface.
1040  *
1041  * Arguments:
1042  *      rn      pointer to node in the routing table
1043  *      arg     argument passed to rnh->rnh_walktree() - detaching interface
1044  *
1045  * Returns:
1046  *      0       successful
1047  *      errno   failed - reason indicated
1048  *
1049  */
1050 static int
1051 if_rtdel(struct radix_node *rn, void *arg)
1052 {
1053         struct rtentry  *rt = (struct rtentry *)rn;
1054         struct ifnet    *ifp = arg;
1055         int             err;
1056
1057         if (rt->rt_ifp == ifp) {
1058
1059                 /*
1060                  * Protect (sorta) against walktree recursion problems
1061                  * with cloned routes
1062                  */
1063                 if (!(rt->rt_flags & RTF_UP))
1064                         return (0);
1065
1066                 err = rtrequest(RTM_DELETE, rt_key(rt), rt->rt_gateway,
1067                                 rt_mask(rt), rt->rt_flags,
1068                                 NULL);
1069                 if (err) {
1070                         log(LOG_WARNING, "if_rtdel: error %d\n", err);
1071                 }
1072         }
1073
1074         return (0);
1075 }
1076
1077 /*
1078  * Locate an interface based on a complete address.
1079  */
1080 struct ifaddr *
1081 ifa_ifwithaddr(struct sockaddr *addr)
1082 {
1083         struct ifnet *ifp;
1084
1085         TAILQ_FOREACH(ifp, &ifnet, if_link) {
1086                 struct ifaddr_container *ifac;
1087
1088                 TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1089                         struct ifaddr *ifa = ifac->ifa;
1090
1091                         if (ifa->ifa_addr->sa_family != addr->sa_family)
1092                                 continue;
1093                         if (sa_equal(addr, ifa->ifa_addr))
1094                                 return (ifa);
1095                         if ((ifp->if_flags & IFF_BROADCAST) &&
1096                             ifa->ifa_broadaddr &&
1097                             /* IPv6 doesn't have broadcast */
1098                             ifa->ifa_broadaddr->sa_len != 0 &&
1099                             sa_equal(ifa->ifa_broadaddr, addr))
1100                                 return (ifa);
1101                 }
1102         }
1103         return (NULL);
1104 }
1105 /*
1106  * Locate the point to point interface with a given destination address.
1107  */
1108 struct ifaddr *
1109 ifa_ifwithdstaddr(struct sockaddr *addr)
1110 {
1111         struct ifnet *ifp;
1112
1113         TAILQ_FOREACH(ifp, &ifnet, if_link) {
1114                 struct ifaddr_container *ifac;
1115
1116                 if (!(ifp->if_flags & IFF_POINTOPOINT))
1117                         continue;
1118
1119                 TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1120                         struct ifaddr *ifa = ifac->ifa;
1121
1122                         if (ifa->ifa_addr->sa_family != addr->sa_family)
1123                                 continue;
1124                         if (ifa->ifa_dstaddr &&
1125                             sa_equal(addr, ifa->ifa_dstaddr))
1126                                 return (ifa);
1127                 }
1128         }
1129         return (NULL);
1130 }
1131
1132 /*
1133  * Find an interface on a specific network.  If many, choice
1134  * is most specific found.
1135  */
1136 struct ifaddr *
1137 ifa_ifwithnet(struct sockaddr *addr)
1138 {
1139         struct ifnet *ifp;
1140         struct ifaddr *ifa_maybe = NULL;
1141         u_int af = addr->sa_family;
1142         char *addr_data = addr->sa_data, *cplim;
1143
1144         /*
1145          * AF_LINK addresses can be looked up directly by their index number,
1146          * so do that if we can.
1147          */
1148         if (af == AF_LINK) {
1149                 struct sockaddr_dl *sdl = (struct sockaddr_dl *)addr;
1150
1151                 if (sdl->sdl_index && sdl->sdl_index <= if_index)
1152                         return (ifindex2ifnet[sdl->sdl_index]->if_lladdr);
1153         }
1154
1155         /*
1156          * Scan though each interface, looking for ones that have
1157          * addresses in this address family.
1158          */
1159         TAILQ_FOREACH(ifp, &ifnet, if_link) {
1160                 struct ifaddr_container *ifac;
1161
1162                 TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1163                         struct ifaddr *ifa = ifac->ifa;
1164                         char *cp, *cp2, *cp3;
1165
1166                         if (ifa->ifa_addr->sa_family != af)
1167 next:                           continue;
1168                         if (af == AF_INET && ifp->if_flags & IFF_POINTOPOINT) {
1169                                 /*
1170                                  * This is a bit broken as it doesn't
1171                                  * take into account that the remote end may
1172                                  * be a single node in the network we are
1173                                  * looking for.
1174                                  * The trouble is that we don't know the
1175                                  * netmask for the remote end.
1176                                  */
1177                                 if (ifa->ifa_dstaddr != NULL &&
1178                                     sa_equal(addr, ifa->ifa_dstaddr))
1179                                         return (ifa);
1180                         } else {
1181                                 /*
1182                                  * if we have a special address handler,
1183                                  * then use it instead of the generic one.
1184                                  */
1185                                 if (ifa->ifa_claim_addr) {
1186                                         if ((*ifa->ifa_claim_addr)(ifa, addr)) {
1187                                                 return (ifa);
1188                                         } else {
1189                                                 continue;
1190                                         }
1191                                 }
1192
1193                                 /*
1194                                  * Scan all the bits in the ifa's address.
1195                                  * If a bit dissagrees with what we are
1196                                  * looking for, mask it with the netmask
1197                                  * to see if it really matters.
1198                                  * (A byte at a time)
1199                                  */
1200                                 if (ifa->ifa_netmask == 0)
1201                                         continue;
1202                                 cp = addr_data;
1203                                 cp2 = ifa->ifa_addr->sa_data;
1204                                 cp3 = ifa->ifa_netmask->sa_data;
1205                                 cplim = ifa->ifa_netmask->sa_len +
1206                                         (char *)ifa->ifa_netmask;
1207                                 while (cp3 < cplim)
1208                                         if ((*cp++ ^ *cp2++) & *cp3++)
1209                                                 goto next; /* next address! */
1210                                 /*
1211                                  * If the netmask of what we just found
1212                                  * is more specific than what we had before
1213                                  * (if we had one) then remember the new one
1214                                  * before continuing to search
1215                                  * for an even better one.
1216                                  */
1217                                 if (ifa_maybe == NULL ||
1218                                     rn_refines((char *)ifa->ifa_netmask,
1219                                                (char *)ifa_maybe->ifa_netmask))
1220                                         ifa_maybe = ifa;
1221                         }
1222                 }
1223         }
1224         return (ifa_maybe);
1225 }
1226
1227 /*
1228  * Find an interface address specific to an interface best matching
1229  * a given address.
1230  */
1231 struct ifaddr *
1232 ifaof_ifpforaddr(struct sockaddr *addr, struct ifnet *ifp)
1233 {
1234         struct ifaddr_container *ifac;
1235         char *cp, *cp2, *cp3;
1236         char *cplim;
1237         struct ifaddr *ifa_maybe = NULL;
1238         u_int af = addr->sa_family;
1239
1240         if (af >= AF_MAX)
1241                 return (0);
1242         TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1243                 struct ifaddr *ifa = ifac->ifa;
1244
1245                 if (ifa->ifa_addr->sa_family != af)
1246                         continue;
1247                 if (ifa_maybe == NULL)
1248                         ifa_maybe = ifa;
1249                 if (ifa->ifa_netmask == NULL) {
1250                         if (sa_equal(addr, ifa->ifa_addr) ||
1251                             (ifa->ifa_dstaddr != NULL &&
1252                              sa_equal(addr, ifa->ifa_dstaddr)))
1253                                 return (ifa);
1254                         continue;
1255                 }
1256                 if (ifp->if_flags & IFF_POINTOPOINT) {
1257                         if (sa_equal(addr, ifa->ifa_dstaddr))
1258                                 return (ifa);
1259                 } else {
1260                         cp = addr->sa_data;
1261                         cp2 = ifa->ifa_addr->sa_data;
1262                         cp3 = ifa->ifa_netmask->sa_data;
1263                         cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask;
1264                         for (; cp3 < cplim; cp3++)
1265                                 if ((*cp++ ^ *cp2++) & *cp3)
1266                                         break;
1267                         if (cp3 == cplim)
1268                                 return (ifa);
1269                 }
1270         }
1271         return (ifa_maybe);
1272 }
1273
1274 /*
1275  * Default action when installing a route with a Link Level gateway.
1276  * Lookup an appropriate real ifa to point to.
1277  * This should be moved to /sys/net/link.c eventually.
1278  */
1279 static void
1280 link_rtrequest(int cmd, struct rtentry *rt, struct rt_addrinfo *info)
1281 {
1282         struct ifaddr *ifa;
1283         struct sockaddr *dst;
1284         struct ifnet *ifp;
1285
1286         if (cmd != RTM_ADD || (ifa = rt->rt_ifa) == NULL ||
1287             (ifp = ifa->ifa_ifp) == NULL || (dst = rt_key(rt)) == NULL)
1288                 return;
1289         ifa = ifaof_ifpforaddr(dst, ifp);
1290         if (ifa != NULL) {
1291                 IFAFREE(rt->rt_ifa);
1292                 IFAREF(ifa);
1293                 rt->rt_ifa = ifa;
1294                 if (ifa->ifa_rtrequest && ifa->ifa_rtrequest != link_rtrequest)
1295                         ifa->ifa_rtrequest(cmd, rt, info);
1296         }
1297 }
1298
1299 /*
1300  * Mark an interface down and notify protocols of
1301  * the transition.
1302  * NOTE: must be called at splnet or eqivalent.
1303  */
1304 void
1305 if_unroute(struct ifnet *ifp, int flag, int fam)
1306 {
1307         struct ifaddr_container *ifac;
1308
1309         ifp->if_flags &= ~flag;
1310         getmicrotime(&ifp->if_lastchange);
1311         TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1312                 struct ifaddr *ifa = ifac->ifa;
1313
1314                 if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family))
1315                         kpfctlinput(PRC_IFDOWN, ifa->ifa_addr);
1316         }
1317         ifq_purge_all(&ifp->if_snd);
1318         rt_ifmsg(ifp);
1319 }
1320
1321 /*
1322  * Mark an interface up and notify protocols of
1323  * the transition.
1324  * NOTE: must be called at splnet or eqivalent.
1325  */
1326 void
1327 if_route(struct ifnet *ifp, int flag, int fam)
1328 {
1329         struct ifaddr_container *ifac;
1330
1331         ifq_purge_all(&ifp->if_snd);
1332         ifp->if_flags |= flag;
1333         getmicrotime(&ifp->if_lastchange);
1334         TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1335                 struct ifaddr *ifa = ifac->ifa;
1336
1337                 if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family))
1338                         kpfctlinput(PRC_IFUP, ifa->ifa_addr);
1339         }
1340         rt_ifmsg(ifp);
1341 #ifdef INET6
1342         in6_if_up(ifp);
1343 #endif
1344 }
1345
1346 /*
1347  * Mark an interface down and notify protocols of the transition.  An
1348  * interface going down is also considered to be a synchronizing event.
1349  * We must ensure that all packet processing related to the interface
1350  * has completed before we return so e.g. the caller can free the ifnet
1351  * structure that the mbufs may be referencing.
1352  *
1353  * NOTE: must be called at splnet or eqivalent.
1354  */
1355 void
1356 if_down(struct ifnet *ifp)
1357 {
1358         if_unroute(ifp, IFF_UP, AF_UNSPEC);
1359         netmsg_service_sync();
1360 }
1361
1362 /*
1363  * Mark an interface up and notify protocols of
1364  * the transition.
1365  * NOTE: must be called at splnet or eqivalent.
1366  */
1367 void
1368 if_up(struct ifnet *ifp)
1369 {
1370         if_route(ifp, IFF_UP, AF_UNSPEC);
1371 }
1372
1373 /*
1374  * Process a link state change.
1375  * NOTE: must be called at splsoftnet or equivalent.
1376  */
1377 void
1378 if_link_state_change(struct ifnet *ifp)
1379 {
1380         int link_state = ifp->if_link_state;
1381
1382         rt_ifmsg(ifp);
1383         devctl_notify("IFNET", ifp->if_xname,
1384             (link_state == LINK_STATE_UP) ? "LINK_UP" : "LINK_DOWN", NULL);
1385 }
1386
1387 /*
1388  * Handle interface watchdog timer routines.  Called
1389  * from softclock, we decrement timers (if set) and
1390  * call the appropriate interface routine on expiration.
1391  */
1392 static void
1393 if_slowtimo(void *arg)
1394 {
1395         struct ifnet *ifp;
1396
1397         crit_enter();
1398
1399         TAILQ_FOREACH(ifp, &ifnet, if_link) {
1400                 if (ifp->if_timer == 0 || --ifp->if_timer)
1401                         continue;
1402                 if (ifp->if_watchdog) {
1403                         if (ifnet_tryserialize_all(ifp)) {
1404                                 (*ifp->if_watchdog)(ifp);
1405                                 ifnet_deserialize_all(ifp);
1406                         } else {
1407                                 /* try again next timeout */
1408                                 ++ifp->if_timer;
1409                         }
1410                 }
1411         }
1412
1413         crit_exit();
1414
1415         callout_reset(&if_slowtimo_timer, hz / IFNET_SLOWHZ, if_slowtimo, NULL);
1416 }
1417
1418 /*
1419  * Map interface name to
1420  * interface structure pointer.
1421  */
1422 struct ifnet *
1423 ifunit(const char *name)
1424 {
1425         struct ifnet *ifp;
1426
1427         /*
1428          * Search all the interfaces for this name/number
1429          */
1430
1431         TAILQ_FOREACH(ifp, &ifnet, if_link) {
1432                 if (strncmp(ifp->if_xname, name, IFNAMSIZ) == 0)
1433                         break;
1434         }
1435         return (ifp);
1436 }
1437
1438
1439 /*
1440  * Map interface name in a sockaddr_dl to
1441  * interface structure pointer.
1442  */
1443 struct ifnet *
1444 if_withname(struct sockaddr *sa)
1445 {
1446         char ifname[IFNAMSIZ+1];
1447         struct sockaddr_dl *sdl = (struct sockaddr_dl *)sa;
1448
1449         if ( (sa->sa_family != AF_LINK) || (sdl->sdl_nlen == 0) ||
1450              (sdl->sdl_nlen > IFNAMSIZ) )
1451                 return NULL;
1452
1453         /*
1454          * ifunit wants a null-terminated name.  It may not be null-terminated
1455          * in the sockaddr.  We don't want to change the caller's sockaddr,
1456          * and there might not be room to put the trailing null anyway, so we
1457          * make a local copy that we know we can null terminate safely.
1458          */
1459
1460         bcopy(sdl->sdl_data, ifname, sdl->sdl_nlen);
1461         ifname[sdl->sdl_nlen] = '\0';
1462         return ifunit(ifname);
1463 }
1464
1465
1466 /*
1467  * Interface ioctls.
1468  */
1469 int
1470 ifioctl(struct socket *so, u_long cmd, caddr_t data, struct ucred *cred)
1471 {
1472         struct ifnet *ifp;
1473         struct ifreq *ifr;
1474         struct ifstat *ifs;
1475         int error;
1476         short oif_flags;
1477         int new_flags;
1478 #ifdef COMPAT_43
1479         int ocmd;
1480 #endif
1481         size_t namelen, onamelen;
1482         char new_name[IFNAMSIZ];
1483         struct ifaddr *ifa;
1484         struct sockaddr_dl *sdl;
1485
1486         switch (cmd) {
1487         case SIOCGIFCONF:
1488         case OSIOCGIFCONF:
1489                 return (ifconf(cmd, data, cred));
1490         default:
1491                 break;
1492         }
1493
1494         ifr = (struct ifreq *)data;
1495
1496         switch (cmd) {
1497         case SIOCIFCREATE:
1498         case SIOCIFCREATE2:
1499                 if ((error = priv_check_cred(cred, PRIV_ROOT, 0)) != 0)
1500                         return (error);
1501                 return (if_clone_create(ifr->ifr_name, sizeof(ifr->ifr_name),
1502                         cmd == SIOCIFCREATE2 ? ifr->ifr_data : NULL));
1503         case SIOCIFDESTROY:
1504                 if ((error = priv_check_cred(cred, PRIV_ROOT, 0)) != 0)
1505                         return (error);
1506                 return (if_clone_destroy(ifr->ifr_name));
1507         case SIOCIFGCLONERS:
1508                 return (if_clone_list((struct if_clonereq *)data));
1509         default:
1510                 break;
1511         }
1512
1513         /*
1514          * Nominal ioctl through interface, lookup the ifp and obtain a
1515          * lock to serialize the ifconfig ioctl operation.
1516          */
1517         ifp = ifunit(ifr->ifr_name);
1518         if (ifp == NULL)
1519                 return (ENXIO);
1520         error = 0;
1521         mtx_lock(&ifp->if_ioctl_mtx);
1522
1523         switch (cmd) {
1524         case SIOCGIFINDEX:
1525                 ifr->ifr_index = ifp->if_index;
1526                 break;
1527
1528         case SIOCGIFFLAGS:
1529                 ifr->ifr_flags = ifp->if_flags;
1530                 ifr->ifr_flagshigh = ifp->if_flags >> 16;
1531                 break;
1532
1533         case SIOCGIFCAP:
1534                 ifr->ifr_reqcap = ifp->if_capabilities;
1535                 ifr->ifr_curcap = ifp->if_capenable;
1536                 break;
1537
1538         case SIOCGIFMETRIC:
1539                 ifr->ifr_metric = ifp->if_metric;
1540                 break;
1541
1542         case SIOCGIFMTU:
1543                 ifr->ifr_mtu = ifp->if_mtu;
1544                 break;
1545
1546         case SIOCGIFDATA:
1547                 error = copyout((caddr_t)&ifp->if_data, ifr->ifr_data,
1548                                 sizeof(ifp->if_data));
1549                 break;
1550
1551         case SIOCGIFPHYS:
1552                 ifr->ifr_phys = ifp->if_physical;
1553                 break;
1554
1555         case SIOCGIFPOLLCPU:
1556                 ifr->ifr_pollcpu = -1;
1557                 break;
1558
1559         case SIOCSIFPOLLCPU:
1560                 break;
1561
1562         case SIOCSIFFLAGS:
1563                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1564                 if (error)
1565                         break;
1566                 new_flags = (ifr->ifr_flags & 0xffff) |
1567                     (ifr->ifr_flagshigh << 16);
1568                 if (ifp->if_flags & IFF_SMART) {
1569                         /* Smart drivers twiddle their own routes */
1570                 } else if (ifp->if_flags & IFF_UP &&
1571                     (new_flags & IFF_UP) == 0) {
1572                         crit_enter();
1573                         if_down(ifp);
1574                         crit_exit();
1575                 } else if (new_flags & IFF_UP &&
1576                     (ifp->if_flags & IFF_UP) == 0) {
1577                         crit_enter();
1578                         if_up(ifp);
1579                         crit_exit();
1580                 }
1581
1582 #ifdef IFPOLL_ENABLE
1583                 if ((new_flags ^ ifp->if_flags) & IFF_NPOLLING) {
1584                         if (new_flags & IFF_NPOLLING)
1585                                 ifpoll_register(ifp);
1586                         else
1587                                 ifpoll_deregister(ifp);
1588                 }
1589 #endif
1590
1591                 ifp->if_flags = (ifp->if_flags & IFF_CANTCHANGE) |
1592                         (new_flags &~ IFF_CANTCHANGE);
1593                 if (new_flags & IFF_PPROMISC) {
1594                         /* Permanently promiscuous mode requested */
1595                         ifp->if_flags |= IFF_PROMISC;
1596                 } else if (ifp->if_pcount == 0) {
1597                         ifp->if_flags &= ~IFF_PROMISC;
1598                 }
1599                 if (ifp->if_ioctl) {
1600                         ifnet_serialize_all(ifp);
1601                         ifp->if_ioctl(ifp, cmd, data, cred);
1602                         ifnet_deserialize_all(ifp);
1603                 }
1604                 getmicrotime(&ifp->if_lastchange);
1605                 break;
1606
1607         case SIOCSIFCAP:
1608                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1609                 if (error)
1610                         break;
1611                 if (ifr->ifr_reqcap & ~ifp->if_capabilities) {
1612                         error = EINVAL;
1613                         break;
1614                 }
1615                 ifnet_serialize_all(ifp);
1616                 ifp->if_ioctl(ifp, cmd, data, cred);
1617                 ifnet_deserialize_all(ifp);
1618                 break;
1619
1620         case SIOCSIFNAME:
1621                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1622                 if (error)
1623                         break;
1624                 error = copyinstr(ifr->ifr_data, new_name, IFNAMSIZ, NULL);
1625                 if (error)
1626                         break;
1627                 if (new_name[0] == '\0') {
1628                         error = EINVAL;
1629                         break;
1630                 }
1631                 if (ifunit(new_name) != NULL) {
1632                         error = EEXIST;
1633                         break;
1634                 }
1635
1636                 EVENTHANDLER_INVOKE(ifnet_detach_event, ifp);
1637
1638                 /* Announce the departure of the interface. */
1639                 rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
1640
1641                 strlcpy(ifp->if_xname, new_name, sizeof(ifp->if_xname));
1642                 ifa = TAILQ_FIRST(&ifp->if_addrheads[mycpuid])->ifa;
1643                 /* XXX IFA_LOCK(ifa); */
1644                 sdl = (struct sockaddr_dl *)ifa->ifa_addr;
1645                 namelen = strlen(new_name);
1646                 onamelen = sdl->sdl_nlen;
1647                 /*
1648                  * Move the address if needed.  This is safe because we
1649                  * allocate space for a name of length IFNAMSIZ when we
1650                  * create this in if_attach().
1651                  */
1652                 if (namelen != onamelen) {
1653                         bcopy(sdl->sdl_data + onamelen,
1654                             sdl->sdl_data + namelen, sdl->sdl_alen);
1655                 }
1656                 bcopy(new_name, sdl->sdl_data, namelen);
1657                 sdl->sdl_nlen = namelen;
1658                 sdl = (struct sockaddr_dl *)ifa->ifa_netmask;
1659                 bzero(sdl->sdl_data, onamelen);
1660                 while (namelen != 0)
1661                         sdl->sdl_data[--namelen] = 0xff;
1662                 /* XXX IFA_UNLOCK(ifa) */
1663
1664                 EVENTHANDLER_INVOKE(ifnet_attach_event, ifp);
1665
1666                 /* Announce the return of the interface. */
1667                 rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
1668                 break;
1669
1670         case SIOCSIFMETRIC:
1671                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1672                 if (error)
1673                         break;
1674                 ifp->if_metric = ifr->ifr_metric;
1675                 getmicrotime(&ifp->if_lastchange);
1676                 break;
1677
1678         case SIOCSIFPHYS:
1679                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1680                 if (error)
1681                         break;
1682                 if (ifp->if_ioctl == NULL) {
1683                         error = EOPNOTSUPP;
1684                         break;
1685                 }
1686                 ifnet_serialize_all(ifp);
1687                 error = ifp->if_ioctl(ifp, cmd, data, cred);
1688                 ifnet_deserialize_all(ifp);
1689                 if (error == 0)
1690                         getmicrotime(&ifp->if_lastchange);
1691                 break;
1692
1693         case SIOCSIFMTU:
1694         {
1695                 u_long oldmtu = ifp->if_mtu;
1696
1697                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1698                 if (error)
1699                         break;
1700                 if (ifp->if_ioctl == NULL) {
1701                         error = EOPNOTSUPP;
1702                         break;
1703                 }
1704                 if (ifr->ifr_mtu < IF_MINMTU || ifr->ifr_mtu > IF_MAXMTU) {
1705                         error = EINVAL;
1706                         break;
1707                 }
1708                 ifnet_serialize_all(ifp);
1709                 error = ifp->if_ioctl(ifp, cmd, data, cred);
1710                 ifnet_deserialize_all(ifp);
1711                 if (error == 0) {
1712                         getmicrotime(&ifp->if_lastchange);
1713                         rt_ifmsg(ifp);
1714                 }
1715                 /*
1716                  * If the link MTU changed, do network layer specific procedure.
1717                  */
1718                 if (ifp->if_mtu != oldmtu) {
1719 #ifdef INET6
1720                         nd6_setmtu(ifp);
1721 #endif
1722                 }
1723                 break;
1724         }
1725
1726         case SIOCADDMULTI:
1727         case SIOCDELMULTI:
1728                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1729                 if (error)
1730                         break;
1731
1732                 /* Don't allow group membership on non-multicast interfaces. */
1733                 if ((ifp->if_flags & IFF_MULTICAST) == 0) {
1734                         error = EOPNOTSUPP;
1735                         break;
1736                 }
1737
1738                 /* Don't let users screw up protocols' entries. */
1739                 if (ifr->ifr_addr.sa_family != AF_LINK) {
1740                         error = EINVAL;
1741                         break;
1742                 }
1743
1744                 if (cmd == SIOCADDMULTI) {
1745                         struct ifmultiaddr *ifma;
1746                         error = if_addmulti(ifp, &ifr->ifr_addr, &ifma);
1747                 } else {
1748                         error = if_delmulti(ifp, &ifr->ifr_addr);
1749                 }
1750                 if (error == 0)
1751                         getmicrotime(&ifp->if_lastchange);
1752                 break;
1753
1754         case SIOCSIFPHYADDR:
1755         case SIOCDIFPHYADDR:
1756 #ifdef INET6
1757         case SIOCSIFPHYADDR_IN6:
1758 #endif
1759         case SIOCSLIFPHYADDR:
1760         case SIOCSIFMEDIA:
1761         case SIOCSIFGENERIC:
1762                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1763                 if (error)
1764                         break;
1765                 if (ifp->if_ioctl == 0) {
1766                         error = EOPNOTSUPP;
1767                         break;
1768                 }
1769                 ifnet_serialize_all(ifp);
1770                 error = ifp->if_ioctl(ifp, cmd, data, cred);
1771                 ifnet_deserialize_all(ifp);
1772                 if (error == 0)
1773                         getmicrotime(&ifp->if_lastchange);
1774                 break;
1775
1776         case SIOCGIFSTATUS:
1777                 ifs = (struct ifstat *)data;
1778                 ifs->ascii[0] = '\0';
1779                 /* fall through */
1780         case SIOCGIFPSRCADDR:
1781         case SIOCGIFPDSTADDR:
1782         case SIOCGLIFPHYADDR:
1783         case SIOCGIFMEDIA:
1784         case SIOCGIFGENERIC:
1785                 if (ifp->if_ioctl == NULL) {
1786                         error = EOPNOTSUPP;
1787                         break;
1788                 }
1789                 ifnet_serialize_all(ifp);
1790                 error = ifp->if_ioctl(ifp, cmd, data, cred);
1791                 ifnet_deserialize_all(ifp);
1792                 break;
1793
1794         case SIOCSIFLLADDR:
1795                 error = priv_check_cred(cred, PRIV_ROOT, 0);
1796                 if (error)
1797                         break;
1798                 error = if_setlladdr(ifp, ifr->ifr_addr.sa_data,
1799                                      ifr->ifr_addr.sa_len);
1800                 EVENTHANDLER_INVOKE(iflladdr_event, ifp);
1801                 break;
1802
1803         default:
1804                 oif_flags = ifp->if_flags;
1805                 if (so->so_proto == 0) {
1806                         error = EOPNOTSUPP;
1807                         break;
1808                 }
1809 #ifndef COMPAT_43
1810                 error = so_pru_control_direct(so, cmd, data, ifp);
1811 #else
1812                 ocmd = cmd;
1813
1814                 switch (cmd) {
1815                 case SIOCSIFDSTADDR:
1816                 case SIOCSIFADDR:
1817                 case SIOCSIFBRDADDR:
1818                 case SIOCSIFNETMASK:
1819 #if BYTE_ORDER != BIG_ENDIAN
1820                         if (ifr->ifr_addr.sa_family == 0 &&
1821                             ifr->ifr_addr.sa_len < 16) {
1822                                 ifr->ifr_addr.sa_family = ifr->ifr_addr.sa_len;
1823                                 ifr->ifr_addr.sa_len = 16;
1824                         }
1825 #else
1826                         if (ifr->ifr_addr.sa_len == 0)
1827                                 ifr->ifr_addr.sa_len = 16;
1828 #endif
1829                         break;
1830                 case OSIOCGIFADDR:
1831                         cmd = SIOCGIFADDR;
1832                         break;
1833                 case OSIOCGIFDSTADDR:
1834                         cmd = SIOCGIFDSTADDR;
1835                         break;
1836                 case OSIOCGIFBRDADDR:
1837                         cmd = SIOCGIFBRDADDR;
1838                         break;
1839                 case OSIOCGIFNETMASK:
1840                         cmd = SIOCGIFNETMASK;
1841                         break;
1842                 default:
1843                         break;
1844                 }
1845
1846                 error = so_pru_control_direct(so, cmd, data, ifp);
1847
1848                 switch (ocmd) {
1849                 case OSIOCGIFADDR:
1850                 case OSIOCGIFDSTADDR:
1851                 case OSIOCGIFBRDADDR:
1852                 case OSIOCGIFNETMASK:
1853                         *(u_short *)&ifr->ifr_addr = ifr->ifr_addr.sa_family;
1854                         break;
1855                 }
1856 #endif /* COMPAT_43 */
1857
1858                 if ((oif_flags ^ ifp->if_flags) & IFF_UP) {
1859 #ifdef INET6
1860                         DELAY(100);/* XXX: temporary workaround for fxp issue*/
1861                         if (ifp->if_flags & IFF_UP) {
1862                                 crit_enter();
1863                                 in6_if_up(ifp);
1864                                 crit_exit();
1865                         }
1866 #endif
1867                 }
1868                 break;
1869         }
1870
1871         mtx_unlock(&ifp->if_ioctl_mtx);
1872         return (error);
1873 }
1874
1875 /*
1876  * Set/clear promiscuous mode on interface ifp based on the truth value
1877  * of pswitch.  The calls are reference counted so that only the first
1878  * "on" request actually has an effect, as does the final "off" request.
1879  * Results are undefined if the "off" and "on" requests are not matched.
1880  */
1881 int
1882 ifpromisc(struct ifnet *ifp, int pswitch)
1883 {
1884         struct ifreq ifr;
1885         int error;
1886         int oldflags;
1887
1888         oldflags = ifp->if_flags;
1889         if (ifp->if_flags & IFF_PPROMISC) {
1890                 /* Do nothing if device is in permanently promiscuous mode */
1891                 ifp->if_pcount += pswitch ? 1 : -1;
1892                 return (0);
1893         }
1894         if (pswitch) {
1895                 /*
1896                  * If the device is not configured up, we cannot put it in
1897                  * promiscuous mode.
1898                  */
1899                 if ((ifp->if_flags & IFF_UP) == 0)
1900                         return (ENETDOWN);
1901                 if (ifp->if_pcount++ != 0)
1902                         return (0);
1903                 ifp->if_flags |= IFF_PROMISC;
1904                 log(LOG_INFO, "%s: promiscuous mode enabled\n",
1905                     ifp->if_xname);
1906         } else {
1907                 if (--ifp->if_pcount > 0)
1908                         return (0);
1909                 ifp->if_flags &= ~IFF_PROMISC;
1910                 log(LOG_INFO, "%s: promiscuous mode disabled\n",
1911                     ifp->if_xname);
1912         }
1913         ifr.ifr_flags = ifp->if_flags;
1914         ifr.ifr_flagshigh = ifp->if_flags >> 16;
1915         ifnet_serialize_all(ifp);
1916         error = ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr, NULL);
1917         ifnet_deserialize_all(ifp);
1918         if (error == 0)
1919                 rt_ifmsg(ifp);
1920         else
1921                 ifp->if_flags = oldflags;
1922         return error;
1923 }
1924
1925 /*
1926  * Return interface configuration
1927  * of system.  List may be used
1928  * in later ioctl's (above) to get
1929  * other information.
1930  */
1931 static int
1932 ifconf(u_long cmd, caddr_t data, struct ucred *cred)
1933 {
1934         struct ifconf *ifc = (struct ifconf *)data;
1935         struct ifnet *ifp;
1936         struct sockaddr *sa;
1937         struct ifreq ifr, *ifrp;
1938         int space = ifc->ifc_len, error = 0;
1939
1940         ifrp = ifc->ifc_req;
1941         TAILQ_FOREACH(ifp, &ifnet, if_link) {
1942                 struct ifaddr_container *ifac;
1943                 int addrs;
1944
1945                 if (space <= sizeof ifr)
1946                         break;
1947
1948                 /*
1949                  * Zero the stack declared structure first to prevent
1950                  * memory disclosure.
1951                  */
1952                 bzero(&ifr, sizeof(ifr));
1953                 if (strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name))
1954                     >= sizeof(ifr.ifr_name)) {
1955                         error = ENAMETOOLONG;
1956                         break;
1957                 }
1958
1959                 addrs = 0;
1960                 TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1961                         struct ifaddr *ifa = ifac->ifa;
1962
1963                         if (space <= sizeof ifr)
1964                                 break;
1965                         sa = ifa->ifa_addr;
1966                         if (cred->cr_prison &&
1967                             prison_if(cred, sa))
1968                                 continue;
1969                         addrs++;
1970 #ifdef COMPAT_43
1971                         if (cmd == OSIOCGIFCONF) {
1972                                 struct osockaddr *osa =
1973                                          (struct osockaddr *)&ifr.ifr_addr;
1974                                 ifr.ifr_addr = *sa;
1975                                 osa->sa_family = sa->sa_family;
1976                                 error = copyout(&ifr, ifrp, sizeof ifr);
1977                                 ifrp++;
1978                         } else
1979 #endif
1980                         if (sa->sa_len <= sizeof(*sa)) {
1981                                 ifr.ifr_addr = *sa;
1982                                 error = copyout(&ifr, ifrp, sizeof ifr);
1983                                 ifrp++;
1984                         } else {
1985                                 if (space < (sizeof ifr) + sa->sa_len -
1986                                             sizeof(*sa))
1987                                         break;
1988                                 space -= sa->sa_len - sizeof(*sa);
1989                                 error = copyout(&ifr, ifrp,
1990                                                 sizeof ifr.ifr_name);
1991                                 if (error == 0)
1992                                         error = copyout(sa, &ifrp->ifr_addr,
1993                                                         sa->sa_len);
1994                                 ifrp = (struct ifreq *)
1995                                         (sa->sa_len + (caddr_t)&ifrp->ifr_addr);
1996                         }
1997                         if (error)
1998                                 break;
1999                         space -= sizeof ifr;
2000                 }
2001                 if (error)
2002                         break;
2003                 if (!addrs) {
2004                         bzero(&ifr.ifr_addr, sizeof ifr.ifr_addr);
2005                         error = copyout(&ifr, ifrp, sizeof ifr);
2006                         if (error)
2007                                 break;
2008                         space -= sizeof ifr;
2009                         ifrp++;
2010                 }
2011         }
2012         ifc->ifc_len -= space;
2013         return (error);
2014 }
2015
2016 /*
2017  * Just like if_promisc(), but for all-multicast-reception mode.
2018  */
2019 int
2020 if_allmulti(struct ifnet *ifp, int onswitch)
2021 {
2022         int error = 0;
2023         struct ifreq ifr;
2024
2025         crit_enter();
2026
2027         if (onswitch) {
2028                 if (ifp->if_amcount++ == 0) {
2029                         ifp->if_flags |= IFF_ALLMULTI;
2030                         ifr.ifr_flags = ifp->if_flags;
2031                         ifr.ifr_flagshigh = ifp->if_flags >> 16;
2032                         ifnet_serialize_all(ifp);
2033                         error = ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr,
2034                                               NULL);
2035                         ifnet_deserialize_all(ifp);
2036                 }
2037         } else {
2038                 if (ifp->if_amcount > 1) {
2039                         ifp->if_amcount--;
2040                 } else {
2041                         ifp->if_amcount = 0;
2042                         ifp->if_flags &= ~IFF_ALLMULTI;
2043                         ifr.ifr_flags = ifp->if_flags;
2044                         ifr.ifr_flagshigh = ifp->if_flags >> 16;
2045                         ifnet_serialize_all(ifp);
2046                         error = ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr,
2047                                               NULL);
2048                         ifnet_deserialize_all(ifp);
2049                 }
2050         }
2051
2052         crit_exit();
2053
2054         if (error == 0)
2055                 rt_ifmsg(ifp);
2056         return error;
2057 }
2058
2059 /*
2060  * Add a multicast listenership to the interface in question.
2061  * The link layer provides a routine which converts
2062  */
2063 int
2064 if_addmulti(
2065         struct ifnet *ifp,      /* interface to manipulate */
2066         struct sockaddr *sa,    /* address to add */
2067         struct ifmultiaddr **retifma)
2068 {
2069         struct sockaddr *llsa, *dupsa;
2070         int error;
2071         struct ifmultiaddr *ifma;
2072
2073         /*
2074          * If the matching multicast address already exists
2075          * then don't add a new one, just add a reference
2076          */
2077         TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
2078                 if (sa_equal(sa, ifma->ifma_addr)) {
2079                         ifma->ifma_refcount++;
2080                         if (retifma)
2081                                 *retifma = ifma;
2082                         return 0;
2083                 }
2084         }
2085
2086         /*
2087          * Give the link layer a chance to accept/reject it, and also
2088          * find out which AF_LINK address this maps to, if it isn't one
2089          * already.
2090          */
2091         if (ifp->if_resolvemulti) {
2092                 ifnet_serialize_all(ifp);
2093                 error = ifp->if_resolvemulti(ifp, &llsa, sa);
2094                 ifnet_deserialize_all(ifp);
2095                 if (error) 
2096                         return error;
2097         } else {
2098                 llsa = NULL;
2099         }
2100
2101         ifma = kmalloc(sizeof *ifma, M_IFMADDR, M_WAITOK);
2102         dupsa = kmalloc(sa->sa_len, M_IFMADDR, M_WAITOK);
2103         bcopy(sa, dupsa, sa->sa_len);
2104
2105         ifma->ifma_addr = dupsa;
2106         ifma->ifma_lladdr = llsa;
2107         ifma->ifma_ifp = ifp;
2108         ifma->ifma_refcount = 1;
2109         ifma->ifma_protospec = 0;
2110         rt_newmaddrmsg(RTM_NEWMADDR, ifma);
2111
2112         /*
2113          * Some network interfaces can scan the address list at
2114          * interrupt time; lock them out.
2115          */
2116         crit_enter();
2117         TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link);
2118         crit_exit();
2119         if (retifma)
2120                 *retifma = ifma;
2121
2122         if (llsa != NULL) {
2123                 TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
2124                         if (sa_equal(ifma->ifma_addr, llsa))
2125                                 break;
2126                 }
2127                 if (ifma) {
2128                         ifma->ifma_refcount++;
2129                 } else {
2130                         ifma = kmalloc(sizeof *ifma, M_IFMADDR, M_WAITOK);
2131                         dupsa = kmalloc(llsa->sa_len, M_IFMADDR, M_WAITOK);
2132                         bcopy(llsa, dupsa, llsa->sa_len);
2133                         ifma->ifma_addr = dupsa;
2134                         ifma->ifma_ifp = ifp;
2135                         ifma->ifma_refcount = 1;
2136                         crit_enter();
2137                         TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link);
2138                         crit_exit();
2139                 }
2140         }
2141         /*
2142          * We are certain we have added something, so call down to the
2143          * interface to let them know about it.
2144          */
2145         crit_enter();
2146         ifnet_serialize_all(ifp);
2147         if (ifp->if_ioctl)
2148                 ifp->if_ioctl(ifp, SIOCADDMULTI, 0, NULL);
2149         ifnet_deserialize_all(ifp);
2150         crit_exit();
2151
2152         return 0;
2153 }
2154
2155 /*
2156  * Remove a reference to a multicast address on this interface.  Yell
2157  * if the request does not match an existing membership.
2158  */
2159 int
2160 if_delmulti(struct ifnet *ifp, struct sockaddr *sa)
2161 {
2162         struct ifmultiaddr *ifma;
2163
2164         TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link)
2165                 if (sa_equal(sa, ifma->ifma_addr))
2166                         break;
2167         if (ifma == NULL)
2168                 return ENOENT;
2169
2170         if (ifma->ifma_refcount > 1) {
2171                 ifma->ifma_refcount--;
2172                 return 0;
2173         }
2174
2175         rt_newmaddrmsg(RTM_DELMADDR, ifma);
2176         sa = ifma->ifma_lladdr;
2177         crit_enter();
2178         TAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifma_link);
2179         /*
2180          * Make sure the interface driver is notified
2181          * in the case of a link layer mcast group being left.
2182          */
2183         if (ifma->ifma_addr->sa_family == AF_LINK && sa == NULL) {
2184                 ifnet_serialize_all(ifp);
2185                 ifp->if_ioctl(ifp, SIOCDELMULTI, 0, NULL);
2186                 ifnet_deserialize_all(ifp);
2187         }
2188         crit_exit();
2189         kfree(ifma->ifma_addr, M_IFMADDR);
2190         kfree(ifma, M_IFMADDR);
2191         if (sa == NULL)
2192                 return 0;
2193
2194         /*
2195          * Now look for the link-layer address which corresponds to
2196          * this network address.  It had been squirreled away in
2197          * ifma->ifma_lladdr for this purpose (so we don't have
2198          * to call ifp->if_resolvemulti() again), and we saved that
2199          * value in sa above.  If some nasty deleted the
2200          * link-layer address out from underneath us, we can deal because
2201          * the address we stored was is not the same as the one which was
2202          * in the record for the link-layer address.  (So we don't complain
2203          * in that case.)
2204          */
2205         TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link)
2206                 if (sa_equal(sa, ifma->ifma_addr))
2207                         break;
2208         if (ifma == NULL)
2209                 return 0;
2210
2211         if (ifma->ifma_refcount > 1) {
2212                 ifma->ifma_refcount--;
2213                 return 0;
2214         }
2215
2216         crit_enter();
2217         ifnet_serialize_all(ifp);
2218         TAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifma_link);
2219         ifp->if_ioctl(ifp, SIOCDELMULTI, 0, NULL);
2220         ifnet_deserialize_all(ifp);
2221         crit_exit();
2222         kfree(ifma->ifma_addr, M_IFMADDR);
2223         kfree(sa, M_IFMADDR);
2224         kfree(ifma, M_IFMADDR);
2225
2226         return 0;
2227 }
2228
2229 /*
2230  * Delete all multicast group membership for an interface.
2231  * Should be used to quickly flush all multicast filters.
2232  */
2233 void
2234 if_delallmulti(struct ifnet *ifp)
2235 {
2236         struct ifmultiaddr *ifma;
2237         struct ifmultiaddr *next;
2238
2239         TAILQ_FOREACH_MUTABLE(ifma, &ifp->if_multiaddrs, ifma_link, next)
2240                 if_delmulti(ifp, ifma->ifma_addr);
2241 }
2242
2243
2244 /*
2245  * Set the link layer address on an interface.
2246  *
2247  * At this time we only support certain types of interfaces,
2248  * and we don't allow the length of the address to change.
2249  */
2250 int
2251 if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len)
2252 {
2253         struct sockaddr_dl *sdl;
2254         struct ifreq ifr;
2255
2256         sdl = IF_LLSOCKADDR(ifp);
2257         if (sdl == NULL)
2258                 return (EINVAL);
2259         if (len != sdl->sdl_alen)       /* don't allow length to change */
2260                 return (EINVAL);
2261         switch (ifp->if_type) {
2262         case IFT_ETHER:                 /* these types use struct arpcom */
2263         case IFT_XETHER:
2264         case IFT_L2VLAN:
2265                 bcopy(lladdr, ((struct arpcom *)ifp->if_softc)->ac_enaddr, len);
2266                 bcopy(lladdr, LLADDR(sdl), len);
2267                 break;
2268         default:
2269                 return (ENODEV);
2270         }
2271         /*
2272          * If the interface is already up, we need
2273          * to re-init it in order to reprogram its
2274          * address filter.
2275          */
2276         ifnet_serialize_all(ifp);
2277         if ((ifp->if_flags & IFF_UP) != 0) {
2278 #ifdef INET
2279                 struct ifaddr_container *ifac;
2280 #endif
2281
2282                 ifp->if_flags &= ~IFF_UP;
2283                 ifr.ifr_flags = ifp->if_flags;
2284                 ifr.ifr_flagshigh = ifp->if_flags >> 16;
2285                 ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr,
2286                               NULL);
2287                 ifp->if_flags |= IFF_UP;
2288                 ifr.ifr_flags = ifp->if_flags;
2289                 ifr.ifr_flagshigh = ifp->if_flags >> 16;
2290                 ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr,
2291                                  NULL);
2292 #ifdef INET
2293                 /*
2294                  * Also send gratuitous ARPs to notify other nodes about
2295                  * the address change.
2296                  */
2297                 TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
2298                         struct ifaddr *ifa = ifac->ifa;
2299
2300                         if (ifa->ifa_addr != NULL &&
2301                             ifa->ifa_addr->sa_family == AF_INET)
2302                                 arp_gratuitous(ifp, ifa);
2303                 }
2304 #endif
2305         }
2306         ifnet_deserialize_all(ifp);
2307         return (0);
2308 }
2309
2310 struct ifmultiaddr *
2311 ifmaof_ifpforaddr(struct sockaddr *sa, struct ifnet *ifp)
2312 {
2313         struct ifmultiaddr *ifma;
2314
2315         TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link)
2316                 if (sa_equal(ifma->ifma_addr, sa))
2317                         break;
2318
2319         return ifma;
2320 }
2321
2322 /*
2323  * This function locates the first real ethernet MAC from a network
2324  * card and loads it into node, returning 0 on success or ENOENT if
2325  * no suitable interfaces were found.  It is used by the uuid code to
2326  * generate a unique 6-byte number.
2327  */
2328 int
2329 if_getanyethermac(uint16_t *node, int minlen)
2330 {
2331         struct ifnet *ifp;
2332         struct sockaddr_dl *sdl;
2333
2334         TAILQ_FOREACH(ifp, &ifnet, if_link) {
2335                 if (ifp->if_type != IFT_ETHER)
2336                         continue;
2337                 sdl = IF_LLSOCKADDR(ifp);
2338                 if (sdl->sdl_alen < minlen)
2339                         continue;
2340                 bcopy(((struct arpcom *)ifp->if_softc)->ac_enaddr, node,
2341                       minlen);
2342                 return(0);
2343         }
2344         return (ENOENT);
2345 }
2346
2347 /*
2348  * The name argument must be a pointer to storage which will last as
2349  * long as the interface does.  For physical devices, the result of
2350  * device_get_name(dev) is a good choice and for pseudo-devices a
2351  * static string works well.
2352  */
2353 void
2354 if_initname(struct ifnet *ifp, const char *name, int unit)
2355 {
2356         ifp->if_dname = name;
2357         ifp->if_dunit = unit;
2358         if (unit != IF_DUNIT_NONE)
2359                 ksnprintf(ifp->if_xname, IFNAMSIZ, "%s%d", name, unit);
2360         else
2361                 strlcpy(ifp->if_xname, name, IFNAMSIZ);
2362 }
2363
2364 int
2365 if_printf(struct ifnet *ifp, const char *fmt, ...)
2366 {
2367         __va_list ap;
2368         int retval;
2369
2370         retval = kprintf("%s: ", ifp->if_xname);
2371         __va_start(ap, fmt);
2372         retval += kvprintf(fmt, ap);
2373         __va_end(ap);
2374         return (retval);
2375 }
2376
2377 struct ifnet *
2378 if_alloc(uint8_t type)
2379 {
2380         struct ifnet *ifp;
2381         size_t size;
2382
2383         /*
2384          * XXX temporary hack until arpcom is setup in if_l2com
2385          */
2386         if (type == IFT_ETHER)
2387                 size = sizeof(struct arpcom);
2388         else
2389                 size = sizeof(struct ifnet);
2390
2391         ifp = kmalloc(size, M_IFNET, M_WAITOK|M_ZERO);
2392
2393         ifp->if_type = type;
2394
2395         if (if_com_alloc[type] != NULL) {
2396                 ifp->if_l2com = if_com_alloc[type](type, ifp);
2397                 if (ifp->if_l2com == NULL) {
2398                         kfree(ifp, M_IFNET);
2399                         return (NULL);
2400                 }
2401         }
2402         return (ifp);
2403 }
2404
2405 void
2406 if_free(struct ifnet *ifp)
2407 {
2408         kfree(ifp, M_IFNET);
2409 }
2410
2411 void
2412 ifq_set_classic(struct ifaltq *ifq)
2413 {
2414         ifq->altq_enqueue = ifq_classic_enqueue;
2415         ifq->altq_dequeue = ifq_classic_dequeue;
2416         ifq->altq_request = ifq_classic_request;
2417 }
2418
2419 int
2420 ifq_classic_enqueue(struct ifaltq *ifq, struct mbuf *m,
2421                     struct altq_pktattr *pa __unused)
2422 {
2423         logifq(enqueue, ifq);
2424         if (IF_QFULL(ifq)) {
2425                 m_freem(m);
2426                 return(ENOBUFS);
2427         } else {
2428                 IF_ENQUEUE(ifq, m);
2429                 return(0);
2430         }       
2431 }
2432
2433 struct mbuf *
2434 ifq_classic_dequeue(struct ifaltq *ifq, struct mbuf *mpolled, int op)
2435 {
2436         struct mbuf *m;
2437
2438         switch (op) {
2439         case ALTDQ_POLL:
2440                 IF_POLL(ifq, m);
2441                 break;
2442         case ALTDQ_REMOVE:
2443                 logifq(dequeue, ifq);
2444                 IF_DEQUEUE(ifq, m);
2445                 break;
2446         default:
2447                 panic("unsupported ALTQ dequeue op: %d", op);
2448         }
2449         KKASSERT(mpolled == NULL || mpolled == m);
2450         return(m);
2451 }
2452
2453 int
2454 ifq_classic_request(struct ifaltq *ifq, int req, void *arg)
2455 {
2456         switch (req) {
2457         case ALTRQ_PURGE:
2458                 IF_DRAIN(ifq);
2459                 break;
2460         default:
2461                 panic("unsupported ALTQ request: %d", req);
2462         }
2463         return(0);
2464 }
2465
2466 static void
2467 ifq_try_ifstart(struct ifaltq *ifq, int force_sched)
2468 {
2469         struct ifnet *ifp = ifq->altq_ifp;
2470         int running = 0, need_sched;
2471
2472         /*
2473          * Try to do direct ifnet.if_start first, if there is
2474          * contention on ifnet's serializer, ifnet.if_start will
2475          * be scheduled on ifnet's CPU.
2476          */
2477         if (!ifnet_tryserialize_tx(ifp)) {
2478                 /*
2479                  * ifnet serializer contention happened,
2480                  * ifnet.if_start is scheduled on ifnet's
2481                  * CPU, and we keep going.
2482                  */
2483                 logifstart(contend_sched, ifp);
2484                 if_start_schedule(ifp, 1);
2485                 return;
2486         }
2487
2488         if ((ifp->if_flags & IFF_RUNNING) && !ifq_is_oactive(ifq)) {
2489                 logifstart(run, ifp);
2490                 ifp->if_start(ifp);
2491                 if ((ifp->if_flags & IFF_RUNNING) && !ifq_is_oactive(ifq))
2492                         running = 1;
2493         }
2494         need_sched = if_start_need_schedule(ifq, running);
2495
2496         ifnet_deserialize_tx(ifp);
2497
2498         if (need_sched) {
2499                 /*
2500                  * More data need to be transmitted, ifnet.if_start is
2501                  * scheduled on ifnet's CPU, and we keep going.
2502                  * NOTE: ifnet.if_start interlock is not released.
2503                  */
2504                 logifstart(sched, ifp);
2505                 if_start_schedule(ifp, force_sched);
2506         }
2507 }
2508
2509 /*
2510  * IFQ packets staging mechanism:
2511  *
2512  * The packets enqueued into IFQ are staged to a certain amount before the
2513  * ifnet's if_start is called.  In this way, the driver could avoid writing
2514  * to hardware registers upon every packet, instead, hardware registers
2515  * could be written when certain amount of packets are put onto hardware
2516  * TX ring.  The measurement on several modern NICs (emx(4), igb(4), bnx(4),
2517  * bge(4), jme(4)) shows that the hardware registers writing aggregation
2518  * could save ~20% CPU time when 18bytes UDP datagrams are transmitted at
2519  * 1.48Mpps.  The performance improvement by hardware registers writing
2520  * aggeregation is also mentioned by Luigi Rizzo's netmap paper
2521  * (http://info.iet.unipi.it/~luigi/netmap/).
2522  *
2523  * IFQ packets staging is performed for two entry points into drivers's
2524  * transmission function:
2525  * - Direct ifnet's if_start calling, i.e. ifq_try_ifstart()
2526  * - ifnet's if_start scheduling, i.e. if_start_schedule()
2527  *
2528  * IFQ packets staging will be stopped upon any of the following conditions:
2529  * - If the count of packets enqueued on the current CPU is great than or
2530  *   equal to ifq_stage_cntmax. (XXX this should be per-interface)
2531  * - If the total length of packets enqueued on the current CPU is great
2532  *   than or equal to the hardware's MTU - max_protohdr.  max_protohdr is
2533  *   cut from the hardware's MTU mainly bacause a full TCP segment's size
2534  *   is usually less than hardware's MTU.
2535  * - if_start_schedule() is not pending on the current CPU and if_start
2536  *   interlock (if_snd.altq_started) is not released.
2537  * - The if_start_rollup(), which is registered as low priority netisr
2538  *   rollup function, is called; probably because no more work is pending
2539  *   for netisr.
2540  *
2541  * NOTE:
2542  * Currently IFQ packet staging is only performed in netisr threads.
2543  */
2544 int
2545 ifq_dispatch(struct ifnet *ifp, struct mbuf *m, struct altq_pktattr *pa)
2546 {
2547         struct ifaltq *ifq = &ifp->if_snd;
2548         int error, start = 0, len, mcast = 0, avoid_start = 0;
2549         struct ifaltq_stage_head *head = NULL;
2550         struct ifaltq_stage *stage = NULL;
2551
2552         ASSERT_IFNET_NOT_SERIALIZED_TX(ifp);
2553
2554         len = m->m_pkthdr.len;
2555         if (m->m_flags & M_MCAST)
2556                 mcast = 1;
2557
2558         if (curthread->td_type == TD_TYPE_NETISR) {
2559                 head = &ifq_stage_heads[mycpuid];
2560                 stage = &ifq->altq_stage[mycpuid];
2561
2562                 stage->ifqs_cnt++;
2563                 stage->ifqs_len += len;
2564                 if (stage->ifqs_cnt < ifq_stage_cntmax &&
2565                     stage->ifqs_len < (ifp->if_mtu - max_protohdr))
2566                         avoid_start = 1;
2567         }
2568
2569         ALTQ_LOCK(ifq);
2570         error = ifq_enqueue_locked(ifq, m, pa);
2571         if (error) {
2572                 if (!ifq_data_ready(ifq)) {
2573                         ALTQ_UNLOCK(ifq);
2574                         return error;
2575                 }
2576                 avoid_start = 0;
2577         }
2578         if (!ifq->altq_started) {
2579                 if (avoid_start) {
2580                         ALTQ_UNLOCK(ifq);
2581
2582                         KKASSERT(!error);
2583                         if ((stage->ifqs_flags & IFQ_STAGE_FLAG_QUED) == 0)
2584                                 ifq_stage_insert(head, stage);
2585
2586                         ifp->if_obytes += len;
2587                         if (mcast)
2588                                 ifp->if_omcasts++;
2589                         return error;
2590                 }
2591
2592                 /*
2593                  * Hold the interlock of ifnet.if_start
2594                  */
2595                 ifq->altq_started = 1;
2596                 start = 1;
2597         }
2598         ALTQ_UNLOCK(ifq);
2599
2600         if (!error) {
2601                 ifp->if_obytes += len;
2602                 if (mcast)
2603                         ifp->if_omcasts++;
2604         }
2605
2606         if (stage != NULL) {
2607                 if (!start && (stage->ifqs_flags & IFQ_STAGE_FLAG_SCHED)) {
2608                         KKASSERT(stage->ifqs_flags & IFQ_STAGE_FLAG_QUED);
2609                         if (!avoid_start) {
2610                                 ifq_stage_remove(head, stage);
2611                                 if_start_schedule(ifp, 1);
2612                         }
2613                         return error;
2614                 }
2615
2616                 if (stage->ifqs_flags & IFQ_STAGE_FLAG_QUED) {
2617                         ifq_stage_remove(head, stage);
2618                 } else {
2619                         stage->ifqs_cnt = 0;
2620                         stage->ifqs_len = 0;
2621                 }
2622         }
2623
2624         if (!start) {
2625                 logifstart(avoid, ifp);
2626                 return error;
2627         }
2628
2629         ifq_try_ifstart(ifq, 0);
2630         return error;
2631 }
2632
2633 void *
2634 ifa_create(int size, int flags)
2635 {
2636         struct ifaddr *ifa;
2637         int i;
2638
2639         KASSERT(size >= sizeof(*ifa), ("ifaddr size too small"));
2640
2641         ifa = kmalloc(size, M_IFADDR, flags | M_ZERO);
2642         if (ifa == NULL)
2643                 return NULL;
2644
2645         ifa->ifa_containers = kmalloc(ncpus * sizeof(struct ifaddr_container),
2646                                       M_IFADDR, M_WAITOK | M_ZERO);
2647         ifa->ifa_ncnt = ncpus;
2648         for (i = 0; i < ncpus; ++i) {
2649                 struct ifaddr_container *ifac = &ifa->ifa_containers[i];
2650
2651                 ifac->ifa_magic = IFA_CONTAINER_MAGIC;
2652                 ifac->ifa = ifa;
2653                 ifac->ifa_refcnt = 1;
2654         }
2655 #ifdef IFADDR_DEBUG
2656         kprintf("alloc ifa %p %d\n", ifa, size);
2657 #endif
2658         return ifa;
2659 }
2660
2661 void
2662 ifac_free(struct ifaddr_container *ifac, int cpu_id)
2663 {
2664         struct ifaddr *ifa = ifac->ifa;
2665
2666         KKASSERT(ifac->ifa_magic == IFA_CONTAINER_MAGIC);
2667         KKASSERT(ifac->ifa_refcnt == 0);
2668         KASSERT(ifac->ifa_listmask == 0,
2669                 ("ifa is still on %#x lists", ifac->ifa_listmask));
2670
2671         ifac->ifa_magic = IFA_CONTAINER_DEAD;
2672
2673 #ifdef IFADDR_DEBUG_VERBOSE
2674         kprintf("try free ifa %p cpu_id %d\n", ifac->ifa, cpu_id);
2675 #endif
2676
2677         KASSERT(ifa->ifa_ncnt > 0 && ifa->ifa_ncnt <= ncpus,
2678                 ("invalid # of ifac, %d", ifa->ifa_ncnt));
2679         if (atomic_fetchadd_int(&ifa->ifa_ncnt, -1) == 1) {
2680 #ifdef IFADDR_DEBUG
2681                 kprintf("free ifa %p\n", ifa);
2682 #endif
2683                 kfree(ifa->ifa_containers, M_IFADDR);
2684                 kfree(ifa, M_IFADDR);
2685         }
2686 }
2687
2688 static void
2689 ifa_iflink_dispatch(netmsg_t nmsg)
2690 {
2691         struct netmsg_ifaddr *msg = (struct netmsg_ifaddr *)nmsg;
2692         struct ifaddr *ifa = msg->ifa;
2693         struct ifnet *ifp = msg->ifp;
2694         int cpu = mycpuid;
2695         struct ifaddr_container *ifac;
2696
2697         crit_enter();
2698
2699         ifac = &ifa->ifa_containers[cpu];
2700         ASSERT_IFAC_VALID(ifac);
2701         KASSERT((ifac->ifa_listmask & IFA_LIST_IFADDRHEAD) == 0,
2702                 ("ifaddr is on if_addrheads"));
2703
2704         ifac->ifa_listmask |= IFA_LIST_IFADDRHEAD;
2705         if (msg->tail)
2706                 TAILQ_INSERT_TAIL(&ifp->if_addrheads[cpu], ifac, ifa_link);
2707         else
2708                 TAILQ_INSERT_HEAD(&ifp->if_addrheads[cpu], ifac, ifa_link);
2709
2710         crit_exit();
2711
2712         ifa_forwardmsg(&nmsg->lmsg, cpu + 1);
2713 }
2714
2715 void
2716 ifa_iflink(struct ifaddr *ifa, struct ifnet *ifp, int tail)
2717 {
2718         struct netmsg_ifaddr msg;
2719
2720         netmsg_init(&msg.base, NULL, &curthread->td_msgport,
2721                     0, ifa_iflink_dispatch);
2722         msg.ifa = ifa;
2723         msg.ifp = ifp;
2724         msg.tail = tail;
2725
2726         ifa_domsg(&msg.base.lmsg, 0);
2727 }
2728
2729 static void
2730 ifa_ifunlink_dispatch(netmsg_t nmsg)
2731 {
2732         struct netmsg_ifaddr *msg = (struct netmsg_ifaddr *)nmsg;
2733         struct ifaddr *ifa = msg->ifa;
2734         struct ifnet *ifp = msg->ifp;
2735         int cpu = mycpuid;
2736         struct ifaddr_container *ifac;
2737
2738         crit_enter();
2739
2740         ifac = &ifa->ifa_containers[cpu];
2741         ASSERT_IFAC_VALID(ifac);
2742         KASSERT(ifac->ifa_listmask & IFA_LIST_IFADDRHEAD,
2743                 ("ifaddr is not on if_addrhead"));
2744
2745         TAILQ_REMOVE(&ifp->if_addrheads[cpu], ifac, ifa_link);
2746         ifac->ifa_listmask &= ~IFA_LIST_IFADDRHEAD;
2747
2748         crit_exit();
2749
2750         ifa_forwardmsg(&nmsg->lmsg, cpu + 1);
2751 }
2752
2753 void
2754 ifa_ifunlink(struct ifaddr *ifa, struct ifnet *ifp)
2755 {
2756         struct netmsg_ifaddr msg;
2757
2758         netmsg_init(&msg.base, NULL, &curthread->td_msgport,
2759                     0, ifa_ifunlink_dispatch);
2760         msg.ifa = ifa;
2761         msg.ifp = ifp;
2762
2763         ifa_domsg(&msg.base.lmsg, 0);
2764 }
2765
2766 static void
2767 ifa_destroy_dispatch(netmsg_t nmsg)
2768 {
2769         struct netmsg_ifaddr *msg = (struct netmsg_ifaddr *)nmsg;
2770
2771         IFAFREE(msg->ifa);
2772         ifa_forwardmsg(&nmsg->lmsg, mycpuid + 1);
2773 }
2774
2775 void
2776 ifa_destroy(struct ifaddr *ifa)
2777 {
2778         struct netmsg_ifaddr msg;
2779
2780         netmsg_init(&msg.base, NULL, &curthread->td_msgport,
2781                     0, ifa_destroy_dispatch);
2782         msg.ifa = ifa;
2783
2784         ifa_domsg(&msg.base.lmsg, 0);
2785 }
2786
2787 struct lwkt_port *
2788 ifnet_portfn(int cpu)
2789 {
2790         return &ifnet_threads[cpu].td_msgport;
2791 }
2792
2793 void
2794 ifnet_forwardmsg(struct lwkt_msg *lmsg, int next_cpu)
2795 {
2796         KKASSERT(next_cpu > mycpuid && next_cpu <= ncpus);
2797
2798         if (next_cpu < ncpus)
2799                 lwkt_forwardmsg(ifnet_portfn(next_cpu), lmsg);
2800         else
2801                 lwkt_replymsg(lmsg, 0);
2802 }
2803
2804 int
2805 ifnet_domsg(struct lwkt_msg *lmsg, int cpu)
2806 {
2807         KKASSERT(cpu < ncpus);
2808         return lwkt_domsg(ifnet_portfn(cpu), lmsg, 0);
2809 }
2810
2811 void
2812 ifnet_sendmsg(struct lwkt_msg *lmsg, int cpu)
2813 {
2814         KKASSERT(cpu < ncpus);
2815         lwkt_sendmsg(ifnet_portfn(cpu), lmsg);
2816 }
2817
2818 /*
2819  * Generic netmsg service loop.  Some protocols may roll their own but all
2820  * must do the basic command dispatch function call done here.
2821  */
2822 static void
2823 ifnet_service_loop(void *arg __unused)
2824 {
2825         netmsg_t msg;
2826
2827         while ((msg = lwkt_waitport(&curthread->td_msgport, 0))) {
2828                 KASSERT(msg->base.nm_dispatch, ("ifnet_service: badmsg"));
2829                 msg->base.nm_dispatch(msg);
2830         }
2831 }
2832
2833 static void
2834 if_start_rollup(void)
2835 {
2836         struct ifaltq_stage_head *head = &ifq_stage_heads[mycpuid];
2837         struct ifaltq_stage *stage;
2838
2839         while ((stage = TAILQ_FIRST(&head->ifqs_head)) != NULL) {
2840                 struct ifaltq *ifq = stage->ifqs_altq;
2841                 int is_sched = 0;
2842
2843                 if (stage->ifqs_flags & IFQ_STAGE_FLAG_SCHED)
2844                         is_sched = 1;
2845                 ifq_stage_remove(head, stage);
2846
2847                 if (is_sched) {
2848                         if_start_schedule(ifq->altq_ifp, 1);
2849                 } else {
2850                         int start = 0;
2851
2852                         ALTQ_LOCK(ifq);
2853                         if (!ifq->altq_started) {
2854                                 /*
2855                                  * Hold the interlock of ifnet.if_start
2856                                  */
2857                                 ifq->altq_started = 1;
2858                                 start = 1;
2859                         }
2860                         ALTQ_UNLOCK(ifq);
2861
2862                         if (start)
2863                                 ifq_try_ifstart(ifq, 1);
2864                 }
2865                 KKASSERT((stage->ifqs_flags &
2866                     (IFQ_STAGE_FLAG_QUED | IFQ_STAGE_FLAG_SCHED)) == 0);
2867         }
2868 }
2869
2870 static void
2871 ifnetinit(void *dummy __unused)
2872 {
2873         int i;
2874
2875         for (i = 0; i < ncpus; ++i) {
2876                 struct thread *thr = &ifnet_threads[i];
2877
2878                 lwkt_create(ifnet_service_loop, NULL, NULL,
2879                             thr, TDF_NOSTART|TDF_FORCE_SPINPORT,
2880                             i, "ifnet %d", i);
2881                 netmsg_service_port_init(&thr->td_msgport);
2882                 lwkt_schedule(thr);
2883         }
2884
2885         for (i = 0; i < ncpus; ++i)
2886                 TAILQ_INIT(&ifq_stage_heads[i].ifqs_head);
2887         netisr_register_rollup(if_start_rollup, NETISR_ROLLUP_PRIO_IFSTART);
2888 }
2889
2890 struct ifnet *
2891 ifnet_byindex(unsigned short idx)
2892 {
2893         if (idx > if_index)
2894                 return NULL;
2895         return ifindex2ifnet[idx];
2896 }
2897
2898 struct ifaddr *
2899 ifaddr_byindex(unsigned short idx)
2900 {
2901         struct ifnet *ifp;
2902
2903         ifp = ifnet_byindex(idx);
2904         if (!ifp)
2905                 return NULL;
2906         return TAILQ_FIRST(&ifp->if_addrheads[mycpuid])->ifa;
2907 }
2908
2909 void
2910 if_register_com_alloc(u_char type,
2911     if_com_alloc_t *a, if_com_free_t *f)
2912 {
2913
2914         KASSERT(if_com_alloc[type] == NULL,
2915             ("if_register_com_alloc: %d already registered", type));
2916         KASSERT(if_com_free[type] == NULL,
2917             ("if_register_com_alloc: %d free already registered", type));
2918
2919         if_com_alloc[type] = a;
2920         if_com_free[type] = f;
2921 }
2922
2923 void
2924 if_deregister_com_alloc(u_char type)
2925 {
2926
2927         KASSERT(if_com_alloc[type] != NULL,
2928             ("if_deregister_com_alloc: %d not registered", type));
2929         KASSERT(if_com_free[type] != NULL,
2930             ("if_deregister_com_alloc: %d free not registered", type));
2931         if_com_alloc[type] = NULL;
2932         if_com_free[type] = NULL;
2933 }
2934
2935 int
2936 if_ring_count2(int cnt, int cnt_max)
2937 {
2938         int shift = 0;
2939
2940         KASSERT(cnt_max >= 1 && powerof2(cnt_max),
2941             ("invalid ring count max %d", cnt_max));
2942
2943         if (cnt <= 0)
2944                 cnt = cnt_max;
2945         if (cnt > ncpus2)
2946                 cnt = ncpus2;
2947         if (cnt > cnt_max)
2948                 cnt = cnt_max;
2949
2950         while ((1 << (shift + 1)) <= cnt)
2951                 ++shift;
2952         cnt = 1 << shift;
2953
2954         KASSERT(cnt >= 1 && cnt <= ncpus2 && cnt <= cnt_max,
2955             ("calculate cnt %d, ncpus2 %d, cnt max %d",
2956              cnt, ncpus2, cnt_max));
2957         return cnt;
2958 }
2959
2960 void
2961 ifq_set_maxlen(struct ifaltq *ifq, int len)
2962 {
2963         ifq->ifq_maxlen = len + (ncpus * ifq_stage_cntmax);
2964 }