carp: Lockless MPSAFE step 8 of many
[dragonfly.git] / sys / netinet / ip_carp.c
1 /*
2  * Copyright (c) 2002 Michael Shalayeff. All rights reserved.
3  * Copyright (c) 2003 Ryan McBride. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
15  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
16  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
17  * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT,
18  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
20  * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
22  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
23  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
24  * THE POSSIBILITY OF SUCH DAMAGE.
25  */
26 /*
27  * $FreeBSD: src/sys/netinet/ip_carp.c,v 1.48 2007/02/02 09:39:09 glebius Exp $
28  */
29
30 #include "opt_carp.h"
31 #include "opt_inet.h"
32 #include "opt_inet6.h"
33
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/kernel.h>
37 #include <sys/in_cksum.h>
38 #include <sys/limits.h>
39 #include <sys/malloc.h>
40 #include <sys/mbuf.h>
41 #include <sys/msgport2.h>
42 #include <sys/time.h>
43 #include <sys/proc.h>
44 #include <sys/priv.h>
45 #include <sys/sockio.h>
46 #include <sys/socket.h>
47 #include <sys/sysctl.h>
48 #include <sys/syslog.h>
49 #include <sys/thread.h>
50
51 #include <machine/stdarg.h>
52 #include <crypto/sha1.h>
53
54 #include <net/bpf.h>
55 #include <net/ethernet.h>
56 #include <net/if.h>
57 #include <net/if_dl.h>
58 #include <net/if_types.h>
59 #include <net/route.h>
60 #include <net/if_clone.h>
61 #include <net/if_var.h>
62 #include <net/ifq_var.h>
63 #include <net/netmsg2.h>
64
65 #ifdef INET
66 #include <netinet/in.h>
67 #include <netinet/in_var.h>
68 #include <netinet/in_systm.h>
69 #include <netinet/ip.h>
70 #include <netinet/ip_var.h>
71 #include <netinet/if_ether.h>
72 #endif
73
74 #ifdef INET6
75 #include <netinet/icmp6.h>
76 #include <netinet/ip6.h>
77 #include <netinet6/ip6_var.h>
78 #include <netinet6/scope6_var.h>
79 #include <netinet6/nd6.h>
80 #endif
81
82 #include <netinet/ip_carp.h>
83
84 #define CARP_IFNAME             "carp"
85 #define CARP_IS_RUNNING(ifp)    \
86         (((ifp)->if_flags & (IFF_UP | IFF_RUNNING)) == (IFF_UP | IFF_RUNNING))
87
88 struct carp_vhaddr {
89         uint32_t                vha_flags;      /* CARP_VHAF_ */
90         struct in_ifaddr        *vha_ia;        /* carp address */
91         struct in_ifaddr        *vha_iaback;    /* backing address */
92         TAILQ_ENTRY(carp_vhaddr) vha_link;
93 };
94 TAILQ_HEAD(carp_vhaddr_list, carp_vhaddr);
95
96 struct carp_softc {
97         struct arpcom            arpcom;
98         struct ifnet            *sc_carpdev;    /* parent interface */
99         struct carp_vhaddr_list  sc_vha_list;   /* virtual addr list */
100
101         const struct in_ifaddr  *sc_ia;         /* primary iface address v4 */
102         struct ip_moptions       sc_imo;
103
104 #ifdef INET6
105         struct in6_ifaddr       *sc_ia6;        /* primary iface address v6 */
106         struct ip6_moptions      sc_im6o;
107 #endif /* INET6 */
108         TAILQ_ENTRY(carp_softc)  sc_list;
109
110         enum { INIT = 0, BACKUP, MASTER }
111                                  sc_state;
112         int                      sc_dead;
113
114         int                      sc_suppress;
115
116         int                      sc_sendad_errors;
117 #define CARP_SENDAD_MAX_ERRORS  3
118         int                      sc_sendad_success;
119 #define CARP_SENDAD_MIN_SUCCESS 3
120
121         int                      sc_vhid;
122         int                      sc_advskew;
123         int                      sc_naddrs;     /* actually used IPv4 vha */
124         int                      sc_naddrs6;
125         int                      sc_advbase;    /* seconds */
126         int                      sc_init_counter;
127         uint64_t                 sc_counter;
128
129         /* authentication */
130 #define CARP_HMAC_PAD   64
131         unsigned char            sc_key[CARP_KEY_LEN];
132         unsigned char            sc_pad[CARP_HMAC_PAD];
133         SHA1_CTX                 sc_sha1;
134
135         struct callout           sc_ad_tmo;     /* advertisement timeout */
136         struct callout           sc_md_tmo;     /* master down timeout */
137         struct callout           sc_md6_tmo;    /* master down timeout */
138
139         LIST_ENTRY(carp_softc)   sc_next;       /* Interface clue */
140 };
141
142 #define sc_if   arpcom.ac_if
143
144 struct carp_if {
145         TAILQ_HEAD(, carp_softc) vhif_vrs;
146 };
147
148 struct netmsg_carp {
149         struct netmsg_base      base;
150         struct ifnet            *nc_carpdev;
151         struct carp_softc       *nc_softc;
152         void                    *nc_data;
153         size_t                  nc_datalen;
154 };
155
156 SYSCTL_DECL(_net_inet_carp);
157
158 static int carp_opts[CARPCTL_MAXID] = { 0, 1, 0, 1, 0, 0 }; /* XXX for now */
159 SYSCTL_INT(_net_inet_carp, CARPCTL_ALLOW, allow, CTLFLAG_RW,
160     &carp_opts[CARPCTL_ALLOW], 0, "Accept incoming CARP packets");
161 SYSCTL_INT(_net_inet_carp, CARPCTL_PREEMPT, preempt, CTLFLAG_RW,
162     &carp_opts[CARPCTL_PREEMPT], 0, "high-priority backup preemption mode");
163 SYSCTL_INT(_net_inet_carp, CARPCTL_LOG, log, CTLFLAG_RW,
164     &carp_opts[CARPCTL_LOG], 0, "log bad carp packets");
165 SYSCTL_INT(_net_inet_carp, CARPCTL_ARPBALANCE, arpbalance, CTLFLAG_RW,
166     &carp_opts[CARPCTL_ARPBALANCE], 0, "balance arp responses");
167
168 static int carp_suppress_preempt = 0;
169 SYSCTL_INT(_net_inet_carp, OID_AUTO, suppress_preempt, CTLFLAG_RD,
170     &carp_suppress_preempt, 0, "Preemption is suppressed");
171
172 static struct carpstats carpstats;
173 SYSCTL_STRUCT(_net_inet_carp, CARPCTL_STATS, stats, CTLFLAG_RW,
174     &carpstats, carpstats,
175     "CARP statistics (struct carpstats, netinet/ip_carp.h)");
176
177 #define CARP_LOG(...)   do {                            \
178         if (carp_opts[CARPCTL_LOG] > 0)                 \
179                 log(LOG_INFO, __VA_ARGS__);             \
180 } while (0)
181
182 #define CARP_DEBUG(...) do {                            \
183         if (carp_opts[CARPCTL_LOG] > 1)                 \
184                 log(LOG_DEBUG, __VA_ARGS__);            \
185 } while (0)
186
187 static struct lwkt_token carp_tok = LWKT_TOKEN_INITIALIZER(carp_token);
188
189 static void     carp_hmac_prepare(struct carp_softc *);
190 static void     carp_hmac_generate(struct carp_softc *, uint32_t *,
191                     unsigned char *);
192 static int      carp_hmac_verify(struct carp_softc *, uint32_t *,
193                     unsigned char *);
194 static void     carp_setroute(struct carp_softc *, int);
195 static void     carp_proto_input_c(struct carp_softc *, struct mbuf *,
196                     struct carp_header *, sa_family_t);
197 static int      carp_clone_create(struct if_clone *, int, caddr_t);
198 static int      carp_clone_destroy(struct ifnet *);
199 static void     carp_detach(struct carp_softc *, int, boolean_t);
200 static void     carp_prepare_ad(struct carp_softc *, struct carp_header *);
201 static void     carp_send_ad_all(void);
202 static void     carp_send_ad_timeout(void *);
203 static void     carp_send_ad(struct carp_softc *);
204 static void     carp_send_arp(struct carp_softc *);
205 static void     carp_master_down_timeout(void *);
206 static void     carp_master_down(struct carp_softc *);
207 static void     carp_setrun(struct carp_softc *, sa_family_t);
208 static void     carp_set_state(struct carp_softc *, int);
209 static struct ifnet *carp_forus(struct carp_if *, const uint8_t *);
210
211 static void     carp_init(void *);
212 static int      carp_ioctl(struct ifnet *, u_long, caddr_t, struct ucred *);
213 static int      carp_output(struct ifnet *, struct mbuf *, struct sockaddr *,
214                     struct rtentry *);
215 static void     carp_start(struct ifnet *);
216 static void     carp_serialize(struct ifnet *, enum ifnet_serialize);
217 static void     carp_deserialize(struct ifnet *, enum ifnet_serialize);
218 static int      carp_tryserialize(struct ifnet *, enum ifnet_serialize);
219 #ifdef INVARIANTS
220 static void     carp_serialize_assert(struct ifnet *, enum ifnet_serialize,
221                     boolean_t);
222 #endif
223
224 static void     carp_multicast_cleanup(struct carp_softc *);
225 static void     carp_add_addr(struct carp_softc *, struct ifaddr *);
226 static void     carp_del_addr(struct carp_softc *, struct ifaddr *);
227 static void     carp_config_addr(struct carp_softc *, struct ifaddr *);
228 static void     carp_link_addrs(struct carp_softc *, struct ifnet *,
229                     struct ifaddr *);
230 static void     carp_unlink_addrs(struct carp_softc *, struct ifnet *,
231                     struct ifaddr *);
232 static void     carp_update_addrs(struct carp_softc *, struct ifaddr *);
233
234 static int      carp_config_vhaddr(struct carp_softc *, struct carp_vhaddr *,
235                     struct in_ifaddr *);
236 static int      carp_activate_vhaddr(struct carp_softc *, struct carp_vhaddr *,
237                     struct ifnet *, struct in_ifaddr *, int);
238 static void     carp_deactivate_vhaddr(struct carp_softc *,
239                     struct carp_vhaddr *, boolean_t);
240 static int      carp_addroute_vhaddr(struct carp_softc *, struct carp_vhaddr *);
241 static void     carp_delroute_vhaddr(struct carp_softc *, struct carp_vhaddr *,
242                     boolean_t);
243
244 static void     carp_sc_state(struct carp_softc *);
245 #ifdef INET6
246 static void     carp_send_na(struct carp_softc *);
247 #ifdef notyet
248 static int      carp_set_addr6(struct carp_softc *, struct sockaddr_in6 *);
249 static int      carp_del_addr6(struct carp_softc *, struct sockaddr_in6 *);
250 #endif
251 static void     carp_multicast6_cleanup(struct carp_softc *);
252 #endif
253 static void     carp_stop(struct carp_softc *, int);
254 static void     carp_suspend(struct carp_softc *, int);
255 static void     carp_ioctl_stop(struct carp_softc *);
256 static int      carp_ioctl_setvh(struct carp_softc *, void *, struct ucred *);
257 static int      carp_ioctl_getvh(struct carp_softc *, void *, struct ucred *);
258 static int      carp_ioctl_getdevname(struct carp_softc *, struct ifdrv *);
259 static int      carp_ioctl_getvhaddr(struct carp_softc *, struct ifdrv *);
260
261 static void     carp_ifaddr(void *, struct ifnet *, enum ifaddr_event,
262                             struct ifaddr *);
263 static void     carp_ifdetach(void *, struct ifnet *);
264
265 static void     carp_ifdetach_dispatch(netmsg_t);
266 static void     carp_clone_destroy_dispatch(netmsg_t);
267 static void     carp_init_dispatch(netmsg_t);
268 static void     carp_ioctl_stop_dispatch(netmsg_t);
269 static void     carp_ioctl_setvh_dispatch(netmsg_t);
270 static void     carp_ioctl_getvh_dispatch(netmsg_t);
271 static void     carp_ioctl_getdevname_dispatch(netmsg_t);
272 static void     carp_ioctl_getvhaddr_dispatch(netmsg_t);
273
274 static MALLOC_DEFINE(M_CARP, "CARP", "CARP interfaces");
275
276 static LIST_HEAD(, carp_softc) carpif_list;
277
278 static struct if_clone carp_cloner =
279 IF_CLONE_INITIALIZER(CARP_IFNAME, carp_clone_create, carp_clone_destroy,
280                      0, IF_MAXUNIT);
281
282 static uint8_t  carp_etheraddr[ETHER_ADDR_LEN] = { 0, 0, 0x5e, 0, 1, 0 };
283
284 static eventhandler_tag carp_ifdetach_event;
285 static eventhandler_tag carp_ifaddr_event;
286
287 static __inline void
288 carp_insert_vhaddr(struct carp_softc *sc, struct carp_vhaddr *vha_new)
289 {
290         struct carp_vhaddr *vha;
291         u_long new_addr, addr;
292
293         KKASSERT((vha_new->vha_flags & CARP_VHAF_ONLIST) == 0);
294
295         /*
296          * Virtual address list is sorted; smaller one first
297          */
298         new_addr = ntohl(vha_new->vha_ia->ia_addr.sin_addr.s_addr);
299
300         TAILQ_FOREACH(vha, &sc->sc_vha_list, vha_link) {
301                 addr = ntohl(vha->vha_ia->ia_addr.sin_addr.s_addr);
302
303                 if (addr > new_addr)
304                         break;
305         }
306         if (vha == NULL)
307                 TAILQ_INSERT_TAIL(&sc->sc_vha_list, vha_new, vha_link);
308         else
309                 TAILQ_INSERT_BEFORE(vha, vha_new, vha_link);
310         vha_new->vha_flags |= CARP_VHAF_ONLIST;
311 }
312
313 static __inline void
314 carp_remove_vhaddr(struct carp_softc *sc, struct carp_vhaddr *vha)
315 {
316         KKASSERT(vha->vha_flags & CARP_VHAF_ONLIST);
317         vha->vha_flags &= ~CARP_VHAF_ONLIST;
318         TAILQ_REMOVE(&sc->sc_vha_list, vha, vha_link);
319 }
320
321 static void
322 carp_hmac_prepare(struct carp_softc *sc)
323 {
324         uint8_t version = CARP_VERSION, type = CARP_ADVERTISEMENT;
325         uint8_t vhid = sc->sc_vhid & 0xff;
326         int i;
327 #ifdef INET6
328         struct ifaddr_container *ifac;
329         struct in6_addr in6;
330 #endif
331 #ifdef INET
332         struct carp_vhaddr *vha;
333 #endif
334
335         /* XXX: possible race here */
336
337         /* compute ipad from key */
338         bzero(sc->sc_pad, sizeof(sc->sc_pad));
339         bcopy(sc->sc_key, sc->sc_pad, sizeof(sc->sc_key));
340         for (i = 0; i < sizeof(sc->sc_pad); i++)
341                 sc->sc_pad[i] ^= 0x36;
342
343         /* precompute first part of inner hash */
344         SHA1Init(&sc->sc_sha1);
345         SHA1Update(&sc->sc_sha1, sc->sc_pad, sizeof(sc->sc_pad));
346         SHA1Update(&sc->sc_sha1, (void *)&version, sizeof(version));
347         SHA1Update(&sc->sc_sha1, (void *)&type, sizeof(type));
348         SHA1Update(&sc->sc_sha1, (void *)&vhid, sizeof(vhid));
349 #ifdef INET
350         TAILQ_FOREACH(vha, &sc->sc_vha_list, vha_link) {
351                 SHA1Update(&sc->sc_sha1,
352                     (const uint8_t *)&vha->vha_ia->ia_addr.sin_addr,
353                     sizeof(struct in_addr));
354         }
355 #endif /* INET */
356 #ifdef INET6
357         TAILQ_FOREACH(ifac, &sc->sc_if.if_addrheads[mycpuid], ifa_link) {
358                 struct ifaddr *ifa = ifac->ifa;
359
360                 if (ifa->ifa_addr->sa_family == AF_INET6) {
361                         in6 = ifatoia6(ifa)->ia_addr.sin6_addr;
362                         in6_clearscope(&in6);
363                         SHA1Update(&sc->sc_sha1, (void *)&in6, sizeof(in6));
364                 }
365         }
366 #endif /* INET6 */
367
368         /* convert ipad to opad */
369         for (i = 0; i < sizeof(sc->sc_pad); i++)
370                 sc->sc_pad[i] ^= 0x36 ^ 0x5c;
371 }
372
373 static void
374 carp_hmac_generate(struct carp_softc *sc, uint32_t counter[2],
375     unsigned char md[20])
376 {
377         SHA1_CTX sha1ctx;
378
379         /* fetch first half of inner hash */
380         bcopy(&sc->sc_sha1, &sha1ctx, sizeof(sha1ctx));
381
382         SHA1Update(&sha1ctx, (void *)counter, sizeof(sc->sc_counter));
383         SHA1Final(md, &sha1ctx);
384
385         /* outer hash */
386         SHA1Init(&sha1ctx);
387         SHA1Update(&sha1ctx, sc->sc_pad, sizeof(sc->sc_pad));
388         SHA1Update(&sha1ctx, md, 20);
389         SHA1Final(md, &sha1ctx);
390 }
391
392 static int
393 carp_hmac_verify(struct carp_softc *sc, uint32_t counter[2],
394     unsigned char md[20])
395 {
396         unsigned char md2[20];
397
398         carp_hmac_generate(sc, counter, md2);
399         return (bcmp(md, md2, sizeof(md2)));
400 }
401
402 static void
403 carp_setroute(struct carp_softc *sc, int cmd)
404 {
405 #ifdef INET6
406         struct ifaddr_container *ifac;
407 #endif
408         struct carp_vhaddr *vha;
409
410         KKASSERT(cmd == RTM_DELETE || cmd == RTM_ADD);
411
412         TAILQ_FOREACH(vha, &sc->sc_vha_list, vha_link) {
413                 if (vha->vha_iaback == NULL)
414                         continue;
415                 if (cmd == RTM_DELETE)
416                         carp_delroute_vhaddr(sc, vha, FALSE);
417                 else
418                         carp_addroute_vhaddr(sc, vha);
419         }
420
421 #ifdef INET6
422         TAILQ_FOREACH(ifac, &sc->sc_if.if_addrheads[mycpuid], ifa_link) {
423                 struct ifaddr *ifa = ifac->ifa;
424
425                 if (ifa->ifa_addr->sa_family == AF_INET6) {
426                         if (cmd == RTM_ADD)
427                                 in6_ifaddloop(ifa);
428                         else
429                                 in6_ifremloop(ifa);
430                 }
431         }
432 #endif /* INET6 */
433 }
434
435 static int
436 carp_clone_create(struct if_clone *ifc, int unit, caddr_t param __unused)
437 {
438         struct carp_softc *sc;
439         struct ifnet *ifp;
440
441         sc = kmalloc(sizeof(*sc), M_CARP, M_WAITOK | M_ZERO);
442         ifp = &sc->sc_if;
443
444         sc->sc_suppress = 0;
445         sc->sc_advbase = CARP_DFLTINTV;
446         sc->sc_vhid = -1;       /* required setting */
447         sc->sc_advskew = 0;
448         sc->sc_init_counter = 1;
449         sc->sc_naddrs = 0;
450         sc->sc_naddrs6 = 0;
451
452         TAILQ_INIT(&sc->sc_vha_list);
453
454 #ifdef INET6
455         sc->sc_im6o.im6o_multicast_hlim = CARP_DFLTTL;
456 #endif
457
458         callout_init_mp(&sc->sc_ad_tmo);
459         callout_init_mp(&sc->sc_md_tmo);
460         callout_init_mp(&sc->sc_md6_tmo);
461
462         if_initname(ifp, CARP_IFNAME, unit);
463         ifp->if_softc = sc;
464         ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
465         ifp->if_init = carp_init;
466         ifp->if_ioctl = carp_ioctl;
467         ifp->if_start = carp_start;
468         ifp->if_serialize = carp_serialize;
469         ifp->if_deserialize = carp_deserialize;
470         ifp->if_tryserialize = carp_tryserialize;
471 #ifdef INVARIANTS
472         ifp->if_serialize_assert = carp_serialize_assert;
473 #endif
474         ifq_set_maxlen(&ifp->if_snd, ifqmaxlen);
475         ifq_set_ready(&ifp->if_snd);
476
477         ether_ifattach(ifp, carp_etheraddr, NULL);
478
479         ifp->if_type = IFT_CARP;
480         ifp->if_output = carp_output;
481
482         carp_gettok();
483         LIST_INSERT_HEAD(&carpif_list, sc, sc_next);
484         carp_reltok();
485
486         return (0);
487 }
488
489 static void
490 carp_clone_destroy_dispatch(netmsg_t msg)
491 {
492         struct netmsg_carp *cmsg = (struct netmsg_carp *)msg;
493         struct carp_softc *sc = cmsg->nc_softc;
494
495         carp_gettok();
496
497         sc->sc_dead = 1;
498         carp_detach(sc, 1, FALSE);
499
500         carp_reltok();
501
502         lwkt_replymsg(&cmsg->base.lmsg, 0);
503 }
504
505 static int
506 carp_clone_destroy(struct ifnet *ifp)
507 {
508         struct carp_softc *sc = ifp->if_softc;
509         struct netmsg_carp cmsg;
510
511         bzero(&cmsg, sizeof(cmsg));
512         netmsg_init(&cmsg.base, NULL, &curthread->td_msgport, 0,
513             carp_clone_destroy_dispatch);
514         cmsg.nc_softc = sc;
515
516         lwkt_domsg(cpu_portfn(0), &cmsg.base.lmsg, 0);
517
518         carp_gettok();
519         LIST_REMOVE(sc, sc_next);
520         carp_reltok();
521
522         bpfdetach(ifp);
523         if_detach(ifp);
524
525         KASSERT(sc->sc_naddrs == 0, ("certain inet address is still active\n"));
526         kfree(sc, M_CARP);
527
528         return 0;
529 }
530
531 static void
532 carp_detach(struct carp_softc *sc, int detach, boolean_t del_iaback)
533 {
534         struct carp_if *cif;
535
536         carp_suspend(sc, detach);
537
538         carp_multicast_cleanup(sc);
539 #ifdef INET6
540         carp_multicast6_cleanup(sc);
541 #endif
542
543         if (!sc->sc_dead && detach) {
544                 struct carp_vhaddr *vha;
545
546                 TAILQ_FOREACH(vha, &sc->sc_vha_list, vha_link)
547                         carp_deactivate_vhaddr(sc, vha, del_iaback);
548                 KKASSERT(sc->sc_naddrs == 0);
549         }
550
551         if (sc->sc_carpdev != NULL) {
552                 cif = sc->sc_carpdev->if_carp;
553                 TAILQ_REMOVE(&cif->vhif_vrs, sc, sc_list);
554                 if (TAILQ_EMPTY(&cif->vhif_vrs)) {
555                         ifpromisc(sc->sc_carpdev, 0);
556                         sc->sc_carpdev->if_carp = NULL;
557                         kfree(cif, M_CARP);
558                 }
559                 sc->sc_carpdev = NULL;
560                 sc->sc_ia = NULL;
561         }
562 }
563
564 static void
565 carp_ifdetach_dispatch(netmsg_t msg)
566 {
567         struct netmsg_carp *cmsg = (struct netmsg_carp *)msg;
568         struct ifnet *ifp = cmsg->nc_carpdev;
569         struct carp_if *cif = ifp->if_carp;
570         struct carp_softc *sc;
571
572         carp_gettok();
573
574         while (ifp->if_carp &&
575                (sc = TAILQ_FIRST(&cif->vhif_vrs)) != NULL)
576                 carp_detach(sc, 1, TRUE);
577
578         carp_reltok();
579
580         lwkt_replymsg(&cmsg->base.lmsg, 0);
581 }
582
583 /* Detach an interface from the carp. */
584 static void
585 carp_ifdetach(void *arg __unused, struct ifnet *ifp)
586 {
587         struct netmsg_carp cmsg;
588
589         ASSERT_IFNET_NOT_SERIALIZED_ALL(ifp);
590
591         bzero(&cmsg, sizeof(cmsg));
592         netmsg_init(&cmsg.base, NULL, &curthread->td_msgport, 0,
593             carp_ifdetach_dispatch);
594         cmsg.nc_carpdev = ifp;
595
596         lwkt_domsg(cpu_portfn(0), &cmsg.base.lmsg, 0);
597 }
598
599 /*
600  * process input packet.
601  * we have rearranged checks order compared to the rfc,
602  * but it seems more efficient this way or not possible otherwise.
603  */
604 int
605 carp_proto_input(struct mbuf **mp, int *offp, int proto)
606 {
607         struct mbuf *m = *mp;
608         struct ip *ip = mtod(m, struct ip *);
609         struct ifnet *ifp = m->m_pkthdr.rcvif;
610         struct carp_header *ch;
611         struct carp_softc *sc;
612         int len, iphlen;
613
614         carp_gettok();
615
616         iphlen = *offp;
617         *mp = NULL;
618
619         carpstats.carps_ipackets++;
620
621         if (!carp_opts[CARPCTL_ALLOW]) {
622                 m_freem(m);
623                 goto back;
624         }
625
626         /* Check if received on a valid carp interface */
627         if (ifp->if_type != IFT_CARP) {
628                 carpstats.carps_badif++;
629                 CARP_LOG("carp_proto_input: packet received on non-carp "
630                     "interface: %s\n", ifp->if_xname);
631                 m_freem(m);
632                 goto back;
633         }
634
635         if (!CARP_IS_RUNNING(ifp)) {
636                 carpstats.carps_badif++;
637                 CARP_LOG("carp_proto_input: packet received on stopped carp "
638                     "interface: %s\n", ifp->if_xname);
639                 m_freem(m);
640                 goto back;
641         }
642
643         sc = ifp->if_softc;
644         if (sc->sc_carpdev == NULL) {
645                 carpstats.carps_badif++;
646                 CARP_LOG("carp_proto_input: packet received on defunc carp "
647                     "interface: %s\n", ifp->if_xname);
648                 m_freem(m);
649                 goto back;
650         }
651
652         if (!IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
653                 carpstats.carps_badif++;
654                 CARP_LOG("carp_proto_input: non-mcast packet on "
655                     "interface: %s\n", ifp->if_xname);
656                 m_freem(m);
657                 goto back;
658         }
659
660         /* Verify that the IP TTL is CARP_DFLTTL. */
661         if (ip->ip_ttl != CARP_DFLTTL) {
662                 carpstats.carps_badttl++;
663                 CARP_LOG("carp_proto_input: received ttl %d != %d on %s\n",
664                     ip->ip_ttl, CARP_DFLTTL, ifp->if_xname);
665                 m_freem(m);
666                 goto back;
667         }
668
669         /* Minimal CARP packet size */
670         len = iphlen + sizeof(*ch);
671
672         /*
673          * Verify that the received packet length is
674          * not less than the CARP header
675          */
676         if (m->m_pkthdr.len < len) {
677                 carpstats.carps_badlen++;
678                 CARP_LOG("packet too short %d on %s\n", m->m_pkthdr.len,
679                     ifp->if_xname);
680                 m_freem(m);
681                 goto back;
682         }
683
684         /* Make sure that CARP header is contiguous */
685         if (len > m->m_len) {
686                 m = m_pullup(m, len);
687                 if (m == NULL) {
688                         carpstats.carps_hdrops++;
689                         CARP_LOG("carp_proto_input: m_pullup failed\n");
690                         goto back;
691                 }
692                 ip = mtod(m, struct ip *);
693         }
694         ch = (struct carp_header *)((uint8_t *)ip + iphlen);
695
696         /* Verify the CARP checksum */
697         if (in_cksum_skip(m, len, iphlen)) {
698                 carpstats.carps_badsum++;
699                 CARP_LOG("carp_proto_input: checksum failed on %s\n",
700                     ifp->if_xname);
701                 m_freem(m);
702                 goto back;
703         }
704         carp_proto_input_c(sc, m, ch, AF_INET);
705 back:
706         carp_reltok();
707         return(IPPROTO_DONE);
708 }
709
710 #ifdef INET6
711 int
712 carp6_proto_input(struct mbuf **mp, int *offp, int proto)
713 {
714         struct mbuf *m = *mp;
715         struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
716         struct ifnet *ifp = m->m_pkthdr.rcvif;
717         struct carp_header *ch;
718         struct carp_softc *sc;
719         u_int len;
720
721         carp_gettok();
722
723         carpstats.carps_ipackets6++;
724
725         if (!carp_opts[CARPCTL_ALLOW]) {
726                 m_freem(m);
727                 goto back;
728         }
729
730         /* check if received on a valid carp interface */
731         if (ifp->if_type != IFT_CARP) {
732                 carpstats.carps_badif++;
733                 CARP_LOG("carp6_proto_input: packet received on non-carp "
734                     "interface: %s\n", ifp->if_xname);
735                 m_freem(m);
736                 goto back;
737         }
738
739         if (!CARP_IS_RUNNING(ifp)) {
740                 carpstats.carps_badif++;
741                 CARP_LOG("carp_proto_input: packet received on stopped carp "
742                     "interface: %s\n", ifp->if_xname);
743                 m_freem(m);
744                 goto back;
745         }
746
747         sc = ifp->if_softc;
748         if (sc->sc_carpdev == NULL) {
749                 carpstats.carps_badif++;
750                 CARP_LOG("carp6_proto_input: packet received on defunc-carp "
751                     "interface: %s\n", ifp->if_xname);
752                 m_freem(m);
753                 goto back;
754         }
755
756         /* verify that the IP TTL is 255 */
757         if (ip6->ip6_hlim != CARP_DFLTTL) {
758                 carpstats.carps_badttl++;
759                 CARP_LOG("carp6_proto_input: received ttl %d != 255 on %s\n",
760                     ip6->ip6_hlim, ifp->if_xname);
761                 m_freem(m);
762                 goto back;
763         }
764
765         /* verify that we have a complete carp packet */
766         len = m->m_len;
767         IP6_EXTHDR_GET(ch, struct carp_header *, m, *offp, sizeof(*ch));
768         if (ch == NULL) {
769                 carpstats.carps_badlen++;
770                 CARP_LOG("carp6_proto_input: packet size %u too small\n", len);
771                 goto back;
772         }
773
774         /* verify the CARP checksum */
775         if (in_cksum_range(m, 0, *offp, sizeof(*ch))) {
776                 carpstats.carps_badsum++;
777                 CARP_LOG("carp6_proto_input: checksum failed, on %s\n",
778                     ifp->if_xname);
779                 m_freem(m);
780                 goto back;
781         }
782
783         carp_proto_input_c(sc, m, ch, AF_INET6);
784 back:
785         carp_reltok();
786         return (IPPROTO_DONE);
787 }
788 #endif /* INET6 */
789
790 static void
791 carp_proto_input_c(struct carp_softc *sc, struct mbuf *m,
792     struct carp_header *ch, sa_family_t af)
793 {
794         struct ifnet *cifp;
795         uint64_t tmp_counter;
796         struct timeval sc_tv, ch_tv;
797
798         if (sc->sc_vhid != ch->carp_vhid) {
799                 /*
800                  * CARP uses multicast, however, multicast packets
801                  * are tapped to all CARP interfaces on the physical
802                  * interface receiving the CARP packets, so we don't
803                  * update any stats here.
804                  */
805                 m_freem(m);
806                 return;
807         }
808         cifp = &sc->sc_if;
809
810         /* verify the CARP version. */
811         if (ch->carp_version != CARP_VERSION) {
812                 carpstats.carps_badver++;
813                 CARP_LOG("%s; invalid version %d\n", cifp->if_xname,
814                          ch->carp_version);
815                 m_freem(m);
816                 return;
817         }
818
819         /* verify the hash */
820         if (carp_hmac_verify(sc, ch->carp_counter, ch->carp_md)) {
821                 carpstats.carps_badauth++;
822                 CARP_LOG("%s: incorrect hash\n", cifp->if_xname);
823                 m_freem(m);
824                 return;
825         }
826
827         tmp_counter = ntohl(ch->carp_counter[0]);
828         tmp_counter = tmp_counter<<32;
829         tmp_counter += ntohl(ch->carp_counter[1]);
830
831         /* XXX Replay protection goes here */
832
833         sc->sc_init_counter = 0;
834         sc->sc_counter = tmp_counter;
835
836         sc_tv.tv_sec = sc->sc_advbase;
837         if (carp_suppress_preempt && sc->sc_advskew <  240)
838                 sc_tv.tv_usec = 240 * 1000000 / 256;
839         else
840                 sc_tv.tv_usec = sc->sc_advskew * 1000000 / 256;
841         ch_tv.tv_sec = ch->carp_advbase;
842         ch_tv.tv_usec = ch->carp_advskew * 1000000 / 256;
843
844         switch (sc->sc_state) {
845         case INIT:
846                 break;
847
848         case MASTER:
849                 /*
850                  * If we receive an advertisement from a master who's going to
851                  * be more frequent than us, go into BACKUP state.
852                  */
853                 if (timevalcmp(&sc_tv, &ch_tv, >) ||
854                     timevalcmp(&sc_tv, &ch_tv, ==)) {
855                         callout_stop(&sc->sc_ad_tmo);
856                         CARP_DEBUG("%s: MASTER -> BACKUP "
857                            "(more frequent advertisement received)\n",
858                            cifp->if_xname);
859                         carp_set_state(sc, BACKUP);
860                         carp_setrun(sc, 0);
861                         carp_setroute(sc, RTM_DELETE);
862                 }
863                 break;
864
865         case BACKUP:
866                 /*
867                  * If we're pre-empting masters who advertise slower than us,
868                  * and this one claims to be slower, treat him as down.
869                  */
870                 if (carp_opts[CARPCTL_PREEMPT] &&
871                     timevalcmp(&sc_tv, &ch_tv, <)) {
872                         CARP_DEBUG("%s: BACKUP -> MASTER "
873                             "(preempting a slower master)\n", cifp->if_xname);
874                         carp_master_down(sc);
875                         break;
876                 }
877
878                 /*
879                  *  If the master is going to advertise at such a low frequency
880                  *  that he's guaranteed to time out, we'd might as well just
881                  *  treat him as timed out now.
882                  */
883                 sc_tv.tv_sec = sc->sc_advbase * 3;
884                 if (timevalcmp(&sc_tv, &ch_tv, <)) {
885                         CARP_DEBUG("%s: BACKUP -> MASTER (master timed out)\n",
886                                    cifp->if_xname);
887                         carp_master_down(sc);
888                         break;
889                 }
890
891                 /*
892                  * Otherwise, we reset the counter and wait for the next
893                  * advertisement.
894                  */
895                 carp_setrun(sc, af);
896                 break;
897         }
898         m_freem(m);
899 }
900
901 struct mbuf *
902 carp_input(void *v, struct mbuf *m)
903 {
904         struct carp_if *cif = v;
905         struct ether_header *eh;
906         struct carp_softc *sc;
907         struct ifnet *ifp;
908
909         ASSERT_LWKT_TOKEN_HELD(&carp_tok);
910
911         eh = mtod(m, struct ether_header *);
912
913         ifp = carp_forus(cif, eh->ether_dhost);
914         if (ifp != NULL) {
915                 ether_reinput_oncpu(ifp, m, REINPUT_RUNBPF);
916                 return NULL;
917         }
918
919         if ((m->m_flags & (M_BCAST | M_MCAST)) == 0)
920                 return m;
921
922         /*
923          * XXX Should really check the list of multicast addresses
924          * for each CARP interface _before_ copying.
925          */
926         TAILQ_FOREACH(sc, &cif->vhif_vrs, sc_list) {
927                 struct mbuf *m0;
928
929                 if ((sc->sc_if.if_flags & IFF_UP) == 0)
930                         continue;
931
932                 m0 = m_dup(m, MB_DONTWAIT);
933                 if (m0 == NULL)
934                         continue;
935
936                 ether_reinput_oncpu(&sc->sc_if, m0, REINPUT_RUNBPF);
937         }
938         return m;
939 }
940
941 static void
942 carp_prepare_ad(struct carp_softc *sc, struct carp_header *ch)
943 {
944         if (sc->sc_init_counter) {
945                 /* this could also be seconds since unix epoch */
946                 sc->sc_counter = karc4random();
947                 sc->sc_counter = sc->sc_counter << 32;
948                 sc->sc_counter += karc4random();
949         } else {
950                 sc->sc_counter++;
951         }
952
953         ch->carp_counter[0] = htonl((sc->sc_counter >> 32) & 0xffffffff);
954         ch->carp_counter[1] = htonl(sc->sc_counter & 0xffffffff);
955
956         carp_hmac_generate(sc, ch->carp_counter, ch->carp_md);
957 }
958
959 static void
960 carp_send_ad_all(void)
961 {
962         struct carp_softc *sc;
963
964         LIST_FOREACH(sc, &carpif_list, sc_next) {
965                 if (sc->sc_carpdev == NULL)
966                         continue;
967
968                 if (CARP_IS_RUNNING(&sc->sc_if) && sc->sc_state == MASTER)
969                         carp_send_ad(sc);
970         }
971 }
972
973 static void
974 carp_send_ad_timeout(void *xsc)
975 {
976         carp_gettok();
977         carp_send_ad(xsc);
978         carp_reltok();
979 }
980
981 static void
982 carp_send_ad(struct carp_softc *sc)
983 {
984         struct ifnet *cifp = &sc->sc_if;
985         struct carp_header ch;
986         struct timeval tv;
987         struct carp_header *ch_ptr;
988         struct mbuf *m;
989         int len, advbase, advskew;
990
991         if (!CARP_IS_RUNNING(cifp)) {
992                 /* Bow out */
993                 advbase = 255;
994                 advskew = 255;
995         } else {
996                 advbase = sc->sc_advbase;
997                 if (!carp_suppress_preempt || sc->sc_advskew > 240)
998                         advskew = sc->sc_advskew;
999                 else
1000                         advskew = 240;
1001                 tv.tv_sec = advbase;
1002                 tv.tv_usec = advskew * 1000000 / 256;
1003         }
1004
1005         ch.carp_version = CARP_VERSION;
1006         ch.carp_type = CARP_ADVERTISEMENT;
1007         ch.carp_vhid = sc->sc_vhid;
1008         ch.carp_advbase = advbase;
1009         ch.carp_advskew = advskew;
1010         ch.carp_authlen = 7;    /* XXX DEFINE */
1011         ch.carp_pad1 = 0;       /* must be zero */
1012         ch.carp_cksum = 0;
1013
1014 #ifdef INET
1015         if (sc->sc_ia != NULL) {
1016                 struct ip *ip;
1017
1018                 MGETHDR(m, MB_DONTWAIT, MT_HEADER);
1019                 if (m == NULL) {
1020                         cifp->if_oerrors++;
1021                         carpstats.carps_onomem++;
1022                         /* XXX maybe less ? */
1023                         if (advbase != 255 || advskew != 255)
1024                                 callout_reset(&sc->sc_ad_tmo, tvtohz_high(&tv),
1025                                     carp_send_ad_timeout, sc);
1026                         return;
1027                 }
1028                 len = sizeof(*ip) + sizeof(ch);
1029                 m->m_pkthdr.len = len;
1030                 m->m_pkthdr.rcvif = NULL;
1031                 m->m_len = len;
1032                 MH_ALIGN(m, m->m_len);
1033                 m->m_flags |= M_MCAST;
1034                 ip = mtod(m, struct ip *);
1035                 ip->ip_v = IPVERSION;
1036                 ip->ip_hl = sizeof(*ip) >> 2;
1037                 ip->ip_tos = IPTOS_LOWDELAY;
1038                 ip->ip_len = len;
1039                 ip->ip_id = ip_newid();
1040                 ip->ip_off = IP_DF;
1041                 ip->ip_ttl = CARP_DFLTTL;
1042                 ip->ip_p = IPPROTO_CARP;
1043                 ip->ip_sum = 0;
1044                 ip->ip_src = sc->sc_ia->ia_addr.sin_addr;
1045                 ip->ip_dst.s_addr = htonl(INADDR_CARP_GROUP);
1046
1047                 ch_ptr = (struct carp_header *)(&ip[1]);
1048                 bcopy(&ch, ch_ptr, sizeof(ch));
1049                 carp_prepare_ad(sc, ch_ptr);
1050                 ch_ptr->carp_cksum = in_cksum_skip(m, len, sizeof(*ip));
1051
1052                 getmicrotime(&cifp->if_lastchange);
1053                 cifp->if_opackets++;
1054                 cifp->if_obytes += len;
1055                 carpstats.carps_opackets++;
1056
1057                 if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &sc->sc_imo, NULL)) {
1058                         cifp->if_oerrors++;
1059                         if (sc->sc_sendad_errors < INT_MAX)
1060                                 sc->sc_sendad_errors++;
1061                         if (sc->sc_sendad_errors == CARP_SENDAD_MAX_ERRORS) {
1062                                 carp_suppress_preempt++;
1063                                 if (carp_suppress_preempt == 1) {
1064                                         carp_send_ad_all();
1065                                 }
1066                         }
1067                         sc->sc_sendad_success = 0;
1068                 } else {
1069                         if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS) {
1070                                 if (++sc->sc_sendad_success >=
1071                                     CARP_SENDAD_MIN_SUCCESS) {
1072                                         carp_suppress_preempt--;
1073                                         sc->sc_sendad_errors = 0;
1074                                 }
1075                         } else {
1076                                 sc->sc_sendad_errors = 0;
1077                         }
1078                 }
1079         }
1080 #endif /* INET */
1081 #ifdef INET6
1082         if (sc->sc_ia6) {
1083                 struct ip6_hdr *ip6;
1084
1085                 MGETHDR(m, MB_DONTWAIT, MT_HEADER);
1086                 if (m == NULL) {
1087                         cifp->if_oerrors++;
1088                         carpstats.carps_onomem++;
1089                         /* XXX maybe less ? */
1090                         if (advbase != 255 || advskew != 255)
1091                                 callout_reset(&sc->sc_ad_tmo, tvtohz_high(&tv),
1092                                     carp_send_ad_timeout, sc);
1093                         return;
1094                 }
1095                 len = sizeof(*ip6) + sizeof(ch);
1096                 m->m_pkthdr.len = len;
1097                 m->m_pkthdr.rcvif = NULL;
1098                 m->m_len = len;
1099                 MH_ALIGN(m, m->m_len);
1100                 m->m_flags |= M_MCAST;
1101                 ip6 = mtod(m, struct ip6_hdr *);
1102                 bzero(ip6, sizeof(*ip6));
1103                 ip6->ip6_vfc |= IPV6_VERSION;
1104                 ip6->ip6_hlim = CARP_DFLTTL;
1105                 ip6->ip6_nxt = IPPROTO_CARP;
1106                 bcopy(&sc->sc_ia6->ia_addr.sin6_addr, &ip6->ip6_src,
1107                     sizeof(struct in6_addr));
1108                 /* set the multicast destination */
1109
1110                 ip6->ip6_dst.s6_addr16[0] = htons(0xff02);
1111                 ip6->ip6_dst.s6_addr8[15] = 0x12;
1112                 if (in6_setscope(&ip6->ip6_dst, sc->sc_carpdev, NULL) != 0) {
1113                         cifp->if_oerrors++;
1114                         m_freem(m);
1115                         CARP_LOG("%s: in6_setscope failed\n", __func__);
1116                         return;
1117                 }
1118
1119                 ch_ptr = (struct carp_header *)(&ip6[1]);
1120                 bcopy(&ch, ch_ptr, sizeof(ch));
1121                 carp_prepare_ad(sc, ch_ptr);
1122                 ch_ptr->carp_cksum = in_cksum_skip(m, len, sizeof(*ip6));
1123
1124                 getmicrotime(&cifp->if_lastchange);
1125                 cifp->if_opackets++;
1126                 cifp->if_obytes += len;
1127                 carpstats.carps_opackets6++;
1128
1129                 if (ip6_output(m, NULL, NULL, 0, &sc->sc_im6o, NULL, NULL)) {
1130                         cifp->if_oerrors++;
1131                         if (sc->sc_sendad_errors < INT_MAX)
1132                                 sc->sc_sendad_errors++;
1133                         if (sc->sc_sendad_errors == CARP_SENDAD_MAX_ERRORS) {
1134                                 carp_suppress_preempt++;
1135                                 if (carp_suppress_preempt == 1) {
1136                                         carp_send_ad_all();
1137                                 }
1138                         }
1139                         sc->sc_sendad_success = 0;
1140                 } else {
1141                         if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS) {
1142                                 if (++sc->sc_sendad_success >=
1143                                     CARP_SENDAD_MIN_SUCCESS) {
1144                                         carp_suppress_preempt--;
1145                                         sc->sc_sendad_errors = 0;
1146                                 }
1147                         } else {
1148                                 sc->sc_sendad_errors = 0;
1149                         }
1150                 }
1151         }
1152 #endif /* INET6 */
1153
1154         if (advbase != 255 || advskew != 255)
1155                 callout_reset(&sc->sc_ad_tmo, tvtohz_high(&tv),
1156                     carp_send_ad_timeout, sc);
1157 }
1158
1159 /*
1160  * Broadcast a gratuitous ARP request containing
1161  * the virtual router MAC address for each IP address
1162  * associated with the virtual router.
1163  */
1164 static void
1165 carp_send_arp(struct carp_softc *sc)
1166 {
1167         const struct carp_vhaddr *vha;
1168
1169         TAILQ_FOREACH(vha, &sc->sc_vha_list, vha_link) {
1170                 if (vha->vha_iaback == NULL)
1171                         continue;
1172                 arp_gratuitous(&sc->sc_if, &vha->vha_ia->ia_ifa);
1173         }
1174 }
1175
1176 #ifdef INET6
1177 static void
1178 carp_send_na(struct carp_softc *sc)
1179 {
1180         struct ifaddr_container *ifac;
1181         struct in6_addr *in6;
1182         static struct in6_addr mcast = IN6ADDR_LINKLOCAL_ALLNODES_INIT;
1183
1184         TAILQ_FOREACH(ifac, &sc->sc_if.if_addrheads[mycpuid], ifa_link) {
1185                 struct ifaddr *ifa = ifac->ifa;
1186
1187                 if (ifa->ifa_addr->sa_family != AF_INET6)
1188                         continue;
1189
1190                 in6 = &ifatoia6(ifa)->ia_addr.sin6_addr;
1191                 nd6_na_output(sc->sc_carpdev, &mcast, in6,
1192                     ND_NA_FLAG_OVERRIDE, 1, NULL);
1193                 DELAY(1000);    /* XXX */
1194         }
1195 }
1196 #endif /* INET6 */
1197
1198 static __inline const struct carp_vhaddr *
1199 carp_find_addr(const struct carp_softc *sc, const struct in_addr *addr)
1200 {
1201         struct carp_vhaddr *vha;
1202
1203         TAILQ_FOREACH(vha, &sc->sc_vha_list, vha_link) {
1204                 if (vha->vha_iaback == NULL)
1205                         continue;
1206
1207                 if (vha->vha_ia->ia_addr.sin_addr.s_addr == addr->s_addr)
1208                         return vha;
1209         }
1210         return NULL;
1211 }
1212
1213 #ifdef notyet
1214 static int
1215 carp_iamatch_balance(const struct carp_if *cif, const struct in_addr *itaddr,
1216                      const struct in_addr *isaddr, uint8_t **enaddr)
1217 {
1218         const struct carp_softc *vh;
1219         int index, count = 0;
1220
1221         /*
1222          * XXX proof of concept implementation.
1223          * We use the source ip to decide which virtual host should
1224          * handle the request. If we're master of that virtual host,
1225          * then we respond, otherwise, just drop the arp packet on
1226          * the floor.
1227          */
1228
1229         TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) {
1230                 if (!CARP_IS_RUNNING(&vh->sc_if))
1231                         continue;
1232
1233                 if (carp_find_addr(vh, itaddr) != NULL)
1234                         count++;
1235         }
1236         if (count == 0)
1237                 return 0;
1238
1239         /* this should be a hash, like pf_hash() */
1240         index = ntohl(isaddr->s_addr) % count;
1241         count = 0;
1242
1243         TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) {
1244                 if (!CARP_IS_RUNNING(&vh->sc_if))
1245                         continue;
1246
1247                 if (carp_find_addr(vh, itaddr) == NULL)
1248                         continue;
1249
1250                 if (count == index) {
1251                         if (vh->sc_state == MASTER) {
1252                                 *enaddr = IF_LLADDR(&vh->sc_if);
1253                                 return 1;
1254                         } else {
1255                                 return 0;
1256                         }
1257                 }
1258                 count++;
1259         }
1260         return 0;
1261 }
1262 #endif
1263
1264 int
1265 carp_iamatch(const struct in_ifaddr *ia)
1266 {
1267         const struct carp_softc *sc = ia->ia_ifp->if_softc;
1268
1269         ASSERT_LWKT_TOKEN_HELD(&carp_tok);
1270
1271 #ifdef notyet
1272         if (carp_opts[CARPCTL_ARPBALANCE])
1273                 return carp_iamatch_balance(cif, itaddr, isaddr, enaddr);
1274 #endif
1275
1276         if (!CARP_IS_RUNNING(&sc->sc_if) || sc->sc_state != MASTER)
1277                 return 0;
1278
1279         return 1;
1280 }
1281
1282 #ifdef INET6
1283 struct ifaddr *
1284 carp_iamatch6(void *v, struct in6_addr *taddr)
1285 {
1286         struct carp_if *cif = v;
1287         struct carp_softc *vh;
1288
1289         ASSERT_LWKT_TOKEN_HELD(&carp_tok);
1290
1291         TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) {
1292                 struct ifaddr_container *ifac;
1293
1294                 TAILQ_FOREACH(ifac, &vh->sc_if.if_addrheads[mycpuid],
1295                               ifa_link) {
1296                         struct ifaddr *ifa = ifac->ifa;
1297
1298                         if (IN6_ARE_ADDR_EQUAL(taddr,
1299                             &ifatoia6(ifa)->ia_addr.sin6_addr) &&
1300                             CARP_IS_RUNNING(&vh->sc_if) &&
1301                             vh->sc_state == MASTER) {
1302                                 return (ifa);
1303                         }
1304                 }
1305         }
1306         return (NULL);
1307 }
1308
1309 void *
1310 carp_macmatch6(void *v, struct mbuf *m, const struct in6_addr *taddr)
1311 {
1312         struct m_tag *mtag;
1313         struct carp_if *cif = v;
1314         struct carp_softc *sc;
1315
1316         ASSERT_LWKT_TOKEN_HELD(&carp_tok);
1317
1318         TAILQ_FOREACH(sc, &cif->vhif_vrs, sc_list) {
1319                 struct ifaddr_container *ifac;
1320
1321                 TAILQ_FOREACH(ifac, &sc->sc_if.if_addrheads[mycpuid],
1322                               ifa_link) {
1323                         struct ifaddr *ifa = ifac->ifa;
1324
1325                         if (IN6_ARE_ADDR_EQUAL(taddr,
1326                             &ifatoia6(ifa)->ia_addr.sin6_addr) &&
1327                             CARP_IS_RUNNING(&sc->sc_if)) {
1328                                 struct ifnet *ifp = &sc->sc_if;
1329
1330                                 mtag = m_tag_get(PACKET_TAG_CARP,
1331                                     sizeof(struct ifnet *), MB_DONTWAIT);
1332                                 if (mtag == NULL) {
1333                                         /* better a bit than nothing */
1334                                         return (IF_LLADDR(ifp));
1335                                 }
1336                                 bcopy(&ifp, (caddr_t)(mtag + 1),
1337                                     sizeof(struct ifnet *));
1338                                 m_tag_prepend(m, mtag);
1339
1340                                 return (IF_LLADDR(ifp));
1341                         }
1342                 }
1343         }
1344         return (NULL);
1345 }
1346 #endif
1347
1348 static struct ifnet *
1349 carp_forus(struct carp_if *cif, const uint8_t *dhost)
1350 {
1351         struct carp_softc *sc;
1352
1353         ASSERT_LWKT_TOKEN_HELD(&carp_tok);
1354
1355         if (memcmp(dhost, carp_etheraddr, ETHER_ADDR_LEN - 1) != 0)
1356                 return NULL;
1357
1358         TAILQ_FOREACH(sc, &cif->vhif_vrs, sc_list) {
1359                 struct ifnet *ifp = &sc->sc_if;
1360
1361                 if (CARP_IS_RUNNING(ifp) && sc->sc_state == MASTER &&
1362                     !bcmp(dhost, IF_LLADDR(ifp), ETHER_ADDR_LEN))
1363                         return ifp;
1364         }
1365         return NULL;
1366 }
1367
1368 static void
1369 carp_master_down_timeout(void *xsc)
1370 {
1371         struct carp_softc *sc = xsc;
1372
1373         CARP_DEBUG("%s: BACKUP -> MASTER (master timed out)\n",
1374                    sc->sc_if.if_xname);
1375         carp_gettok();
1376         carp_master_down(sc);
1377         carp_reltok();
1378 }
1379
1380 static void
1381 carp_master_down(struct carp_softc *sc)
1382 {
1383         switch (sc->sc_state) {
1384         case INIT:
1385                 kprintf("%s: master_down event in INIT state\n",
1386                         sc->sc_if.if_xname);
1387                 break;
1388
1389         case MASTER:
1390                 break;
1391
1392         case BACKUP:
1393                 carp_set_state(sc, MASTER);
1394                 carp_send_ad(sc);
1395                 carp_send_arp(sc);
1396 #ifdef INET6
1397                 carp_send_na(sc);
1398 #endif /* INET6 */
1399                 carp_setrun(sc, 0);
1400                 carp_setroute(sc, RTM_ADD);
1401                 break;
1402         }
1403 }
1404
1405 /*
1406  * When in backup state, af indicates whether to reset the master down timer
1407  * for v4 or v6. If it's set to zero, reset the ones which are already pending.
1408  */
1409 static void
1410 carp_setrun(struct carp_softc *sc, sa_family_t af)
1411 {
1412         struct ifnet *cifp = &sc->sc_if;
1413         struct timeval tv;
1414
1415         if (sc->sc_carpdev == NULL) {
1416                 carp_set_state(sc, INIT);
1417                 return;
1418         }
1419
1420         if ((cifp->if_flags & IFF_RUNNING) && sc->sc_vhid > 0 &&
1421             (sc->sc_naddrs || sc->sc_naddrs6)) {
1422                 /* Nothing */
1423         } else {
1424                 carp_setroute(sc, RTM_DELETE);
1425                 return;
1426         }
1427
1428         switch (sc->sc_state) {
1429         case INIT:
1430                 if (carp_opts[CARPCTL_PREEMPT] && !carp_suppress_preempt) {
1431                         carp_send_ad(sc);
1432                         carp_send_arp(sc);
1433 #ifdef INET6
1434                         carp_send_na(sc);
1435 #endif /* INET6 */
1436                         CARP_DEBUG("%s: INIT -> MASTER (preempting)\n",
1437                                    cifp->if_xname);
1438                         carp_set_state(sc, MASTER);
1439                         carp_setroute(sc, RTM_ADD);
1440                 } else {
1441                         CARP_DEBUG("%s: INIT -> BACKUP\n", cifp->if_xname);
1442                         carp_set_state(sc, BACKUP);
1443                         carp_setroute(sc, RTM_DELETE);
1444                         carp_setrun(sc, 0);
1445                 }
1446                 break;
1447
1448         case BACKUP:
1449                 callout_stop(&sc->sc_ad_tmo);
1450                 tv.tv_sec = 3 * sc->sc_advbase;
1451                 tv.tv_usec = sc->sc_advskew * 1000000 / 256;
1452                 switch (af) {
1453 #ifdef INET
1454                 case AF_INET:
1455                         callout_reset(&sc->sc_md_tmo, tvtohz_high(&tv),
1456                             carp_master_down_timeout, sc);
1457                         break;
1458 #endif /* INET */
1459 #ifdef INET6
1460                 case AF_INET6:
1461                         callout_reset(&sc->sc_md6_tmo, tvtohz_high(&tv),
1462                             carp_master_down_timeout, sc);
1463                         break;
1464 #endif /* INET6 */
1465                 default:
1466                         if (sc->sc_naddrs)
1467                                 callout_reset(&sc->sc_md_tmo, tvtohz_high(&tv),
1468                                     carp_master_down_timeout, sc);
1469                         if (sc->sc_naddrs6)
1470                                 callout_reset(&sc->sc_md6_tmo, tvtohz_high(&tv),
1471                                     carp_master_down_timeout, sc);
1472                         break;
1473                 }
1474                 break;
1475
1476         case MASTER:
1477                 tv.tv_sec = sc->sc_advbase;
1478                 tv.tv_usec = sc->sc_advskew * 1000000 / 256;
1479                 callout_reset(&sc->sc_ad_tmo, tvtohz_high(&tv),
1480                     carp_send_ad_timeout, sc);
1481                 break;
1482         }
1483 }
1484
1485 static void
1486 carp_multicast_cleanup(struct carp_softc *sc)
1487 {
1488         struct ip_moptions *imo = &sc->sc_imo;
1489
1490         if (imo->imo_num_memberships == 0)
1491                 return;
1492         KKASSERT(imo->imo_num_memberships == 1);
1493
1494         in_delmulti(imo->imo_membership[0]);
1495         imo->imo_membership[0] = NULL;
1496         imo->imo_num_memberships = 0;
1497         imo->imo_multicast_ifp = NULL;
1498 }
1499
1500 #ifdef INET6
1501 static void
1502 carp_multicast6_cleanup(struct carp_softc *sc)
1503 {
1504         struct ip6_moptions *im6o = &sc->sc_im6o;
1505
1506         while (!LIST_EMPTY(&im6o->im6o_memberships)) {
1507                 struct in6_multi_mship *imm =
1508                     LIST_FIRST(&im6o->im6o_memberships);
1509
1510                 LIST_REMOVE(imm, i6mm_chain);
1511                 in6_leavegroup(imm);
1512         }
1513         im6o->im6o_multicast_ifp = NULL;
1514 }
1515 #endif
1516
1517 static void
1518 carp_ioctl_getvhaddr_dispatch(netmsg_t msg)
1519 {
1520         struct netmsg_carp *cmsg = (struct netmsg_carp *)msg;
1521         struct carp_softc *sc = cmsg->nc_softc;
1522         const struct carp_vhaddr *vha;
1523         struct ifcarpvhaddr *carpa, *carpa0;
1524         int count, len, error = 0;
1525
1526         carp_gettok();
1527
1528         count = 0;
1529         TAILQ_FOREACH(vha, &sc->sc_vha_list, vha_link)
1530                 ++count;
1531
1532         if (cmsg->nc_datalen == 0) {
1533                 cmsg->nc_datalen = count * sizeof(*carpa);
1534                 goto back;
1535         } else if (count == 0 || cmsg->nc_datalen < sizeof(*carpa)) {
1536                 cmsg->nc_datalen = 0;
1537                 goto back;
1538         }
1539         len = min(cmsg->nc_datalen, sizeof(*carpa) * count);
1540         KKASSERT(len >= sizeof(*carpa));
1541
1542         carpa0 = carpa = kmalloc(len, M_TEMP, M_WAITOK | M_NULLOK | M_ZERO);
1543         if (carpa == NULL) {
1544                 error = ENOMEM; 
1545                 goto back;
1546         }
1547
1548         count = 0;
1549         TAILQ_FOREACH(vha, &sc->sc_vha_list, vha_link) {
1550                 if (len < sizeof(*carpa))
1551                         break;
1552
1553                 carpa->carpa_flags = vha->vha_flags;
1554                 carpa->carpa_addr.sin_family = AF_INET;
1555                 carpa->carpa_addr.sin_addr = vha->vha_ia->ia_addr.sin_addr;
1556
1557                 carpa->carpa_baddr.sin_family = AF_INET;
1558                 if (vha->vha_iaback == NULL) {
1559                         carpa->carpa_baddr.sin_addr.s_addr = INADDR_ANY;
1560                 } else {
1561                         carpa->carpa_baddr.sin_addr =
1562                         vha->vha_iaback->ia_addr.sin_addr;
1563                 }
1564
1565                 ++carpa;
1566                 ++count;
1567                 len -= sizeof(*carpa);
1568         }
1569         cmsg->nc_datalen = sizeof(*carpa) * count;
1570         KKASSERT(cmsg->nc_datalen > 0);
1571
1572         cmsg->nc_data = carpa0;
1573
1574 back:
1575         carp_reltok();
1576         lwkt_replymsg(&cmsg->base.lmsg, error);
1577 }
1578
1579 static int
1580 carp_ioctl_getvhaddr(struct carp_softc *sc, struct ifdrv *ifd)
1581 {
1582         struct ifnet *ifp = &sc->arpcom.ac_if;
1583         struct netmsg_carp cmsg;
1584         int error;
1585
1586         ASSERT_IFNET_SERIALIZED_ALL(ifp);
1587         ifnet_deserialize_all(ifp);
1588
1589         bzero(&cmsg, sizeof(cmsg));
1590         netmsg_init(&cmsg.base, NULL, &curthread->td_msgport, 0,
1591             carp_ioctl_getvhaddr_dispatch);
1592         cmsg.nc_softc = sc;
1593         cmsg.nc_datalen = ifd->ifd_len;
1594
1595         error = lwkt_domsg(cpu_portfn(0), &cmsg.base.lmsg, 0);
1596
1597         if (!error) {
1598                 if (cmsg.nc_data != NULL) {
1599                         error = copyout(cmsg.nc_data, ifd->ifd_data,
1600                             cmsg.nc_datalen);
1601                         kfree(cmsg.nc_data, M_TEMP);
1602                 }
1603                 ifd->ifd_len = cmsg.nc_datalen;
1604         } else {
1605                 KASSERT(cmsg.nc_data == NULL,
1606                     ("%s temp vhaddr is alloc upon error\n", __func__));
1607         }
1608
1609         ifnet_serialize_all(ifp);
1610         return error;
1611 }
1612
1613 static int
1614 carp_config_vhaddr(struct carp_softc *sc, struct carp_vhaddr *vha,
1615     struct in_ifaddr *ia_del)
1616 {
1617         struct ifnet *ifp;
1618         struct in_ifaddr *ia_if;
1619         struct in_ifaddr_container *iac;
1620         const struct sockaddr_in *sin;
1621         u_long iaddr;
1622         int own;
1623
1624         KKASSERT(vha->vha_ia != NULL);
1625
1626         sin = &vha->vha_ia->ia_addr;
1627         iaddr = ntohl(sin->sin_addr.s_addr);
1628
1629         ia_if = NULL;
1630         own = 0;
1631         TAILQ_FOREACH(iac, &in_ifaddrheads[mycpuid], ia_link) {
1632                 struct in_ifaddr *ia = iac->ia;
1633
1634                 if (ia == ia_del)
1635                         continue;
1636
1637                 if (ia->ia_ifp->if_type == IFT_CARP)
1638                         continue;
1639
1640                 if ((ia->ia_ifp->if_flags & IFF_UP) == 0)
1641                         continue;
1642
1643                 /* and, yeah, we need a multicast-capable iface too */
1644                 if ((ia->ia_ifp->if_flags & IFF_MULTICAST) == 0)
1645                         continue;
1646
1647                 if ((iaddr & ia->ia_subnetmask) == ia->ia_subnet) {
1648                         if (sin->sin_addr.s_addr ==
1649                             ia->ia_addr.sin_addr.s_addr)
1650                                 own = 1;
1651                         if (ia_if == NULL)
1652                                 ia_if = ia;
1653                         else if (sc->sc_carpdev != NULL &&
1654                                  sc->sc_carpdev == ia->ia_ifp)
1655                                 ia_if = ia;
1656                 }
1657         }
1658
1659         carp_deactivate_vhaddr(sc, vha, FALSE);
1660         if (!ia_if)
1661                 return ENOENT;
1662
1663         ifp = ia_if->ia_ifp;
1664
1665         /* XXX Don't allow parent iface to be changed */
1666         if (sc->sc_carpdev != NULL && sc->sc_carpdev != ifp)
1667                 return EEXIST;
1668
1669         return carp_activate_vhaddr(sc, vha, ifp, ia_if, own);
1670 }
1671
1672 static void
1673 carp_add_addr(struct carp_softc *sc, struct ifaddr *carp_ifa)
1674 {
1675         struct carp_vhaddr *vha_new;
1676         struct in_ifaddr *carp_ia;
1677 #ifdef INVARIANTS
1678         struct carp_vhaddr *vha;
1679 #endif
1680
1681         KKASSERT(carp_ifa->ifa_addr->sa_family == AF_INET);
1682         carp_ia = ifatoia(carp_ifa);
1683
1684 #ifdef INVARIANTS
1685         TAILQ_FOREACH(vha, &sc->sc_vha_list, vha_link)
1686                 KKASSERT(vha->vha_ia != NULL && vha->vha_ia != carp_ia);
1687 #endif
1688
1689         vha_new = kmalloc(sizeof(*vha_new), M_CARP, M_WAITOK | M_ZERO);
1690         vha_new->vha_ia = carp_ia;
1691         carp_insert_vhaddr(sc, vha_new);
1692
1693         if (carp_config_vhaddr(sc, vha_new, NULL) != 0) {
1694                 /*
1695                  * If the above configuration fails, it may only mean
1696                  * that the new address is problematic.  However, the
1697                  * carp(4) interface may already have several working
1698                  * addresses.  Since the expected behaviour of
1699                  * SIOC[AS]IFADDR is to put the NIC into working state,
1700                  * we try starting the state machine manually here with
1701                  * the hope that the carp(4)'s previously working
1702                  * addresses still could be brought up.
1703                  */
1704                 carp_hmac_prepare(sc);
1705                 carp_set_state(sc, INIT);
1706                 carp_setrun(sc, 0);
1707         }
1708 }
1709
1710 static void
1711 carp_del_addr(struct carp_softc *sc, struct ifaddr *carp_ifa)
1712 {
1713         struct carp_vhaddr *vha;
1714         struct in_ifaddr *carp_ia;
1715
1716         KKASSERT(carp_ifa->ifa_addr->sa_family == AF_INET);
1717         carp_ia = ifatoia(carp_ifa);
1718
1719         TAILQ_FOREACH(vha, &sc->sc_vha_list, vha_link) {
1720                 KKASSERT(vha->vha_ia != NULL);
1721                 if (vha->vha_ia == carp_ia)
1722                         break;
1723         }
1724         KASSERT(vha != NULL, ("no corresponding vhaddr %p\n", carp_ifa));
1725
1726         /*
1727          * Remove the vhaddr from the list before deactivating
1728          * the vhaddr, so that the HMAC could be correctly
1729          * updated in carp_deactivate_vhaddr()
1730          */
1731         carp_remove_vhaddr(sc, vha);
1732
1733         carp_deactivate_vhaddr(sc, vha, FALSE);
1734         kfree(vha, M_CARP);
1735 }
1736
1737 static void
1738 carp_config_addr(struct carp_softc *sc, struct ifaddr *carp_ifa)
1739 {
1740         struct carp_vhaddr *vha;
1741         struct in_ifaddr *carp_ia;
1742
1743         KKASSERT(carp_ifa->ifa_addr->sa_family == AF_INET);
1744         carp_ia = ifatoia(carp_ifa);
1745
1746         TAILQ_FOREACH(vha, &sc->sc_vha_list, vha_link) {
1747                 KKASSERT(vha->vha_ia != NULL);
1748                 if (vha->vha_ia == carp_ia)
1749                         break;
1750         }
1751         KASSERT(vha != NULL, ("no corresponding vhaddr %p\n", carp_ifa));
1752
1753         /* Remove then reinsert, to keep the vhaddr list sorted */
1754         carp_remove_vhaddr(sc, vha);
1755         carp_insert_vhaddr(sc, vha);
1756
1757         if (carp_config_vhaddr(sc, vha, NULL) != 0) {
1758                 /* See the comment in carp_add_addr() */
1759                 carp_hmac_prepare(sc);
1760                 carp_set_state(sc, INIT);
1761                 carp_setrun(sc, 0);
1762         }
1763 }
1764
1765 #ifdef notyet
1766
1767 #ifdef INET6
1768 static int
1769 carp_set_addr6(struct carp_softc *sc, struct sockaddr_in6 *sin6)
1770 {
1771         struct ifnet *ifp;
1772         struct carp_if *cif;
1773         struct in6_ifaddr *ia, *ia_if;
1774         struct ip6_moptions *im6o = &sc->sc_im6o;
1775         struct in6_multi_mship *imm;
1776         struct in6_addr in6;
1777         int own, error;
1778
1779         if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
1780                 carp_setrun(sc, 0);
1781                 return (0);
1782         }
1783
1784         /* we have to do it by hands to check we won't match on us */
1785         ia_if = NULL; own = 0;
1786         for (ia = in6_ifaddr; ia; ia = ia->ia_next) {
1787                 int i;
1788
1789                 for (i = 0; i < 4; i++) {
1790                         if ((sin6->sin6_addr.s6_addr32[i] &
1791                             ia->ia_prefixmask.sin6_addr.s6_addr32[i]) !=
1792                             (ia->ia_addr.sin6_addr.s6_addr32[i] &
1793                             ia->ia_prefixmask.sin6_addr.s6_addr32[i]))
1794                                 break;
1795                 }
1796                 /* and, yeah, we need a multicast-capable iface too */
1797                 if (ia->ia_ifp != &sc->sc_if &&
1798                     (ia->ia_ifp->if_flags & IFF_MULTICAST) &&
1799                     (i == 4)) {
1800                         if (!ia_if)
1801                                 ia_if = ia;
1802                         if (IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr,
1803                             &ia->ia_addr.sin6_addr))
1804                                 own++;
1805                 }
1806         }
1807
1808         if (!ia_if)
1809                 return (EADDRNOTAVAIL);
1810         ia = ia_if;
1811         ifp = ia->ia_ifp;
1812
1813         if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0 ||
1814             (im6o->im6o_multicast_ifp && im6o->im6o_multicast_ifp != ifp))
1815                 return (EADDRNOTAVAIL);
1816
1817         if (!sc->sc_naddrs6) {
1818                 im6o->im6o_multicast_ifp = ifp;
1819
1820                 /* join CARP multicast address */
1821                 bzero(&in6, sizeof(in6));
1822                 in6.s6_addr16[0] = htons(0xff02);
1823                 in6.s6_addr8[15] = 0x12;
1824                 if (in6_setscope(&in6, ifp, NULL) != 0)
1825                         goto cleanup;
1826                 if ((imm = in6_joingroup(ifp, &in6, &error)) == NULL)
1827                         goto cleanup;
1828                 LIST_INSERT_HEAD(&im6o->im6o_memberships, imm, i6mm_chain);
1829
1830                 /* join solicited multicast address */
1831                 bzero(&in6, sizeof(in6));
1832                 in6.s6_addr16[0] = htons(0xff02);
1833                 in6.s6_addr32[1] = 0;
1834                 in6.s6_addr32[2] = htonl(1);
1835                 in6.s6_addr32[3] = sin6->sin6_addr.s6_addr32[3];
1836                 in6.s6_addr8[12] = 0xff;
1837                 if (in6_setscope(&in6, ifp, NULL) != 0)
1838                         goto cleanup;
1839                 if ((imm = in6_joingroup(ifp, &in6, &error)) == NULL)
1840                         goto cleanup;
1841                 LIST_INSERT_HEAD(&im6o->im6o_memberships, imm, i6mm_chain);
1842         }
1843
1844         if (!ifp->if_carp) {
1845                 cif = kmalloc(sizeof(*cif), M_CARP, M_WAITOK | M_ZERO);
1846
1847                 if ((error = ifpromisc(ifp, 1))) {
1848                         kfree(cif, M_CARP);
1849                         goto cleanup;
1850                 }
1851
1852                 TAILQ_INIT(&cif->vhif_vrs);
1853                 ifp->if_carp = cif;
1854         } else {
1855                 struct carp_softc *vr;
1856
1857                 cif = ifp->if_carp;
1858                 TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) {
1859                         if (vr != sc && vr->sc_vhid == sc->sc_vhid) {
1860                                 error = EINVAL;
1861                                 goto cleanup;
1862                         }
1863                 }
1864         }
1865         sc->sc_ia6 = ia;
1866         sc->sc_carpdev = ifp;
1867
1868         { /* XXX prevent endless loop if already in queue */
1869         struct carp_softc *vr, *after = NULL;
1870         int myself = 0;
1871         cif = ifp->if_carp;
1872
1873         TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) {
1874                 if (vr == sc)
1875                         myself = 1;
1876                 if (vr->sc_vhid < sc->sc_vhid)
1877                         after = vr;
1878         }
1879
1880         if (!myself) {
1881                 /* We're trying to keep things in order */
1882                 if (after == NULL)
1883                         TAILQ_INSERT_TAIL(&cif->vhif_vrs, sc, sc_list);
1884                 else
1885                         TAILQ_INSERT_AFTER(&cif->vhif_vrs, after, sc, sc_list);
1886         }
1887         }
1888
1889         sc->sc_naddrs6++;
1890         if (own)
1891                 sc->sc_advskew = 0;
1892         carp_sc_state(sc);
1893         carp_setrun(sc, 0);
1894
1895         return (0);
1896
1897 cleanup:
1898         /* clean up multicast memberships */
1899         if (!sc->sc_naddrs6) {
1900                 while (!LIST_EMPTY(&im6o->im6o_memberships)) {
1901                         imm = LIST_FIRST(&im6o->im6o_memberships);
1902                         LIST_REMOVE(imm, i6mm_chain);
1903                         in6_leavegroup(imm);
1904                 }
1905         }
1906         return (error);
1907 }
1908
1909 static int
1910 carp_del_addr6(struct carp_softc *sc, struct sockaddr_in6 *sin6)
1911 {
1912         int error = 0;
1913
1914         if (!--sc->sc_naddrs6) {
1915                 struct carp_if *cif = sc->sc_carpdev->if_carp;
1916                 struct ip6_moptions *im6o = &sc->sc_im6o;
1917
1918                 callout_stop(&sc->sc_ad_tmo);
1919                 sc->sc_vhid = -1;
1920                 while (!LIST_EMPTY(&im6o->im6o_memberships)) {
1921                         struct in6_multi_mship *imm =
1922                             LIST_FIRST(&im6o->im6o_memberships);
1923
1924                         LIST_REMOVE(imm, i6mm_chain);
1925                         in6_leavegroup(imm);
1926                 }
1927                 im6o->im6o_multicast_ifp = NULL;
1928                 TAILQ_REMOVE(&cif->vhif_vrs, sc, sc_list);
1929                 if (TAILQ_EMPTY(&cif->vhif_vrs)) {
1930                         sc->sc_carpdev->if_carp = NULL;
1931                         kfree(cif, M_IFADDR);
1932                 }
1933         }
1934         return (error);
1935 }
1936 #endif /* INET6 */
1937
1938 #endif
1939
1940 static int
1941 carp_ioctl(struct ifnet *ifp, u_long cmd, caddr_t addr, struct ucred *cr)
1942 {
1943         struct carp_softc *sc = ifp->if_softc;
1944         struct ifreq *ifr = (struct ifreq *)addr;
1945         struct ifdrv *ifd = (struct ifdrv *)addr;
1946         int error = 0;
1947
1948         ASSERT_IFNET_SERIALIZED_ALL(ifp);
1949
1950         carp_gettok();
1951
1952         switch (cmd) {
1953         case SIOCSIFFLAGS:
1954                 if (ifp->if_flags & IFF_UP) {
1955                         if ((ifp->if_flags & IFF_RUNNING) == 0)
1956                                 carp_init(sc);
1957                 } else if (ifp->if_flags & IFF_RUNNING) {
1958                         carp_ioctl_stop(sc);
1959                 }
1960                 break;
1961
1962         case SIOCSVH:
1963                 error = carp_ioctl_setvh(sc, ifr->ifr_data, cr);
1964                 break;
1965
1966         case SIOCGVH:
1967                 error = carp_ioctl_getvh(sc, ifr->ifr_data, cr);
1968                 break;
1969
1970         case SIOCGDRVSPEC:
1971                 switch (ifd->ifd_cmd) {
1972                 case CARPGDEVNAME:
1973                         error = carp_ioctl_getdevname(sc, ifd);
1974                         break;
1975
1976                 case CARPGVHADDR:
1977                         error = carp_ioctl_getvhaddr(sc, ifd);
1978                         break;
1979
1980                 default:
1981                         error = EINVAL;
1982                         break;
1983                 }
1984                 break;
1985
1986         default:
1987                 error = ether_ioctl(ifp, cmd, addr);
1988                 break;
1989         }
1990
1991         carp_reltok();
1992         return error;
1993 }
1994
1995 static void
1996 carp_ioctl_stop_dispatch(netmsg_t msg)
1997 {
1998         struct netmsg_carp *cmsg = (struct netmsg_carp *)msg;
1999         struct carp_softc *sc = cmsg->nc_softc;
2000
2001         carp_gettok();
2002         carp_stop(sc, 0);
2003         carp_reltok();
2004
2005         lwkt_replymsg(&cmsg->base.lmsg, 0);
2006 }
2007
2008 static void
2009 carp_ioctl_stop(struct carp_softc *sc)
2010 {
2011         struct ifnet *ifp = &sc->arpcom.ac_if;
2012         struct netmsg_carp cmsg;
2013
2014         ASSERT_IFNET_SERIALIZED_ALL(ifp);
2015
2016         ifnet_deserialize_all(ifp);
2017
2018         bzero(&cmsg, sizeof(cmsg));
2019         netmsg_init(&cmsg.base, NULL, &curthread->td_msgport, 0,
2020             carp_ioctl_stop_dispatch);
2021         cmsg.nc_softc = sc;
2022
2023         lwkt_domsg(cpu_portfn(0), &cmsg.base.lmsg, 0);
2024
2025         ifnet_serialize_all(ifp);
2026 }
2027
2028 static void
2029 carp_ioctl_setvh_dispatch(netmsg_t msg)
2030 {
2031         struct netmsg_carp *cmsg = (struct netmsg_carp *)msg;
2032         struct carp_softc *sc = cmsg->nc_softc, *vr;
2033         struct ifnet *ifp = &sc->arpcom.ac_if;
2034         const struct carpreq *carpr = cmsg->nc_data;
2035         int error;
2036
2037         carp_gettok();
2038
2039         error = 1;
2040         if ((ifp->if_flags & IFF_RUNNING) &&
2041             sc->sc_state != INIT && carpr->carpr_state != sc->sc_state) {
2042                 switch (carpr->carpr_state) {
2043                 case BACKUP:
2044                         callout_stop(&sc->sc_ad_tmo);
2045                         carp_set_state(sc, BACKUP);
2046                         carp_setrun(sc, 0);
2047                         carp_setroute(sc, RTM_DELETE);
2048                         break;
2049
2050                 case MASTER:
2051                         carp_master_down(sc);
2052                         break;
2053
2054                 default:
2055                         break;
2056                 }
2057         }
2058         if (carpr->carpr_vhid > 0) {
2059                 if (carpr->carpr_vhid > 255) {
2060                         error = EINVAL;
2061                         goto back;
2062                 }
2063                 if (sc->sc_carpdev) {
2064                         struct carp_if *cif = sc->sc_carpdev->if_carp;
2065
2066                         TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) {
2067                                 if (vr != sc &&
2068                                     vr->sc_vhid == carpr->carpr_vhid) {
2069                                         error = EEXIST;
2070                                         goto back;
2071                                 }
2072                         }
2073                 }
2074                 sc->sc_vhid = carpr->carpr_vhid;
2075
2076                 IF_LLADDR(ifp)[5] = sc->sc_vhid;
2077                 bcopy(IF_LLADDR(ifp), sc->arpcom.ac_enaddr,
2078                     ETHER_ADDR_LEN);
2079
2080                 error--;
2081         }
2082         if (carpr->carpr_advbase > 0 || carpr->carpr_advskew > 0) {
2083                 if (carpr->carpr_advskew >= 255) {
2084                         error = EINVAL;
2085                         goto back;
2086                 }
2087                 if (carpr->carpr_advbase > 255) {
2088                         error = EINVAL;
2089                         goto back;
2090                 }
2091                 sc->sc_advbase = carpr->carpr_advbase;
2092                 sc->sc_advskew = carpr->carpr_advskew;
2093                 error--;
2094         }
2095         bcopy(carpr->carpr_key, sc->sc_key, sizeof(sc->sc_key));
2096         if (error > 0) {
2097                 error = EINVAL;
2098         } else {
2099                 error = 0;
2100                 carp_setrun(sc, 0);
2101         }
2102 back:
2103         carp_hmac_prepare(sc);
2104         carp_gettok();
2105
2106         lwkt_replymsg(&cmsg->base.lmsg, error);
2107 }
2108
2109 static int
2110 carp_ioctl_setvh(struct carp_softc *sc, void *udata, struct ucred *cr)
2111 {
2112         struct ifnet *ifp = &sc->arpcom.ac_if;
2113         struct netmsg_carp cmsg;
2114         struct carpreq carpr;
2115         int error;
2116
2117         ASSERT_IFNET_SERIALIZED_ALL(ifp);
2118         ifnet_deserialize_all(ifp);
2119
2120         error = priv_check_cred(cr, PRIV_ROOT, NULL_CRED_OKAY);
2121         if (error)
2122                 goto back;
2123
2124         error = copyin(udata, &carpr, sizeof(carpr));
2125         if (error)
2126                 goto back;
2127
2128         bzero(&cmsg, sizeof(cmsg));
2129         netmsg_init(&cmsg.base, NULL, &curthread->td_msgport, 0,
2130             carp_ioctl_setvh_dispatch);
2131         cmsg.nc_softc = sc;
2132         cmsg.nc_data = &carpr;
2133
2134         error = lwkt_domsg(cpu_portfn(0), &cmsg.base.lmsg, 0);
2135
2136 back:
2137         ifnet_serialize_all(ifp);
2138         return error;
2139 }
2140
2141 static void
2142 carp_ioctl_getvh_dispatch(netmsg_t msg)
2143 {
2144         struct netmsg_carp *cmsg = (struct netmsg_carp *)msg;
2145         struct carp_softc *sc = cmsg->nc_softc;
2146         struct carpreq *carpr = cmsg->nc_data;
2147
2148         carp_gettok();
2149
2150         carpr->carpr_state = sc->sc_state;
2151         carpr->carpr_vhid = sc->sc_vhid;
2152         carpr->carpr_advbase = sc->sc_advbase;
2153         carpr->carpr_advskew = sc->sc_advskew;
2154         bcopy(sc->sc_key, carpr->carpr_key, sizeof(carpr->carpr_key));
2155
2156         carp_reltok();
2157
2158         lwkt_replymsg(&cmsg->base.lmsg, 0);
2159 }
2160
2161 static int
2162 carp_ioctl_getvh(struct carp_softc *sc, void *udata, struct ucred *cr)
2163 {
2164         struct ifnet *ifp = &sc->arpcom.ac_if;
2165         struct netmsg_carp cmsg;
2166         struct carpreq carpr;
2167         int error;
2168
2169         ASSERT_IFNET_SERIALIZED_ALL(ifp);
2170         ifnet_deserialize_all(ifp);
2171
2172         bzero(&cmsg, sizeof(cmsg));
2173         netmsg_init(&cmsg.base, NULL, &curthread->td_msgport, 0,
2174             carp_ioctl_getvh_dispatch);
2175         cmsg.nc_softc = sc;
2176         cmsg.nc_data = &carpr;
2177
2178         lwkt_domsg(cpu_portfn(0), &cmsg.base.lmsg, 0);
2179
2180         error = priv_check_cred(cr, PRIV_ROOT, NULL_CRED_OKAY);
2181         if (error)
2182                 bzero(carpr.carpr_key, sizeof(carpr.carpr_key));
2183
2184         error = copyout(&carpr, udata, sizeof(carpr));
2185
2186         ifnet_serialize_all(ifp);
2187         return error;
2188 }
2189
2190 static void
2191 carp_ioctl_getdevname_dispatch(netmsg_t msg)
2192 {
2193         struct netmsg_carp *cmsg = (struct netmsg_carp *)msg;
2194         struct carp_softc *sc = cmsg->nc_softc;
2195         char *devname = cmsg->nc_data;
2196
2197         bzero(devname, sizeof(devname));
2198
2199         carp_gettok();
2200         if (sc->sc_carpdev != NULL)
2201                 strlcpy(devname, sc->sc_carpdev->if_xname, sizeof(devname));
2202         carp_reltok();
2203
2204         lwkt_replymsg(&cmsg->base.lmsg, 0);
2205 }
2206
2207 static int
2208 carp_ioctl_getdevname(struct carp_softc *sc, struct ifdrv *ifd)
2209 {
2210         struct ifnet *ifp = &sc->arpcom.ac_if;
2211         struct netmsg_carp cmsg;
2212         char devname[IFNAMSIZ];
2213         int error;
2214
2215         ASSERT_IFNET_SERIALIZED_ALL(ifp);
2216
2217         if (ifd->ifd_len != sizeof(devname))
2218                 return EINVAL;
2219
2220         ifnet_deserialize_all(ifp);
2221
2222         bzero(&cmsg, sizeof(cmsg));
2223         netmsg_init(&cmsg.base, NULL, &curthread->td_msgport, 0,
2224             carp_ioctl_getdevname_dispatch);
2225         cmsg.nc_softc = sc;
2226         cmsg.nc_data = devname;
2227
2228         lwkt_domsg(cpu_portfn(0), &cmsg.base.lmsg, 0);
2229
2230         error = copyout(devname, ifd->ifd_data, sizeof(devname));
2231
2232         ifnet_serialize_all(ifp);
2233         return error;
2234 }
2235
2236 static void
2237 carp_init_dispatch(netmsg_t msg)
2238 {
2239         struct netmsg_carp *cmsg = (struct netmsg_carp *)msg;
2240         struct carp_softc *sc = cmsg->nc_softc;
2241
2242         carp_gettok();
2243
2244         sc->sc_if.if_flags |= IFF_RUNNING;
2245         carp_hmac_prepare(sc);
2246         carp_set_state(sc, INIT);
2247         carp_setrun(sc, 0);
2248
2249         carp_reltok();
2250
2251         lwkt_replymsg(&cmsg->base.lmsg, 0);
2252 }
2253
2254 static void
2255 carp_init(void *xsc)
2256 {
2257         struct carp_softc *sc = xsc;
2258         struct ifnet *ifp = &sc->arpcom.ac_if;
2259         struct netmsg_carp cmsg;
2260
2261         ASSERT_IFNET_SERIALIZED_ALL(ifp);
2262
2263         ifnet_deserialize_all(ifp);
2264
2265         bzero(&cmsg, sizeof(cmsg));
2266         netmsg_init(&cmsg.base, NULL, &curthread->td_msgport, 0,
2267             carp_init_dispatch);
2268         cmsg.nc_softc = sc;
2269
2270         lwkt_domsg(cpu_portfn(0), &cmsg.base.lmsg, 0);
2271
2272         ifnet_serialize_all(ifp);
2273 }
2274
2275 static int
2276 carp_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst,
2277     struct rtentry *rt)
2278 {
2279         struct carp_softc *sc = ifp->if_softc;
2280         int error = 0;
2281
2282         carp_gettok();
2283         if (sc->sc_carpdev) {
2284                 /*
2285                  * NOTE:
2286                  * CARP's ifp is passed to backing device's
2287                  * if_output method.
2288                  */
2289                 sc->sc_carpdev->if_output(ifp, m, dst, rt);
2290         } else {
2291                 m_freem(m);
2292                 error = ENETUNREACH;
2293         }
2294         carp_reltok();
2295
2296         return error;
2297 }
2298
2299 /*
2300  * Start output on carp interface. This function should never be called.
2301  */
2302 static void
2303 carp_start(struct ifnet *ifp)
2304 {
2305         panic("%s: start called\n", ifp->if_xname);
2306 }
2307
2308 static void
2309 carp_serialize(struct ifnet *ifp __unused,
2310     enum ifnet_serialize slz __unused)
2311 {
2312 }
2313
2314 static void
2315 carp_deserialize(struct ifnet *ifp __unused,
2316     enum ifnet_serialize slz __unused)
2317 {
2318 }
2319
2320 static int
2321 carp_tryserialize(struct ifnet *ifp __unused,
2322     enum ifnet_serialize slz __unused)
2323 {
2324         return 1;
2325 }
2326
2327 #ifdef INVARIANTS
2328
2329 static void
2330 carp_serialize_assert(struct ifnet *ifp __unused,
2331     enum ifnet_serialize slz __unused, boolean_t serialized __unused)
2332 {
2333 }
2334
2335 #endif  /* INVARIANTS */
2336
2337 static void
2338 carp_set_state(struct carp_softc *sc, int state)
2339 {
2340         struct ifnet *cifp = &sc->sc_if;
2341
2342         if (sc->sc_state == state)
2343                 return;
2344         sc->sc_state = state;
2345
2346         switch (sc->sc_state) {
2347         case BACKUP:
2348                 cifp->if_link_state = LINK_STATE_DOWN;
2349                 break;
2350
2351         case MASTER:
2352                 cifp->if_link_state = LINK_STATE_UP;
2353                 break;
2354
2355         default:
2356                 cifp->if_link_state = LINK_STATE_UNKNOWN;
2357                 break;
2358         }
2359         rt_ifmsg(cifp);
2360 }
2361
2362 void
2363 carp_group_demote_adj(struct ifnet *ifp, int adj)
2364 {
2365         struct ifg_list *ifgl;
2366         int *dm;
2367
2368         carp_gettok();
2369
2370         TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) {
2371                 if (!strcmp(ifgl->ifgl_group->ifg_group, IFG_ALL))
2372                         continue;
2373                 dm = &ifgl->ifgl_group->ifg_carp_demoted;
2374
2375                 if (*dm + adj >= 0)
2376                         *dm += adj;
2377                 else
2378                         *dm = 0;
2379
2380                 if (adj > 0 && *dm == 1)
2381                         carp_send_ad_all();
2382                 CARP_LOG("%s demoted group %s to %d", ifp->if_xname,
2383                     ifgl->ifgl_group->ifg_group, *dm);
2384         }
2385
2386         carp_reltok();
2387 }
2388
2389 void
2390 carp_carpdev_state(void *v)
2391 {
2392         struct carp_if *cif = v;
2393         struct carp_softc *sc;
2394
2395         carp_gettok();
2396
2397         TAILQ_FOREACH(sc, &cif->vhif_vrs, sc_list)
2398                 carp_sc_state(sc);
2399
2400         carp_reltok();
2401 }
2402
2403 static void
2404 carp_sc_state(struct carp_softc *sc)
2405 {
2406         if (!(sc->sc_carpdev->if_flags & IFF_UP)) {
2407                 callout_stop(&sc->sc_ad_tmo);
2408                 callout_stop(&sc->sc_md_tmo);
2409                 callout_stop(&sc->sc_md6_tmo);
2410                 carp_set_state(sc, INIT);
2411                 carp_setrun(sc, 0);
2412                 if (!sc->sc_suppress) {
2413                         carp_suppress_preempt++;
2414                         if (carp_suppress_preempt == 1)
2415                                 carp_send_ad_all();
2416                 }
2417                 sc->sc_suppress = 1;
2418         } else {
2419                 carp_set_state(sc, INIT);
2420                 carp_setrun(sc, 0);
2421                 if (sc->sc_suppress)
2422                         carp_suppress_preempt--;
2423                 sc->sc_suppress = 0;
2424         }
2425 }
2426
2427 static void
2428 carp_stop(struct carp_softc *sc, int detach)
2429 {
2430         sc->sc_if.if_flags &= ~IFF_RUNNING;
2431
2432         callout_stop(&sc->sc_ad_tmo);
2433         callout_stop(&sc->sc_md_tmo);
2434         callout_stop(&sc->sc_md6_tmo);
2435
2436         if (!detach && sc->sc_state == MASTER)
2437                 carp_send_ad(sc);
2438
2439         if (sc->sc_suppress)
2440                 carp_suppress_preempt--;
2441         sc->sc_suppress = 0;
2442
2443         if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS)
2444                 carp_suppress_preempt--;
2445         sc->sc_sendad_errors = 0;
2446         sc->sc_sendad_success = 0;
2447
2448         carp_set_state(sc, INIT);
2449         carp_setrun(sc, 0);
2450 }
2451
2452 static void
2453 carp_suspend(struct carp_softc *sc, int detach)
2454 {
2455         struct ifnet *cifp = &sc->sc_if;
2456
2457         carp_stop(sc, detach);
2458
2459         /* Retain the running state, if we are not dead yet */
2460         if (!sc->sc_dead && (cifp->if_flags & IFF_UP))
2461                 cifp->if_flags |= IFF_RUNNING;
2462 }
2463
2464 static int
2465 carp_activate_vhaddr(struct carp_softc *sc, struct carp_vhaddr *vha,
2466     struct ifnet *ifp, struct in_ifaddr *ia_if, int own)
2467 {
2468         struct ip_moptions *imo = &sc->sc_imo;
2469         struct carp_if *cif;
2470         struct carp_softc *vr, *after = NULL;
2471         int onlist, error;
2472 #ifdef INVARIANTS
2473         int assert_onlist;
2474 #endif
2475
2476         KKASSERT(vha->vha_ia != NULL);
2477
2478         KASSERT(ia_if != NULL, ("NULL backing address\n"));
2479         KASSERT(vha->vha_iaback == NULL, ("%p is already activated\n", vha));
2480         KASSERT((vha->vha_flags & CARP_VHAF_OWNER) == 0,
2481                 ("inactive vhaddr %p is the address owner\n", vha));
2482
2483         KASSERT(sc->sc_carpdev == NULL || sc->sc_carpdev == ifp,
2484                 ("%s is already on %s\n", sc->sc_if.if_xname,
2485                  sc->sc_carpdev->if_xname));
2486
2487         if (!ifp->if_carp) {
2488                 KASSERT(sc->sc_carpdev == NULL,
2489                         ("%s is already on %s\n", sc->sc_if.if_xname,
2490                          sc->sc_carpdev->if_xname));
2491
2492                 cif = kmalloc(sizeof(*cif), M_CARP, M_WAITOK | M_ZERO);
2493
2494                 error = ifpromisc(ifp, 1);
2495                 if (error) {
2496                         kfree(cif, M_CARP);
2497                         return error;
2498                 }
2499
2500                 TAILQ_INIT(&cif->vhif_vrs);
2501                 ifp->if_carp = cif;
2502         } else {
2503                 cif = ifp->if_carp;
2504                 TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) {
2505                         if (vr != sc && vr->sc_vhid == sc->sc_vhid)
2506                                 return EINVAL;
2507                 }
2508         }
2509
2510 #ifdef INVARIANTS
2511         if (sc->sc_carpdev != NULL)
2512                 assert_onlist = 1;
2513         else
2514                 assert_onlist = 0;
2515 #endif
2516         sc->sc_ia = ia_if;
2517         sc->sc_carpdev = ifp;
2518
2519         cif = ifp->if_carp;
2520         onlist = 0;
2521         TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) {
2522                 if (vr == sc)
2523                         onlist = 1;
2524                 if (vr->sc_vhid < sc->sc_vhid)
2525                         after = vr;
2526         }
2527
2528 #ifdef INVARIANTS
2529         if (assert_onlist) {
2530                 KASSERT(onlist, ("%s is not on %s carp list\n",
2531                         sc->sc_if.if_xname, ifp->if_xname));
2532         } else {
2533                 KASSERT(!onlist, ("%s is already on %s carp list\n",
2534                         sc->sc_if.if_xname, ifp->if_xname));
2535         }
2536 #endif
2537
2538         if (!onlist) {
2539                 /* We're trying to keep things in order */
2540                 if (after == NULL)
2541                         TAILQ_INSERT_TAIL(&cif->vhif_vrs, sc, sc_list);
2542                 else
2543                         TAILQ_INSERT_AFTER(&cif->vhif_vrs, after, sc, sc_list);
2544         }
2545
2546         vha->vha_iaback = ia_if;
2547         sc->sc_naddrs++;
2548
2549         if (own) {
2550                 vha->vha_flags |= CARP_VHAF_OWNER;
2551
2552                 /* XXX save user configured advskew? */
2553                 sc->sc_advskew = 0;
2554         }
2555
2556         carp_addroute_vhaddr(sc, vha);
2557
2558         /*
2559          * Join the multicast group only after the backing interface
2560          * has been hooked with the CARP interface.
2561          */
2562         KASSERT(imo->imo_multicast_ifp == NULL ||
2563                 imo->imo_multicast_ifp == &sc->sc_if,
2564                 ("%s didn't leave mcast group on %s\n",
2565                  sc->sc_if.if_xname, imo->imo_multicast_ifp->if_xname));
2566
2567         if (imo->imo_num_memberships == 0) {
2568                 struct in_addr addr;
2569
2570                 addr.s_addr = htonl(INADDR_CARP_GROUP);
2571                 imo->imo_membership[0] = in_addmulti(&addr, &sc->sc_if);
2572                 if (imo->imo_membership[0] == NULL) {
2573                         carp_deactivate_vhaddr(sc, vha, FALSE);
2574                         return ENOBUFS;
2575                 }
2576
2577                 imo->imo_num_memberships++;
2578                 imo->imo_multicast_ifp = &sc->sc_if;
2579                 imo->imo_multicast_ttl = CARP_DFLTTL;
2580                 imo->imo_multicast_loop = 0;
2581         }
2582
2583         carp_hmac_prepare(sc);
2584         carp_set_state(sc, INIT);
2585         carp_setrun(sc, 0);
2586         return 0;
2587 }
2588
2589 static void
2590 carp_deactivate_vhaddr(struct carp_softc *sc, struct carp_vhaddr *vha,
2591     boolean_t del_iaback)
2592 {
2593         KKASSERT(vha->vha_ia != NULL);
2594
2595         carp_hmac_prepare(sc);
2596
2597         if (vha->vha_iaback == NULL) {
2598                 KASSERT((vha->vha_flags & CARP_VHAF_OWNER) == 0,
2599                         ("inactive vhaddr %p is the address owner\n", vha));
2600                 return;
2601         }
2602
2603         vha->vha_flags &= ~CARP_VHAF_OWNER;
2604         carp_delroute_vhaddr(sc, vha, del_iaback);
2605
2606         KKASSERT(sc->sc_naddrs > 0);
2607         vha->vha_iaback = NULL;
2608         sc->sc_naddrs--;
2609         if (!sc->sc_naddrs) {
2610                 if (sc->sc_naddrs6) {
2611                         carp_multicast_cleanup(sc);
2612                         sc->sc_ia = NULL;
2613                 } else {
2614                         carp_detach(sc, 0, del_iaback);
2615                 }
2616         }
2617 }
2618
2619 static void
2620 carp_link_addrs(struct carp_softc *sc, struct ifnet *ifp, struct ifaddr *ifa_if)
2621 {
2622         struct carp_vhaddr *vha;
2623         struct in_ifaddr *ia_if;
2624
2625         KKASSERT(ifa_if->ifa_addr->sa_family == AF_INET);
2626         ia_if = ifatoia(ifa_if);
2627
2628         /*
2629          * Test each inactive vhaddr against the newly added address.
2630          * If the newly added address could be the backing address,
2631          * then activate the matching vhaddr.
2632          */
2633         TAILQ_FOREACH(vha, &sc->sc_vha_list, vha_link) {
2634                 const struct in_ifaddr *ia;
2635                 u_long iaddr;
2636                 int own;
2637
2638                 if (vha->vha_iaback != NULL)
2639                         continue;
2640
2641                 ia = vha->vha_ia;
2642                 iaddr = ntohl(ia->ia_addr.sin_addr.s_addr);
2643
2644                 if ((iaddr & ia_if->ia_subnetmask) != ia_if->ia_subnet)
2645                         continue;
2646
2647                 own = 0;
2648                 if (ia->ia_addr.sin_addr.s_addr ==
2649                     ia_if->ia_addr.sin_addr.s_addr)
2650                         own = 1;
2651
2652                 carp_activate_vhaddr(sc, vha, ifp, ia_if, own);
2653         }
2654 }
2655
2656 static void
2657 carp_unlink_addrs(struct carp_softc *sc, struct ifnet *ifp,
2658                   struct ifaddr *ifa_if)
2659 {
2660         struct carp_vhaddr *vha;
2661         struct in_ifaddr *ia_if;
2662
2663         KKASSERT(ifa_if->ifa_addr->sa_family == AF_INET);
2664         ia_if = ifatoia(ifa_if);
2665
2666         /*
2667          * Ad src address is deleted; set it to NULL.
2668          * Following loop will try pick up a new ad src address
2669          * if one of the vhaddr could retain its backing address.
2670          */
2671         if (sc->sc_ia == ia_if)
2672                 sc->sc_ia = NULL;
2673
2674         /*
2675          * Test each active vhaddr against the deleted address.
2676          * If the deleted address is vhaddr address's backing
2677          * address, then deactivate the vhaddr.
2678          */
2679         TAILQ_FOREACH(vha, &sc->sc_vha_list, vha_link) {
2680                 if (vha->vha_iaback == NULL)
2681                         continue;
2682
2683                 if (vha->vha_iaback == ia_if)
2684                         carp_deactivate_vhaddr(sc, vha, TRUE);
2685                 else if (sc->sc_ia == NULL)
2686                         sc->sc_ia = vha->vha_iaback;
2687         }
2688 }
2689
2690 static void
2691 carp_update_addrs(struct carp_softc *sc, struct ifaddr *ifa_del)
2692 {
2693         struct carp_vhaddr *vha;
2694
2695         KKASSERT(sc->sc_carpdev == NULL);
2696
2697         TAILQ_FOREACH(vha, &sc->sc_vha_list, vha_link)
2698                 carp_config_vhaddr(sc, vha, ifatoia(ifa_del));
2699 }
2700
2701 static void
2702 carp_ifaddr(void *arg __unused, struct ifnet *ifp,
2703             enum ifaddr_event event, struct ifaddr *ifa)
2704 {
2705         struct carp_softc *sc;
2706
2707         carp_gettok();
2708
2709         if (ifa->ifa_addr->sa_family != AF_INET)
2710                 goto back;
2711
2712         KASSERT(&curthread->td_msgport == cpu_portfn(0),
2713             ("not in netisr0"));
2714
2715         if (ifp->if_type == IFT_CARP) {
2716                 /*
2717                  * Address is changed on carp(4) interface
2718                  */
2719                 switch (event) {
2720                 case IFADDR_EVENT_ADD:
2721                         carp_add_addr(ifp->if_softc, ifa);
2722                         break;
2723
2724                 case IFADDR_EVENT_CHANGE:
2725                         carp_config_addr(ifp->if_softc, ifa);
2726                         break;
2727
2728                 case IFADDR_EVENT_DELETE:
2729                         carp_del_addr(ifp->if_softc, ifa);
2730                         break;
2731                 }
2732                 goto back;
2733         }
2734
2735         /*
2736          * Address is changed on non-carp(4) interface
2737          */
2738         if ((ifp->if_flags & IFF_MULTICAST) == 0)
2739                 goto back;
2740
2741         LIST_FOREACH(sc, &carpif_list, sc_next) {
2742                 if (sc->sc_carpdev != NULL && sc->sc_carpdev != ifp) {
2743                         /* Not the parent iface; skip */
2744                         continue;
2745                 }
2746
2747                 switch (event) {
2748                 case IFADDR_EVENT_ADD:
2749                         carp_link_addrs(sc, ifp, ifa);
2750                         break;
2751
2752                 case IFADDR_EVENT_DELETE:
2753                         if (sc->sc_carpdev != NULL) {
2754                                 carp_unlink_addrs(sc, ifp, ifa);
2755                                 if (sc->sc_carpdev == NULL) {
2756                                         /*
2757                                          * We no longer have the parent
2758                                          * interface, however, certain
2759                                          * virtual addresses, which are
2760                                          * not used because they can't
2761                                          * match the previous parent
2762                                          * interface's addresses, may now
2763                                          * match different interface's
2764                                          * addresses.
2765                                          */
2766                                         carp_update_addrs(sc, ifa);
2767                                 }
2768                         } else {
2769                                 /*
2770                                  * The carp(4) interface didn't have a
2771                                  * parent iface, so it is not possible
2772                                  * that it will contain any address to
2773                                  * be unlinked.
2774                                  */
2775                         }
2776                         break;
2777
2778                 case IFADDR_EVENT_CHANGE:
2779                         if (sc->sc_carpdev == NULL) {
2780                                 /*
2781                                  * The carp(4) interface didn't have a
2782                                  * parent iface, so it is not possible
2783                                  * that it will contain any address to
2784                                  * be updated.
2785                                  */
2786                                 carp_link_addrs(sc, ifp, ifa);
2787                         } else {
2788                                 /*
2789                                  * First try breaking tie with the old
2790                                  * address.  Then see whether we could
2791                                  * link certain vhaddr to the new address.
2792                                  * If that fails, i.e. carpdev is NULL,
2793                                  * we try a global update.
2794                                  *
2795                                  * NOTE: The above order is critical.
2796                                  */
2797                                 carp_unlink_addrs(sc, ifp, ifa);
2798                                 carp_link_addrs(sc, ifp, ifa);
2799                                 if (sc->sc_carpdev == NULL) {
2800                                         /*
2801                                          * See the comment in the above
2802                                          * IFADDR_EVENT_DELETE block.
2803                                          */
2804                                         carp_update_addrs(sc, NULL);
2805                                 }
2806                         }
2807                         break;
2808                 }
2809         }
2810
2811 back:
2812         carp_reltok();
2813 }
2814
2815 void
2816 carp_proto_ctlinput(netmsg_t msg)
2817 {
2818         int cmd = msg->ctlinput.nm_cmd;
2819         struct sockaddr *sa = msg->ctlinput.nm_arg;
2820         struct in_ifaddr_container *iac;
2821
2822         carp_gettok();
2823
2824         TAILQ_FOREACH(iac, &in_ifaddrheads[mycpuid], ia_link) {
2825                 struct in_ifaddr *ia = iac->ia;
2826                 struct ifnet *ifp = ia->ia_ifp;
2827
2828                 if (ifp->if_type == IFT_CARP)
2829                         continue;
2830
2831                 if (ia->ia_ifa.ifa_addr == sa) {
2832                         if (cmd == PRC_IFDOWN) {
2833                                 carp_ifaddr(NULL, ifp, IFADDR_EVENT_DELETE,
2834                                     &ia->ia_ifa);
2835                         } else if (cmd == PRC_IFUP) {
2836                                 carp_ifaddr(NULL, ifp, IFADDR_EVENT_ADD,
2837                                     &ia->ia_ifa);
2838                         }
2839                         break;
2840                 }
2841         }
2842
2843         carp_reltok();
2844         lwkt_replymsg(&msg->lmsg, 0);
2845 }
2846
2847 void
2848 carp_gettok(void)
2849 {
2850         lwkt_gettoken(&carp_tok);
2851 }
2852
2853 void
2854 carp_reltok(void)
2855 {
2856         lwkt_reltoken(&carp_tok);
2857 }
2858
2859 struct ifnet *
2860 carp_parent(struct ifnet *cifp)
2861 {
2862         struct carp_softc *sc;
2863
2864         ASSERT_LWKT_TOKEN_HELD(&carp_tok);
2865
2866         KKASSERT(cifp->if_type == IFT_CARP);
2867         sc = cifp->if_softc;
2868
2869         return sc->sc_carpdev;
2870 }
2871
2872 #define rtinitflags(x) \
2873         (((x)->ia_ifp->if_flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) \
2874                  ? RTF_HOST : 0)
2875
2876 static int
2877 carp_addroute_vhaddr(struct carp_softc *sc, struct carp_vhaddr *vha)
2878 {
2879         struct in_ifaddr *ia, *iaback;
2880         int error;
2881
2882         if (sc->sc_state != MASTER)
2883                 return 0;
2884
2885         ia = vha->vha_ia;
2886         KKASSERT(ia != NULL);
2887
2888         iaback = vha->vha_iaback;
2889         KKASSERT(iaback != NULL);
2890
2891         rtinit(&iaback->ia_ifa, RTM_DELETE, rtinitflags(iaback));
2892         in_ifadown(&iaback->ia_ifa, 1);
2893         iaback->ia_flags &= ~IFA_ROUTE;
2894
2895         error = rtinit(&ia->ia_ifa, RTM_ADD, rtinitflags(ia) | RTF_UP);
2896         if (!error)
2897                 ia->ia_flags |= IFA_ROUTE;
2898         return error;
2899 }
2900
2901 static void
2902 carp_delroute_vhaddr(struct carp_softc *sc, struct carp_vhaddr *vha,
2903     boolean_t del_iaback)
2904 {
2905         struct in_ifaddr *ia, *iaback;
2906
2907         ia = vha->vha_ia;
2908         KKASSERT(ia != NULL);
2909
2910         iaback = vha->vha_iaback;
2911         KKASSERT(iaback != NULL);
2912
2913         rtinit(&ia->ia_ifa, RTM_DELETE, rtinitflags(ia));
2914         in_ifadown(&ia->ia_ifa, 1);
2915         ia->ia_flags &= ~IFA_ROUTE;
2916
2917         if (!del_iaback && (iaback->ia_ifp->if_flags & IFF_UP)) {
2918                 int error;
2919
2920                 error = rtinit(&iaback->ia_ifa, RTM_ADD,
2921                     rtinitflags(iaback) | RTF_UP);
2922                 if (!error)
2923                         iaback->ia_flags |= IFA_ROUTE;
2924         }
2925 }
2926
2927 static int
2928 carp_modevent(module_t mod, int type, void *data)
2929 {
2930         switch (type) {
2931         case MOD_LOAD:
2932                 LIST_INIT(&carpif_list);
2933                 carp_ifdetach_event =
2934                 EVENTHANDLER_REGISTER(ifnet_detach_event, carp_ifdetach, NULL,
2935                                       EVENTHANDLER_PRI_ANY);
2936                 carp_ifaddr_event =
2937                 EVENTHANDLER_REGISTER(ifaddr_event, carp_ifaddr, NULL,
2938                                       EVENTHANDLER_PRI_FIRST);
2939                 if_clone_attach(&carp_cloner);
2940                 break;
2941
2942         case MOD_UNLOAD:
2943                 EVENTHANDLER_DEREGISTER(ifnet_detach_event,
2944                                         carp_ifdetach_event);
2945                 EVENTHANDLER_DEREGISTER(ifaddr_event,
2946                                         carp_ifaddr_event);
2947                 if_clone_detach(&carp_cloner);
2948                 break;
2949
2950         default:
2951                 return (EINVAL);
2952         }
2953         return (0);
2954 }
2955
2956 static moduledata_t carp_mod = {
2957         "carp",
2958         carp_modevent,
2959         0
2960 };
2961 DECLARE_MODULE(carp, carp_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);