132ca11d1b3f857447255b7f35e57602c3d75480
[dragonfly.git] / sys / netinet / ip_carp.c
1 /*
2  * Copyright (c) 2002 Michael Shalayeff. All rights reserved.
3  * Copyright (c) 2003 Ryan McBride. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
15  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
16  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
17  * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT,
18  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
20  * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
22  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
23  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
24  * THE POSSIBILITY OF SUCH DAMAGE.
25  */
26 /*
27  * $FreeBSD: src/sys/netinet/ip_carp.c,v 1.48 2007/02/02 09:39:09 glebius Exp $
28  */
29
30 #include "opt_carp.h"
31 #include "opt_inet.h"
32 #include "opt_inet6.h"
33
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/kernel.h>
37 #include <sys/in_cksum.h>
38 #include <sys/limits.h>
39 #include <sys/malloc.h>
40 #include <sys/mbuf.h>
41 #include <sys/msgport2.h>
42 #include <sys/time.h>
43 #include <sys/proc.h>
44 #include <sys/priv.h>
45 #include <sys/sockio.h>
46 #include <sys/socket.h>
47 #include <sys/sysctl.h>
48 #include <sys/syslog.h>
49 #include <sys/thread.h>
50
51 #include <machine/stdarg.h>
52 #include <crypto/sha1.h>
53
54 #include <net/bpf.h>
55 #include <net/ethernet.h>
56 #include <net/if.h>
57 #include <net/if_dl.h>
58 #include <net/if_types.h>
59 #include <net/route.h>
60 #include <net/if_clone.h>
61 #include <net/if_var.h>
62 #include <net/ifq_var.h>
63 #include <net/netmsg2.h>
64
65 #ifdef INET
66 #include <netinet/in.h>
67 #include <netinet/in_var.h>
68 #include <netinet/in_systm.h>
69 #include <netinet/ip.h>
70 #include <netinet/ip_var.h>
71 #include <netinet/if_ether.h>
72 #endif
73
74 #ifdef INET6
75 #include <netinet/icmp6.h>
76 #include <netinet/ip6.h>
77 #include <netinet6/ip6_var.h>
78 #include <netinet6/scope6_var.h>
79 #include <netinet6/nd6.h>
80 #endif
81
82 #include <netinet/ip_carp.h>
83
84 #define CARP_IFNAME             "carp"
85 #define CARP_IS_RUNNING(ifp)    \
86         (((ifp)->if_flags & (IFF_UP | IFF_RUNNING)) == (IFF_UP | IFF_RUNNING))
87
88 struct carp_softc;
89
90 struct carp_vhaddr {
91         uint32_t                vha_flags;      /* CARP_VHAF_ */
92         struct in_ifaddr        *vha_ia;        /* carp address */
93         struct in_ifaddr        *vha_iaback;    /* backing address */
94         TAILQ_ENTRY(carp_vhaddr) vha_link;
95 };
96 TAILQ_HEAD(carp_vhaddr_list, carp_vhaddr);
97
98 struct netmsg_carp {
99         struct netmsg_base      base;
100         struct ifnet            *nc_carpdev;
101         struct carp_softc       *nc_softc;
102         void                    *nc_data;
103         size_t                  nc_datalen;
104 };
105
106 struct carp_softc {
107         struct arpcom            arpcom;
108         struct ifnet            *sc_carpdev;    /* parent interface */
109         struct carp_vhaddr_list  sc_vha_list;   /* virtual addr list */
110
111         const struct in_ifaddr  *sc_ia;         /* primary iface address v4 */
112         struct ip_moptions       sc_imo;
113
114 #ifdef INET6
115         struct in6_ifaddr       *sc_ia6;        /* primary iface address v6 */
116         struct ip6_moptions      sc_im6o;
117 #endif /* INET6 */
118         TAILQ_ENTRY(carp_softc)  sc_list;
119
120         enum { INIT = 0, BACKUP, MASTER }
121                                  sc_state;
122         int                      sc_dead;
123
124         int                      sc_suppress;
125
126         int                      sc_sendad_errors;
127 #define CARP_SENDAD_MAX_ERRORS  3
128         int                      sc_sendad_success;
129 #define CARP_SENDAD_MIN_SUCCESS 3
130
131         int                      sc_vhid;
132         int                      sc_advskew;
133         int                      sc_naddrs;     /* actually used IPv4 vha */
134         int                      sc_naddrs6;
135         int                      sc_advbase;    /* seconds */
136         int                      sc_init_counter;
137         uint64_t                 sc_counter;
138
139         /* authentication */
140 #define CARP_HMAC_PAD   64
141         unsigned char            sc_key[CARP_KEY_LEN];
142         unsigned char            sc_pad[CARP_HMAC_PAD];
143         SHA1_CTX                 sc_sha1;
144
145         struct callout           sc_ad_tmo;     /* advertisement timeout */
146         struct netmsg_carp       sc_ad_msg;     /* adv timeout netmsg */
147         struct callout           sc_md_tmo;     /* ip4 master down timeout */
148         struct callout           sc_md6_tmo;    /* ip6 master down timeout */
149         struct netmsg_carp       sc_md_msg;     /* master down timeout netmsg */
150
151         LIST_ENTRY(carp_softc)   sc_next;       /* Interface clue */
152 };
153
154 #define sc_if   arpcom.ac_if
155
156 struct carp_if {
157         TAILQ_HEAD(, carp_softc) vhif_vrs;
158 };
159
160 SYSCTL_DECL(_net_inet_carp);
161
162 static int carp_opts[CARPCTL_MAXID] = { 0, 1, 0, 1, 0, 0 }; /* XXX for now */
163 SYSCTL_INT(_net_inet_carp, CARPCTL_ALLOW, allow, CTLFLAG_RW,
164     &carp_opts[CARPCTL_ALLOW], 0, "Accept incoming CARP packets");
165 SYSCTL_INT(_net_inet_carp, CARPCTL_PREEMPT, preempt, CTLFLAG_RW,
166     &carp_opts[CARPCTL_PREEMPT], 0, "high-priority backup preemption mode");
167 SYSCTL_INT(_net_inet_carp, CARPCTL_LOG, log, CTLFLAG_RW,
168     &carp_opts[CARPCTL_LOG], 0, "log bad carp packets");
169 SYSCTL_INT(_net_inet_carp, CARPCTL_ARPBALANCE, arpbalance, CTLFLAG_RW,
170     &carp_opts[CARPCTL_ARPBALANCE], 0, "balance arp responses");
171
172 static int carp_suppress_preempt = 0;
173 SYSCTL_INT(_net_inet_carp, OID_AUTO, suppress_preempt, CTLFLAG_RD,
174     &carp_suppress_preempt, 0, "Preemption is suppressed");
175
176 static struct carpstats carpstats;
177 SYSCTL_STRUCT(_net_inet_carp, CARPCTL_STATS, stats, CTLFLAG_RW,
178     &carpstats, carpstats,
179     "CARP statistics (struct carpstats, netinet/ip_carp.h)");
180
181 #define CARP_LOG(...)   do {                            \
182         if (carp_opts[CARPCTL_LOG] > 0)                 \
183                 log(LOG_INFO, __VA_ARGS__);             \
184 } while (0)
185
186 #define CARP_DEBUG(...) do {                            \
187         if (carp_opts[CARPCTL_LOG] > 1)                 \
188                 log(LOG_DEBUG, __VA_ARGS__);            \
189 } while (0)
190
191 static struct lwkt_token carp_tok = LWKT_TOKEN_INITIALIZER(carp_token);
192
193 static void     carp_hmac_prepare(struct carp_softc *);
194 static void     carp_hmac_generate(struct carp_softc *, uint32_t *,
195                     unsigned char *);
196 static int      carp_hmac_verify(struct carp_softc *, uint32_t *,
197                     unsigned char *);
198 static void     carp_setroute(struct carp_softc *, int);
199 static void     carp_proto_input_c(struct carp_softc *, struct mbuf *,
200                     struct carp_header *, sa_family_t);
201 static int      carp_clone_create(struct if_clone *, int, caddr_t);
202 static int      carp_clone_destroy(struct ifnet *);
203 static void     carp_detach(struct carp_softc *, int, boolean_t);
204 static void     carp_prepare_ad(struct carp_softc *, struct carp_header *);
205 static void     carp_send_ad_all(void);
206 static void     carp_send_ad_timeout(void *);
207 static void     carp_send_ad(struct carp_softc *);
208 static void     carp_send_arp(struct carp_softc *);
209 static void     carp_master_down_timeout(void *);
210 static void     carp_master_down(struct carp_softc *);
211 static void     carp_setrun(struct carp_softc *, sa_family_t);
212 static void     carp_set_state(struct carp_softc *, int);
213 static struct ifnet *carp_forus(struct carp_if *, const uint8_t *);
214
215 static void     carp_init(void *);
216 static int      carp_ioctl(struct ifnet *, u_long, caddr_t, struct ucred *);
217 static int      carp_output(struct ifnet *, struct mbuf *, struct sockaddr *,
218                     struct rtentry *);
219 static void     carp_start(struct ifnet *);
220 static void     carp_serialize(struct ifnet *, enum ifnet_serialize);
221 static void     carp_deserialize(struct ifnet *, enum ifnet_serialize);
222 static int      carp_tryserialize(struct ifnet *, enum ifnet_serialize);
223 #ifdef INVARIANTS
224 static void     carp_serialize_assert(struct ifnet *, enum ifnet_serialize,
225                     boolean_t);
226 #endif
227
228 static void     carp_multicast_cleanup(struct carp_softc *);
229 static void     carp_add_addr(struct carp_softc *, struct ifaddr *);
230 static void     carp_del_addr(struct carp_softc *, struct ifaddr *);
231 static void     carp_config_addr(struct carp_softc *, struct ifaddr *);
232 static void     carp_link_addrs(struct carp_softc *, struct ifnet *,
233                     struct ifaddr *);
234 static void     carp_unlink_addrs(struct carp_softc *, struct ifnet *,
235                     struct ifaddr *);
236 static void     carp_update_addrs(struct carp_softc *, struct ifaddr *);
237
238 static int      carp_config_vhaddr(struct carp_softc *, struct carp_vhaddr *,
239                     struct in_ifaddr *);
240 static int      carp_activate_vhaddr(struct carp_softc *, struct carp_vhaddr *,
241                     struct ifnet *, struct in_ifaddr *, int);
242 static void     carp_deactivate_vhaddr(struct carp_softc *,
243                     struct carp_vhaddr *, boolean_t);
244 static int      carp_addroute_vhaddr(struct carp_softc *, struct carp_vhaddr *);
245 static void     carp_delroute_vhaddr(struct carp_softc *, struct carp_vhaddr *,
246                     boolean_t);
247
248 static void     carp_sc_state(struct carp_softc *);
249 #ifdef INET6
250 static void     carp_send_na(struct carp_softc *);
251 #ifdef notyet
252 static int      carp_set_addr6(struct carp_softc *, struct sockaddr_in6 *);
253 static int      carp_del_addr6(struct carp_softc *, struct sockaddr_in6 *);
254 #endif
255 static void     carp_multicast6_cleanup(struct carp_softc *);
256 #endif
257 static void     carp_stop(struct carp_softc *, int);
258 static void     carp_suspend(struct carp_softc *, int);
259 static void     carp_ioctl_stop(struct carp_softc *);
260 static int      carp_ioctl_setvh(struct carp_softc *, void *, struct ucred *);
261 static int      carp_ioctl_getvh(struct carp_softc *, void *, struct ucred *);
262 static int      carp_ioctl_getdevname(struct carp_softc *, struct ifdrv *);
263 static int      carp_ioctl_getvhaddr(struct carp_softc *, struct ifdrv *);
264
265 static void     carp_ifaddr(void *, struct ifnet *, enum ifaddr_event,
266                             struct ifaddr *);
267 static void     carp_ifdetach(void *, struct ifnet *);
268
269 static void     carp_ifdetach_dispatch(netmsg_t);
270 static void     carp_clone_destroy_dispatch(netmsg_t);
271 static void     carp_init_dispatch(netmsg_t);
272 static void     carp_ioctl_stop_dispatch(netmsg_t);
273 static void     carp_ioctl_setvh_dispatch(netmsg_t);
274 static void     carp_ioctl_getvh_dispatch(netmsg_t);
275 static void     carp_ioctl_getdevname_dispatch(netmsg_t);
276 static void     carp_ioctl_getvhaddr_dispatch(netmsg_t);
277 static void     carp_send_ad_timeout_dispatch(netmsg_t);
278 static void     carp_master_down_timeout_dispatch(netmsg_t);
279
280 static MALLOC_DEFINE(M_CARP, "CARP", "CARP interfaces");
281
282 static LIST_HEAD(, carp_softc) carpif_list;
283
284 static struct if_clone carp_cloner =
285 IF_CLONE_INITIALIZER(CARP_IFNAME, carp_clone_create, carp_clone_destroy,
286                      0, IF_MAXUNIT);
287
288 static uint8_t  carp_etheraddr[ETHER_ADDR_LEN] = { 0, 0, 0x5e, 0, 1, 0 };
289
290 static eventhandler_tag carp_ifdetach_event;
291 static eventhandler_tag carp_ifaddr_event;
292
293 static __inline void
294 carp_insert_vhaddr(struct carp_softc *sc, struct carp_vhaddr *vha_new)
295 {
296         struct carp_vhaddr *vha;
297         u_long new_addr, addr;
298
299         KKASSERT((vha_new->vha_flags & CARP_VHAF_ONLIST) == 0);
300
301         /*
302          * Virtual address list is sorted; smaller one first
303          */
304         new_addr = ntohl(vha_new->vha_ia->ia_addr.sin_addr.s_addr);
305
306         TAILQ_FOREACH(vha, &sc->sc_vha_list, vha_link) {
307                 addr = ntohl(vha->vha_ia->ia_addr.sin_addr.s_addr);
308
309                 if (addr > new_addr)
310                         break;
311         }
312         if (vha == NULL)
313                 TAILQ_INSERT_TAIL(&sc->sc_vha_list, vha_new, vha_link);
314         else
315                 TAILQ_INSERT_BEFORE(vha, vha_new, vha_link);
316         vha_new->vha_flags |= CARP_VHAF_ONLIST;
317 }
318
319 static __inline void
320 carp_remove_vhaddr(struct carp_softc *sc, struct carp_vhaddr *vha)
321 {
322         KKASSERT(vha->vha_flags & CARP_VHAF_ONLIST);
323         vha->vha_flags &= ~CARP_VHAF_ONLIST;
324         TAILQ_REMOVE(&sc->sc_vha_list, vha, vha_link);
325 }
326
327 static void
328 carp_hmac_prepare(struct carp_softc *sc)
329 {
330         uint8_t version = CARP_VERSION, type = CARP_ADVERTISEMENT;
331         uint8_t vhid = sc->sc_vhid & 0xff;
332         int i;
333 #ifdef INET6
334         struct ifaddr_container *ifac;
335         struct in6_addr in6;
336 #endif
337 #ifdef INET
338         struct carp_vhaddr *vha;
339 #endif
340
341         /* XXX: possible race here */
342
343         /* compute ipad from key */
344         bzero(sc->sc_pad, sizeof(sc->sc_pad));
345         bcopy(sc->sc_key, sc->sc_pad, sizeof(sc->sc_key));
346         for (i = 0; i < sizeof(sc->sc_pad); i++)
347                 sc->sc_pad[i] ^= 0x36;
348
349         /* precompute first part of inner hash */
350         SHA1Init(&sc->sc_sha1);
351         SHA1Update(&sc->sc_sha1, sc->sc_pad, sizeof(sc->sc_pad));
352         SHA1Update(&sc->sc_sha1, (void *)&version, sizeof(version));
353         SHA1Update(&sc->sc_sha1, (void *)&type, sizeof(type));
354         SHA1Update(&sc->sc_sha1, (void *)&vhid, sizeof(vhid));
355 #ifdef INET
356         TAILQ_FOREACH(vha, &sc->sc_vha_list, vha_link) {
357                 SHA1Update(&sc->sc_sha1,
358                     (const uint8_t *)&vha->vha_ia->ia_addr.sin_addr,
359                     sizeof(struct in_addr));
360         }
361 #endif /* INET */
362 #ifdef INET6
363         TAILQ_FOREACH(ifac, &sc->sc_if.if_addrheads[mycpuid], ifa_link) {
364                 struct ifaddr *ifa = ifac->ifa;
365
366                 if (ifa->ifa_addr->sa_family == AF_INET6) {
367                         in6 = ifatoia6(ifa)->ia_addr.sin6_addr;
368                         in6_clearscope(&in6);
369                         SHA1Update(&sc->sc_sha1, (void *)&in6, sizeof(in6));
370                 }
371         }
372 #endif /* INET6 */
373
374         /* convert ipad to opad */
375         for (i = 0; i < sizeof(sc->sc_pad); i++)
376                 sc->sc_pad[i] ^= 0x36 ^ 0x5c;
377 }
378
379 static void
380 carp_hmac_generate(struct carp_softc *sc, uint32_t counter[2],
381     unsigned char md[20])
382 {
383         SHA1_CTX sha1ctx;
384
385         /* fetch first half of inner hash */
386         bcopy(&sc->sc_sha1, &sha1ctx, sizeof(sha1ctx));
387
388         SHA1Update(&sha1ctx, (void *)counter, sizeof(sc->sc_counter));
389         SHA1Final(md, &sha1ctx);
390
391         /* outer hash */
392         SHA1Init(&sha1ctx);
393         SHA1Update(&sha1ctx, sc->sc_pad, sizeof(sc->sc_pad));
394         SHA1Update(&sha1ctx, md, 20);
395         SHA1Final(md, &sha1ctx);
396 }
397
398 static int
399 carp_hmac_verify(struct carp_softc *sc, uint32_t counter[2],
400     unsigned char md[20])
401 {
402         unsigned char md2[20];
403
404         carp_hmac_generate(sc, counter, md2);
405         return (bcmp(md, md2, sizeof(md2)));
406 }
407
408 static void
409 carp_setroute(struct carp_softc *sc, int cmd)
410 {
411 #ifdef INET6
412         struct ifaddr_container *ifac;
413 #endif
414         struct carp_vhaddr *vha;
415
416         KKASSERT(cmd == RTM_DELETE || cmd == RTM_ADD);
417
418         TAILQ_FOREACH(vha, &sc->sc_vha_list, vha_link) {
419                 if (vha->vha_iaback == NULL)
420                         continue;
421                 if (cmd == RTM_DELETE)
422                         carp_delroute_vhaddr(sc, vha, FALSE);
423                 else
424                         carp_addroute_vhaddr(sc, vha);
425         }
426
427 #ifdef INET6
428         TAILQ_FOREACH(ifac, &sc->sc_if.if_addrheads[mycpuid], ifa_link) {
429                 struct ifaddr *ifa = ifac->ifa;
430
431                 if (ifa->ifa_addr->sa_family == AF_INET6) {
432                         if (cmd == RTM_ADD)
433                                 in6_ifaddloop(ifa);
434                         else
435                                 in6_ifremloop(ifa);
436                 }
437         }
438 #endif /* INET6 */
439 }
440
441 static int
442 carp_clone_create(struct if_clone *ifc, int unit, caddr_t param __unused)
443 {
444         struct carp_softc *sc;
445         struct ifnet *ifp;
446
447         sc = kmalloc(sizeof(*sc), M_CARP, M_WAITOK | M_ZERO);
448         ifp = &sc->sc_if;
449
450         sc->sc_suppress = 0;
451         sc->sc_advbase = CARP_DFLTINTV;
452         sc->sc_vhid = -1;       /* required setting */
453         sc->sc_advskew = 0;
454         sc->sc_init_counter = 1;
455         sc->sc_naddrs = 0;
456         sc->sc_naddrs6 = 0;
457
458         TAILQ_INIT(&sc->sc_vha_list);
459
460 #ifdef INET6
461         sc->sc_im6o.im6o_multicast_hlim = CARP_DFLTTL;
462 #endif
463
464         callout_init_mp(&sc->sc_ad_tmo);
465         netmsg_init(&sc->sc_ad_msg.base, NULL, &netisr_adone_rport,
466             MSGF_DROPABLE | MSGF_PRIORITY, carp_send_ad_timeout_dispatch);
467         sc->sc_ad_msg.nc_softc = sc;
468
469         callout_init_mp(&sc->sc_md_tmo);
470         callout_init_mp(&sc->sc_md6_tmo);
471         netmsg_init(&sc->sc_md_msg.base, NULL, &netisr_adone_rport,
472             MSGF_DROPABLE | MSGF_PRIORITY, carp_master_down_timeout_dispatch);
473         sc->sc_md_msg.nc_softc = sc;
474
475         if_initname(ifp, CARP_IFNAME, unit);
476         ifp->if_softc = sc;
477         ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
478         ifp->if_init = carp_init;
479         ifp->if_ioctl = carp_ioctl;
480         ifp->if_start = carp_start;
481         ifp->if_serialize = carp_serialize;
482         ifp->if_deserialize = carp_deserialize;
483         ifp->if_tryserialize = carp_tryserialize;
484 #ifdef INVARIANTS
485         ifp->if_serialize_assert = carp_serialize_assert;
486 #endif
487         ifq_set_maxlen(&ifp->if_snd, ifqmaxlen);
488         ifq_set_ready(&ifp->if_snd);
489
490         ether_ifattach(ifp, carp_etheraddr, NULL);
491
492         ifp->if_type = IFT_CARP;
493         ifp->if_output = carp_output;
494
495         carp_gettok();
496         LIST_INSERT_HEAD(&carpif_list, sc, sc_next);
497         carp_reltok();
498
499         return (0);
500 }
501
502 static void
503 carp_clone_destroy_dispatch(netmsg_t msg)
504 {
505         struct netmsg_carp *cmsg = (struct netmsg_carp *)msg;
506         struct carp_softc *sc = cmsg->nc_softc;
507
508         carp_gettok();
509
510         sc->sc_dead = 1;
511         carp_detach(sc, 1, FALSE);
512
513         carp_reltok();
514
515         callout_stop_sync(&sc->sc_ad_tmo);
516         callout_stop_sync(&sc->sc_md_tmo);
517         callout_stop_sync(&sc->sc_md6_tmo);
518
519         crit_enter();
520         if ((sc->sc_ad_msg.base.lmsg.ms_flags & MSGF_DONE) == 0)
521                 lwkt_dropmsg(&sc->sc_ad_msg.base.lmsg);
522         if ((sc->sc_md_msg.base.lmsg.ms_flags & MSGF_DONE) == 0)
523                 lwkt_dropmsg(&sc->sc_md_msg.base.lmsg);
524         crit_exit();
525
526         lwkt_replymsg(&cmsg->base.lmsg, 0);
527 }
528
529 static int
530 carp_clone_destroy(struct ifnet *ifp)
531 {
532         struct carp_softc *sc = ifp->if_softc;
533         struct netmsg_carp cmsg;
534
535         bzero(&cmsg, sizeof(cmsg));
536         netmsg_init(&cmsg.base, NULL, &curthread->td_msgport, 0,
537             carp_clone_destroy_dispatch);
538         cmsg.nc_softc = sc;
539
540         lwkt_domsg(cpu_portfn(0), &cmsg.base.lmsg, 0);
541
542         carp_gettok();
543         LIST_REMOVE(sc, sc_next);
544         carp_reltok();
545
546         bpfdetach(ifp);
547         if_detach(ifp);
548
549         KASSERT(sc->sc_naddrs == 0, ("certain inet address is still active\n"));
550         kfree(sc, M_CARP);
551
552         return 0;
553 }
554
555 static void
556 carp_detach(struct carp_softc *sc, int detach, boolean_t del_iaback)
557 {
558         struct carp_if *cif;
559
560         carp_suspend(sc, detach);
561
562         carp_multicast_cleanup(sc);
563 #ifdef INET6
564         carp_multicast6_cleanup(sc);
565 #endif
566
567         if (!sc->sc_dead && detach) {
568                 struct carp_vhaddr *vha;
569
570                 TAILQ_FOREACH(vha, &sc->sc_vha_list, vha_link)
571                         carp_deactivate_vhaddr(sc, vha, del_iaback);
572                 KKASSERT(sc->sc_naddrs == 0);
573         }
574
575         if (sc->sc_carpdev != NULL) {
576                 cif = sc->sc_carpdev->if_carp;
577                 TAILQ_REMOVE(&cif->vhif_vrs, sc, sc_list);
578                 if (TAILQ_EMPTY(&cif->vhif_vrs)) {
579                         ifpromisc(sc->sc_carpdev, 0);
580                         sc->sc_carpdev->if_carp = NULL;
581                         kfree(cif, M_CARP);
582                 }
583                 sc->sc_carpdev = NULL;
584                 sc->sc_ia = NULL;
585         }
586 }
587
588 static void
589 carp_ifdetach_dispatch(netmsg_t msg)
590 {
591         struct netmsg_carp *cmsg = (struct netmsg_carp *)msg;
592         struct ifnet *ifp = cmsg->nc_carpdev;
593         struct carp_if *cif = ifp->if_carp;
594         struct carp_softc *sc;
595
596         carp_gettok();
597
598         while (ifp->if_carp &&
599                (sc = TAILQ_FIRST(&cif->vhif_vrs)) != NULL)
600                 carp_detach(sc, 1, TRUE);
601
602         carp_reltok();
603
604         lwkt_replymsg(&cmsg->base.lmsg, 0);
605 }
606
607 /* Detach an interface from the carp. */
608 static void
609 carp_ifdetach(void *arg __unused, struct ifnet *ifp)
610 {
611         struct netmsg_carp cmsg;
612
613         ASSERT_IFNET_NOT_SERIALIZED_ALL(ifp);
614
615         bzero(&cmsg, sizeof(cmsg));
616         netmsg_init(&cmsg.base, NULL, &curthread->td_msgport, 0,
617             carp_ifdetach_dispatch);
618         cmsg.nc_carpdev = ifp;
619
620         lwkt_domsg(cpu_portfn(0), &cmsg.base.lmsg, 0);
621 }
622
623 /*
624  * process input packet.
625  * we have rearranged checks order compared to the rfc,
626  * but it seems more efficient this way or not possible otherwise.
627  */
628 int
629 carp_proto_input(struct mbuf **mp, int *offp, int proto)
630 {
631         struct mbuf *m = *mp;
632         struct ip *ip = mtod(m, struct ip *);
633         struct ifnet *ifp = m->m_pkthdr.rcvif;
634         struct carp_header *ch;
635         struct carp_softc *sc;
636         int len, iphlen;
637
638         carp_gettok();
639
640         iphlen = *offp;
641         *mp = NULL;
642
643         carpstats.carps_ipackets++;
644
645         if (!carp_opts[CARPCTL_ALLOW]) {
646                 m_freem(m);
647                 goto back;
648         }
649
650         /* Check if received on a valid carp interface */
651         if (ifp->if_type != IFT_CARP) {
652                 carpstats.carps_badif++;
653                 CARP_LOG("carp_proto_input: packet received on non-carp "
654                     "interface: %s\n", ifp->if_xname);
655                 m_freem(m);
656                 goto back;
657         }
658
659         if (!CARP_IS_RUNNING(ifp)) {
660                 carpstats.carps_badif++;
661                 CARP_LOG("carp_proto_input: packet received on stopped carp "
662                     "interface: %s\n", ifp->if_xname);
663                 m_freem(m);
664                 goto back;
665         }
666
667         sc = ifp->if_softc;
668         if (sc->sc_carpdev == NULL) {
669                 carpstats.carps_badif++;
670                 CARP_LOG("carp_proto_input: packet received on defunc carp "
671                     "interface: %s\n", ifp->if_xname);
672                 m_freem(m);
673                 goto back;
674         }
675
676         if (!IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
677                 carpstats.carps_badif++;
678                 CARP_LOG("carp_proto_input: non-mcast packet on "
679                     "interface: %s\n", ifp->if_xname);
680                 m_freem(m);
681                 goto back;
682         }
683
684         /* Verify that the IP TTL is CARP_DFLTTL. */
685         if (ip->ip_ttl != CARP_DFLTTL) {
686                 carpstats.carps_badttl++;
687                 CARP_LOG("carp_proto_input: received ttl %d != %d on %s\n",
688                     ip->ip_ttl, CARP_DFLTTL, ifp->if_xname);
689                 m_freem(m);
690                 goto back;
691         }
692
693         /* Minimal CARP packet size */
694         len = iphlen + sizeof(*ch);
695
696         /*
697          * Verify that the received packet length is
698          * not less than the CARP header
699          */
700         if (m->m_pkthdr.len < len) {
701                 carpstats.carps_badlen++;
702                 CARP_LOG("packet too short %d on %s\n", m->m_pkthdr.len,
703                     ifp->if_xname);
704                 m_freem(m);
705                 goto back;
706         }
707
708         /* Make sure that CARP header is contiguous */
709         if (len > m->m_len) {
710                 m = m_pullup(m, len);
711                 if (m == NULL) {
712                         carpstats.carps_hdrops++;
713                         CARP_LOG("carp_proto_input: m_pullup failed\n");
714                         goto back;
715                 }
716                 ip = mtod(m, struct ip *);
717         }
718         ch = (struct carp_header *)((uint8_t *)ip + iphlen);
719
720         /* Verify the CARP checksum */
721         if (in_cksum_skip(m, len, iphlen)) {
722                 carpstats.carps_badsum++;
723                 CARP_LOG("carp_proto_input: checksum failed on %s\n",
724                     ifp->if_xname);
725                 m_freem(m);
726                 goto back;
727         }
728         carp_proto_input_c(sc, m, ch, AF_INET);
729 back:
730         carp_reltok();
731         return(IPPROTO_DONE);
732 }
733
734 #ifdef INET6
735 int
736 carp6_proto_input(struct mbuf **mp, int *offp, int proto)
737 {
738         struct mbuf *m = *mp;
739         struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
740         struct ifnet *ifp = m->m_pkthdr.rcvif;
741         struct carp_header *ch;
742         struct carp_softc *sc;
743         u_int len;
744
745         carp_gettok();
746
747         carpstats.carps_ipackets6++;
748
749         if (!carp_opts[CARPCTL_ALLOW]) {
750                 m_freem(m);
751                 goto back;
752         }
753
754         /* check if received on a valid carp interface */
755         if (ifp->if_type != IFT_CARP) {
756                 carpstats.carps_badif++;
757                 CARP_LOG("carp6_proto_input: packet received on non-carp "
758                     "interface: %s\n", ifp->if_xname);
759                 m_freem(m);
760                 goto back;
761         }
762
763         if (!CARP_IS_RUNNING(ifp)) {
764                 carpstats.carps_badif++;
765                 CARP_LOG("carp_proto_input: packet received on stopped carp "
766                     "interface: %s\n", ifp->if_xname);
767                 m_freem(m);
768                 goto back;
769         }
770
771         sc = ifp->if_softc;
772         if (sc->sc_carpdev == NULL) {
773                 carpstats.carps_badif++;
774                 CARP_LOG("carp6_proto_input: packet received on defunc-carp "
775                     "interface: %s\n", ifp->if_xname);
776                 m_freem(m);
777                 goto back;
778         }
779
780         /* verify that the IP TTL is 255 */
781         if (ip6->ip6_hlim != CARP_DFLTTL) {
782                 carpstats.carps_badttl++;
783                 CARP_LOG("carp6_proto_input: received ttl %d != 255 on %s\n",
784                     ip6->ip6_hlim, ifp->if_xname);
785                 m_freem(m);
786                 goto back;
787         }
788
789         /* verify that we have a complete carp packet */
790         len = m->m_len;
791         IP6_EXTHDR_GET(ch, struct carp_header *, m, *offp, sizeof(*ch));
792         if (ch == NULL) {
793                 carpstats.carps_badlen++;
794                 CARP_LOG("carp6_proto_input: packet size %u too small\n", len);
795                 goto back;
796         }
797
798         /* verify the CARP checksum */
799         if (in_cksum_range(m, 0, *offp, sizeof(*ch))) {
800                 carpstats.carps_badsum++;
801                 CARP_LOG("carp6_proto_input: checksum failed, on %s\n",
802                     ifp->if_xname);
803                 m_freem(m);
804                 goto back;
805         }
806
807         carp_proto_input_c(sc, m, ch, AF_INET6);
808 back:
809         carp_reltok();
810         return (IPPROTO_DONE);
811 }
812 #endif /* INET6 */
813
814 static void
815 carp_proto_input_c(struct carp_softc *sc, struct mbuf *m,
816     struct carp_header *ch, sa_family_t af)
817 {
818         struct ifnet *cifp;
819         uint64_t tmp_counter;
820         struct timeval sc_tv, ch_tv;
821
822         if (sc->sc_vhid != ch->carp_vhid) {
823                 /*
824                  * CARP uses multicast, however, multicast packets
825                  * are tapped to all CARP interfaces on the physical
826                  * interface receiving the CARP packets, so we don't
827                  * update any stats here.
828                  */
829                 m_freem(m);
830                 return;
831         }
832         cifp = &sc->sc_if;
833
834         /* verify the CARP version. */
835         if (ch->carp_version != CARP_VERSION) {
836                 carpstats.carps_badver++;
837                 CARP_LOG("%s; invalid version %d\n", cifp->if_xname,
838                          ch->carp_version);
839                 m_freem(m);
840                 return;
841         }
842
843         /* verify the hash */
844         if (carp_hmac_verify(sc, ch->carp_counter, ch->carp_md)) {
845                 carpstats.carps_badauth++;
846                 CARP_LOG("%s: incorrect hash\n", cifp->if_xname);
847                 m_freem(m);
848                 return;
849         }
850
851         tmp_counter = ntohl(ch->carp_counter[0]);
852         tmp_counter = tmp_counter<<32;
853         tmp_counter += ntohl(ch->carp_counter[1]);
854
855         /* XXX Replay protection goes here */
856
857         sc->sc_init_counter = 0;
858         sc->sc_counter = tmp_counter;
859
860         sc_tv.tv_sec = sc->sc_advbase;
861         if (carp_suppress_preempt && sc->sc_advskew <  240)
862                 sc_tv.tv_usec = 240 * 1000000 / 256;
863         else
864                 sc_tv.tv_usec = sc->sc_advskew * 1000000 / 256;
865         ch_tv.tv_sec = ch->carp_advbase;
866         ch_tv.tv_usec = ch->carp_advskew * 1000000 / 256;
867
868         switch (sc->sc_state) {
869         case INIT:
870                 break;
871
872         case MASTER:
873                 /*
874                  * If we receive an advertisement from a master who's going to
875                  * be more frequent than us, go into BACKUP state.
876                  */
877                 if (timevalcmp(&sc_tv, &ch_tv, >) ||
878                     timevalcmp(&sc_tv, &ch_tv, ==)) {
879                         callout_stop(&sc->sc_ad_tmo);
880                         CARP_DEBUG("%s: MASTER -> BACKUP "
881                            "(more frequent advertisement received)\n",
882                            cifp->if_xname);
883                         carp_set_state(sc, BACKUP);
884                         carp_setrun(sc, 0);
885                         carp_setroute(sc, RTM_DELETE);
886                 }
887                 break;
888
889         case BACKUP:
890                 /*
891                  * If we're pre-empting masters who advertise slower than us,
892                  * and this one claims to be slower, treat him as down.
893                  */
894                 if (carp_opts[CARPCTL_PREEMPT] &&
895                     timevalcmp(&sc_tv, &ch_tv, <)) {
896                         CARP_DEBUG("%s: BACKUP -> MASTER "
897                             "(preempting a slower master)\n", cifp->if_xname);
898                         carp_master_down(sc);
899                         break;
900                 }
901
902                 /*
903                  *  If the master is going to advertise at such a low frequency
904                  *  that he's guaranteed to time out, we'd might as well just
905                  *  treat him as timed out now.
906                  */
907                 sc_tv.tv_sec = sc->sc_advbase * 3;
908                 if (timevalcmp(&sc_tv, &ch_tv, <)) {
909                         CARP_DEBUG("%s: BACKUP -> MASTER (master timed out)\n",
910                                    cifp->if_xname);
911                         carp_master_down(sc);
912                         break;
913                 }
914
915                 /*
916                  * Otherwise, we reset the counter and wait for the next
917                  * advertisement.
918                  */
919                 carp_setrun(sc, af);
920                 break;
921         }
922         m_freem(m);
923 }
924
925 struct mbuf *
926 carp_input(void *v, struct mbuf *m)
927 {
928         struct carp_if *cif = v;
929         struct ether_header *eh;
930         struct carp_softc *sc;
931         struct ifnet *ifp;
932
933         ASSERT_LWKT_TOKEN_HELD(&carp_tok);
934
935         eh = mtod(m, struct ether_header *);
936
937         ifp = carp_forus(cif, eh->ether_dhost);
938         if (ifp != NULL) {
939                 ether_reinput_oncpu(ifp, m, REINPUT_RUNBPF);
940                 return NULL;
941         }
942
943         if ((m->m_flags & (M_BCAST | M_MCAST)) == 0)
944                 return m;
945
946         /*
947          * XXX Should really check the list of multicast addresses
948          * for each CARP interface _before_ copying.
949          */
950         TAILQ_FOREACH(sc, &cif->vhif_vrs, sc_list) {
951                 struct mbuf *m0;
952
953                 if ((sc->sc_if.if_flags & IFF_UP) == 0)
954                         continue;
955
956                 m0 = m_dup(m, MB_DONTWAIT);
957                 if (m0 == NULL)
958                         continue;
959
960                 ether_reinput_oncpu(&sc->sc_if, m0, REINPUT_RUNBPF);
961         }
962         return m;
963 }
964
965 static void
966 carp_prepare_ad(struct carp_softc *sc, struct carp_header *ch)
967 {
968         if (sc->sc_init_counter) {
969                 /* this could also be seconds since unix epoch */
970                 sc->sc_counter = karc4random();
971                 sc->sc_counter = sc->sc_counter << 32;
972                 sc->sc_counter += karc4random();
973         } else {
974                 sc->sc_counter++;
975         }
976
977         ch->carp_counter[0] = htonl((sc->sc_counter >> 32) & 0xffffffff);
978         ch->carp_counter[1] = htonl(sc->sc_counter & 0xffffffff);
979
980         carp_hmac_generate(sc, ch->carp_counter, ch->carp_md);
981 }
982
983 static void
984 carp_send_ad_all(void)
985 {
986         struct carp_softc *sc;
987
988         LIST_FOREACH(sc, &carpif_list, sc_next) {
989                 if (sc->sc_carpdev == NULL)
990                         continue;
991
992                 if (CARP_IS_RUNNING(&sc->sc_if) && sc->sc_state == MASTER)
993                         carp_send_ad(sc);
994         }
995 }
996
997 static void
998 carp_send_ad_timeout(void *xsc)
999 {
1000         struct carp_softc *sc = xsc;
1001         struct netmsg_carp *cmsg = &sc->sc_ad_msg;
1002
1003         KASSERT(mycpuid == 0, ("%s not on cpu0 but on cpu%d\n",
1004             __func__, mycpuid));
1005
1006         crit_enter();
1007         if (cmsg->base.lmsg.ms_flags & MSGF_DONE)
1008                 lwkt_sendmsg(cpu_portfn(0), &cmsg->base.lmsg);
1009         crit_exit();
1010 }
1011
1012 static void
1013 carp_send_ad_timeout_dispatch(netmsg_t msg)
1014 {
1015         struct netmsg_carp *cmsg = (struct netmsg_carp *)msg;
1016         struct carp_softc *sc = cmsg->nc_softc;
1017
1018         /* Reply ASAP */
1019         crit_enter();
1020         lwkt_replymsg(&cmsg->base.lmsg, 0);
1021         crit_exit();
1022
1023         carp_gettok();
1024         carp_send_ad(sc);
1025         carp_reltok();
1026 }
1027
1028 static void
1029 carp_send_ad(struct carp_softc *sc)
1030 {
1031         struct ifnet *cifp = &sc->sc_if;
1032         struct carp_header ch;
1033         struct timeval tv;
1034         struct carp_header *ch_ptr;
1035         struct mbuf *m;
1036         int len, advbase, advskew;
1037
1038         if (!CARP_IS_RUNNING(cifp)) {
1039                 /* Bow out */
1040                 advbase = 255;
1041                 advskew = 255;
1042         } else {
1043                 advbase = sc->sc_advbase;
1044                 if (!carp_suppress_preempt || sc->sc_advskew > 240)
1045                         advskew = sc->sc_advskew;
1046                 else
1047                         advskew = 240;
1048                 tv.tv_sec = advbase;
1049                 tv.tv_usec = advskew * 1000000 / 256;
1050         }
1051
1052         ch.carp_version = CARP_VERSION;
1053         ch.carp_type = CARP_ADVERTISEMENT;
1054         ch.carp_vhid = sc->sc_vhid;
1055         ch.carp_advbase = advbase;
1056         ch.carp_advskew = advskew;
1057         ch.carp_authlen = 7;    /* XXX DEFINE */
1058         ch.carp_pad1 = 0;       /* must be zero */
1059         ch.carp_cksum = 0;
1060
1061 #ifdef INET
1062         if (sc->sc_ia != NULL) {
1063                 struct ip *ip;
1064
1065                 MGETHDR(m, MB_DONTWAIT, MT_HEADER);
1066                 if (m == NULL) {
1067                         cifp->if_oerrors++;
1068                         carpstats.carps_onomem++;
1069                         /* XXX maybe less ? */
1070                         if (advbase != 255 || advskew != 255)
1071                                 callout_reset(&sc->sc_ad_tmo, tvtohz_high(&tv),
1072                                     carp_send_ad_timeout, sc);
1073                         return;
1074                 }
1075                 len = sizeof(*ip) + sizeof(ch);
1076                 m->m_pkthdr.len = len;
1077                 m->m_pkthdr.rcvif = NULL;
1078                 m->m_len = len;
1079                 MH_ALIGN(m, m->m_len);
1080                 m->m_flags |= M_MCAST;
1081                 ip = mtod(m, struct ip *);
1082                 ip->ip_v = IPVERSION;
1083                 ip->ip_hl = sizeof(*ip) >> 2;
1084                 ip->ip_tos = IPTOS_LOWDELAY;
1085                 ip->ip_len = len;
1086                 ip->ip_id = ip_newid();
1087                 ip->ip_off = IP_DF;
1088                 ip->ip_ttl = CARP_DFLTTL;
1089                 ip->ip_p = IPPROTO_CARP;
1090                 ip->ip_sum = 0;
1091                 ip->ip_src = sc->sc_ia->ia_addr.sin_addr;
1092                 ip->ip_dst.s_addr = htonl(INADDR_CARP_GROUP);
1093
1094                 ch_ptr = (struct carp_header *)(&ip[1]);
1095                 bcopy(&ch, ch_ptr, sizeof(ch));
1096                 carp_prepare_ad(sc, ch_ptr);
1097                 ch_ptr->carp_cksum = in_cksum_skip(m, len, sizeof(*ip));
1098
1099                 getmicrotime(&cifp->if_lastchange);
1100                 cifp->if_opackets++;
1101                 cifp->if_obytes += len;
1102                 carpstats.carps_opackets++;
1103
1104                 if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &sc->sc_imo, NULL)) {
1105                         cifp->if_oerrors++;
1106                         if (sc->sc_sendad_errors < INT_MAX)
1107                                 sc->sc_sendad_errors++;
1108                         if (sc->sc_sendad_errors == CARP_SENDAD_MAX_ERRORS) {
1109                                 carp_suppress_preempt++;
1110                                 if (carp_suppress_preempt == 1) {
1111                                         carp_send_ad_all();
1112                                 }
1113                         }
1114                         sc->sc_sendad_success = 0;
1115                 } else {
1116                         if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS) {
1117                                 if (++sc->sc_sendad_success >=
1118                                     CARP_SENDAD_MIN_SUCCESS) {
1119                                         carp_suppress_preempt--;
1120                                         sc->sc_sendad_errors = 0;
1121                                 }
1122                         } else {
1123                                 sc->sc_sendad_errors = 0;
1124                         }
1125                 }
1126         }
1127 #endif /* INET */
1128 #ifdef INET6
1129         if (sc->sc_ia6) {
1130                 struct ip6_hdr *ip6;
1131
1132                 MGETHDR(m, MB_DONTWAIT, MT_HEADER);
1133                 if (m == NULL) {
1134                         cifp->if_oerrors++;
1135                         carpstats.carps_onomem++;
1136                         /* XXX maybe less ? */
1137                         if (advbase != 255 || advskew != 255)
1138                                 callout_reset(&sc->sc_ad_tmo, tvtohz_high(&tv),
1139                                     carp_send_ad_timeout, sc);
1140                         return;
1141                 }
1142                 len = sizeof(*ip6) + sizeof(ch);
1143                 m->m_pkthdr.len = len;
1144                 m->m_pkthdr.rcvif = NULL;
1145                 m->m_len = len;
1146                 MH_ALIGN(m, m->m_len);
1147                 m->m_flags |= M_MCAST;
1148                 ip6 = mtod(m, struct ip6_hdr *);
1149                 bzero(ip6, sizeof(*ip6));
1150                 ip6->ip6_vfc |= IPV6_VERSION;
1151                 ip6->ip6_hlim = CARP_DFLTTL;
1152                 ip6->ip6_nxt = IPPROTO_CARP;
1153                 bcopy(&sc->sc_ia6->ia_addr.sin6_addr, &ip6->ip6_src,
1154                     sizeof(struct in6_addr));
1155                 /* set the multicast destination */
1156
1157                 ip6->ip6_dst.s6_addr16[0] = htons(0xff02);
1158                 ip6->ip6_dst.s6_addr8[15] = 0x12;
1159                 if (in6_setscope(&ip6->ip6_dst, sc->sc_carpdev, NULL) != 0) {
1160                         cifp->if_oerrors++;
1161                         m_freem(m);
1162                         CARP_LOG("%s: in6_setscope failed\n", __func__);
1163                         return;
1164                 }
1165
1166                 ch_ptr = (struct carp_header *)(&ip6[1]);
1167                 bcopy(&ch, ch_ptr, sizeof(ch));
1168                 carp_prepare_ad(sc, ch_ptr);
1169                 ch_ptr->carp_cksum = in_cksum_skip(m, len, sizeof(*ip6));
1170
1171                 getmicrotime(&cifp->if_lastchange);
1172                 cifp->if_opackets++;
1173                 cifp->if_obytes += len;
1174                 carpstats.carps_opackets6++;
1175
1176                 if (ip6_output(m, NULL, NULL, 0, &sc->sc_im6o, NULL, NULL)) {
1177                         cifp->if_oerrors++;
1178                         if (sc->sc_sendad_errors < INT_MAX)
1179                                 sc->sc_sendad_errors++;
1180                         if (sc->sc_sendad_errors == CARP_SENDAD_MAX_ERRORS) {
1181                                 carp_suppress_preempt++;
1182                                 if (carp_suppress_preempt == 1) {
1183                                         carp_send_ad_all();
1184                                 }
1185                         }
1186                         sc->sc_sendad_success = 0;
1187                 } else {
1188                         if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS) {
1189                                 if (++sc->sc_sendad_success >=
1190                                     CARP_SENDAD_MIN_SUCCESS) {
1191                                         carp_suppress_preempt--;
1192                                         sc->sc_sendad_errors = 0;
1193                                 }
1194                         } else {
1195                                 sc->sc_sendad_errors = 0;
1196                         }
1197                 }
1198         }
1199 #endif /* INET6 */
1200
1201         if (advbase != 255 || advskew != 255)
1202                 callout_reset(&sc->sc_ad_tmo, tvtohz_high(&tv),
1203                     carp_send_ad_timeout, sc);
1204 }
1205
1206 /*
1207  * Broadcast a gratuitous ARP request containing
1208  * the virtual router MAC address for each IP address
1209  * associated with the virtual router.
1210  */
1211 static void
1212 carp_send_arp(struct carp_softc *sc)
1213 {
1214         const struct carp_vhaddr *vha;
1215
1216         TAILQ_FOREACH(vha, &sc->sc_vha_list, vha_link) {
1217                 if (vha->vha_iaback == NULL)
1218                         continue;
1219                 arp_gratuitous(&sc->sc_if, &vha->vha_ia->ia_ifa);
1220         }
1221 }
1222
1223 #ifdef INET6
1224 static void
1225 carp_send_na(struct carp_softc *sc)
1226 {
1227         struct ifaddr_container *ifac;
1228         struct in6_addr *in6;
1229         static struct in6_addr mcast = IN6ADDR_LINKLOCAL_ALLNODES_INIT;
1230
1231         TAILQ_FOREACH(ifac, &sc->sc_if.if_addrheads[mycpuid], ifa_link) {
1232                 struct ifaddr *ifa = ifac->ifa;
1233
1234                 if (ifa->ifa_addr->sa_family != AF_INET6)
1235                         continue;
1236
1237                 in6 = &ifatoia6(ifa)->ia_addr.sin6_addr;
1238                 nd6_na_output(sc->sc_carpdev, &mcast, in6,
1239                     ND_NA_FLAG_OVERRIDE, 1, NULL);
1240                 DELAY(1000);    /* XXX */
1241         }
1242 }
1243 #endif /* INET6 */
1244
1245 static __inline const struct carp_vhaddr *
1246 carp_find_addr(const struct carp_softc *sc, const struct in_addr *addr)
1247 {
1248         struct carp_vhaddr *vha;
1249
1250         TAILQ_FOREACH(vha, &sc->sc_vha_list, vha_link) {
1251                 if (vha->vha_iaback == NULL)
1252                         continue;
1253
1254                 if (vha->vha_ia->ia_addr.sin_addr.s_addr == addr->s_addr)
1255                         return vha;
1256         }
1257         return NULL;
1258 }
1259
1260 #ifdef notyet
1261 static int
1262 carp_iamatch_balance(const struct carp_if *cif, const struct in_addr *itaddr,
1263                      const struct in_addr *isaddr, uint8_t **enaddr)
1264 {
1265         const struct carp_softc *vh;
1266         int index, count = 0;
1267
1268         /*
1269          * XXX proof of concept implementation.
1270          * We use the source ip to decide which virtual host should
1271          * handle the request. If we're master of that virtual host,
1272          * then we respond, otherwise, just drop the arp packet on
1273          * the floor.
1274          */
1275
1276         TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) {
1277                 if (!CARP_IS_RUNNING(&vh->sc_if))
1278                         continue;
1279
1280                 if (carp_find_addr(vh, itaddr) != NULL)
1281                         count++;
1282         }
1283         if (count == 0)
1284                 return 0;
1285
1286         /* this should be a hash, like pf_hash() */
1287         index = ntohl(isaddr->s_addr) % count;
1288         count = 0;
1289
1290         TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) {
1291                 if (!CARP_IS_RUNNING(&vh->sc_if))
1292                         continue;
1293
1294                 if (carp_find_addr(vh, itaddr) == NULL)
1295                         continue;
1296
1297                 if (count == index) {
1298                         if (vh->sc_state == MASTER) {
1299                                 *enaddr = IF_LLADDR(&vh->sc_if);
1300                                 return 1;
1301                         } else {
1302                                 return 0;
1303                         }
1304                 }
1305                 count++;
1306         }
1307         return 0;
1308 }
1309 #endif
1310
1311 int
1312 carp_iamatch(const struct in_ifaddr *ia)
1313 {
1314         const struct carp_softc *sc = ia->ia_ifp->if_softc;
1315
1316         ASSERT_LWKT_TOKEN_HELD(&carp_tok);
1317
1318 #ifdef notyet
1319         if (carp_opts[CARPCTL_ARPBALANCE])
1320                 return carp_iamatch_balance(cif, itaddr, isaddr, enaddr);
1321 #endif
1322
1323         if (!CARP_IS_RUNNING(&sc->sc_if) || sc->sc_state != MASTER)
1324                 return 0;
1325
1326         return 1;
1327 }
1328
1329 #ifdef INET6
1330 struct ifaddr *
1331 carp_iamatch6(void *v, struct in6_addr *taddr)
1332 {
1333         struct carp_if *cif = v;
1334         struct carp_softc *vh;
1335
1336         ASSERT_LWKT_TOKEN_HELD(&carp_tok);
1337
1338         TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) {
1339                 struct ifaddr_container *ifac;
1340
1341                 TAILQ_FOREACH(ifac, &vh->sc_if.if_addrheads[mycpuid],
1342                               ifa_link) {
1343                         struct ifaddr *ifa = ifac->ifa;
1344
1345                         if (IN6_ARE_ADDR_EQUAL(taddr,
1346                             &ifatoia6(ifa)->ia_addr.sin6_addr) &&
1347                             CARP_IS_RUNNING(&vh->sc_if) &&
1348                             vh->sc_state == MASTER) {
1349                                 return (ifa);
1350                         }
1351                 }
1352         }
1353         return (NULL);
1354 }
1355
1356 void *
1357 carp_macmatch6(void *v, struct mbuf *m, const struct in6_addr *taddr)
1358 {
1359         struct m_tag *mtag;
1360         struct carp_if *cif = v;
1361         struct carp_softc *sc;
1362
1363         ASSERT_LWKT_TOKEN_HELD(&carp_tok);
1364
1365         TAILQ_FOREACH(sc, &cif->vhif_vrs, sc_list) {
1366                 struct ifaddr_container *ifac;
1367
1368                 TAILQ_FOREACH(ifac, &sc->sc_if.if_addrheads[mycpuid],
1369                               ifa_link) {
1370                         struct ifaddr *ifa = ifac->ifa;
1371
1372                         if (IN6_ARE_ADDR_EQUAL(taddr,
1373                             &ifatoia6(ifa)->ia_addr.sin6_addr) &&
1374                             CARP_IS_RUNNING(&sc->sc_if)) {
1375                                 struct ifnet *ifp = &sc->sc_if;
1376
1377                                 mtag = m_tag_get(PACKET_TAG_CARP,
1378                                     sizeof(struct ifnet *), MB_DONTWAIT);
1379                                 if (mtag == NULL) {
1380                                         /* better a bit than nothing */
1381                                         return (IF_LLADDR(ifp));
1382                                 }
1383                                 bcopy(&ifp, (caddr_t)(mtag + 1),
1384                                     sizeof(struct ifnet *));
1385                                 m_tag_prepend(m, mtag);
1386
1387                                 return (IF_LLADDR(ifp));
1388                         }
1389                 }
1390         }
1391         return (NULL);
1392 }
1393 #endif
1394
1395 static struct ifnet *
1396 carp_forus(struct carp_if *cif, const uint8_t *dhost)
1397 {
1398         struct carp_softc *sc;
1399
1400         ASSERT_LWKT_TOKEN_HELD(&carp_tok);
1401
1402         if (memcmp(dhost, carp_etheraddr, ETHER_ADDR_LEN - 1) != 0)
1403                 return NULL;
1404
1405         TAILQ_FOREACH(sc, &cif->vhif_vrs, sc_list) {
1406                 struct ifnet *ifp = &sc->sc_if;
1407
1408                 if (CARP_IS_RUNNING(ifp) && sc->sc_state == MASTER &&
1409                     !bcmp(dhost, IF_LLADDR(ifp), ETHER_ADDR_LEN))
1410                         return ifp;
1411         }
1412         return NULL;
1413 }
1414
1415 static void
1416 carp_master_down_timeout(void *xsc)
1417 {
1418         struct carp_softc *sc = xsc;
1419         struct netmsg_carp *cmsg = &sc->sc_md_msg;
1420
1421         KASSERT(mycpuid == 0, ("%s not on cpu0 but on cpu%d\n",
1422             __func__, mycpuid));
1423
1424         crit_enter();
1425         if (cmsg->base.lmsg.ms_flags & MSGF_DONE)
1426                 lwkt_sendmsg(cpu_portfn(0), &cmsg->base.lmsg);
1427         crit_exit();
1428 }
1429
1430 static void
1431 carp_master_down_timeout_dispatch(netmsg_t msg)
1432 {
1433         struct netmsg_carp *cmsg = (struct netmsg_carp *)msg;
1434         struct carp_softc *sc = cmsg->nc_softc;
1435
1436         /* Reply ASAP */
1437         crit_enter();
1438         lwkt_replymsg(&cmsg->base.lmsg, 0);
1439         crit_exit();
1440
1441         CARP_DEBUG("%s: BACKUP -> MASTER (master timed out)\n",
1442                    sc->sc_if.if_xname);
1443         carp_gettok();
1444         carp_master_down(sc);
1445         carp_reltok();
1446 }
1447
1448 static void
1449 carp_master_down(struct carp_softc *sc)
1450 {
1451         switch (sc->sc_state) {
1452         case INIT:
1453                 kprintf("%s: master_down event in INIT state\n",
1454                         sc->sc_if.if_xname);
1455                 break;
1456
1457         case MASTER:
1458                 break;
1459
1460         case BACKUP:
1461                 carp_set_state(sc, MASTER);
1462                 carp_send_ad(sc);
1463                 carp_send_arp(sc);
1464 #ifdef INET6
1465                 carp_send_na(sc);
1466 #endif /* INET6 */
1467                 carp_setrun(sc, 0);
1468                 carp_setroute(sc, RTM_ADD);
1469                 break;
1470         }
1471 }
1472
1473 /*
1474  * When in backup state, af indicates whether to reset the master down timer
1475  * for v4 or v6. If it's set to zero, reset the ones which are already pending.
1476  */
1477 static void
1478 carp_setrun(struct carp_softc *sc, sa_family_t af)
1479 {
1480         struct ifnet *cifp = &sc->sc_if;
1481         struct timeval tv;
1482
1483         if (sc->sc_carpdev == NULL) {
1484                 carp_set_state(sc, INIT);
1485                 return;
1486         }
1487
1488         if ((cifp->if_flags & IFF_RUNNING) && sc->sc_vhid > 0 &&
1489             (sc->sc_naddrs || sc->sc_naddrs6)) {
1490                 /* Nothing */
1491         } else {
1492                 carp_setroute(sc, RTM_DELETE);
1493                 return;
1494         }
1495
1496         switch (sc->sc_state) {
1497         case INIT:
1498                 if (carp_opts[CARPCTL_PREEMPT] && !carp_suppress_preempt) {
1499                         carp_send_ad(sc);
1500                         carp_send_arp(sc);
1501 #ifdef INET6
1502                         carp_send_na(sc);
1503 #endif /* INET6 */
1504                         CARP_DEBUG("%s: INIT -> MASTER (preempting)\n",
1505                                    cifp->if_xname);
1506                         carp_set_state(sc, MASTER);
1507                         carp_setroute(sc, RTM_ADD);
1508                 } else {
1509                         CARP_DEBUG("%s: INIT -> BACKUP\n", cifp->if_xname);
1510                         carp_set_state(sc, BACKUP);
1511                         carp_setroute(sc, RTM_DELETE);
1512                         carp_setrun(sc, 0);
1513                 }
1514                 break;
1515
1516         case BACKUP:
1517                 callout_stop(&sc->sc_ad_tmo);
1518                 tv.tv_sec = 3 * sc->sc_advbase;
1519                 tv.tv_usec = sc->sc_advskew * 1000000 / 256;
1520                 switch (af) {
1521 #ifdef INET
1522                 case AF_INET:
1523                         callout_reset(&sc->sc_md_tmo, tvtohz_high(&tv),
1524                             carp_master_down_timeout, sc);
1525                         break;
1526 #endif /* INET */
1527 #ifdef INET6
1528                 case AF_INET6:
1529                         callout_reset(&sc->sc_md6_tmo, tvtohz_high(&tv),
1530                             carp_master_down_timeout, sc);
1531                         break;
1532 #endif /* INET6 */
1533                 default:
1534                         if (sc->sc_naddrs)
1535                                 callout_reset(&sc->sc_md_tmo, tvtohz_high(&tv),
1536                                     carp_master_down_timeout, sc);
1537                         if (sc->sc_naddrs6)
1538                                 callout_reset(&sc->sc_md6_tmo, tvtohz_high(&tv),
1539                                     carp_master_down_timeout, sc);
1540                         break;
1541                 }
1542                 break;
1543
1544         case MASTER:
1545                 tv.tv_sec = sc->sc_advbase;
1546                 tv.tv_usec = sc->sc_advskew * 1000000 / 256;
1547                 callout_reset(&sc->sc_ad_tmo, tvtohz_high(&tv),
1548                     carp_send_ad_timeout, sc);
1549                 break;
1550         }
1551 }
1552
1553 static void
1554 carp_multicast_cleanup(struct carp_softc *sc)
1555 {
1556         struct ip_moptions *imo = &sc->sc_imo;
1557
1558         if (imo->imo_num_memberships == 0)
1559                 return;
1560         KKASSERT(imo->imo_num_memberships == 1);
1561
1562         in_delmulti(imo->imo_membership[0]);
1563         imo->imo_membership[0] = NULL;
1564         imo->imo_num_memberships = 0;
1565         imo->imo_multicast_ifp = NULL;
1566 }
1567
1568 #ifdef INET6
1569 static void
1570 carp_multicast6_cleanup(struct carp_softc *sc)
1571 {
1572         struct ip6_moptions *im6o = &sc->sc_im6o;
1573
1574         while (!LIST_EMPTY(&im6o->im6o_memberships)) {
1575                 struct in6_multi_mship *imm =
1576                     LIST_FIRST(&im6o->im6o_memberships);
1577
1578                 LIST_REMOVE(imm, i6mm_chain);
1579                 in6_leavegroup(imm);
1580         }
1581         im6o->im6o_multicast_ifp = NULL;
1582 }
1583 #endif
1584
1585 static void
1586 carp_ioctl_getvhaddr_dispatch(netmsg_t msg)
1587 {
1588         struct netmsg_carp *cmsg = (struct netmsg_carp *)msg;
1589         struct carp_softc *sc = cmsg->nc_softc;
1590         const struct carp_vhaddr *vha;
1591         struct ifcarpvhaddr *carpa, *carpa0;
1592         int count, len, error = 0;
1593
1594         carp_gettok();
1595
1596         count = 0;
1597         TAILQ_FOREACH(vha, &sc->sc_vha_list, vha_link)
1598                 ++count;
1599
1600         if (cmsg->nc_datalen == 0) {
1601                 cmsg->nc_datalen = count * sizeof(*carpa);
1602                 goto back;
1603         } else if (count == 0 || cmsg->nc_datalen < sizeof(*carpa)) {
1604                 cmsg->nc_datalen = 0;
1605                 goto back;
1606         }
1607         len = min(cmsg->nc_datalen, sizeof(*carpa) * count);
1608         KKASSERT(len >= sizeof(*carpa));
1609
1610         carpa0 = carpa = kmalloc(len, M_TEMP, M_WAITOK | M_NULLOK | M_ZERO);
1611         if (carpa == NULL) {
1612                 error = ENOMEM; 
1613                 goto back;
1614         }
1615
1616         count = 0;
1617         TAILQ_FOREACH(vha, &sc->sc_vha_list, vha_link) {
1618                 if (len < sizeof(*carpa))
1619                         break;
1620
1621                 carpa->carpa_flags = vha->vha_flags;
1622                 carpa->carpa_addr.sin_family = AF_INET;
1623                 carpa->carpa_addr.sin_addr = vha->vha_ia->ia_addr.sin_addr;
1624
1625                 carpa->carpa_baddr.sin_family = AF_INET;
1626                 if (vha->vha_iaback == NULL) {
1627                         carpa->carpa_baddr.sin_addr.s_addr = INADDR_ANY;
1628                 } else {
1629                         carpa->carpa_baddr.sin_addr =
1630                         vha->vha_iaback->ia_addr.sin_addr;
1631                 }
1632
1633                 ++carpa;
1634                 ++count;
1635                 len -= sizeof(*carpa);
1636         }
1637         cmsg->nc_datalen = sizeof(*carpa) * count;
1638         KKASSERT(cmsg->nc_datalen > 0);
1639
1640         cmsg->nc_data = carpa0;
1641
1642 back:
1643         carp_reltok();
1644         lwkt_replymsg(&cmsg->base.lmsg, error);
1645 }
1646
1647 static int
1648 carp_ioctl_getvhaddr(struct carp_softc *sc, struct ifdrv *ifd)
1649 {
1650         struct ifnet *ifp = &sc->arpcom.ac_if;
1651         struct netmsg_carp cmsg;
1652         int error;
1653
1654         ASSERT_IFNET_SERIALIZED_ALL(ifp);
1655         ifnet_deserialize_all(ifp);
1656
1657         bzero(&cmsg, sizeof(cmsg));
1658         netmsg_init(&cmsg.base, NULL, &curthread->td_msgport, 0,
1659             carp_ioctl_getvhaddr_dispatch);
1660         cmsg.nc_softc = sc;
1661         cmsg.nc_datalen = ifd->ifd_len;
1662
1663         error = lwkt_domsg(cpu_portfn(0), &cmsg.base.lmsg, 0);
1664
1665         if (!error) {
1666                 if (cmsg.nc_data != NULL) {
1667                         error = copyout(cmsg.nc_data, ifd->ifd_data,
1668                             cmsg.nc_datalen);
1669                         kfree(cmsg.nc_data, M_TEMP);
1670                 }
1671                 ifd->ifd_len = cmsg.nc_datalen;
1672         } else {
1673                 KASSERT(cmsg.nc_data == NULL,
1674                     ("%s temp vhaddr is alloc upon error\n", __func__));
1675         }
1676
1677         ifnet_serialize_all(ifp);
1678         return error;
1679 }
1680
1681 static int
1682 carp_config_vhaddr(struct carp_softc *sc, struct carp_vhaddr *vha,
1683     struct in_ifaddr *ia_del)
1684 {
1685         struct ifnet *ifp;
1686         struct in_ifaddr *ia_if;
1687         struct in_ifaddr_container *iac;
1688         const struct sockaddr_in *sin;
1689         u_long iaddr;
1690         int own;
1691
1692         KKASSERT(vha->vha_ia != NULL);
1693
1694         sin = &vha->vha_ia->ia_addr;
1695         iaddr = ntohl(sin->sin_addr.s_addr);
1696
1697         ia_if = NULL;
1698         own = 0;
1699         TAILQ_FOREACH(iac, &in_ifaddrheads[mycpuid], ia_link) {
1700                 struct in_ifaddr *ia = iac->ia;
1701
1702                 if (ia == ia_del)
1703                         continue;
1704
1705                 if (ia->ia_ifp->if_type == IFT_CARP)
1706                         continue;
1707
1708                 if ((ia->ia_ifp->if_flags & IFF_UP) == 0)
1709                         continue;
1710
1711                 /* and, yeah, we need a multicast-capable iface too */
1712                 if ((ia->ia_ifp->if_flags & IFF_MULTICAST) == 0)
1713                         continue;
1714
1715                 if ((iaddr & ia->ia_subnetmask) == ia->ia_subnet) {
1716                         if (sin->sin_addr.s_addr ==
1717                             ia->ia_addr.sin_addr.s_addr)
1718                                 own = 1;
1719                         if (ia_if == NULL)
1720                                 ia_if = ia;
1721                         else if (sc->sc_carpdev != NULL &&
1722                                  sc->sc_carpdev == ia->ia_ifp)
1723                                 ia_if = ia;
1724                 }
1725         }
1726
1727         carp_deactivate_vhaddr(sc, vha, FALSE);
1728         if (!ia_if)
1729                 return ENOENT;
1730
1731         ifp = ia_if->ia_ifp;
1732
1733         /* XXX Don't allow parent iface to be changed */
1734         if (sc->sc_carpdev != NULL && sc->sc_carpdev != ifp)
1735                 return EEXIST;
1736
1737         return carp_activate_vhaddr(sc, vha, ifp, ia_if, own);
1738 }
1739
1740 static void
1741 carp_add_addr(struct carp_softc *sc, struct ifaddr *carp_ifa)
1742 {
1743         struct carp_vhaddr *vha_new;
1744         struct in_ifaddr *carp_ia;
1745 #ifdef INVARIANTS
1746         struct carp_vhaddr *vha;
1747 #endif
1748
1749         KKASSERT(carp_ifa->ifa_addr->sa_family == AF_INET);
1750         carp_ia = ifatoia(carp_ifa);
1751
1752 #ifdef INVARIANTS
1753         TAILQ_FOREACH(vha, &sc->sc_vha_list, vha_link)
1754                 KKASSERT(vha->vha_ia != NULL && vha->vha_ia != carp_ia);
1755 #endif
1756
1757         vha_new = kmalloc(sizeof(*vha_new), M_CARP, M_WAITOK | M_ZERO);
1758         vha_new->vha_ia = carp_ia;
1759         carp_insert_vhaddr(sc, vha_new);
1760
1761         if (carp_config_vhaddr(sc, vha_new, NULL) != 0) {
1762                 /*
1763                  * If the above configuration fails, it may only mean
1764                  * that the new address is problematic.  However, the
1765                  * carp(4) interface may already have several working
1766                  * addresses.  Since the expected behaviour of
1767                  * SIOC[AS]IFADDR is to put the NIC into working state,
1768                  * we try starting the state machine manually here with
1769                  * the hope that the carp(4)'s previously working
1770                  * addresses still could be brought up.
1771                  */
1772                 carp_hmac_prepare(sc);
1773                 carp_set_state(sc, INIT);
1774                 carp_setrun(sc, 0);
1775         }
1776 }
1777
1778 static void
1779 carp_del_addr(struct carp_softc *sc, struct ifaddr *carp_ifa)
1780 {
1781         struct carp_vhaddr *vha;
1782         struct in_ifaddr *carp_ia;
1783
1784         KKASSERT(carp_ifa->ifa_addr->sa_family == AF_INET);
1785         carp_ia = ifatoia(carp_ifa);
1786
1787         TAILQ_FOREACH(vha, &sc->sc_vha_list, vha_link) {
1788                 KKASSERT(vha->vha_ia != NULL);
1789                 if (vha->vha_ia == carp_ia)
1790                         break;
1791         }
1792         KASSERT(vha != NULL, ("no corresponding vhaddr %p\n", carp_ifa));
1793
1794         /*
1795          * Remove the vhaddr from the list before deactivating
1796          * the vhaddr, so that the HMAC could be correctly
1797          * updated in carp_deactivate_vhaddr()
1798          */
1799         carp_remove_vhaddr(sc, vha);
1800
1801         carp_deactivate_vhaddr(sc, vha, FALSE);
1802         kfree(vha, M_CARP);
1803 }
1804
1805 static void
1806 carp_config_addr(struct carp_softc *sc, struct ifaddr *carp_ifa)
1807 {
1808         struct carp_vhaddr *vha;
1809         struct in_ifaddr *carp_ia;
1810
1811         KKASSERT(carp_ifa->ifa_addr->sa_family == AF_INET);
1812         carp_ia = ifatoia(carp_ifa);
1813
1814         TAILQ_FOREACH(vha, &sc->sc_vha_list, vha_link) {
1815                 KKASSERT(vha->vha_ia != NULL);
1816                 if (vha->vha_ia == carp_ia)
1817                         break;
1818         }
1819         KASSERT(vha != NULL, ("no corresponding vhaddr %p\n", carp_ifa));
1820
1821         /* Remove then reinsert, to keep the vhaddr list sorted */
1822         carp_remove_vhaddr(sc, vha);
1823         carp_insert_vhaddr(sc, vha);
1824
1825         if (carp_config_vhaddr(sc, vha, NULL) != 0) {
1826                 /* See the comment in carp_add_addr() */
1827                 carp_hmac_prepare(sc);
1828                 carp_set_state(sc, INIT);
1829                 carp_setrun(sc, 0);
1830         }
1831 }
1832
1833 #ifdef notyet
1834
1835 #ifdef INET6
1836 static int
1837 carp_set_addr6(struct carp_softc *sc, struct sockaddr_in6 *sin6)
1838 {
1839         struct ifnet *ifp;
1840         struct carp_if *cif;
1841         struct in6_ifaddr *ia, *ia_if;
1842         struct ip6_moptions *im6o = &sc->sc_im6o;
1843         struct in6_multi_mship *imm;
1844         struct in6_addr in6;
1845         int own, error;
1846
1847         if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
1848                 carp_setrun(sc, 0);
1849                 return (0);
1850         }
1851
1852         /* we have to do it by hands to check we won't match on us */
1853         ia_if = NULL; own = 0;
1854         for (ia = in6_ifaddr; ia; ia = ia->ia_next) {
1855                 int i;
1856
1857                 for (i = 0; i < 4; i++) {
1858                         if ((sin6->sin6_addr.s6_addr32[i] &
1859                             ia->ia_prefixmask.sin6_addr.s6_addr32[i]) !=
1860                             (ia->ia_addr.sin6_addr.s6_addr32[i] &
1861                             ia->ia_prefixmask.sin6_addr.s6_addr32[i]))
1862                                 break;
1863                 }
1864                 /* and, yeah, we need a multicast-capable iface too */
1865                 if (ia->ia_ifp != &sc->sc_if &&
1866                     (ia->ia_ifp->if_flags & IFF_MULTICAST) &&
1867                     (i == 4)) {
1868                         if (!ia_if)
1869                                 ia_if = ia;
1870                         if (IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr,
1871                             &ia->ia_addr.sin6_addr))
1872                                 own++;
1873                 }
1874         }
1875
1876         if (!ia_if)
1877                 return (EADDRNOTAVAIL);
1878         ia = ia_if;
1879         ifp = ia->ia_ifp;
1880
1881         if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0 ||
1882             (im6o->im6o_multicast_ifp && im6o->im6o_multicast_ifp != ifp))
1883                 return (EADDRNOTAVAIL);
1884
1885         if (!sc->sc_naddrs6) {
1886                 im6o->im6o_multicast_ifp = ifp;
1887
1888                 /* join CARP multicast address */
1889                 bzero(&in6, sizeof(in6));
1890                 in6.s6_addr16[0] = htons(0xff02);
1891                 in6.s6_addr8[15] = 0x12;
1892                 if (in6_setscope(&in6, ifp, NULL) != 0)
1893                         goto cleanup;
1894                 if ((imm = in6_joingroup(ifp, &in6, &error)) == NULL)
1895                         goto cleanup;
1896                 LIST_INSERT_HEAD(&im6o->im6o_memberships, imm, i6mm_chain);
1897
1898                 /* join solicited multicast address */
1899                 bzero(&in6, sizeof(in6));
1900                 in6.s6_addr16[0] = htons(0xff02);
1901                 in6.s6_addr32[1] = 0;
1902                 in6.s6_addr32[2] = htonl(1);
1903                 in6.s6_addr32[3] = sin6->sin6_addr.s6_addr32[3];
1904                 in6.s6_addr8[12] = 0xff;
1905                 if (in6_setscope(&in6, ifp, NULL) != 0)
1906                         goto cleanup;
1907                 if ((imm = in6_joingroup(ifp, &in6, &error)) == NULL)
1908                         goto cleanup;
1909                 LIST_INSERT_HEAD(&im6o->im6o_memberships, imm, i6mm_chain);
1910         }
1911
1912         if (!ifp->if_carp) {
1913                 cif = kmalloc(sizeof(*cif), M_CARP, M_WAITOK | M_ZERO);
1914
1915                 if ((error = ifpromisc(ifp, 1))) {
1916                         kfree(cif, M_CARP);
1917                         goto cleanup;
1918                 }
1919
1920                 TAILQ_INIT(&cif->vhif_vrs);
1921                 ifp->if_carp = cif;
1922         } else {
1923                 struct carp_softc *vr;
1924
1925                 cif = ifp->if_carp;
1926                 TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) {
1927                         if (vr != sc && vr->sc_vhid == sc->sc_vhid) {
1928                                 error = EINVAL;
1929                                 goto cleanup;
1930                         }
1931                 }
1932         }
1933         sc->sc_ia6 = ia;
1934         sc->sc_carpdev = ifp;
1935
1936         { /* XXX prevent endless loop if already in queue */
1937         struct carp_softc *vr, *after = NULL;
1938         int myself = 0;
1939         cif = ifp->if_carp;
1940
1941         TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) {
1942                 if (vr == sc)
1943                         myself = 1;
1944                 if (vr->sc_vhid < sc->sc_vhid)
1945                         after = vr;
1946         }
1947
1948         if (!myself) {
1949                 /* We're trying to keep things in order */
1950                 if (after == NULL)
1951                         TAILQ_INSERT_TAIL(&cif->vhif_vrs, sc, sc_list);
1952                 else
1953                         TAILQ_INSERT_AFTER(&cif->vhif_vrs, after, sc, sc_list);
1954         }
1955         }
1956
1957         sc->sc_naddrs6++;
1958         if (own)
1959                 sc->sc_advskew = 0;
1960         carp_sc_state(sc);
1961         carp_setrun(sc, 0);
1962
1963         return (0);
1964
1965 cleanup:
1966         /* clean up multicast memberships */
1967         if (!sc->sc_naddrs6) {
1968                 while (!LIST_EMPTY(&im6o->im6o_memberships)) {
1969                         imm = LIST_FIRST(&im6o->im6o_memberships);
1970                         LIST_REMOVE(imm, i6mm_chain);
1971                         in6_leavegroup(imm);
1972                 }
1973         }
1974         return (error);
1975 }
1976
1977 static int
1978 carp_del_addr6(struct carp_softc *sc, struct sockaddr_in6 *sin6)
1979 {
1980         int error = 0;
1981
1982         if (!--sc->sc_naddrs6) {
1983                 struct carp_if *cif = sc->sc_carpdev->if_carp;
1984                 struct ip6_moptions *im6o = &sc->sc_im6o;
1985
1986                 callout_stop(&sc->sc_ad_tmo);
1987                 sc->sc_vhid = -1;
1988                 while (!LIST_EMPTY(&im6o->im6o_memberships)) {
1989                         struct in6_multi_mship *imm =
1990                             LIST_FIRST(&im6o->im6o_memberships);
1991
1992                         LIST_REMOVE(imm, i6mm_chain);
1993                         in6_leavegroup(imm);
1994                 }
1995                 im6o->im6o_multicast_ifp = NULL;
1996                 TAILQ_REMOVE(&cif->vhif_vrs, sc, sc_list);
1997                 if (TAILQ_EMPTY(&cif->vhif_vrs)) {
1998                         sc->sc_carpdev->if_carp = NULL;
1999                         kfree(cif, M_IFADDR);
2000                 }
2001         }
2002         return (error);
2003 }
2004 #endif /* INET6 */
2005
2006 #endif
2007
2008 static int
2009 carp_ioctl(struct ifnet *ifp, u_long cmd, caddr_t addr, struct ucred *cr)
2010 {
2011         struct carp_softc *sc = ifp->if_softc;
2012         struct ifreq *ifr = (struct ifreq *)addr;
2013         struct ifdrv *ifd = (struct ifdrv *)addr;
2014         int error = 0;
2015
2016         ASSERT_IFNET_SERIALIZED_ALL(ifp);
2017
2018         carp_gettok();
2019
2020         switch (cmd) {
2021         case SIOCSIFFLAGS:
2022                 if (ifp->if_flags & IFF_UP) {
2023                         if ((ifp->if_flags & IFF_RUNNING) == 0)
2024                                 carp_init(sc);
2025                 } else if (ifp->if_flags & IFF_RUNNING) {
2026                         carp_ioctl_stop(sc);
2027                 }
2028                 break;
2029
2030         case SIOCSVH:
2031                 error = carp_ioctl_setvh(sc, ifr->ifr_data, cr);
2032                 break;
2033
2034         case SIOCGVH:
2035                 error = carp_ioctl_getvh(sc, ifr->ifr_data, cr);
2036                 break;
2037
2038         case SIOCGDRVSPEC:
2039                 switch (ifd->ifd_cmd) {
2040                 case CARPGDEVNAME:
2041                         error = carp_ioctl_getdevname(sc, ifd);
2042                         break;
2043
2044                 case CARPGVHADDR:
2045                         error = carp_ioctl_getvhaddr(sc, ifd);
2046                         break;
2047
2048                 default:
2049                         error = EINVAL;
2050                         break;
2051                 }
2052                 break;
2053
2054         default:
2055                 error = ether_ioctl(ifp, cmd, addr);
2056                 break;
2057         }
2058
2059         carp_reltok();
2060         return error;
2061 }
2062
2063 static void
2064 carp_ioctl_stop_dispatch(netmsg_t msg)
2065 {
2066         struct netmsg_carp *cmsg = (struct netmsg_carp *)msg;
2067         struct carp_softc *sc = cmsg->nc_softc;
2068
2069         carp_gettok();
2070         carp_stop(sc, 0);
2071         carp_reltok();
2072
2073         lwkt_replymsg(&cmsg->base.lmsg, 0);
2074 }
2075
2076 static void
2077 carp_ioctl_stop(struct carp_softc *sc)
2078 {
2079         struct ifnet *ifp = &sc->arpcom.ac_if;
2080         struct netmsg_carp cmsg;
2081
2082         ASSERT_IFNET_SERIALIZED_ALL(ifp);
2083
2084         ifnet_deserialize_all(ifp);
2085
2086         bzero(&cmsg, sizeof(cmsg));
2087         netmsg_init(&cmsg.base, NULL, &curthread->td_msgport, 0,
2088             carp_ioctl_stop_dispatch);
2089         cmsg.nc_softc = sc;
2090
2091         lwkt_domsg(cpu_portfn(0), &cmsg.base.lmsg, 0);
2092
2093         ifnet_serialize_all(ifp);
2094 }
2095
2096 static void
2097 carp_ioctl_setvh_dispatch(netmsg_t msg)
2098 {
2099         struct netmsg_carp *cmsg = (struct netmsg_carp *)msg;
2100         struct carp_softc *sc = cmsg->nc_softc, *vr;
2101         struct ifnet *ifp = &sc->arpcom.ac_if;
2102         const struct carpreq *carpr = cmsg->nc_data;
2103         int error;
2104
2105         carp_gettok();
2106
2107         error = 1;
2108         if ((ifp->if_flags & IFF_RUNNING) &&
2109             sc->sc_state != INIT && carpr->carpr_state != sc->sc_state) {
2110                 switch (carpr->carpr_state) {
2111                 case BACKUP:
2112                         callout_stop(&sc->sc_ad_tmo);
2113                         carp_set_state(sc, BACKUP);
2114                         carp_setrun(sc, 0);
2115                         carp_setroute(sc, RTM_DELETE);
2116                         break;
2117
2118                 case MASTER:
2119                         carp_master_down(sc);
2120                         break;
2121
2122                 default:
2123                         break;
2124                 }
2125         }
2126         if (carpr->carpr_vhid > 0) {
2127                 if (carpr->carpr_vhid > 255) {
2128                         error = EINVAL;
2129                         goto back;
2130                 }
2131                 if (sc->sc_carpdev) {
2132                         struct carp_if *cif = sc->sc_carpdev->if_carp;
2133
2134                         TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) {
2135                                 if (vr != sc &&
2136                                     vr->sc_vhid == carpr->carpr_vhid) {
2137                                         error = EEXIST;
2138                                         goto back;
2139                                 }
2140                         }
2141                 }
2142                 sc->sc_vhid = carpr->carpr_vhid;
2143
2144                 IF_LLADDR(ifp)[5] = sc->sc_vhid;
2145                 bcopy(IF_LLADDR(ifp), sc->arpcom.ac_enaddr,
2146                     ETHER_ADDR_LEN);
2147
2148                 error--;
2149         }
2150         if (carpr->carpr_advbase > 0 || carpr->carpr_advskew > 0) {
2151                 if (carpr->carpr_advskew >= 255) {
2152                         error = EINVAL;
2153                         goto back;
2154                 }
2155                 if (carpr->carpr_advbase > 255) {
2156                         error = EINVAL;
2157                         goto back;
2158                 }
2159                 sc->sc_advbase = carpr->carpr_advbase;
2160                 sc->sc_advskew = carpr->carpr_advskew;
2161                 error--;
2162         }
2163         bcopy(carpr->carpr_key, sc->sc_key, sizeof(sc->sc_key));
2164         if (error > 0) {
2165                 error = EINVAL;
2166         } else {
2167                 error = 0;
2168                 carp_setrun(sc, 0);
2169         }
2170 back:
2171         carp_hmac_prepare(sc);
2172         carp_gettok();
2173
2174         lwkt_replymsg(&cmsg->base.lmsg, error);
2175 }
2176
2177 static int
2178 carp_ioctl_setvh(struct carp_softc *sc, void *udata, struct ucred *cr)
2179 {
2180         struct ifnet *ifp = &sc->arpcom.ac_if;
2181         struct netmsg_carp cmsg;
2182         struct carpreq carpr;
2183         int error;
2184
2185         ASSERT_IFNET_SERIALIZED_ALL(ifp);
2186         ifnet_deserialize_all(ifp);
2187
2188         error = priv_check_cred(cr, PRIV_ROOT, NULL_CRED_OKAY);
2189         if (error)
2190                 goto back;
2191
2192         error = copyin(udata, &carpr, sizeof(carpr));
2193         if (error)
2194                 goto back;
2195
2196         bzero(&cmsg, sizeof(cmsg));
2197         netmsg_init(&cmsg.base, NULL, &curthread->td_msgport, 0,
2198             carp_ioctl_setvh_dispatch);
2199         cmsg.nc_softc = sc;
2200         cmsg.nc_data = &carpr;
2201
2202         error = lwkt_domsg(cpu_portfn(0), &cmsg.base.lmsg, 0);
2203
2204 back:
2205         ifnet_serialize_all(ifp);
2206         return error;
2207 }
2208
2209 static void
2210 carp_ioctl_getvh_dispatch(netmsg_t msg)
2211 {
2212         struct netmsg_carp *cmsg = (struct netmsg_carp *)msg;
2213         struct carp_softc *sc = cmsg->nc_softc;
2214         struct carpreq *carpr = cmsg->nc_data;
2215
2216         carp_gettok();
2217
2218         carpr->carpr_state = sc->sc_state;
2219         carpr->carpr_vhid = sc->sc_vhid;
2220         carpr->carpr_advbase = sc->sc_advbase;
2221         carpr->carpr_advskew = sc->sc_advskew;
2222         bcopy(sc->sc_key, carpr->carpr_key, sizeof(carpr->carpr_key));
2223
2224         carp_reltok();
2225
2226         lwkt_replymsg(&cmsg->base.lmsg, 0);
2227 }
2228
2229 static int
2230 carp_ioctl_getvh(struct carp_softc *sc, void *udata, struct ucred *cr)
2231 {
2232         struct ifnet *ifp = &sc->arpcom.ac_if;
2233         struct netmsg_carp cmsg;
2234         struct carpreq carpr;
2235         int error;
2236
2237         ASSERT_IFNET_SERIALIZED_ALL(ifp);
2238         ifnet_deserialize_all(ifp);
2239
2240         bzero(&cmsg, sizeof(cmsg));
2241         netmsg_init(&cmsg.base, NULL, &curthread->td_msgport, 0,
2242             carp_ioctl_getvh_dispatch);
2243         cmsg.nc_softc = sc;
2244         cmsg.nc_data = &carpr;
2245
2246         lwkt_domsg(cpu_portfn(0), &cmsg.base.lmsg, 0);
2247
2248         error = priv_check_cred(cr, PRIV_ROOT, NULL_CRED_OKAY);
2249         if (error)
2250                 bzero(carpr.carpr_key, sizeof(carpr.carpr_key));
2251
2252         error = copyout(&carpr, udata, sizeof(carpr));
2253
2254         ifnet_serialize_all(ifp);
2255         return error;
2256 }
2257
2258 static void
2259 carp_ioctl_getdevname_dispatch(netmsg_t msg)
2260 {
2261         struct netmsg_carp *cmsg = (struct netmsg_carp *)msg;
2262         struct carp_softc *sc = cmsg->nc_softc;
2263         char *devname = cmsg->nc_data;
2264
2265         bzero(devname, sizeof(devname));
2266
2267         carp_gettok();
2268         if (sc->sc_carpdev != NULL)
2269                 strlcpy(devname, sc->sc_carpdev->if_xname, sizeof(devname));
2270         carp_reltok();
2271
2272         lwkt_replymsg(&cmsg->base.lmsg, 0);
2273 }
2274
2275 static int
2276 carp_ioctl_getdevname(struct carp_softc *sc, struct ifdrv *ifd)
2277 {
2278         struct ifnet *ifp = &sc->arpcom.ac_if;
2279         struct netmsg_carp cmsg;
2280         char devname[IFNAMSIZ];
2281         int error;
2282
2283         ASSERT_IFNET_SERIALIZED_ALL(ifp);
2284
2285         if (ifd->ifd_len != sizeof(devname))
2286                 return EINVAL;
2287
2288         ifnet_deserialize_all(ifp);
2289
2290         bzero(&cmsg, sizeof(cmsg));
2291         netmsg_init(&cmsg.base, NULL, &curthread->td_msgport, 0,
2292             carp_ioctl_getdevname_dispatch);
2293         cmsg.nc_softc = sc;
2294         cmsg.nc_data = devname;
2295
2296         lwkt_domsg(cpu_portfn(0), &cmsg.base.lmsg, 0);
2297
2298         error = copyout(devname, ifd->ifd_data, sizeof(devname));
2299
2300         ifnet_serialize_all(ifp);
2301         return error;
2302 }
2303
2304 static void
2305 carp_init_dispatch(netmsg_t msg)
2306 {
2307         struct netmsg_carp *cmsg = (struct netmsg_carp *)msg;
2308         struct carp_softc *sc = cmsg->nc_softc;
2309
2310         carp_gettok();
2311
2312         sc->sc_if.if_flags |= IFF_RUNNING;
2313         carp_hmac_prepare(sc);
2314         carp_set_state(sc, INIT);
2315         carp_setrun(sc, 0);
2316
2317         carp_reltok();
2318
2319         lwkt_replymsg(&cmsg->base.lmsg, 0);
2320 }
2321
2322 static void
2323 carp_init(void *xsc)
2324 {
2325         struct carp_softc *sc = xsc;
2326         struct ifnet *ifp = &sc->arpcom.ac_if;
2327         struct netmsg_carp cmsg;
2328
2329         ASSERT_IFNET_SERIALIZED_ALL(ifp);
2330
2331         ifnet_deserialize_all(ifp);
2332
2333         bzero(&cmsg, sizeof(cmsg));
2334         netmsg_init(&cmsg.base, NULL, &curthread->td_msgport, 0,
2335             carp_init_dispatch);
2336         cmsg.nc_softc = sc;
2337
2338         lwkt_domsg(cpu_portfn(0), &cmsg.base.lmsg, 0);
2339
2340         ifnet_serialize_all(ifp);
2341 }
2342
2343 static int
2344 carp_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst,
2345     struct rtentry *rt)
2346 {
2347         struct carp_softc *sc = ifp->if_softc;
2348         int error = 0;
2349
2350         carp_gettok();
2351         if (sc->sc_carpdev) {
2352                 /*
2353                  * NOTE:
2354                  * CARP's ifp is passed to backing device's
2355                  * if_output method.
2356                  */
2357                 sc->sc_carpdev->if_output(ifp, m, dst, rt);
2358         } else {
2359                 m_freem(m);
2360                 error = ENETUNREACH;
2361         }
2362         carp_reltok();
2363
2364         return error;
2365 }
2366
2367 /*
2368  * Start output on carp interface. This function should never be called.
2369  */
2370 static void
2371 carp_start(struct ifnet *ifp)
2372 {
2373         panic("%s: start called\n", ifp->if_xname);
2374 }
2375
2376 static void
2377 carp_serialize(struct ifnet *ifp __unused,
2378     enum ifnet_serialize slz __unused)
2379 {
2380 }
2381
2382 static void
2383 carp_deserialize(struct ifnet *ifp __unused,
2384     enum ifnet_serialize slz __unused)
2385 {
2386 }
2387
2388 static int
2389 carp_tryserialize(struct ifnet *ifp __unused,
2390     enum ifnet_serialize slz __unused)
2391 {
2392         return 1;
2393 }
2394
2395 #ifdef INVARIANTS
2396
2397 static void
2398 carp_serialize_assert(struct ifnet *ifp __unused,
2399     enum ifnet_serialize slz __unused, boolean_t serialized __unused)
2400 {
2401 }
2402
2403 #endif  /* INVARIANTS */
2404
2405 static void
2406 carp_set_state(struct carp_softc *sc, int state)
2407 {
2408         struct ifnet *cifp = &sc->sc_if;
2409
2410         if (sc->sc_state == state)
2411                 return;
2412         sc->sc_state = state;
2413
2414         switch (sc->sc_state) {
2415         case BACKUP:
2416                 cifp->if_link_state = LINK_STATE_DOWN;
2417                 break;
2418
2419         case MASTER:
2420                 cifp->if_link_state = LINK_STATE_UP;
2421                 break;
2422
2423         default:
2424                 cifp->if_link_state = LINK_STATE_UNKNOWN;
2425                 break;
2426         }
2427         rt_ifmsg(cifp);
2428 }
2429
2430 void
2431 carp_group_demote_adj(struct ifnet *ifp, int adj)
2432 {
2433         struct ifg_list *ifgl;
2434         int *dm;
2435
2436         carp_gettok();
2437
2438         TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) {
2439                 if (!strcmp(ifgl->ifgl_group->ifg_group, IFG_ALL))
2440                         continue;
2441                 dm = &ifgl->ifgl_group->ifg_carp_demoted;
2442
2443                 if (*dm + adj >= 0)
2444                         *dm += adj;
2445                 else
2446                         *dm = 0;
2447
2448                 if (adj > 0 && *dm == 1)
2449                         carp_send_ad_all();
2450                 CARP_LOG("%s demoted group %s to %d", ifp->if_xname,
2451                     ifgl->ifgl_group->ifg_group, *dm);
2452         }
2453
2454         carp_reltok();
2455 }
2456
2457 void
2458 carp_carpdev_state(void *v)
2459 {
2460         struct carp_if *cif = v;
2461         struct carp_softc *sc;
2462
2463         carp_gettok();
2464
2465         TAILQ_FOREACH(sc, &cif->vhif_vrs, sc_list)
2466                 carp_sc_state(sc);
2467
2468         carp_reltok();
2469 }
2470
2471 static void
2472 carp_sc_state(struct carp_softc *sc)
2473 {
2474         if (!(sc->sc_carpdev->if_flags & IFF_UP)) {
2475                 callout_stop(&sc->sc_ad_tmo);
2476                 callout_stop(&sc->sc_md_tmo);
2477                 callout_stop(&sc->sc_md6_tmo);
2478                 carp_set_state(sc, INIT);
2479                 carp_setrun(sc, 0);
2480                 if (!sc->sc_suppress) {
2481                         carp_suppress_preempt++;
2482                         if (carp_suppress_preempt == 1)
2483                                 carp_send_ad_all();
2484                 }
2485                 sc->sc_suppress = 1;
2486         } else {
2487                 carp_set_state(sc, INIT);
2488                 carp_setrun(sc, 0);
2489                 if (sc->sc_suppress)
2490                         carp_suppress_preempt--;
2491                 sc->sc_suppress = 0;
2492         }
2493 }
2494
2495 static void
2496 carp_stop(struct carp_softc *sc, int detach)
2497 {
2498         sc->sc_if.if_flags &= ~IFF_RUNNING;
2499
2500         callout_stop(&sc->sc_ad_tmo);
2501         callout_stop(&sc->sc_md_tmo);
2502         callout_stop(&sc->sc_md6_tmo);
2503
2504         if (!detach && sc->sc_state == MASTER)
2505                 carp_send_ad(sc);
2506
2507         if (sc->sc_suppress)
2508                 carp_suppress_preempt--;
2509         sc->sc_suppress = 0;
2510
2511         if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS)
2512                 carp_suppress_preempt--;
2513         sc->sc_sendad_errors = 0;
2514         sc->sc_sendad_success = 0;
2515
2516         carp_set_state(sc, INIT);
2517         carp_setrun(sc, 0);
2518 }
2519
2520 static void
2521 carp_suspend(struct carp_softc *sc, int detach)
2522 {
2523         struct ifnet *cifp = &sc->sc_if;
2524
2525         carp_stop(sc, detach);
2526
2527         /* Retain the running state, if we are not dead yet */
2528         if (!sc->sc_dead && (cifp->if_flags & IFF_UP))
2529                 cifp->if_flags |= IFF_RUNNING;
2530 }
2531
2532 static int
2533 carp_activate_vhaddr(struct carp_softc *sc, struct carp_vhaddr *vha,
2534     struct ifnet *ifp, struct in_ifaddr *ia_if, int own)
2535 {
2536         struct ip_moptions *imo = &sc->sc_imo;
2537         struct carp_if *cif;
2538         struct carp_softc *vr, *after = NULL;
2539         int onlist, error;
2540 #ifdef INVARIANTS
2541         int assert_onlist;
2542 #endif
2543
2544         KKASSERT(vha->vha_ia != NULL);
2545
2546         KASSERT(ia_if != NULL, ("NULL backing address\n"));
2547         KASSERT(vha->vha_iaback == NULL, ("%p is already activated\n", vha));
2548         KASSERT((vha->vha_flags & CARP_VHAF_OWNER) == 0,
2549                 ("inactive vhaddr %p is the address owner\n", vha));
2550
2551         KASSERT(sc->sc_carpdev == NULL || sc->sc_carpdev == ifp,
2552                 ("%s is already on %s\n", sc->sc_if.if_xname,
2553                  sc->sc_carpdev->if_xname));
2554
2555         if (!ifp->if_carp) {
2556                 KASSERT(sc->sc_carpdev == NULL,
2557                         ("%s is already on %s\n", sc->sc_if.if_xname,
2558                          sc->sc_carpdev->if_xname));
2559
2560                 cif = kmalloc(sizeof(*cif), M_CARP, M_WAITOK | M_ZERO);
2561
2562                 error = ifpromisc(ifp, 1);
2563                 if (error) {
2564                         kfree(cif, M_CARP);
2565                         return error;
2566                 }
2567
2568                 TAILQ_INIT(&cif->vhif_vrs);
2569                 ifp->if_carp = cif;
2570         } else {
2571                 cif = ifp->if_carp;
2572                 TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) {
2573                         if (vr != sc && vr->sc_vhid == sc->sc_vhid)
2574                                 return EINVAL;
2575                 }
2576         }
2577
2578 #ifdef INVARIANTS
2579         if (sc->sc_carpdev != NULL)
2580                 assert_onlist = 1;
2581         else
2582                 assert_onlist = 0;
2583 #endif
2584         sc->sc_ia = ia_if;
2585         sc->sc_carpdev = ifp;
2586
2587         cif = ifp->if_carp;
2588         onlist = 0;
2589         TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) {
2590                 if (vr == sc)
2591                         onlist = 1;
2592                 if (vr->sc_vhid < sc->sc_vhid)
2593                         after = vr;
2594         }
2595
2596 #ifdef INVARIANTS
2597         if (assert_onlist) {
2598                 KASSERT(onlist, ("%s is not on %s carp list\n",
2599                         sc->sc_if.if_xname, ifp->if_xname));
2600         } else {
2601                 KASSERT(!onlist, ("%s is already on %s carp list\n",
2602                         sc->sc_if.if_xname, ifp->if_xname));
2603         }
2604 #endif
2605
2606         if (!onlist) {
2607                 /* We're trying to keep things in order */
2608                 if (after == NULL)
2609                         TAILQ_INSERT_TAIL(&cif->vhif_vrs, sc, sc_list);
2610                 else
2611                         TAILQ_INSERT_AFTER(&cif->vhif_vrs, after, sc, sc_list);
2612         }
2613
2614         vha->vha_iaback = ia_if;
2615         sc->sc_naddrs++;
2616
2617         if (own) {
2618                 vha->vha_flags |= CARP_VHAF_OWNER;
2619
2620                 /* XXX save user configured advskew? */
2621                 sc->sc_advskew = 0;
2622         }
2623
2624         carp_addroute_vhaddr(sc, vha);
2625
2626         /*
2627          * Join the multicast group only after the backing interface
2628          * has been hooked with the CARP interface.
2629          */
2630         KASSERT(imo->imo_multicast_ifp == NULL ||
2631                 imo->imo_multicast_ifp == &sc->sc_if,
2632                 ("%s didn't leave mcast group on %s\n",
2633                  sc->sc_if.if_xname, imo->imo_multicast_ifp->if_xname));
2634
2635         if (imo->imo_num_memberships == 0) {
2636                 struct in_addr addr;
2637
2638                 addr.s_addr = htonl(INADDR_CARP_GROUP);
2639                 imo->imo_membership[0] = in_addmulti(&addr, &sc->sc_if);
2640                 if (imo->imo_membership[0] == NULL) {
2641                         carp_deactivate_vhaddr(sc, vha, FALSE);
2642                         return ENOBUFS;
2643                 }
2644
2645                 imo->imo_num_memberships++;
2646                 imo->imo_multicast_ifp = &sc->sc_if;
2647                 imo->imo_multicast_ttl = CARP_DFLTTL;
2648                 imo->imo_multicast_loop = 0;
2649         }
2650
2651         carp_hmac_prepare(sc);
2652         carp_set_state(sc, INIT);
2653         carp_setrun(sc, 0);
2654         return 0;
2655 }
2656
2657 static void
2658 carp_deactivate_vhaddr(struct carp_softc *sc, struct carp_vhaddr *vha,
2659     boolean_t del_iaback)
2660 {
2661         KKASSERT(vha->vha_ia != NULL);
2662
2663         carp_hmac_prepare(sc);
2664
2665         if (vha->vha_iaback == NULL) {
2666                 KASSERT((vha->vha_flags & CARP_VHAF_OWNER) == 0,
2667                         ("inactive vhaddr %p is the address owner\n", vha));
2668                 return;
2669         }
2670
2671         vha->vha_flags &= ~CARP_VHAF_OWNER;
2672         carp_delroute_vhaddr(sc, vha, del_iaback);
2673
2674         KKASSERT(sc->sc_naddrs > 0);
2675         vha->vha_iaback = NULL;
2676         sc->sc_naddrs--;
2677         if (!sc->sc_naddrs) {
2678                 if (sc->sc_naddrs6) {
2679                         carp_multicast_cleanup(sc);
2680                         sc->sc_ia = NULL;
2681                 } else {
2682                         carp_detach(sc, 0, del_iaback);
2683                 }
2684         }
2685 }
2686
2687 static void
2688 carp_link_addrs(struct carp_softc *sc, struct ifnet *ifp, struct ifaddr *ifa_if)
2689 {
2690         struct carp_vhaddr *vha;
2691         struct in_ifaddr *ia_if;
2692
2693         KKASSERT(ifa_if->ifa_addr->sa_family == AF_INET);
2694         ia_if = ifatoia(ifa_if);
2695
2696         /*
2697          * Test each inactive vhaddr against the newly added address.
2698          * If the newly added address could be the backing address,
2699          * then activate the matching vhaddr.
2700          */
2701         TAILQ_FOREACH(vha, &sc->sc_vha_list, vha_link) {
2702                 const struct in_ifaddr *ia;
2703                 u_long iaddr;
2704                 int own;
2705
2706                 if (vha->vha_iaback != NULL)
2707                         continue;
2708
2709                 ia = vha->vha_ia;
2710                 iaddr = ntohl(ia->ia_addr.sin_addr.s_addr);
2711
2712                 if ((iaddr & ia_if->ia_subnetmask) != ia_if->ia_subnet)
2713                         continue;
2714
2715                 own = 0;
2716                 if (ia->ia_addr.sin_addr.s_addr ==
2717                     ia_if->ia_addr.sin_addr.s_addr)
2718                         own = 1;
2719
2720                 carp_activate_vhaddr(sc, vha, ifp, ia_if, own);
2721         }
2722 }
2723
2724 static void
2725 carp_unlink_addrs(struct carp_softc *sc, struct ifnet *ifp,
2726                   struct ifaddr *ifa_if)
2727 {
2728         struct carp_vhaddr *vha;
2729         struct in_ifaddr *ia_if;
2730
2731         KKASSERT(ifa_if->ifa_addr->sa_family == AF_INET);
2732         ia_if = ifatoia(ifa_if);
2733
2734         /*
2735          * Ad src address is deleted; set it to NULL.
2736          * Following loop will try pick up a new ad src address
2737          * if one of the vhaddr could retain its backing address.
2738          */
2739         if (sc->sc_ia == ia_if)
2740                 sc->sc_ia = NULL;
2741
2742         /*
2743          * Test each active vhaddr against the deleted address.
2744          * If the deleted address is vhaddr address's backing
2745          * address, then deactivate the vhaddr.
2746          */
2747         TAILQ_FOREACH(vha, &sc->sc_vha_list, vha_link) {
2748                 if (vha->vha_iaback == NULL)
2749                         continue;
2750
2751                 if (vha->vha_iaback == ia_if)
2752                         carp_deactivate_vhaddr(sc, vha, TRUE);
2753                 else if (sc->sc_ia == NULL)
2754                         sc->sc_ia = vha->vha_iaback;
2755         }
2756 }
2757
2758 static void
2759 carp_update_addrs(struct carp_softc *sc, struct ifaddr *ifa_del)
2760 {
2761         struct carp_vhaddr *vha;
2762
2763         KKASSERT(sc->sc_carpdev == NULL);
2764
2765         TAILQ_FOREACH(vha, &sc->sc_vha_list, vha_link)
2766                 carp_config_vhaddr(sc, vha, ifatoia(ifa_del));
2767 }
2768
2769 static void
2770 carp_ifaddr(void *arg __unused, struct ifnet *ifp,
2771             enum ifaddr_event event, struct ifaddr *ifa)
2772 {
2773         struct carp_softc *sc;
2774
2775         carp_gettok();
2776
2777         if (ifa->ifa_addr->sa_family != AF_INET)
2778                 goto back;
2779
2780         KASSERT(&curthread->td_msgport == cpu_portfn(0),
2781             ("not in netisr0"));
2782
2783         if (ifp->if_type == IFT_CARP) {
2784                 /*
2785                  * Address is changed on carp(4) interface
2786                  */
2787                 switch (event) {
2788                 case IFADDR_EVENT_ADD:
2789                         carp_add_addr(ifp->if_softc, ifa);
2790                         break;
2791
2792                 case IFADDR_EVENT_CHANGE:
2793                         carp_config_addr(ifp->if_softc, ifa);
2794                         break;
2795
2796                 case IFADDR_EVENT_DELETE:
2797                         carp_del_addr(ifp->if_softc, ifa);
2798                         break;
2799                 }
2800                 goto back;
2801         }
2802
2803         /*
2804          * Address is changed on non-carp(4) interface
2805          */
2806         if ((ifp->if_flags & IFF_MULTICAST) == 0)
2807                 goto back;
2808
2809         LIST_FOREACH(sc, &carpif_list, sc_next) {
2810                 if (sc->sc_carpdev != NULL && sc->sc_carpdev != ifp) {
2811                         /* Not the parent iface; skip */
2812                         continue;
2813                 }
2814
2815                 switch (event) {
2816                 case IFADDR_EVENT_ADD:
2817                         carp_link_addrs(sc, ifp, ifa);
2818                         break;
2819
2820                 case IFADDR_EVENT_DELETE:
2821                         if (sc->sc_carpdev != NULL) {
2822                                 carp_unlink_addrs(sc, ifp, ifa);
2823                                 if (sc->sc_carpdev == NULL) {
2824                                         /*
2825                                          * We no longer have the parent
2826                                          * interface, however, certain
2827                                          * virtual addresses, which are
2828                                          * not used because they can't
2829                                          * match the previous parent
2830                                          * interface's addresses, may now
2831                                          * match different interface's
2832                                          * addresses.
2833                                          */
2834                                         carp_update_addrs(sc, ifa);
2835                                 }
2836                         } else {
2837                                 /*
2838                                  * The carp(4) interface didn't have a
2839                                  * parent iface, so it is not possible
2840                                  * that it will contain any address to
2841                                  * be unlinked.
2842                                  */
2843                         }
2844                         break;
2845
2846                 case IFADDR_EVENT_CHANGE:
2847                         if (sc->sc_carpdev == NULL) {
2848                                 /*
2849                                  * The carp(4) interface didn't have a
2850                                  * parent iface, so it is not possible
2851                                  * that it will contain any address to
2852                                  * be updated.
2853                                  */
2854                                 carp_link_addrs(sc, ifp, ifa);
2855                         } else {
2856                                 /*
2857                                  * First try breaking tie with the old
2858                                  * address.  Then see whether we could
2859                                  * link certain vhaddr to the new address.
2860                                  * If that fails, i.e. carpdev is NULL,
2861                                  * we try a global update.
2862                                  *
2863                                  * NOTE: The above order is critical.
2864                                  */
2865                                 carp_unlink_addrs(sc, ifp, ifa);
2866                                 carp_link_addrs(sc, ifp, ifa);
2867                                 if (sc->sc_carpdev == NULL) {
2868                                         /*
2869                                          * See the comment in the above
2870                                          * IFADDR_EVENT_DELETE block.
2871                                          */
2872                                         carp_update_addrs(sc, NULL);
2873                                 }
2874                         }
2875                         break;
2876                 }
2877         }
2878
2879 back:
2880         carp_reltok();
2881 }
2882
2883 void
2884 carp_proto_ctlinput(netmsg_t msg)
2885 {
2886         int cmd = msg->ctlinput.nm_cmd;
2887         struct sockaddr *sa = msg->ctlinput.nm_arg;
2888         struct in_ifaddr_container *iac;
2889
2890         carp_gettok();
2891
2892         TAILQ_FOREACH(iac, &in_ifaddrheads[mycpuid], ia_link) {
2893                 struct in_ifaddr *ia = iac->ia;
2894                 struct ifnet *ifp = ia->ia_ifp;
2895
2896                 if (ifp->if_type == IFT_CARP)
2897                         continue;
2898
2899                 if (ia->ia_ifa.ifa_addr == sa) {
2900                         if (cmd == PRC_IFDOWN) {
2901                                 carp_ifaddr(NULL, ifp, IFADDR_EVENT_DELETE,
2902                                     &ia->ia_ifa);
2903                         } else if (cmd == PRC_IFUP) {
2904                                 carp_ifaddr(NULL, ifp, IFADDR_EVENT_ADD,
2905                                     &ia->ia_ifa);
2906                         }
2907                         break;
2908                 }
2909         }
2910
2911         carp_reltok();
2912         lwkt_replymsg(&msg->lmsg, 0);
2913 }
2914
2915 void
2916 carp_gettok(void)
2917 {
2918         lwkt_gettoken(&carp_tok);
2919 }
2920
2921 void
2922 carp_reltok(void)
2923 {
2924         lwkt_reltoken(&carp_tok);
2925 }
2926
2927 struct ifnet *
2928 carp_parent(struct ifnet *cifp)
2929 {
2930         struct carp_softc *sc;
2931
2932         ASSERT_LWKT_TOKEN_HELD(&carp_tok);
2933
2934         KKASSERT(cifp->if_type == IFT_CARP);
2935         sc = cifp->if_softc;
2936
2937         return sc->sc_carpdev;
2938 }
2939
2940 #define rtinitflags(x) \
2941         (((x)->ia_ifp->if_flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) \
2942                  ? RTF_HOST : 0)
2943
2944 static int
2945 carp_addroute_vhaddr(struct carp_softc *sc, struct carp_vhaddr *vha)
2946 {
2947         struct in_ifaddr *ia, *iaback;
2948         int error;
2949
2950         if (sc->sc_state != MASTER)
2951                 return 0;
2952
2953         ia = vha->vha_ia;
2954         KKASSERT(ia != NULL);
2955
2956         iaback = vha->vha_iaback;
2957         KKASSERT(iaback != NULL);
2958
2959         rtinit(&iaback->ia_ifa, RTM_DELETE, rtinitflags(iaback));
2960         in_ifadown(&iaback->ia_ifa, 1);
2961         iaback->ia_flags &= ~IFA_ROUTE;
2962
2963         error = rtinit(&ia->ia_ifa, RTM_ADD, rtinitflags(ia) | RTF_UP);
2964         if (!error)
2965                 ia->ia_flags |= IFA_ROUTE;
2966         return error;
2967 }
2968
2969 static void
2970 carp_delroute_vhaddr(struct carp_softc *sc, struct carp_vhaddr *vha,
2971     boolean_t del_iaback)
2972 {
2973         struct in_ifaddr *ia, *iaback;
2974
2975         ia = vha->vha_ia;
2976         KKASSERT(ia != NULL);
2977
2978         iaback = vha->vha_iaback;
2979         KKASSERT(iaback != NULL);
2980
2981         rtinit(&ia->ia_ifa, RTM_DELETE, rtinitflags(ia));
2982         in_ifadown(&ia->ia_ifa, 1);
2983         ia->ia_flags &= ~IFA_ROUTE;
2984
2985         if (!del_iaback && (iaback->ia_ifp->if_flags & IFF_UP)) {
2986                 int error;
2987
2988                 error = rtinit(&iaback->ia_ifa, RTM_ADD,
2989                     rtinitflags(iaback) | RTF_UP);
2990                 if (!error)
2991                         iaback->ia_flags |= IFA_ROUTE;
2992         }
2993 }
2994
2995 static int
2996 carp_modevent(module_t mod, int type, void *data)
2997 {
2998         switch (type) {
2999         case MOD_LOAD:
3000                 LIST_INIT(&carpif_list);
3001                 carp_ifdetach_event =
3002                 EVENTHANDLER_REGISTER(ifnet_detach_event, carp_ifdetach, NULL,
3003                                       EVENTHANDLER_PRI_ANY);
3004                 carp_ifaddr_event =
3005                 EVENTHANDLER_REGISTER(ifaddr_event, carp_ifaddr, NULL,
3006                                       EVENTHANDLER_PRI_FIRST);
3007                 if_clone_attach(&carp_cloner);
3008                 break;
3009
3010         case MOD_UNLOAD:
3011                 EVENTHANDLER_DEREGISTER(ifnet_detach_event,
3012                                         carp_ifdetach_event);
3013                 EVENTHANDLER_DEREGISTER(ifaddr_event,
3014                                         carp_ifaddr_event);
3015                 if_clone_detach(&carp_cloner);
3016                 break;
3017
3018         default:
3019                 return (EINVAL);
3020         }
3021         return (0);
3022 }
3023
3024 static moduledata_t carp_mod = {
3025         "carp",
3026         carp_modevent,
3027         0
3028 };
3029 DECLARE_MODULE(carp, carp_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);