Merge branches 'master' and 'suser_to_priv'
authorMichael Neumann <mneumann@ntecs.de>
Wed, 31 Dec 2008 13:06:40 +0000 (13:06 +0000)
committerMichael Neumann <mneumann@ntecs.de>
Wed, 31 Dec 2008 13:06:40 +0000 (13:06 +0000)
Conflicts:

sys/netinet/ip_carp.c
sys/platform/pc64/amd64/machdep.c

18 files changed:
1  2 
sys/dev/misc/syscons/syscons.c
sys/dev/netif/cx/cx.c
sys/dev/netif/wi/if_wi.c
sys/dev/netif/wl/if_wl.c
sys/kern/vfs_syscalls.c
sys/kern/vfs_vnops.c
sys/net/bridge/if_bridge.c
sys/netinet/in.c
sys/netinet/ip_carp.c
sys/netinet/ip_output.c
sys/netinet/tcp_subr.c
sys/netinet6/in6.c
sys/netinet6/in6_src.c
sys/netinet6/ip6_input.c
sys/netproto/ipx/ipx_usrreq.c
sys/platform/pc64/amd64/machdep.c
sys/vfs/hammer/hammer.h
sys/vm/vm_swap.c

@@@ -35,9 -35,7 +35,9 @@@
  #include "use_splash.h"
  #include "opt_syscons.h"
  #include "opt_ddb.h"
 +#ifdef __i386__
  #include "use_apm.h"
 +#endif
  
  #include <sys/param.h>
  #include <sys/systm.h>
@@@ -45,6 -43,7 +45,7 @@@
  #include <sys/reboot.h>
  #include <sys/conf.h>
  #include <sys/proc.h>
+ #include <sys/priv.h>
  #include <sys/signalvar.h>
  #include <sys/sysctl.h>
  #include <sys/tty.h>
@@@ -57,9 -56,7 +58,9 @@@
  #include <machine/console.h>
  #include <machine/psl.h>
  #include <machine/pc/display.h>
 +#ifdef __i386__
  #include <machine/apm_bios.h>
 +#endif
  #include <machine/frame.h>
  
  #include <dev/misc/kbd/kbdreg.h>
@@@ -496,7 -493,7 +497,7 @@@ scopen(struct dev_open_args *ap
        (*linesw[tp->t_line].l_modem)(tp, 1);
      }
      else
-       if (tp->t_state & TS_XCLUDE && suser_cred(ap->a_cred, 0))
+       if (tp->t_state & TS_XCLUDE && priv_check_cred(ap->a_cred, PRIV_ROOT, 0))
            return(EBUSY);
  
      error = (*linesw[tp->t_line].l_open)(dev, tp);
@@@ -1006,24 -1003,16 +1007,24 @@@ scioctl(struct dev_ioctl_args *ap
        return 0;
  
      case KDENABIO:            /* allow io operations */
-       error = suser_cred(ap->a_cred, 0);
+       error = priv_check_cred(ap->a_cred, PRIV_ROOT, 0);
        if (error != 0)
            return error;
        if (securelevel > 0)
            return EPERM;
 +#if defined(__i386__)
        curthread->td_lwp->lwp_md.md_regs->tf_eflags |= PSL_IOPL;
 +#elif defined(__amd64__)
 +      curthread->td_lwp->lwp_md.md_regs->tf_rflags |= PSL_IOPL;
 +#endif
        return 0;
  
      case KDDISABIO:           /* disallow io operations (default) */
 +#if defined(__i386__)
        curthread->td_lwp->lwp_md.md_regs->tf_eflags &= ~PSL_IOPL;
 +#elif defined(__amd64__)
 +      curthread->td_lwp->lwp_md.md_regs->tf_rflags &= ~PSL_IOPL;
 +#endif
        return 0;
  
      case KDSKBSTATE:          /* set keyboard state (locks) */
diff --combined sys/dev/netif/cx/cx.c
@@@ -29,6 -29,7 +29,7 @@@
  #include <sys/fcntl.h>
  #include <sys/conf.h>
  #include <sys/proc.h>
+ #include <sys/priv.h>
  #include <sys/tty.h>
  #include <sys/socket.h>
  #include <sys/thread2.h>
@@@ -162,7 -163,7 +163,7 @@@ cxopen (struct dev_open_args *ap
        tp = c->ttyp;
        tp->t_dev = dev;
        if ((tp->t_state & TS_ISOPEN) && (tp->t_state & TS_XCLUDE) &&
-           suser_cred(ap->a_cred, 0))
+           priv_check_cred(ap->a_cred, PRIV_ROOT, 0))
                return (EBUSY);
        if (! (tp->t_state & TS_ISOPEN)) {
                ttychars (tp);
@@@ -727,12 -728,12 +728,12 @@@ cxparam (struct tty *tp, struct termio
  void
  cxstop (struct tty *tp, int flag)
  {
 -      cx_chan_t *c = cxchan[UNIT(tp->t_dev)];
 -      unsigned short port = c->chip->port;
 -
        crit_enter();
  
        if (tp->t_state & TS_BUSY) {
 +              cx_chan_t *c = cxchan[UNIT(tp->t_dev)];
 +              unsigned short port = c->chip->port;
 +
                print (("cx%d.%d: cxstop\n", c->board->num, c->num));
  
                /* Set current channel number */
diff --combined sys/dev/netif/wi/if_wi.c
@@@ -75,6 -75,7 +75,7 @@@
  #include <sys/sockio.h>
  #include <sys/mbuf.h>
  #include <sys/proc.h>
+ #include <sys/priv.h>
  #include <sys/kernel.h>
  #include <sys/socket.h>
  #include <sys/module.h>
@@@ -1110,7 -1111,7 +1111,7 @@@ wi_ioctl(struct ifnet *ifp, u_long cmd
                error = wi_get_cfg(ifp, cmd, data, cr);
                break;
        case SIOCSIFGENERIC:
-               error = suser_cred(cr, NULL_CRED_OKAY);
+               error = priv_check_cred(cr, PRIV_ROOT, NULL_CRED_OKAY);
                if (error)
                        break;
                error = wi_set_cfg(ifp, cmd, data);
                        error = copyout(&wreq, ifr->ifr_data, sizeof(wreq));
                break;
        case SIOCSPRISM2DEBUG:
-               if ((error = suser_cred(cr, NULL_CRED_OKAY)))
+               if ((error = priv_check_cred(cr, PRIV_ROOT, NULL_CRED_OKAY)))
                        goto out;
                error = copyin(ifr->ifr_data, &wreq, sizeof(wreq));
                if (error)
                }
                break;
        case SIOCS80211:
-               error = suser_cred(cr, NULL_CRED_OKAY);
+               error = priv_check_cred(cr, PRIV_ROOT, NULL_CRED_OKAY);
                if (error)
                        break;
                ireq = (struct ieee80211req *) data;
@@@ -1946,7 -1947,7 +1947,7 @@@ wi_get_cfg(struct ifnet *ifp, u_long cm
                        n = (len - off) / reslen;
                len = off + reslen * n;
                if (off != 0) {
 -                      struct wi_scan_p2_hdr *p2 = (struct wi_scan_p2_hdr *)wreq.wi_val;
 +                      struct wi_scan_p2_hdr *p2;
                        /*
                         * Prepend Prism-specific header.
                         */
diff --combined sys/dev/netif/wl/if_wl.c
@@@ -200,6 -200,7 +200,7 @@@ WITH THE USE OR PERFORMANCE OF THIS SOF
  #include <sys/socket.h>
  #include <sys/syslog.h>
  #include <sys/proc.h>
+ #include <sys/priv.h>
  #include <sys/serialize.h>
  #include <sys/sysctl.h>
  #include <sys/bus.h>
@@@ -1341,7 -1342,7 +1342,7 @@@ wlioctl(struct ifnet *ifp, u_long cmd, 
        /* pointer to buffer in user space */
        up = (void *)ifr->ifr_data;
        /* work out if they're root */
-       isroot = (suser(td) == 0);
+       isroot = (priv_check(td, PRIV_ROOT) == 0);
        
        for (i = 0; i < 0x40; i++) {
            /* don't hand the DES key out to non-root users */
        /* copy the PSA in from the caller; we only copy _some_ values */
      case SIOCSWLPSA:
        /* root only */
-       if ((error = suser(td)))
+       if ((error = priv_check(td, PRIV_ROOT)))
            break;
        error = EINVAL; /* assume the worst */
        /* pointer to buffer in user space containing data */
         */
      case SIOCSWLCNWID:
        /* root only */
-       if ((error = suser(td)))
+       if ((error = priv_check(td, PRIV_ROOT)))
            break;
        if (!(ifp->if_flags & IFF_UP)) {
            error = EIO;        /* only allowed while up */
        /* copy the EEPROM in 2.4 Gz WaveMODEM  out to the caller */
      case SIOCGWLEEPROM:
        /* root only */
-       if ((error = suser(td)))
+       if ((error = priv_check(td; PRIV_ROOT)))
            break;
        /* pointer to buffer in user space */
        up = (void *)ifr->ifr_data;
        /* zero (Delete) the wl cache */
      case SIOCDWLCACHE:
        /* root only */
-       if ((error = suser(td)))
+       if ((error = priv_check(td, PRIV_ROOT)))
            break;
        wl_cache_zero(sc);
        break;
@@@ -2313,7 -2314,7 +2314,7 @@@ static voi
  wlhdwsleaze(u_short *countp, u_char **mb_pp, struct mbuf **tm_pp)
  {
      struct mbuf       *tm_p = *tm_pp;
 -    u_char            *mb_p = *mb_pp;
 +    u_char            *mb_p;
      u_short           count = 0;
      u_char            *cp;
      int               len;
diff --combined sys/kern/vfs_syscalls.c
@@@ -58,6 -58,7 +58,7 @@@
  #include <sys/unistd.h>
  #include <sys/vnode.h>
  #include <sys/proc.h>
+ #include <sys/priv.h>
  #include <sys/namei.h>
  #include <sys/nlookup.h>
  #include <sys/dirent.h>
@@@ -125,22 -126,22 +126,22 @@@ sys_mount(struct mount_args *uap
        struct ucred *cred = p->p_ucred;
  
        KKASSERT(p);
-       if (cred->cr_prison != NULL)
+       if (jailed(cred))
                return (EPERM);
-       if (usermount == 0 && (error = suser(td)))
+       if (usermount == 0 && (error = priv_check(td, PRIV_ROOT)))
                return (error);
        /*
         * Do not allow NFS export by non-root users.
         */
        if (uap->flags & MNT_EXPORTED) {
-               error = suser(td);
+               error = priv_check(td, PRIV_ROOT);
                if (error)
                        return (error);
        }
        /*
         * Silently enforce MNT_NOSUID and MNT_NODEV for non-root users
         */
-       if (suser(td)) 
+       if (priv_check(td, PRIV_ROOT)) 
                uap->flags |= MNT_NOSUID | MNT_NODEV;
  
        /*
                 * permitted to update it.
                 */
                if (mp->mnt_stat.f_owner != cred->cr_uid &&
-                   (error = suser(td))) {
+                   (error = priv_check(td, PRIV_ROOT))) {
                        cache_drop(&nch);
                        vput(vp);
                        return (error);
         * onto which we are attempting to mount.
         */
        if ((error = VOP_GETATTR(vp, &va)) ||
-           (va.va_uid != cred->cr_uid && (error = suser(td)))) {
+           (va.va_uid != cred->cr_uid && (error = priv_check(td, PRIV_ROOT)))) {
                cache_drop(&nch);
                vput(vp);
                return (error);
                vput(vp);
                return (error);
        }
 -      for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
 -              if (!strcmp(vfsp->vfc_name, fstypename))
 -                      break;
 -      }
 +      vfsp = vfsconf_find_by_name(fstypename);
        if (vfsp == NULL) {
                linker_file_t lf;
  
                /* Only load modules for root (very important!) */
-               if ((error = suser(td)) != 0) {
+               if ((error = priv_check(td, PRIV_ROOT)) != 0) {
                        cache_drop(&nch);
                        vput(vp);
                        return error;
                }
                lf->userrefs++;
                /* lookup again, see if the VFS was loaded */
 -              for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
 -                      if (!strcmp(vfsp->vfc_name, fstypename))
 -                              break;
 -              }
 +              vfsp = vfsconf_find_by_name(fstypename);
                if (vfsp == NULL) {
                        lf->userrefs--;
                        linker_file_unload(lf);
@@@ -542,7 -549,7 +543,7 @@@ sys_unmount(struct unmount_args *uap
        KKASSERT(p);
        if (p->p_ucred->cr_prison != NULL)
                return (EPERM);
-       if (usermount == 0 && (error = suser(td)))
+       if (usermount == 0 && (error = priv_check(td, PRIV_ROOT)))
                return (error);
  
        error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
         * permitted to unmount this filesystem.
         */
        if ((mp->mnt_stat.f_owner != p->p_ucred->cr_uid) &&
-           (error = suser(td)))
+           (error = priv_check(td, PRIV_ROOT)))
                goto out;
  
        /*
@@@ -905,7 -912,7 +906,7 @@@ sys_mountctl(struct mountctl_args *uap
        KKASSERT(p);
        if (p->p_ucred->cr_prison != NULL)
                return (EPERM);
-       if ((error = suser(td)) != 0)
+       if ((error = priv_check(td, PRIV_ROOT)) != 0)
                return (error);
  
        /*
@@@ -1035,7 -1042,7 +1036,7 @@@ kern_statfs(struct nlookupdata *nd, str
        sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
        bcopy(sp, buf, sizeof(*buf));
        /* Only root should have access to the fsid's. */
-       if (suser(td))
+       if (priv_check(td, PRIV_ROOT))
                buf->f_fsid.val[0] = buf->f_fsid.val[1] = 0;
        return (0);
  }
@@@ -1098,7 -1105,7 +1099,7 @@@ kern_fstatfs(int fd, struct statfs *buf
        bcopy(sp, buf, sizeof(*buf));
  
        /* Only root should have access to the fsid's. */
-       if (suser(td))
+       if (priv_check(td, PRIV_ROOT))
                buf->f_fsid.val[0] = buf->f_fsid.val[1] = 0;
        error = 0;
  done:
@@@ -1596,7 -1603,7 +1597,7 @@@ kern_chroot(struct nchandle *nch
        /*
         * Only root can chroot
         */
-       if ((error = suser_cred(p->p_ucred, PRISON_ROOT)) != 0)
+       if ((error = priv_check_cred(p->p_ucred, PRIV_ROOT, PRISON_ROOT)) != 0)
                return (error);
  
        /*
@@@ -1847,10 -1854,10 +1848,10 @@@ kern_mknod(struct nlookupdata *nd, int 
        switch (mode & S_IFMT) {
        case S_IFCHR:
        case S_IFBLK:
-               error = suser(td);
+               error = priv_check(td, PRIV_ROOT);
                break;
        default:
-               error = suser_cred(p->p_ucred, PRISON_ROOT);
+               error = priv_check_cred(p->p_ucred, PRIV_ROOT, PRISON_ROOT);
                break;
        }
        if (error)
@@@ -2000,7 -2007,7 +2001,7 @@@ can_hardlink(struct vnode *vp, struct t
        /*
         * root cred can always hardlink
         */
-       if (suser_cred(cred, PRISON_ROOT) == 0)
+       if (priv_check_cred(cred, PRIV_ROOT, PRISON_ROOT) == 0)
                return (0);
  
        /*
@@@ -2531,7 -2538,7 +2532,7 @@@ setfflags(struct vnode *vp, int flags
         * chown can't fail when done as root.
         */
        if ((vp->v_type == VCHR || vp->v_type == VBLK) && 
-           ((error = suser_cred(p->p_ucred, PRISON_ROOT)) != 0))
+           ((error = priv_check_cred(p->p_ucred, PRIV_ROOT, PRISON_ROOT)) != 0))
                return (error);
  
        /*
@@@ -3560,7 -3567,7 +3561,7 @@@ sys_revoke(struct revoke_args *uap
                if (error == 0)
                        error = VOP_GETATTR(vp, &vattr);
                if (error == 0 && cred->cr_uid != vattr.va_uid)
-                       error = suser_cred(cred, PRISON_ROOT);
+                       error = priv_check_cred(cred, PRIV_ROOT, PRISON_ROOT);
                if (error == 0 && count_udev(vp->v_umajor, vp->v_uminor) > 0) {
                        error = 0;
                        vx_lock(vp);
@@@ -3601,7 -3608,7 +3602,7 @@@ sys_getfh(struct getfh_args *uap
        /*
         * Must be super user
         */
-       if ((error = suser(td)) != 0)
+       if ((error = priv_check(td, PRIV_ROOT)) != 0)
                return (error);
  
        vp = NULL;
   * syscall for the rpc.lockd to use to translate a NFS file handle into
   * an open descriptor.
   *
-  * warning: do not remove the suser() call or this becomes one giant
+  * warning: do not remove the priv_check() call or this becomes one giant
   * security hole.
   */
  int
@@@ -3651,7 -3658,7 +3652,7 @@@ sys_fhopen(struct fhopen_args *uap
        /*
         * Must be super user
         */
-       error = suser(td);
+       error = priv_check(td, PRIV_ROOT);
        if (error)
                return (error);
  
@@@ -3812,7 -3819,7 +3813,7 @@@ sys_fhstat(struct fhstat_args *uap
        /*
         * Must be super user
         */
-       error = suser(td);
+       error = priv_check(td, PRIV_ROOT);
        if (error)
                return (error);
        
@@@ -3851,7 -3858,7 +3852,7 @@@ sys_fhstatfs(struct fhstatfs_args *uap
        /*
         * Must be super user
         */
-       if ((error = suser(td)))
+       if ((error = priv_check(td, PRIV_ROOT)))
                return (error);
  
        if ((error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t))) != 0)
        kfree(freepath, M_TEMP);
  
        sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
-       if (suser(td)) {
+       if (priv_check(td, PRIV_ROOT)) {
                bcopy(sp, &sb, sizeof(sb));
                sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
                sp = &sb;
@@@ -3904,7 -3911,7 +3905,7 @@@ sys_fhstatvfs(struct fhstatvfs_args *ua
        /*
         * Must be super user
         */
-       if ((error = suser(td)))
+       if ((error = priv_check(td, PRIV_ROOT)))
                return (error);
  
        if ((error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t))) != 0)
diff --combined sys/kern/vfs_vnops.c
@@@ -46,6 -46,7 +46,7 @@@
  #include <sys/file.h>
  #include <sys/stat.h>
  #include <sys/proc.h>
+ #include <sys/priv.h>
  #include <sys/mount.h>
  #include <sys/nlookup.h>
  #include <sys/vnode.h>
@@@ -409,8 -410,7 +410,8 @@@ vn_close(struct vnode *vp, int flags
  {
        int error;
  
 -      if ((error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY)) == 0) {
 +      error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 +      if (error == 0) {
                error = VOP_CLOSE(vp, flags);
                vn_unlock(vp);
        }
@@@ -881,7 -881,7 +882,7 @@@ vn_stat(struct vnode *vp, struct stat *
                break;
        default:
                return (EBADF);
 -      };
 +      }
        sb->st_mode = mode;
        if (vap->va_nlink > (nlink_t)-1)
                sb->st_nlink = (nlink_t)-1;
         * because device read and write calls may bypass the filesystem.
         */
        if (vp->v_type == VCHR || vp->v_type == VBLK) {
 -              if ((dev = vp->v_rdev) != NULL) {
 +              dev = vp->v_rdev;
 +              if (dev != NULL) {
                        if (dev->si_lastread) {
                                sb->st_atimespec.tv_sec = dev->si_lastread;
                                sb->st_atimespec.tv_nsec = 0;
                 * stat() call, aka v_rdev == NULL), how are we supposed
                 * to get a valid block size out of it?
                 */
 -              cdev_t dev;
 -
 -              if ((dev = vp->v_rdev) == NULL) {
 -                      if (vp->v_type == VCHR)
 -                              dev = get_dev(vp->v_umajor, vp->v_uminor);
 +              dev = vp->v_rdev;
 +              if (dev == NULL && vp->v_type == VCHR) {
 +                      dev = get_dev(vp->v_umajor, vp->v_uminor);
                }
                sb->st_blksize = dev->si_bsize_best;
                if (sb->st_blksize < dev->si_bsize_phys)
        }
        
        sb->st_flags = vap->va_flags;
-       if (suser_cred(cred, 0))
+       error = priv_check_cred(cred, PRIV_VFS_GENERATION, 0);
+       if (error)
                sb->st_gen = 0;
        else
                sb->st_gen = (u_int32_t)vap->va_gen;
@@@ -971,8 -974,7 +974,8 @@@ vn_ioctl(struct file *fp, u_long com, c
        case VREG:
        case VDIR:
                if (com == FIONREAD) {
 -                      if ((error = VOP_GETATTR(vp, &vattr)) != 0)
 +                      error = VOP_GETATTR(vp, &vattr);
 +                      if (error)
                                break;
                        *(int *)data = vattr.va_size - fp->f_offset;
                        error = 0;
@@@ -1105,7 -1107,7 +1108,7 @@@ vn_closefile(struct file *fp
        fp->f_ops = &badfileops;
        error = vn_close(((struct vnode *)fp->f_data), fp->f_flag);
        rel_mplock();
 -      return(error);
 +      return (error);
  }
  
  /*
  #include <sys/sysctl.h>
  #include <sys/module.h>
  #include <sys/proc.h>
+ #include <sys/priv.h>
  #include <sys/lock.h>
  #include <sys/thread.h>
  #include <sys/thread2.h>
@@@ -806,7 -807,7 +807,7 @@@ bridge_ioctl(struct ifnet *ifp, u_long 
                }
  
                if (bc->bc_flags & BC_F_SUSER) {
-                       error = suser_cred(cr, NULL_CRED_OKAY);
+                       error = priv_check_cred(cr, PRIV_ROOT, NULL_CRED_OKAY);
                        if (error)
                                break;
                }
@@@ -1346,7 -1347,7 +1347,7 @@@ bridge_ioctl_gifs(struct bridge_softc *
        len = min(bifc->ifbic_len, sizeof(*breq) * count);
        KKASSERT(len >= sizeof(*breq));
  
 -      breq = kmalloc(len, M_TEMP, M_INTWAIT | M_NULLOK | M_ZERO);
 +      breq = kmalloc(len, M_TEMP, M_WAITOK | M_NULLOK | M_ZERO);
        if (breq == NULL) {
                bifc->ifbic_len = 0;
                return ENOMEM;
@@@ -1414,7 -1415,7 +1415,7 @@@ bridge_ioctl_rts(struct bridge_softc *s
        len = min(bac->ifbac_len, sizeof(*bareq) * count);
        KKASSERT(len >= sizeof(*bareq));
  
 -      bareq = kmalloc(len, M_TEMP, M_INTWAIT | M_NULLOK | M_ZERO);
 +      bareq = kmalloc(len, M_TEMP, M_WAITOK | M_NULLOK | M_ZERO);
        if (bareq == NULL) {
                bac->ifbac_len = 0;
                return ENOMEM;
@@@ -1821,6 -1822,7 +1822,6 @@@ voi
  bridge_enqueue(struct ifnet *dst_ifp, struct mbuf *m)
  {
        struct netmsg_packet *nmp;
 -      lwkt_port_t port;
  
        nmp = &m->m_hdr.mh_netmsg;
        netmsg_init(&nmp->nm_netmsg, &netisr_apanic_rport, 0,
        nmp->nm_packet = m;
        nmp->nm_netmsg.nm_lmsg.u.ms_resultp = dst_ifp;
  
 -      if (curthread->td_flags & TDF_NETWORK)
 -              port = &curthread->td_msgport;
 -      else
 -              port = cpu_portfn(mycpuid);
 -      lwkt_sendmsg(port, &nmp->nm_netmsg.nm_lmsg);
 +      lwkt_sendmsg(curnetport, &nmp->nm_netmsg.nm_lmsg);
  }
  
  /*
diff --combined sys/netinet/in.c
@@@ -42,6 -42,7 +42,7 @@@
  #include <sys/sockio.h>
  #include <sys/malloc.h>
  #include <sys/proc.h>
+ #include <sys/priv.h>
  #include <sys/msgport.h>
  #include <sys/socket.h>
  
@@@ -75,9 -76,6 +76,9 @@@ static void   in_control_dispatch(struct 
  static int    in_control_internal(u_long, caddr_t, struct ifnet *,
                    struct thread *);
  
 +static int    in_addprefix(struct in_ifaddr *, int);
 +static void   in_scrubprefix(struct in_ifaddr *);
 +
  static int subnetsarelocal = 0;
  SYSCTL_INT(_net_inet_ip, OID_AUTO, subnets_are_local, CTLFLAG_RW,
        &subnetsarelocal, 0, "");
@@@ -230,7 -228,7 +231,7 @@@ in_control(struct socket *so, u_long cm
        switch (cmd) {
        case SIOCALIFADDR:
        case SIOCDLIFADDR:
-               if (td && (error = suser(td)) != 0)
+               if (td && (error = priv_check(td, PRIV_ROOT)) != 0)
                        return error;
                /* FALLTHROUGH */
        case SIOCGLIFADDR:
@@@ -454,11 -452,10 +455,11 @@@ in_control_internal(u_long cmd, caddr_
        struct ifaddr_container *ifac;
        struct in_ifaddr_container *iac;
        struct sockaddr_in oldaddr;
 -      int hostIsNew, iaIsNew, maskIsNew;
 +      int hostIsNew, iaIsNew, maskIsNew, ifpWasUp;
        int error = 0;
  
        iaIsNew = 0;
 +      ifpWasUp = 0;
  
        /*
         * Find address for this interface, if it exists.
                                }
                        }
                }
 +
 +              if (ifp->if_flags & IFF_UP)
 +                      ifpWasUp = 1;
        }
  
        switch (cmd) {
        case SIOCSIFADDR:
        case SIOCSIFNETMASK:
        case SIOCSIFDSTADDR:
-               if (td && (error = suser(td)) != 0)
+               if (td && (error = priv_check(td, PRIV_ROOT)) != 0)
                        return error;
  
                if (ifp == NULL)
                break;
  
        case SIOCSIFBRDADDR:
-               if (td && (error = suser(td)) != 0)
+               if (td && (error = priv_check(td, PRIV_ROOT)) != 0)
                        return error;
                /* FALLTHROUGH */
  
                    (const struct sockaddr_in *)&ifr->ifr_addr, 1);
                if (error != 0 && iaIsNew)
                        break;
 -              if (error == 0)
 -                      EVENTHANDLER_INVOKE(ifaddr_event, ifp);
 +              if (error == 0) {
 +                      EVENTHANDLER_INVOKE(ifaddr_event, ifp,
 +                      iaIsNew ? IFADDR_EVENT_ADD : IFADDR_EVENT_CHANGE,
 +                      &ia->ia_ifa);
 +              }
 +              if (!ifpWasUp && (ifp->if_flags & IFF_UP)) {
 +                      /*
 +                       * Interface is brought up by in_ifinit()
 +                       * (via ifp->if_ioctl).  We act as if the
 +                       * interface got IFF_UP flag turned on.
 +                       */
 +                      if_up(ifp);
 +              }
                return (0);
  
        case SIOCSIFNETMASK:
                if ((ifp->if_flags & IFF_BROADCAST) &&
                    ifra->ifra_broadaddr.sin_family == AF_INET)
                        ia->ia_broadaddr = ifra->ifra_broadaddr;
 -              if (error == 0)
 -                      EVENTHANDLER_INVOKE(ifaddr_event, ifp);
 +              if (error == 0) {
 +                      EVENTHANDLER_INVOKE(ifaddr_event, ifp,
 +                      iaIsNew ? IFADDR_EVENT_ADD : IFADDR_EVENT_CHANGE,
 +                      &ia->ia_ifa);
 +              }
 +              if (!ifpWasUp && (ifp->if_flags & IFF_UP)) {
 +                      /* See the comment in SIOCSIFADDR */
 +                      if_up(ifp);
 +              }
                return (error);
  
        case SIOCDIFADDR:
                 * a routing process they will come back.
                 */
                in_ifadown(&ia->ia_ifa, 1);
 -              EVENTHANDLER_INVOKE(ifaddr_event, ifp);
 +              EVENTHANDLER_INVOKE(ifaddr_event, ifp, IFADDR_EVENT_DELETE,
 +                                  &ia->ia_ifa);
                error = 0;
                break;
  
  
        ifa_destroy(&ia->ia_ifa);
  
 +      if ((cmd == SIOCAIFADDR || cmd == SIOCSIFADDR) &&
 +          !ifpWasUp && (ifp->if_flags & IFF_UP)) {
 +              /*
 +               * Though the address assignment failed, the
 +               * interface is brought up by in_ifinit()
 +               * (via ifp->if_ioctl).  With the hope that
 +               * the interface has some valid addresses, we
 +               * act as if IFF_UP flag was just set on the
 +               * interface.
 +               *
 +               * NOTE:
 +               * This could only be done after the failed
 +               * address is unlinked from the global address
 +               * list.
 +               */
 +              if_up(ifp);
 +      }
 +
        return (error);
  }
  
@@@ -967,9 -924,16 +968,9 @@@ in_lifaddr_ioctl(struct socket *so, u_l
   * Delete any existing route for an interface.
   */
  void
 -in_ifscrub(struct ifnet *ifp, struct in_ifaddr *ia)
 +in_ifscrub(struct ifnet *ifp __unused, struct in_ifaddr *ia)
  {
 -
 -      if ((ia->ia_flags & IFA_ROUTE) == 0)
 -              return;
 -      if (ifp->if_flags & (IFF_LOOPBACK|IFF_POINTOPOINT))
 -              rtinit(&ia->ia_ifa, RTM_DELETE, RTF_HOST);
 -      else
 -              rtinit(&ia->ia_ifa, RTM_DELETE, 0);
 -      ia->ia_flags &= ~IFA_ROUTE;
 +      in_scrubprefix(ia);
  }
  
  /*
@@@ -1054,7 -1018,7 +1055,7 @@@ in_ifinit(struct ifnet *ifp, struct in_
                ia->ia_netbroadcast.s_addr =
                        htonl(ia->ia_net | ~ ia->ia_netmask);
        } else if (ifp->if_flags & IFF_LOOPBACK) {
 -              ia->ia_ifa.ifa_dstaddr = ia->ia_ifa.ifa_addr;
 +              ia->ia_dstaddr = ia->ia_addr;
                flags |= RTF_HOST;
        } else if (ifp->if_flags & IFF_POINTOPOINT) {
                if (ia->ia_dstaddr.sin_family != AF_INET)
        if (ia->ia_addr.sin_addr.s_addr != INADDR_ANY ||
            ia->ia_netmask != IN_CLASSA_NET ||
            ia->ia_dstaddr.sin_addr.s_addr != htonl(IN_CLASSA_HOST)) {
 -              if ((error = rtinit(&ia->ia_ifa, RTM_ADD, flags)) != 0)
 +              error = in_addprefix(ia, flags);
 +              if (error)
                        goto fail;
 -              ia->ia_flags |= IFA_ROUTE;
        }
  
        /*
        return (error);
  }
  
 +#define rtinitflags(x) \
 +      (((x)->ia_ifp->if_flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) \
 +       ? RTF_HOST : 0)
 +
 +/*
 + * Add a route to prefix ("connected route" in cisco terminology).
 + * Do nothing, if there are some interface addresses with the same
 + * prefix already.  This function assumes that the 'target' parent
 + * interface is UP.
 + */
 +static int
 +in_addprefix(struct in_ifaddr *target, int flags)
 +{
 +      struct in_ifaddr_container *iac;
 +      struct in_addr prefix, mask;
 +      int error;
 +
 +      mask = target->ia_sockmask.sin_addr;
 +      if (flags & RTF_HOST) {
 +              prefix = target->ia_dstaddr.sin_addr;
 +      } else {
 +              prefix = target->ia_addr.sin_addr;
 +              prefix.s_addr &= mask.s_addr;
 +      }
 +
 +      TAILQ_FOREACH(iac, &in_ifaddrheads[mycpuid], ia_link) {
 +              struct in_ifaddr *ia = iac->ia;
 +              struct in_addr p;
 +
 +              /* Don't test against self */
 +              if (ia == target)
 +                      continue;
 +
 +              /* The tested address does not own a route entry */
 +              if ((ia->ia_flags & IFA_ROUTE) == 0)
 +                      continue;
 +
 +              /* Prefix test */
 +              if (rtinitflags(ia)) {
 +                      p = ia->ia_dstaddr.sin_addr;
 +              } else {
 +                      p = ia->ia_addr.sin_addr;
 +                      p.s_addr &= ia->ia_sockmask.sin_addr.s_addr;
 +              }
 +              if (prefix.s_addr != p.s_addr)
 +                      continue;
 +
 +              /*
 +               * If the to-be-added address and the curretly being
 +               * tested address are not host addresses, we need to
 +               * take subnetmask into consideration.
 +               */
 +              if (!(flags & RTF_HOST) && !rtinitflags(ia) &&
 +                  mask.s_addr != ia->ia_sockmask.sin_addr.s_addr)
 +                      continue;
 +
 +              /*
 +               * If we got a matching prefix route inserted by other
 +               * interface address, we don't need to bother.
 +               */
 +              return 0;
 +      }
 +
 +      /*
 +       * No one seem to have prefix route; insert it.
 +       */
 +      error = rtinit(&target->ia_ifa, RTM_ADD, flags);
 +      if (!error)
 +              target->ia_flags |= IFA_ROUTE;
 +      return error;
 +}
 +
 +/*
 + * Remove a route to prefix ("connected route" in cisco terminology).
 + * Re-installs the route by using another interface address, if there's
 + * one with the same prefix (otherwise we lose the route mistakenly).
 + */
 +static void
 +in_scrubprefix(struct in_ifaddr *target)
 +{
 +      struct in_ifaddr_container *iac;
 +      struct in_addr prefix, mask;
 +      int error;
 +
 +      if ((target->ia_flags & IFA_ROUTE) == 0)
 +              return;
 +
 +      mask = target->ia_sockmask.sin_addr;
 +      if (rtinitflags(target)) {
 +              prefix = target->ia_dstaddr.sin_addr;
 +      } else {
 +              prefix = target->ia_addr.sin_addr;
 +              prefix.s_addr &= mask.s_addr;
 +      }
 +
 +      TAILQ_FOREACH(iac, &in_ifaddrheads[mycpuid], ia_link) {
 +              struct in_ifaddr *ia = iac->ia;
 +              struct in_addr p;
 +
 +              /* Don't test against self */
 +              if (ia == target)
 +                      continue;
 +
 +              /* The tested address already owns a route entry */
 +              if (ia->ia_flags & IFA_ROUTE)
 +                      continue;
 +
 +              /*
 +               * The prefix route of the tested address should
 +               * never be installed if its parent interface is
 +               * not UP yet.
 +               */
 +              if ((ia->ia_ifp->if_flags & IFF_UP) == 0)
 +                      continue;
 +
 +              /* Prefix test */
 +              if (rtinitflags(ia)) {
 +                      p = ia->ia_dstaddr.sin_addr;
 +              } else {
 +                      p = ia->ia_addr.sin_addr;
 +                      p.s_addr &= ia->ia_sockmask.sin_addr.s_addr;
 +              }
 +              if (prefix.s_addr != p.s_addr)
 +                      continue;
 +
 +              /*
 +               * We don't need to test subnetmask here, as what we do
 +               * in in_addprefix(), since if the the tested address's
 +               * parent interface is UP, the tested address should own
 +               * a prefix route entry and we would never reach here.
 +               */
 +
 +              /*
 +               * If we got a matching prefix route, move IFA_ROUTE to him
 +               */
 +              rtinit(&target->ia_ifa, RTM_DELETE, rtinitflags(target));
 +              target->ia_flags &= ~IFA_ROUTE;
 +
 +              error = rtinit(&ia->ia_ifa, RTM_ADD, rtinitflags(ia) | RTF_UP);
 +              if (!error)
 +                      ia->ia_flags |= IFA_ROUTE;
 +              return;
 +      }
 +
 +      /*
 +       * No candidates for this prefix route; just remove it.
 +       */
 +      rtinit(&target->ia_ifa, RTM_DELETE, rtinitflags(target));
 +      target->ia_flags &= ~IFA_ROUTE;
 +}
 +
 +#undef rtinitflags
  
  /*
   * Return 1 if the address might be a local broadcast address.
diff --combined sys/netinet/ip_carp.c
@@@ -41,6 -41,7 +41,7 @@@
  #include <sys/mbuf.h>
  #include <sys/time.h>
  #include <sys/proc.h>
+ #include <sys/priv.h>
  #include <sys/sockio.h>
  #include <sys/socket.h>
  #include <sys/sysctl.h>
  #define CARP_IS_RUNNING(ifp)  \
        (((ifp)->if_flags & (IFF_UP | IFF_RUNNING)) == (IFF_UP | IFF_RUNNING))
  
 +struct carp_vhaddr {
 +      uint32_t                vha_flags;      /* CARP_VHAF_ */
 +      const struct in_ifaddr  *vha_ia;        /* carp address */
 +      const struct in_ifaddr  *vha_iaback;    /* backing address */
 +      TAILQ_ENTRY(carp_vhaddr) vha_link;
 +};
 +TAILQ_HEAD(carp_vhaddr_list, carp_vhaddr);
 +
  struct carp_softc {
        struct ifnet             sc_if;
 -      struct ifnet            *sc_ifp;        /* compat shim */
        struct ifnet            *sc_carpdev;    /* parent interface */
 -      struct in_ifaddr        *sc_ia;         /* primary iface address */
 +      struct carp_vhaddr_list  sc_vha_list;   /* virtual addr list */
 +
 +      const struct in_ifaddr  *sc_ia;         /* primary iface address v4 */
        struct ip_moptions       sc_imo;
 +
  #ifdef INET6
        struct in6_ifaddr       *sc_ia6;        /* primary iface address v6 */
        struct ip6_moptions      sc_im6o;
  
        enum { INIT = 0, BACKUP, MASTER }
                                 sc_state;
 +      int                      sc_dead;
  
 -      int                      sc_flags_backup;
        int                      sc_suppress;
  
        int                      sc_sendad_errors;
  
        int                      sc_vhid;
        int                      sc_advskew;
 -      int                      sc_naddrs;
 +      int                      sc_naddrs;     /* actually used IPv4 vha */
        int                      sc_naddrs6;
        int                      sc_advbase;    /* seconds */
        int                      sc_init_counter;
  
        LIST_ENTRY(carp_softc)   sc_next;       /* Interface clue */
  };
 -#define       SC2IFP(sc)      ((sc)->sc_ifp)
  
  struct carp_if {
        TAILQ_HEAD(, carp_softc) vhif_vrs;
 -      int             vhif_nvrs;
 -
 -      struct ifnet    *vhif_ifp;
  };
  
  enum  { CARP_COUNT_MASTER, CARP_COUNT_RUNNING };
@@@ -180,51 -175,34 +181,51 @@@ static void     carp_setroute(struct carp_s
  static void   carp_input_c(struct mbuf *, struct carp_header *, sa_family_t);
  static int    carp_clone_create(struct if_clone *, int);
  static void   carp_clone_destroy(struct ifnet *);
 -static void   carpdetach(struct carp_softc *, int);
 +static void   carp_detach(struct carp_softc *, int);
  static int    carp_prepare_ad(struct mbuf *, struct carp_softc *,
                    struct carp_header *);
  static void   carp_send_ad_all(void);
 -static void   carp_send_ad(void *);
 -static void   carp_send_ad_locked(struct carp_softc *);
 +static void   carp_send_ad_timeout(void *);
 +static void   carp_send_ad(struct carp_softc *);
  static void   carp_send_arp(struct carp_softc *);
 -static void   carp_master_down(void *);
 -static void   carp_master_down_locked(struct carp_softc *);
 +static void   carp_master_down_timeout(void *);
 +static void   carp_master_down(struct carp_softc *);
  static int    carp_ioctl(struct ifnet *, u_long, caddr_t, struct ucred *);
  static int    carp_looutput(struct ifnet *, struct mbuf *, struct sockaddr *,
                    struct rtentry *);
  static void   carp_start(struct ifnet *);
  static void   carp_setrun(struct carp_softc *, sa_family_t);
  static void   carp_set_state(struct carp_softc *, int);
 -static int    carp_addrcount(struct carp_if *, struct in_ifaddr *, int);
  
  static void   carp_multicast_cleanup(struct carp_softc *);
 -static int    carp_set_addr(struct carp_softc *, struct sockaddr_in *);
 -static int    carp_del_addr(struct carp_softc *, struct sockaddr_in *);
 -static void   carp_carpdev_state_locked(struct carp_if *);
 -static void   carp_sc_state_locked(struct carp_softc *);
 +static void   carp_add_addr(struct carp_softc *, struct ifaddr *);
 +static void   carp_del_addr(struct carp_softc *, struct ifaddr *);
 +static void   carp_config_addr(struct carp_softc *, struct ifaddr *);
 +static void   carp_link_addrs(struct carp_softc *, struct ifnet *,
 +                  struct ifaddr *);
 +static void   carp_unlink_addrs(struct carp_softc *, struct ifnet *,
 +                  struct ifaddr *);
 +
 +static int    carp_get_vhaddr(struct carp_softc *, struct ifdrv *);
 +static int    carp_config_vhaddr(struct carp_softc *, struct carp_vhaddr *);
 +static int    carp_activate_vhaddr(struct carp_softc *, struct carp_vhaddr *,
 +                  struct ifnet *, const struct in_ifaddr *, int);
 +static void   carp_deactivate_vhaddr(struct carp_softc *,
 +                  struct carp_vhaddr *);
 +
 +static void   carp_sc_state(struct carp_softc *);
  #ifdef INET6
  static void   carp_send_na(struct carp_softc *);
  static int    carp_set_addr6(struct carp_softc *, struct sockaddr_in6 *);
  static int    carp_del_addr6(struct carp_softc *, struct sockaddr_in6 *);
  static void   carp_multicast6_cleanup(struct carp_softc *);
  #endif
 +static void   carp_stop(struct carp_softc *, int);
 +static void   carp_reset(struct carp_softc *, int);
 +
 +static void   carp_ifaddr(void *, struct ifnet *, enum ifaddr_event,
 +                          struct ifaddr *);
 +static void   carp_ifdetach(void *, struct ifnet *);
  
  static MALLOC_DEFINE(M_CARP, "CARP", "CARP interfaces");
  
@@@ -235,41 -213,6 +236,41 @@@ IF_CLONE_INITIALIZER(CARP_IFNAME, carp_
                     0, IF_MAXUNIT);
  
  static eventhandler_tag carp_ifdetach_event;
 +static eventhandler_tag carp_ifaddr_event;
 +
 +static __inline void
 +carp_insert_vhaddr(struct carp_softc *sc, struct carp_vhaddr *vha_new)
 +{
 +      struct carp_vhaddr *vha;
 +      u_long new_addr, addr;
 +
 +      KKASSERT((vha_new->vha_flags & CARP_VHAF_ONLIST) == 0);
 +
 +      /*
 +       * Virtual address list is sorted; smaller one first
 +       */
 +      new_addr = ntohl(vha_new->vha_ia->ia_addr.sin_addr.s_addr);
 +
 +      TAILQ_FOREACH(vha, &sc->sc_vha_list, vha_link) {
 +              addr = ntohl(vha->vha_ia->ia_addr.sin_addr.s_addr);
 +
 +              if (addr > new_addr)
 +                      break;
 +      }
 +      if (vha == NULL)
 +              TAILQ_INSERT_TAIL(&sc->sc_vha_list, vha_new, vha_link);
 +      else
 +              TAILQ_INSERT_BEFORE(vha, vha_new, vha_link);
 +      vha_new->vha_flags |= CARP_VHAF_ONLIST;
 +}
 +
 +static __inline void
 +carp_remove_vhaddr(struct carp_softc *sc, struct carp_vhaddr *vha)
 +{
 +      KKASSERT(vha->vha_flags & CARP_VHAF_ONLIST);
 +      vha->vha_flags &= ~CARP_VHAF_ONLIST;
 +      TAILQ_REMOVE(&sc->sc_vha_list, vha, vha_link);
 +}
  
  static void
  carp_hmac_prepare(struct carp_softc *sc)
  #ifdef INET6
        struct in6_addr in6;
  #endif
 +#ifdef INET
 +      struct carp_vhaddr *vha;
 +#endif
  
        /* XXX: possible race here */
  
        SHA1Update(&sc->sc_sha1, (void *)&type, sizeof(type));
        SHA1Update(&sc->sc_sha1, (void *)&vhid, sizeof(vhid));
  #ifdef INET
 -      TAILQ_FOREACH(ifac, &SC2IFP(sc)->if_addrheads[mycpuid], ifa_link) {
 -              struct ifaddr *ifa = ifac->ifa;
 -
 -              if (ifa->ifa_addr->sa_family == AF_INET)
 -                      SHA1Update(&sc->sc_sha1,
 -                          (void *)&ifatoia(ifa)->ia_addr.sin_addr.s_addr,
 -                          sizeof(struct in_addr));
 +      TAILQ_FOREACH(vha, &sc->sc_vha_list, vha_link) {
 +              SHA1Update(&sc->sc_sha1,
 +                  (const uint8_t *)&vha->vha_ia->ia_addr.sin_addr,
 +                  sizeof(struct in_addr));
        }
  #endif /* INET */
  #ifdef INET6
 -      TAILQ_FOREACH(ifac, &SC2IFP(sc)->if_addrheads[mycpuid], ifa_link) {
 +      TAILQ_FOREACH(ifac, &sc->sc_if.if_addrheads[mycpuid], ifa_link) {
                struct ifaddr *ifa = ifac->ifa;
  
                if (ifa->ifa_addr->sa_family == AF_INET6) {
@@@ -355,22 -298,32 +356,22 @@@ carp_hmac_verify(struct carp_softc *sc
  static void
  carp_setroute(struct carp_softc *sc, int cmd)
  {
 +#ifdef INET6
        struct ifaddr_container *ifac;
  
        crit_enter();
 -      TAILQ_FOREACH(ifac, &SC2IFP(sc)->if_addrheads[mycpuid], ifa_link) {
 +      TAILQ_FOREACH(ifac, &sc->sc_if.if_addrheads[mycpuid], ifa_link) {
                struct ifaddr *ifa = ifac->ifa;
  
 -              if (ifa->ifa_addr->sa_family == AF_INET &&
 -                  sc->sc_carpdev != NULL) {
 -                      int count = carp_addrcount(
 -                          (struct carp_if *)sc->sc_carpdev->if_carp,
 -                          ifatoia(ifa), CARP_COUNT_MASTER);
 -
 -                      if ((cmd == RTM_ADD && count == 1) ||
 -                          (cmd == RTM_DELETE && count == 0))
 -                              rtinit(ifa, cmd, RTF_UP | RTF_HOST);
 -              }
 -#ifdef INET6
                if (ifa->ifa_addr->sa_family == AF_INET6) {
                        if (cmd == RTM_ADD)
                                in6_ifaddloop(ifa);
                        else
                                in6_ifremloop(ifa);
                }
 -#endif /* INET6 */
        }
        crit_exit();
 +#endif /* INET6 */
  }
  
  static int
@@@ -380,17 -333,15 +381,17 @@@ carp_clone_create(struct if_clone *ifc
        struct ifnet *ifp;
  
        sc = kmalloc(sizeof(*sc), M_CARP, M_WAITOK | M_ZERO);
 -      ifp = sc->sc_ifp = &sc->sc_if;
 +      ifp = &sc->sc_if;
  
 -      sc->sc_flags_backup = 0;
        sc->sc_suppress = 0;
        sc->sc_advbase = CARP_DFLTINTV;
        sc->sc_vhid = -1;       /* required setting */
        sc->sc_advskew = 0;
        sc->sc_init_counter = 1;
 -      sc->sc_naddrs = sc->sc_naddrs6 = 0;
 +      sc->sc_naddrs = 0;
 +      sc->sc_naddrs6 = 0;
 +
 +      TAILQ_INIT(&sc->sc_vha_list);
  
  #ifdef INET6
        sc->sc_im6o.im6o_multicast_hlim = CARP_DFLTTL;
@@@ -425,49 -376,58 +426,49 @@@ carp_clone_destroy(struct ifnet *ifp
  {
        struct carp_softc *sc = ifp->if_softc;
  
 -      carpdetach(sc, 1);
 +      sc->sc_dead = 1;
 +      carp_detach(sc, 1);
  
        crit_enter();
        LIST_REMOVE(sc, sc_next);
        crit_exit();
        bpfdetach(ifp);
        if_detach(ifp);
 +
 +      KASSERT(sc->sc_naddrs == 0, ("certain inet address is still active\n"));
        kfree(sc, M_CARP);
  }
  
 -/*
 - * This function can be called on CARP interface destroy path,
 - * and in case of the removal of the underlying interface as
 - * well. We differentiate these two cases. In the latter case
 - * we do not cleanup our multicast memberships, since they
 - * are already freed.
 - */
  static void
 -carpdetach(struct carp_softc *sc, int unlock)
 +carp_detach(struct carp_softc *sc, int detach)
  {
        struct carp_if *cif;
  
 -      callout_stop(&sc->sc_ad_tmo);
 -      callout_stop(&sc->sc_md_tmo);
 -      callout_stop(&sc->sc_md6_tmo);
 +      carp_reset(sc, detach);
  
 -      if (sc->sc_suppress)
 -              carp_suppress_preempt--;
 -      sc->sc_suppress = 0;
 -
 -      if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS)
 -              carp_suppress_preempt--;
 -      sc->sc_sendad_errors = 0;
 -
 -      carp_set_state(sc, INIT);
 -      SC2IFP(sc)->if_flags &= ~IFF_UP;
 -      carp_setrun(sc, 0);
 -      if (unlock)
 -              carp_multicast_cleanup(sc);
 +      carp_multicast_cleanup(sc);
  #ifdef INET6
        carp_multicast6_cleanup(sc);
  #endif
  
 +      if (!sc->sc_dead && detach) {
 +              struct carp_vhaddr *vha;
 +
 +              TAILQ_FOREACH(vha, &sc->sc_vha_list, vha_link)
 +                      carp_deactivate_vhaddr(sc, vha);
 +              KKASSERT(sc->sc_naddrs == 0);
 +      }
 +
        if (sc->sc_carpdev != NULL) {
 -              cif = (struct carp_if *)sc->sc_carpdev->if_carp;
 +              cif = sc->sc_carpdev->if_carp;
                TAILQ_REMOVE(&cif->vhif_vrs, sc, sc_list);
 -              if (!--cif->vhif_nvrs) {
 +              if (TAILQ_EMPTY(&cif->vhif_vrs)) {
                        ifpromisc(sc->sc_carpdev, 0);
                        sc->sc_carpdev->if_carp = NULL;
 -                      kfree(cif, M_IFADDR);
 +                      kfree(cif, M_CARP);
                }
                sc->sc_carpdev = NULL;
 +              sc->sc_ia = NULL;
        }
  }
  
  static void
  carp_ifdetach(void *arg __unused, struct ifnet *ifp)
  {
 -      struct carp_if *cif = (struct carp_if *)ifp->if_carp;
 -      struct carp_softc *sc, *nextsc;
 -
 -      if (cif == NULL)
 -              return;
 +      struct carp_if *cif = ifp->if_carp;
 +      struct carp_softc *sc;
  
 -      for (sc = TAILQ_FIRST(&cif->vhif_vrs); sc; sc = nextsc) {
 -              nextsc = TAILQ_NEXT(sc, sc_list);
 -              carpdetach(sc, 0);
 -      }
 +      while (ifp->if_carp &&
 +             (sc = TAILQ_FIRST(&cif->vhif_vrs)) != NULL)
 +              carp_detach(sc, 1);
  }
  
  /*
@@@ -628,7 -592,6 +629,7 @@@ static voi
  carp_input_c(struct mbuf *m, struct carp_header *ch, sa_family_t af)
  {
        struct ifnet *ifp = m->m_pkthdr.rcvif;
 +      struct ifnet *cifp;
        struct carp_softc *sc;
        uint64_t tmp_counter;
        struct timeval sc_tv, ch_tv;
                if (sc->sc_vhid == ch->carp_vhid)
                        break;
  
 -      if (!sc || !CARP_IS_RUNNING(SC2IFP(sc))) {
 +      if (!sc || !CARP_IS_RUNNING(&sc->sc_if)) {
                carpstats.carps_badvhid++;
                m_freem(m);
                return;
        }
 +      cifp = &sc->sc_if;
  
 -      getmicrotime(&SC2IFP(sc)->if_lastchange);
 -      SC2IFP(sc)->if_ipackets++;
 -      SC2IFP(sc)->if_ibytes += m->m_pkthdr.len;
 +      getmicrotime(&cifp->if_lastchange);
 +      cifp->if_ipackets++;
 +      cifp->if_ibytes += m->m_pkthdr.len;
  
 -      if (SC2IFP(sc)->if_bpf) {
 +      if (cifp->if_bpf) {
                struct ip *ip = mtod(m, struct ip *);
  
                /* BPF wants net byte order */
                ip->ip_len = htons(ip->ip_len + (ip->ip_hl << 2));
                ip->ip_off = htons(ip->ip_off);
 -              bpf_mtap(SC2IFP(sc)->if_bpf, m);
 +              bpf_mtap(cifp->if_bpf, m);
        }
  
        /* verify the CARP version. */
        if (ch->carp_version != CARP_VERSION) {
                carpstats.carps_badver++;
 -              SC2IFP(sc)->if_ierrors++;
 -              CARP_LOG("%s; invalid version %d\n",
 -                  SC2IFP(sc)->if_xname,
 -                  ch->carp_version);
 +              cifp->if_ierrors++;
 +              CARP_LOG("%s; invalid version %d\n", cifp->if_xname,
 +                       ch->carp_version);
                m_freem(m);
                return;
        }
        /* verify the hash */
        if (carp_hmac_verify(sc, ch->carp_counter, ch->carp_md)) {
                carpstats.carps_badauth++;
 -              SC2IFP(sc)->if_ierrors++;
 -              CARP_LOG("%s: incorrect hash\n", SC2IFP(sc)->if_xname);
 +              cifp->if_ierrors++;
 +              CARP_LOG("%s: incorrect hash\n", cifp->if_xname);
                m_freem(m);
                return;
        }
                        callout_stop(&sc->sc_ad_tmo);
                        CARP_DEBUG("%s: MASTER -> BACKUP "
                           "(more frequent advertisement received)\n",
 -                         SC2IFP(sc)->if_xname);
 +                         cifp->if_xname);
                        carp_set_state(sc, BACKUP);
                        carp_setrun(sc, 0);
                        carp_setroute(sc, RTM_DELETE);
                if (carp_opts[CARPCTL_PREEMPT] &&
                    timevalcmp(&sc_tv, &ch_tv, <)) {
                        CARP_DEBUG("%s: BACKUP -> MASTER "
 -                          "(preempting a slower master)\n",
 -                          SC2IFP(sc)->if_xname);
 -                      carp_master_down_locked(sc);
 +                          "(preempting a slower master)\n", cifp->if_xname);
 +                      carp_master_down(sc);
                        break;
                }
  
                 */
                sc_tv.tv_sec = sc->sc_advbase * 3;
                if (timevalcmp(&sc_tv, &ch_tv, <)) {
 -                      CARP_DEBUG("%s: BACKUP -> MASTER "
 -                          "(master timed out)\n",
 -                          SC2IFP(sc)->if_xname);
 -                      carp_master_down_locked(sc);
 +                      CARP_DEBUG("%s: BACKUP -> MASTER (master timed out)\n",
 +                                 cifp->if_xname);
 +                      carp_master_down(sc);
                        break;
                }
  
  static int
  carp_prepare_ad(struct mbuf *m, struct carp_softc *sc, struct carp_header *ch)
  {
 +      struct ifnet *cifp = &sc->sc_if;
        struct m_tag *mtag;
 -      struct ifnet *ifp = SC2IFP(sc);
  
        if (sc->sc_init_counter) {
                /* this could also be seconds since unix epoch */
                sc->sc_counter++;
        }
  
 -      ch->carp_counter[0] = htonl((sc->sc_counter>>32)&0xffffffff);
 -      ch->carp_counter[1] = htonl(sc->sc_counter&0xffffffff);
 +      ch->carp_counter[0] = htonl((sc->sc_counter >> 32) & 0xffffffff);
 +      ch->carp_counter[1] = htonl(sc->sc_counter & 0xffffffff);
  
        carp_hmac_generate(sc, ch->carp_counter, ch->carp_md);
  
        mtag = m_tag_get(PACKET_TAG_CARP, sizeof(struct ifnet *), MB_DONTWAIT);
        if (mtag == NULL) {
                m_freem(m);
 -              SC2IFP(sc)->if_oerrors++;
 -              return (ENOMEM);
 +              cifp->if_oerrors++;
 +              return ENOMEM;
        }
 -      bcopy(&ifp, (caddr_t)(mtag + 1), sizeof(struct ifnet *));
 +      bcopy(&cifp, (caddr_t)(mtag + 1), sizeof(struct ifnet *));
        m_tag_prepend(m, mtag);
  
 -      return (0);
 +      return 0;
  }
  
  static void
@@@ -793,29 -758,30 +794,29 @@@ carp_send_ad_all(void
                if (sc->sc_carpdev == NULL)
                        continue;
  
 -              if (CARP_IS_RUNNING(SC2IFP(sc)) && sc->sc_state == MASTER)
 -                      carp_send_ad_locked(sc);
 +              if (CARP_IS_RUNNING(&sc->sc_if) && sc->sc_state == MASTER)
 +                      carp_send_ad(sc);
        }
  }
  
  static void
 -carp_send_ad(void *v)
 +carp_send_ad_timeout(void *xsc)
  {
 -      struct carp_softc *sc = v;
 -
 -      carp_send_ad_locked(sc);
 +      carp_send_ad(xsc);
  }
  
  static void
 -carp_send_ad_locked(struct carp_softc *sc)
 +carp_send_ad(struct carp_softc *sc)
  {
 +      struct ifnet *cifp = &sc->sc_if;
        struct carp_header ch;
        struct timeval tv;
        struct carp_header *ch_ptr;
        struct mbuf *m;
        int len, advbase, advskew;
  
 -      /* bow out if we've lost our UPness or RUNNINGuiness */
 -      if (!CARP_IS_RUNNING(SC2IFP(sc))) {
 +      if (!CARP_IS_RUNNING(cifp)) {
 +              /* Bow out */
                advbase = 255;
                advskew = 255;
        } else {
        ch.carp_cksum = 0;
  
  #ifdef INET
 -      if (sc->sc_ia) {
 +      if (sc->sc_ia != NULL) {
                struct ip *ip;
  
 -              MGETHDR(m, M_NOWAIT, MT_HEADER);
 +              MGETHDR(m, MB_DONTWAIT, MT_HEADER);
                if (m == NULL) {
 -                      SC2IFP(sc)->if_oerrors++;
 +                      cifp->if_oerrors++;
                        carpstats.carps_onomem++;
                        /* XXX maybe less ? */
                        if (advbase != 255 || advskew != 255)
                                callout_reset(&sc->sc_ad_tmo, tvtohz_high(&tv),
 -                                  carp_send_ad, sc);
 +                                  carp_send_ad_timeout, sc);
                        return;
                }
                len = sizeof(*ip) + sizeof(ch);
                ip->ip_ttl = CARP_DFLTTL;
                ip->ip_p = IPPROTO_CARP;
                ip->ip_sum = 0;
 -              ip->ip_src.s_addr = sc->sc_ia->ia_addr.sin_addr.s_addr;
 +              ip->ip_src = sc->sc_ia->ia_addr.sin_addr;
                ip->ip_dst.s_addr = htonl(INADDR_CARP_GROUP);
  
                ch_ptr = (struct carp_header *)(&ip[1]);
                        return;
                ch_ptr->carp_cksum = in_cksum_skip(m, len, sizeof(*ip));
  
 -              getmicrotime(&SC2IFP(sc)->if_lastchange);
 -              SC2IFP(sc)->if_opackets++;
 -              SC2IFP(sc)->if_obytes += len;
 +              getmicrotime(&cifp->if_lastchange);
 +              cifp->if_opackets++;
 +              cifp->if_obytes += len;
                carpstats.carps_opackets++;
  
                if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &sc->sc_imo, NULL)) {
 -                      SC2IFP(sc)->if_oerrors++;
 +                      cifp->if_oerrors++;
                        if (sc->sc_sendad_errors < INT_MAX)
                                sc->sc_sendad_errors++;
                        if (sc->sc_sendad_errors == CARP_SENDAD_MAX_ERRORS) {
        if (sc->sc_ia6) {
                struct ip6_hdr *ip6;
  
 -              MGETHDR(m, M_NOWAIT, MT_HEADER);
 +              MGETHDR(m, MB_DONTWAIT, MT_HEADER);
                if (m == NULL) {
 -                      SC2IFP(sc)->if_oerrors++;
 +                      cifp->if_oerrors++;
                        carpstats.carps_onomem++;
                        /* XXX maybe less ? */
                        if (advbase != 255 || advskew != 255)
                                callout_reset(&sc->sc_ad_tmo, tvtohz_high(&tv),
 -                                  carp_send_ad, sc);
 +                                  carp_send_ad_timeout, sc);
                        return;
                }
                len = sizeof(*ip6) + sizeof(ch);
                ip6->ip6_dst.s6_addr16[0] = htons(0xff02);
                ip6->ip6_dst.s6_addr8[15] = 0x12;
                if (in6_setscope(&ip6->ip6_dst, sc->sc_carpdev, NULL) != 0) {
 -                      SC2IFP(sc)->if_oerrors++;
 +                      cifp->if_oerrors++;
                        m_freem(m);
                        CARP_LOG("%s: in6_setscope failed\n", __func__);
                        return;
                        return;
                ch_ptr->carp_cksum = in_cksum_skip(m, len, sizeof(*ip6));
  
 -              getmicrotime(&SC2IFP(sc)->if_lastchange);
 -              SC2IFP(sc)->if_opackets++;
 -              SC2IFP(sc)->if_obytes += len;
 +              getmicrotime(&cifp->if_lastchange);
 +              cifp->if_opackets++;
 +              cifp->if_obytes += len;
                carpstats.carps_opackets6++;
  
                if (ip6_output(m, NULL, NULL, 0, &sc->sc_im6o, NULL, NULL)) {
 -                      SC2IFP(sc)->if_oerrors++;
 +                      cifp->if_oerrors++;
                        if (sc->sc_sendad_errors < INT_MAX)
                                sc->sc_sendad_errors++;
                        if (sc->sc_sendad_errors == CARP_SENDAD_MAX_ERRORS) {
  
        if (advbase != 255 || advskew != 255)
                callout_reset(&sc->sc_ad_tmo, tvtohz_high(&tv),
 -                  carp_send_ad, sc);
 +                  carp_send_ad_timeout, sc);
  }
  
  /*
  static void
  carp_send_arp(struct carp_softc *sc)
  {
 -      struct ifaddr_container *ifac;
 -
 -      TAILQ_FOREACH(ifac, &SC2IFP(sc)->if_addrheads[mycpuid], ifa_link) {
 -              struct ifaddr *ifa = ifac->ifa;
 +      const struct carp_vhaddr *vha;
  
 -              if (ifa->ifa_addr->sa_family != AF_INET)
 +      TAILQ_FOREACH(vha, &sc->sc_vha_list, vha_link) {
 +              if (vha->vha_iaback == NULL)
                        continue;
 -              arp_ifinit2(sc->sc_carpdev, ifa, IF_LLADDR(sc->sc_ifp));        
  
 -              DELAY(1000);    /* XXX */
 +              arp_iainit(sc->sc_carpdev, &vha->vha_ia->ia_addr.sin_addr,
 +                         IF_LLADDR(&sc->sc_if));
        }
  }
  
@@@ -1011,7 -979,7 +1012,7 @@@ carp_send_na(struct carp_softc *sc
        struct in6_addr *in6;
        static struct in6_addr mcast = IN6ADDR_LINKLOCAL_ALLNODES_INIT;
  
 -      TAILQ_FOREACH(ifac, &SC2IFP(sc)->if_addrheads[mycpuid], ifa_link) {
 +      TAILQ_FOREACH(ifac, &sc->sc_if.if_addrheads[mycpuid], ifa_link) {
                struct ifaddr *ifa = ifac->ifa;
  
                if (ifa->ifa_addr->sa_family != AF_INET6)
  }
  #endif /* INET6 */
  
 +static __inline const struct carp_vhaddr *
 +carp_find_addr(const struct carp_softc *sc, const struct in_addr *addr)
 +{
 +      struct carp_vhaddr *vha;
 +
 +      TAILQ_FOREACH(vha, &sc->sc_vha_list, vha_link) {
 +              if (vha->vha_iaback == NULL)
 +                      continue;
 +
 +              if (vha->vha_ia->ia_addr.sin_addr.s_addr == addr->s_addr)
 +                      return vha;
 +      }
 +      return NULL;
 +}
 +
  static int
 -carp_addrcount(struct carp_if *cif, struct in_ifaddr *ia, int type)
 +carp_iamatch_balance(const struct carp_if *cif, const struct in_addr *itaddr,
 +                   const struct in_addr *isaddr, uint8_t **enaddr)
  {
 -      struct carp_softc *vh;
 -      int count = 0;
 +      const struct carp_softc *vh;
 +      int index, count = 0;
 +
 +      /*
 +       * XXX proof of concept implementation.
 +       * We use the source ip to decide which virtual host should
 +       * handle the request. If we're master of that virtual host,
 +       * then we respond, otherwise, just drop the arp packet on
 +       * the floor.
 +       */
 +
 +      TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) {
 +              if (!CARP_IS_RUNNING(&vh->sc_if))
 +                      continue;
 +
 +              if (carp_find_addr(vh, itaddr) != NULL)
 +                      count++;
 +      }
 +      if (count == 0)
 +              return 0;
 +
 +      /* this should be a hash, like pf_hash() */
 +      index = ntohl(isaddr->s_addr) % count;
 +      count = 0;
  
        TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) {
 -              if ((type == CARP_COUNT_RUNNING &&
 -                   CARP_IS_RUNNING(SC2IFP(vh))) ||
 -                  (type == CARP_COUNT_MASTER && vh->sc_state == MASTER)) {
 -                      struct ifaddr_container *ifac;
 -
 -                      TAILQ_FOREACH(ifac, &SC2IFP(vh)->if_addrheads[mycpuid],
 -                                    ifa_link) {
 -                              struct ifaddr *ifa = ifac->ifa;
 -
 -                              if (ifa->ifa_addr->sa_family == AF_INET &&
 -                                  ia->ia_addr.sin_addr.s_addr ==
 -                                  ifatoia(ifa)->ia_addr.sin_addr.s_addr)
 -                                      count++;
 +              if (!CARP_IS_RUNNING(&vh->sc_if))
 +                      continue;
 +
 +              if (carp_find_addr(vh, itaddr) == NULL)
 +                      continue;
 +
 +              if (count == index) {
 +                      if (vh->sc_state == MASTER) {
 +                              *enaddr = IF_LLADDR(&vh->sc_if);
 +                              return 1;
 +                      } else {
 +                              return 0;
                        }
                }
 +              count++;
        }
 -      return (count);
 +      return 0;
  }
  
  int
 -carp_iamatch(void *v, struct in_ifaddr *ia,
 -    struct in_addr *isaddr, uint8_t **enaddr)
 +carp_iamatch(const void *v, const struct in_addr *itaddr,
 +           const struct in_addr *isaddr, uint8_t **enaddr)
  {
 -      struct carp_if *cif = v;
 -      struct carp_softc *vh;
 -      int index, count = 0;
 +      const struct carp_if *cif = v;
 +      const struct carp_softc *vh;
  
 -      if (carp_opts[CARPCTL_ARPBALANCE]) {
 -              /*
 -               * XXX proof of concept implementation.
 -               * We use the source ip to decide which virtual host should
 -               * handle the request. If we're master of that virtual host,
 -               * then we respond, otherwise, just drop the arp packet on
 -               * the floor.
 -               */
 -              count = carp_addrcount(cif, ia, CARP_COUNT_RUNNING);
 -              if (count == 0) {
 -                      /* should never reach this */
 -                      return (0);
 -              }
 +      if (carp_opts[CARPCTL_ARPBALANCE])
 +              return carp_iamatch_balance(cif, itaddr, isaddr, enaddr);
  
 -              /* this should be a hash, like pf_hash() */
 -              index = ntohl(isaddr->s_addr) % count;
 -              count = 0;
 -
 -              TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) {
 -                      if (CARP_IS_RUNNING(SC2IFP(vh))) {
 -                              struct ifaddr_container *ifac;
 -
 -                              TAILQ_FOREACH(ifac,
 -                              &SC2IFP(vh)->if_addrheads[mycpuid], ifa_link) {
 -                                      struct ifaddr *ifa = ifac->ifa;
 -
 -                                      if (ifa->ifa_addr->sa_family ==
 -                                          AF_INET &&
 -                                          ia->ia_addr.sin_addr.s_addr ==
 -                                          ifatoia(ifa)->ia_addr.sin_addr.s_addr) {
 -                                              if (count == index) {
 -                                                      if (vh->sc_state == MASTER) {
 -                                                              *enaddr = IF_LLADDR(vh->sc_ifp);
 -                                                              return (1);
 -                                                      } else {
 -                                                              return (0);
 -                                                      }
 -                                              }
 -                                              count++;
 -                                      }
 -                              }
 -                      }
 -              }
 -      } else {
 -              TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) {
 -                      if (CARP_IS_RUNNING(SC2IFP(vh)) &&
 -                          vh->sc_state == MASTER) {
 -                              *enaddr = IF_LLADDR(vh->sc_ifp);
 -                              return (1);
 -                      }
 +      TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) {
 +              if (!CARP_IS_RUNNING(&vh->sc_if) || vh->sc_state != MASTER)
 +                      continue;
 +
 +              if (carp_find_addr(vh, itaddr) != NULL) {
 +                      *enaddr = IF_LLADDR(&vh->sc_if);
 +                      return 1;
                }
        }
 -      return(0);
 +      return 0;
  }
  
  #ifdef INET6
@@@ -1121,13 -1092,13 +1122,13 @@@ carp_iamatch6(void *v, struct in6_addr 
        TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) {
                struct ifaddr_container *ifac;
  
 -              TAILQ_FOREACH(ifac, &SC2IFP(vh)->if_addrheads[mycpuid],
 +              TAILQ_FOREACH(ifac, &vh->sc_if.if_addrheads[mycpuid],
                              ifa_link) {
                        struct ifaddr *ifa = ifac->ifa;
  
                        if (IN6_ARE_ADDR_EQUAL(taddr,
                            &ifatoia6(ifa)->ia_addr.sin6_addr) &&
 -                          CARP_IS_RUNNING(SC2IFP(vh)) &&
 +                          CARP_IS_RUNNING(&vh->sc_if) &&
                            vh->sc_state == MASTER) {
                                return (ifa);
                        }
@@@ -1146,26 -1117,25 +1147,26 @@@ carp_macmatch6(void *v, struct mbuf *m
        TAILQ_FOREACH(sc, &cif->vhif_vrs, sc_list) {
                struct ifaddr_container *ifac;
  
 -              TAILQ_FOREACH(ifac, &SC2IFP(sc)->if_addrheads[mycpuid],
 +              TAILQ_FOREACH(ifac, &sc->sc_if.if_addrheads[mycpuid],
                              ifa_link) {
                        struct ifaddr *ifa = ifac->ifa;
  
                        if (IN6_ARE_ADDR_EQUAL(taddr,
                            &ifatoia6(ifa)->ia_addr.sin6_addr) &&
 -                          CARP_IS_RUNNING(SC2IFP(sc))) {
 -                              struct ifnet *ifp = SC2IFP(sc);
 +                          CARP_IS_RUNNING(&sc->sc_if)) {
 +                              struct ifnet *ifp = &sc->sc_if;
 +
                                mtag = m_tag_get(PACKET_TAG_CARP,
                                    sizeof(struct ifnet *), MB_DONTWAIT);
                                if (mtag == NULL) {
                                        /* better a bit than nothing */
 -                                      return (IF_LLADDR(sc->sc_ifp));
 +                                      return (IF_LLADDR(ifp));
                                }
                                bcopy(&ifp, (caddr_t)(mtag + 1),
                                    sizeof(struct ifnet *));
                                m_tag_prepend(m, mtag);
  
 -                              return (IF_LLADDR(sc->sc_ifp));
 +                              return (IF_LLADDR(ifp));
                        }
                }
        }
  }
  #endif
  
 -struct ifnet *
 -carp_forus(void *v, void *dhost)
 +int
 +carp_forus(const void *v, const void *dhost)
  {
 -      struct carp_if *cif = v;
 -      struct carp_softc *vh;
 -      uint8_t *ena = dhost;
 -      
 -      /**
 -         * XXX: See here for check on MAC adr is not for virtual use
 -         *
 -         **/
 +      const struct carp_if *cif = v;
 +      const struct carp_softc *vh;
 +      const uint8_t *ena = dhost;
  
        if (ena[0] || ena[1] || ena[2] != 0x5e || ena[3] || ena[4] != 1)
 -              return (NULL);
 +              return 0;
  
        TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) {
 -              if (CARP_IS_RUNNING(SC2IFP(vh)) && vh->sc_state == MASTER &&
 -                  !bcmp(dhost, IF_LLADDR(vh->sc_ifp), ETHER_ADDR_LEN)) {
 -                      return (SC2IFP(vh));
 -              }
 +              const struct ifnet *cifp = &vh->sc_if;
 +
 +              if (CARP_IS_RUNNING(cifp) && vh->sc_state == MASTER &&
 +                  !bcmp(dhost, IF_LLADDR(cifp), ETHER_ADDR_LEN))
 +                      return 1;
        }
 -      return (NULL);
 +      return 0;
  }
  
  static void
 -carp_master_down(void *v)
 +carp_master_down_timeout(void *xsc)
  {
 -      struct carp_softc *sc = v;
 +      struct carp_softc *sc = xsc;
  
 -      lwkt_serialize_enter(sc->sc_ifp->if_serializer);
 -      carp_master_down_locked(sc);
 -      lwkt_serialize_exit(sc->sc_ifp->if_serializer);
 +      CARP_DEBUG("%s: BACKUP -> MASTER (master timed out)\n",
 +                 sc->sc_if.if_xname);
 +      carp_master_down(sc);
  }
  
  static void
 -carp_master_down_locked(struct carp_softc *sc)
 +carp_master_down(struct carp_softc *sc)
  {
        switch (sc->sc_state) {
        case INIT:
                kprintf("%s: master_down event in INIT state\n",
 -                  SC2IFP(sc)->if_xname);
 +                      sc->sc_if.if_xname);
                break;
  
        case MASTER:
  
        case BACKUP:
                carp_set_state(sc, MASTER);
 -              carp_send_ad_locked(sc);
 +              carp_send_ad(sc);
                carp_send_arp(sc);
  #ifdef INET6
                carp_send_na(sc);
  static void
  carp_setrun(struct carp_softc *sc, sa_family_t af)
  {
 +      struct ifnet *cifp = &sc->sc_if;
        struct timeval tv;
  
        if (sc->sc_carpdev == NULL) {
 -              SC2IFP(sc)->if_flags &= ~IFF_RUNNING;
                carp_set_state(sc, INIT);
                return;
        }
  
 -      if (SC2IFP(sc)->if_flags & IFF_UP &&
 -          sc->sc_vhid > 0 && (sc->sc_naddrs || sc->sc_naddrs6)) {
 -              SC2IFP(sc)->if_flags |= IFF_RUNNING;
 +      if ((cifp->if_flags & IFF_RUNNING) && sc->sc_vhid > 0 &&
 +          (sc->sc_naddrs || sc->sc_naddrs6)) {
 +              /* Nothing */
        } else {
 -              SC2IFP(sc)->if_flags &= ~IFF_RUNNING;
                carp_setroute(sc, RTM_DELETE);
                return;
        }
        switch (sc->sc_state) {
        case INIT:
                if (carp_opts[CARPCTL_PREEMPT] && !carp_suppress_preempt) {
 -                      carp_send_ad_locked(sc);
 +                      carp_send_ad(sc);
                        carp_send_arp(sc);
  #ifdef INET6
                        carp_send_na(sc);
  #endif /* INET6 */
                        CARP_DEBUG("%s: INIT -> MASTER (preempting)\n",
 -                                 SC2IFP(sc)->if_xname);
 +                                 cifp->if_xname);
                        carp_set_state(sc, MASTER);
                        carp_setroute(sc, RTM_ADD);
                } else {
 -                      CARP_DEBUG("%s: INIT -> BACKUP\n",
 -                                 SC2IFP(sc)->if_xname);
 +                      CARP_DEBUG("%s: INIT -> BACKUP\n", cifp->if_xname);
                        carp_set_state(sc, BACKUP);
                        carp_setroute(sc, RTM_DELETE);
                        carp_setrun(sc, 0);
  #ifdef INET
                case AF_INET:
                        callout_reset(&sc->sc_md_tmo, tvtohz_high(&tv),
 -                          carp_master_down, sc);
 +                          carp_master_down_timeout, sc);
                        break;
  #endif /* INET */
  #ifdef INET6
                case AF_INET6:
                        callout_reset(&sc->sc_md6_tmo, tvtohz_high(&tv),
 -                          carp_master_down, sc);
 +                          carp_master_down_timeout, sc);
                        break;
  #endif /* INET6 */
                default:
                        if (sc->sc_naddrs)
                                callout_reset(&sc->sc_md_tmo, tvtohz_high(&tv),
 -                                  carp_master_down, sc);
 +                                  carp_master_down_timeout, sc);
                        if (sc->sc_naddrs6)
                                callout_reset(&sc->sc_md6_tmo, tvtohz_high(&tv),
 -                                  carp_master_down, sc);
 +                                  carp_master_down_timeout, sc);
                        break;
                }
                break;
                tv.tv_sec = sc->sc_advbase;
                tv.tv_usec = sc->sc_advskew * 1000000 / 256;
                callout_reset(&sc->sc_ad_tmo, tvtohz_high(&tv),
 -                  carp_send_ad, sc);
 +                  carp_send_ad_timeout, sc);
                break;
        }
  }
@@@ -1312,13 -1288,15 +1313,13 @@@ static voi
  carp_multicast_cleanup(struct carp_softc *sc)
  {
        struct ip_moptions *imo = &sc->sc_imo;
 -      uint16_t n = imo->imo_num_memberships;
  
 -      /* Clean up our own multicast memberships */
 -      while (n-- > 0) {
 -              if (imo->imo_membership[n] != NULL) {
 -                      in_delmulti(imo->imo_membership[n]);
 -                      imo->imo_membership[n] = NULL;
 -              }
 -      }
 +      if (imo->imo_num_memberships == 0)
 +              return;
 +      KKASSERT(imo->imo_num_memberships == 1);
 +
 +      in_delmulti(imo->imo_membership[0]);
 +      imo->imo_membership[0] = NULL;
        imo->imo_num_memberships = 0;
        imo->imo_multicast_ifp = NULL;
  }
@@@ -1341,205 -1319,144 +1342,205 @@@ carp_multicast6_cleanup(struct carp_sof
  #endif
  
  static int
 -carp_set_addr(struct carp_softc *sc, struct sockaddr_in *sin)
 +carp_get_vhaddr(struct carp_softc *sc, struct ifdrv *ifd)
 +{
 +      const struct carp_vhaddr *vha;
 +      struct ifcarpvhaddr *carpa, *carpa0;
 +      int count, len, error;
 +
 +      count = 0;
 +      TAILQ_FOREACH(vha, &sc->sc_vha_list, vha_link)
 +              ++count;
 +
 +      if (ifd->ifd_len == 0) {
 +              ifd->ifd_len = count * sizeof(*carpa);
 +              return 0;
 +      } else if (count == 0 || ifd->ifd_len < sizeof(*carpa)) {
 +              ifd->ifd_len = 0;
 +              return 0;
 +      }
 +      len = min(ifd->ifd_len, sizeof(*carpa) * count);
 +      KKASSERT(len >= sizeof(*carpa));
 +
 +      carpa0 = carpa = kmalloc(len, M_TEMP, M_WAITOK | M_NULLOK | M_ZERO);
 +      if (carpa == NULL)
 +              return ENOMEM;
 +
 +      count = 0;
 +      TAILQ_FOREACH(vha, &sc->sc_vha_list, vha_link) {
 +              if (len < sizeof(*carpa))
 +                      break;
 +
 +              carpa->carpa_flags = vha->vha_flags;
 +              carpa->carpa_addr.sin_family = AF_INET;
 +              carpa->carpa_addr.sin_addr = vha->vha_ia->ia_addr.sin_addr;
 +
 +              carpa->carpa_baddr.sin_family = AF_INET;
 +              if (vha->vha_iaback == NULL) {
 +                      carpa->carpa_baddr.sin_addr.s_addr = INADDR_ANY;
 +              } else {
 +                      carpa->carpa_baddr.sin_addr =
 +                      vha->vha_iaback->ia_addr.sin_addr;
 +              }
 +
 +              ++carpa;
 +              ++count;
 +              len -= sizeof(*carpa);
 +      }
 +      ifd->ifd_len = sizeof(*carpa) * count;
 +      KKASSERT(ifd->ifd_len > 0);
 +
 +      error = copyout(carpa0, ifd->ifd_data, ifd->ifd_len);
 +      kfree(carpa0, M_TEMP);
 +      return error;
 +}
 +
 +static int
 +carp_config_vhaddr(struct carp_softc *sc, struct carp_vhaddr *vha)
  {
        struct ifnet *ifp;
 -      struct carp_if *cif;
 -      struct in_ifaddr *ia, *ia_if;
 +      struct in_ifaddr *ia_if;
        struct in_ifaddr_container *iac;
 -      struct ip_moptions *imo = &sc->sc_imo;
 -      struct in_addr addr;
 -      u_long iaddr = htonl(sin->sin_addr.s_addr);
 -      int own, error;
 -      
 -      if (sin->sin_addr.s_addr == 0) {
 -              if (!(SC2IFP(sc)->if_flags & IFF_UP))
 -                      carp_set_state(sc, INIT);
 -              if (sc->sc_naddrs)
 -                      SC2IFP(sc)->if_flags |= IFF_UP;
 -              carp_setrun(sc, 0);
 -              return (0);
 -      }
 -      /* we have to do it by hands to check we won't match on us */
 -      ia_if = NULL; own = 0;
 +      const struct sockaddr_in *sin;
 +      u_long iaddr;
 +      int own;
 +
 +      KKASSERT(vha->vha_ia != NULL);
 +
 +      sin = &vha->vha_ia->ia_addr;
 +      iaddr = ntohl(sin->sin_addr.s_addr);
 +
 +      ia_if = NULL;
 +      own = 0;
        TAILQ_FOREACH(iac, &in_ifaddrheads[mycpuid], ia_link) {
 -              ia = iac->ia;
 +              struct in_ifaddr *ia = iac->ia;
 +
 +              if ((ia->ia_flags & IFA_ROUTE) == 0)
 +                      continue;
 +
 +              if (ia->ia_ifp->if_type == IFT_CARP)
 +                      continue;
  
                /* and, yeah, we need a multicast-capable iface too */
 -              if (ia->ia_ifp != SC2IFP(sc) &&
 -                  (ia->ia_ifp->if_flags & IFF_MULTICAST) &&
 -                  (iaddr & ia->ia_subnetmask) == ia->ia_subnet) {
 -                      if (!ia_if)
 -                              ia_if = ia;
 +              if ((ia->ia_ifp->if_flags & IFF_MULTICAST) == 0)
 +                      continue;
 +
 +              if ((iaddr & ia->ia_subnetmask) == ia->ia_subnet) {
                        if (sin->sin_addr.s_addr ==
                            ia->ia_addr.sin_addr.s_addr)
 -                              own++;
 +                              own = 1;
 +                      if (ia_if == NULL)
 +                              ia_if = ia;
 +                      else if (sc->sc_carpdev != NULL &&
 +                               sc->sc_carpdev == ia->ia_ifp)
 +                              ia_if = ia;
                }
        }
  
 +      carp_deactivate_vhaddr(sc, vha);
        if (!ia_if)
 -              return (EADDRNOTAVAIL);
 +              return ENOENT;
  
 -      ia = ia_if;
 -      ifp = ia->ia_ifp;
 +      ifp = ia_if->ia_ifp;
  
 -      if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0 ||
 -          (imo->imo_multicast_ifp && imo->imo_multicast_ifp != ifp))
 -              return (EADDRNOTAVAIL);
 +      /* XXX Don't allow parent iface to be changed */
 +      if (sc->sc_carpdev != NULL && sc->sc_carpdev != ifp)
 +              return EEXIST;
  
 -      if (imo->imo_num_memberships == 0) {
 -              addr.s_addr = htonl(INADDR_CARP_GROUP);
 -              if ((imo->imo_membership[0] = in_addmulti(&addr, ifp)) == NULL)
 -                      return (ENOBUFS);
 -              imo->imo_num_memberships++;
 -              imo->imo_multicast_ifp = ifp;
 -              imo->imo_multicast_ttl = CARP_DFLTTL;
 -              imo->imo_multicast_loop = 0;
 -      }
 +      return carp_activate_vhaddr(sc, vha, ifp, ia_if, own);
 +}
  
 -      if (!ifp->if_carp) {
 -              cif = kmalloc(sizeof(*cif), M_CARP, M_WAITOK | M_ZERO);
 -              if ((error = ifpromisc(ifp, 1))) {
 -                      kfree(cif, M_CARP);
 -                      goto cleanup;
 -              }
 -              
 -              cif->vhif_ifp = ifp;
 -              TAILQ_INIT(&cif->vhif_vrs);
 -              ifp->if_carp = cif;
 -      } else {
 -              struct carp_softc *vr;
 +static void
 +carp_add_addr(struct carp_softc *sc, struct ifaddr *carp_ifa)
 +{
 +      struct carp_vhaddr *vha_new;
 +      struct in_ifaddr *carp_ia;
 +#ifdef INVARIANTS
 +      struct carp_vhaddr *vha;
 +#endif
  
 -              cif = (struct carp_if *)ifp->if_carp;
 -              TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) {
 -                      if (vr != sc && vr->sc_vhid == sc->sc_vhid) {
 -                              error = EINVAL;
 -                              goto cleanup;
 -                      }
 -              }
 -      }
 -      sc->sc_ia = ia;
 -      sc->sc_carpdev = ifp;
 +      KKASSERT(carp_ifa->ifa_addr->sa_family == AF_INET);
 +      carp_ia = ifatoia(carp_ifa);
  
 -      { /* XXX prevent endless loop if already in queue */
 -      struct carp_softc *vr, *after = NULL;
 -      int myself = 0;
 -      cif = (struct carp_if *)ifp->if_carp;
 +#ifdef INVARIANTS
 +      TAILQ_FOREACH(vha, &sc->sc_vha_list, vha_link)
 +              KKASSERT(vha->vha_ia != NULL && vha->vha_ia != carp_ia);
 +#endif
  
 -      TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) {
 -              if (vr == sc)
 -                      myself = 1;
 -              if (vr->sc_vhid < sc->sc_vhid)
 -                      after = vr;
 -      }
 +      vha_new = kmalloc(sizeof(*vha_new), M_CARP, M_WAITOK | M_ZERO);
 +      vha_new->vha_ia = carp_ia;
 +      carp_insert_vhaddr(sc, vha_new);
  
 -      if (!myself) {
 -              /* We're trying to keep things in order */
 -              if (after == NULL)
 -                      TAILQ_INSERT_TAIL(&cif->vhif_vrs, sc, sc_list);
 -              else
 -                      TAILQ_INSERT_AFTER(&cif->vhif_vrs, after, sc, sc_list);
 -              cif->vhif_nvrs++;
 -      }
 +      if (carp_config_vhaddr(sc, vha_new) != 0) {
 +              /*
 +               * If the above configuration fails, it may only mean
 +               * that the new address is problematic.  However, the
 +               * carp(4) interface may already have several working
 +               * addresses.  Since the expected behaviour of
 +               * SIOC[AS]IFADDR is to put the NIC into working state,
 +               * we try starting the state machine manually here with
 +               * the hope that the carp(4)'s previously working
 +               * addresses still could be brought up.
 +               */
 +              carp_hmac_prepare(sc);
 +              carp_set_state(sc, INIT);
 +              carp_setrun(sc, 0);
        }
 +}
  
 -      sc->sc_naddrs++;
 -      SC2IFP(sc)->if_flags |= IFF_UP;
 -      if (own)
 -              sc->sc_advskew = 0;
 +static void
 +carp_del_addr(struct carp_softc *sc, struct ifaddr *carp_ifa)
 +{
 +      struct carp_vhaddr *vha;
 +      struct in_ifaddr *carp_ia;
  
 -      carp_sc_state_locked(sc);
 -      carp_setrun(sc, 0);
 +      KKASSERT(carp_ifa->ifa_addr->sa_family == AF_INET);
 +      carp_ia = ifatoia(carp_ifa);
  
 -      return (0);
 +      TAILQ_FOREACH(vha, &sc->sc_vha_list, vha_link) {
 +              KKASSERT(vha->vha_ia != NULL);
 +              if (vha->vha_ia == carp_ia)
 +                      break;
 +      }
 +      KASSERT(vha != NULL, ("no corresponding vhaddr %p\n", carp_ifa));
  
 -cleanup:
 -      in_delmulti(imo->imo_membership[--imo->imo_num_memberships]);
 -      return (error);
 +      /*
 +       * Remove the vhaddr from the list before deactivating
 +       * the vhaddr, so that the HMAC could be correctly
 +       * updated in carp_deactivate_vhaddr()
 +       */
 +      carp_remove_vhaddr(sc, vha);
 +
 +      carp_deactivate_vhaddr(sc, vha);
 +      kfree(vha, M_CARP);
  }
  
 -static int
 -carp_del_addr(struct carp_softc *sc, struct sockaddr_in *sin)
 +static void
 +carp_config_addr(struct carp_softc *sc, struct ifaddr *carp_ifa)
  {
 -      int error = 0;
 +      struct carp_vhaddr *vha;
 +      struct in_ifaddr *carp_ia;
  
 -      if (!--sc->sc_naddrs) {
 -              struct carp_if *cif = (struct carp_if *)sc->sc_carpdev->if_carp;
 -              struct ip_moptions *imo = &sc->sc_imo;
 +      KKASSERT(carp_ifa->ifa_addr->sa_family == AF_INET);
 +      carp_ia = ifatoia(carp_ifa);
  
 -              callout_stop(&sc->sc_ad_tmo);
 -              SC2IFP(sc)->if_flags &= ~(IFF_UP | IFF_RUNNING);
 -              sc->sc_vhid = -1;
 -              in_delmulti(imo->imo_membership[--imo->imo_num_memberships]);
 -              imo->imo_multicast_ifp = NULL;
 -              TAILQ_REMOVE(&cif->vhif_vrs, sc, sc_list);
 -              if (!--cif->vhif_nvrs) {
 -                      sc->sc_carpdev->if_carp = NULL;
 -                      kfree(cif, M_IFADDR);
 -              }
 +      TAILQ_FOREACH(vha, &sc->sc_vha_list, vha_link) {
 +              KKASSERT(vha->vha_ia != NULL);
 +              if (vha->vha_ia == carp_ia)
 +                      break;
 +      }
 +      KASSERT(vha != NULL, ("no corresponding vhaddr %p\n", carp_ifa));
 +
 +      /* Remove then reinsert, to keep the vhaddr list sorted */
 +      carp_remove_vhaddr(sc, vha);
 +      carp_insert_vhaddr(sc, vha);
 +
 +      if (carp_config_vhaddr(sc, vha) != 0) {
 +              /* See the comment in carp_add_addr() */
 +              carp_hmac_prepare(sc);
 +              carp_set_state(sc, INIT);
 +              carp_setrun(sc, 0);
        }
 -      return (error);
  }
  
  #ifdef INET6
@@@ -1555,6 -1472,10 +1556,6 @@@ carp_set_addr6(struct carp_softc *sc, s
        int own, error;
  
        if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
 -              if (!(SC2IFP(sc)->if_flags & IFF_UP))
 -                      carp_set_state(sc, INIT);
 -              if (sc->sc_naddrs6)
 -                      SC2IFP(sc)->if_flags |= IFF_UP;
                carp_setrun(sc, 0);
                return (0);
        }
                                break;
                }
                /* and, yeah, we need a multicast-capable iface too */
 -              if (ia->ia_ifp != SC2IFP(sc) &&
 +              if (ia->ia_ifp != &sc->sc_if &&
                    (ia->ia_ifp->if_flags & IFF_MULTICAST) &&
                    (i == 4)) {
                        if (!ia_if)
  
        if (!ifp->if_carp) {
                cif = kmalloc(sizeof(*cif), M_CARP, M_WAITOK | M_ZERO);
 +
                if ((error = ifpromisc(ifp, 1))) {
                        kfree(cif, M_CARP);
                        goto cleanup;
                }
  
 -              cif->vhif_ifp = ifp;
                TAILQ_INIT(&cif->vhif_vrs);
                ifp->if_carp = cif;
        } else {
                struct carp_softc *vr;
  
 -              cif = (struct carp_if *)ifp->if_carp;
 +              cif = ifp->if_carp;
                TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) {
                        if (vr != sc && vr->sc_vhid == sc->sc_vhid) {
                                error = EINVAL;
        { /* XXX prevent endless loop if already in queue */
        struct carp_softc *vr, *after = NULL;
        int myself = 0;
 -      cif = (struct carp_if *)ifp->if_carp;
 +      cif = ifp->if_carp;
  
        TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) {
                if (vr == sc)
                        TAILQ_INSERT_TAIL(&cif->vhif_vrs, sc, sc_list);
                else
                        TAILQ_INSERT_AFTER(&cif->vhif_vrs, after, sc, sc_list);
 -              cif->vhif_nvrs++;
        }
        }
  
        sc->sc_naddrs6++;
 -      SC2IFP(sc)->if_flags |= IFF_UP;
        if (own)
                sc->sc_advskew = 0;
 -      carp_sc_state_locked(sc);
 +      carp_sc_state(sc);
        carp_setrun(sc, 0);
  
        return (0);
@@@ -1690,10 -1613,11 +1691,10 @@@ carp_del_addr6(struct carp_softc *sc, s
        int error = 0;
  
        if (!--sc->sc_naddrs6) {
 -              struct carp_if *cif = (struct carp_if *)sc->sc_carpdev->if_carp;
 +              struct carp_if *cif = sc->sc_carpdev->if_carp;
                struct ip6_moptions *im6o = &sc->sc_im6o;
  
                callout_stop(&sc->sc_ad_tmo);
 -              SC2IFP(sc)->if_flags &= ~(IFF_UP | IFF_RUNNING);
                sc->sc_vhid = -1;
                while (!LIST_EMPTY(&im6o->im6o_memberships)) {
                        struct in6_multi_mship *imm =
                }
                im6o->im6o_multicast_ifp = NULL;
                TAILQ_REMOVE(&cif->vhif_vrs, sc, sc_list);
 -              if (!--cif->vhif_nvrs) {
 +              if (TAILQ_EMPTY(&cif->vhif_vrs)) {
                        sc->sc_carpdev->if_carp = NULL;
                        kfree(cif, M_IFADDR);
                }
@@@ -1721,26 -1645,26 +1722,26 @@@ carp_ioctl(struct ifnet *ifp, u_long cm
        struct ifaddr *ifa;
        struct ifreq *ifr;
        struct ifaliasreq *ifra;
 +      struct ifdrv *ifd;
 +      char devname[IFNAMSIZ];
        int error = 0;
  
        ifa = (struct ifaddr *)addr;
        ifra = (struct ifaliasreq *)addr;
        ifr = (struct ifreq *)addr;
 +      ifd = (struct ifdrv *)addr;
  
        switch (cmd) {
        case SIOCSIFADDR:
                switch (ifa->ifa_addr->sa_family) {
  #ifdef INET
                case AF_INET:
 -                      SC2IFP(sc)->if_flags |= IFF_UP;
 -                      bcopy(ifa->ifa_addr, ifa->ifa_dstaddr,
 -                          sizeof(struct sockaddr));
 -                      error = carp_set_addr(sc, satosin(ifa->ifa_addr));
 +                      ifp->if_flags |= IFF_UP | IFF_RUNNING;
                        break;
  #endif /* INET */
  #ifdef INET6
                case AF_INET6:
 -                      SC2IFP(sc)->if_flags |= IFF_UP;
 +                      ifp->if_flags |= IFF_UP | IFF_RUNNING;
                        error = carp_set_addr6(sc, satosin6(ifa->ifa_addr));
                        break;
  #endif /* INET6 */
                switch (ifa->ifa_addr->sa_family) {
  #ifdef INET
                case AF_INET:
 -                      SC2IFP(sc)->if_flags |= IFF_UP;
 -                      bcopy(ifa->ifa_addr, ifa->ifa_dstaddr,
 -                          sizeof(struct sockaddr));
 -                      error = carp_set_addr(sc, satosin(&ifra->ifra_addr));
 -                      break;
 +                      panic("SIOCAIFADDR should never be seen\n");
  #endif /* INET */
  #ifdef INET6
                case AF_INET6:
 -                      SC2IFP(sc)->if_flags |= IFF_UP;
 +                      ifp->if_flags |= IFF_UP | IFF_RUNNING;
                        error = carp_set_addr6(sc, satosin6(&ifra->ifra_addr));
                        break;
  #endif /* INET6 */
                switch (ifa->ifa_addr->sa_family) {
  #ifdef INET
                case AF_INET:
 -                      error = carp_del_addr(sc, satosin(&ifra->ifra_addr));
 -                      break;
 +                      panic("SIOCDIFADDR should never be seen\n");
  #endif /* INET */
  #ifdef INET6
                case AF_INET6:
                break;
  
        case SIOCSIFFLAGS:
 -              if (sc->sc_state != INIT && !(ifr->ifr_flags & IFF_UP)) {
 -                      callout_stop(&sc->sc_ad_tmo);
 -                      callout_stop(&sc->sc_md_tmo);
 -                      callout_stop(&sc->sc_md6_tmo);
 -                      if (sc->sc_state == MASTER)
 -                              carp_send_ad_locked(sc);
 -                      carp_set_state(sc, INIT);
 -                      carp_setrun(sc, 0);
 -              } else if (sc->sc_state == INIT && (ifr->ifr_flags & IFF_UP)) {
 -                      SC2IFP(sc)->if_flags |= IFF_UP;
 -                      carp_setrun(sc, 0);
 +              if (ifp->if_flags & IFF_UP) {
 +                      if ((ifp->if_flags & IFF_RUNNING) == 0) {
 +                              ifp->if_flags |= IFF_RUNNING;
 +                              carp_set_state(sc, INIT);
 +                              carp_setrun(sc, 0);
 +                      }
 +              } else if (ifp->if_flags & IFF_RUNNING) {
 +                      carp_stop(sc, 0);
                }
                break;
  
        case SIOCSVH:
-               error = suser_cred(cr, NULL_CRED_OKAY);
+               error = priv_check_cred(cr, PRIV_ROOT, NULL_CRED_OKAY);
                if (error)
                        break;
 -              if ((error = copyin(ifr->ifr_data, &carpr, sizeof carpr)))
 +              error = copyin(ifr->ifr_data, &carpr, sizeof(carpr));
 +              if (error)
                        break;
 +
                error = 1;
 -              if (sc->sc_state != INIT && carpr.carpr_state != sc->sc_state) {
 +              if ((ifp->if_flags & IFF_RUNNING) &&
 +                  sc->sc_state != INIT && carpr.carpr_state != sc->sc_state) {
                        switch (carpr.carpr_state) {
                        case BACKUP:
                                callout_stop(&sc->sc_ad_tmo);
                                break;
  
                        case MASTER:
 -                              carp_master_down_locked(sc);
 +                              carp_master_down(sc);
                                break;
  
                        default:
                                break;
                        }
                        if (sc->sc_carpdev) {
 -                              struct carp_if *cif;
 -                              cif = (struct carp_if *)sc->sc_carpdev->if_carp;
 +                              struct carp_if *cif = sc->sc_carpdev->if_carp;
 +
                                TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) {
                                        if (vr != sc &&
                                            vr->sc_vhid == carpr.carpr_vhid)
                                }
                        }
                        sc->sc_vhid = carpr.carpr_vhid;
 -                      IF_LLADDR(sc->sc_ifp)[0] = 0;
 -                      IF_LLADDR(sc->sc_ifp)[1] = 0;
 -                      IF_LLADDR(sc->sc_ifp)[2] = 0x5e;
 -                      IF_LLADDR(sc->sc_ifp)[3] = 0;
 -                      IF_LLADDR(sc->sc_ifp)[4] = 1;
 -                      IF_LLADDR(sc->sc_ifp)[5] = sc->sc_vhid;
 +                      IF_LLADDR(ifp)[0] = 0;
 +                      IF_LLADDR(ifp)[1] = 0;
 +                      IF_LLADDR(ifp)[2] = 0x5e;
 +                      IF_LLADDR(ifp)[3] = 0;
 +                      IF_LLADDR(ifp)[4] = 1;
 +                      IF_LLADDR(ifp)[5] = sc->sc_vhid;
                        error--;
                }
                if (carpr.carpr_advbase > 0 || carpr.carpr_advskew > 0) {
                carpr.carpr_vhid = sc->sc_vhid;
                carpr.carpr_advbase = sc->sc_advbase;
                carpr.carpr_advskew = sc->sc_advskew;
-               error = suser_cred(cr, NULL_CRED_OKAY);
+               error = priv_check_cred(cr, PRIV_ROOT, NULL_CRED_OKAY);
                if (error == 0) {
                        bcopy(sc->sc_key, carpr.carpr_key,
 -                          sizeof(carpr.carpr_key));
 +                            sizeof(carpr.carpr_key));
                }
 +
                error = copyout(&carpr, ifr->ifr_data, sizeof(carpr));
                break;
  
 +      case SIOCGDRVSPEC:
 +              switch (ifd->ifd_cmd) {
 +              case CARPGDEVNAME:
 +                      if (ifd->ifd_len != sizeof(devname))
 +                              error = EINVAL;
 +                      break;
 +
 +              case CARPGVHADDR:
 +                      break;
 +
 +              default:
 +                      error = EINVAL;
 +                      break;
 +              }
 +              if (error)
 +                      break;
 +
 +              switch (ifd->ifd_cmd) {
 +              case CARPGVHADDR:
 +                      error = carp_get_vhaddr(sc, ifd);
 +                      break;
 +
 +              case CARPGDEVNAME:
 +                      bzero(devname, sizeof(devname));
 +                      if (sc->sc_carpdev != NULL) {
 +                              strlcpy(devname, sc->sc_carpdev->if_xname,
 +                                      sizeof(devname));
 +                      }
 +                      error = copyout(devname, ifd->ifd_data,
 +                                      sizeof(devname));
 +                      break;
 +              }
 +              break;
 +
        default:
                error = EINVAL;
 +              break;
        }
        carp_hmac_prepare(sc);
 -      return (error);
 +      return error;
  }
  
  /*
@@@ -2036,42 -1928,49 +2036,42 @@@ carp_output(struct ifnet *ifp, struct m
  static void
  carp_set_state(struct carp_softc *sc, int state)
  {
 +      struct ifnet *cifp = &sc->sc_if;
 +
        if (sc->sc_state == state)
                return;
 -
        sc->sc_state = state;
 -      switch (state) {
 +
 +      switch (sc->sc_state) {
        case BACKUP:
 -              SC2IFP(sc)->if_link_state = LINK_STATE_DOWN;
 +              cifp->if_link_state = LINK_STATE_DOWN;
                break;
  
        case MASTER:
 -              SC2IFP(sc)->if_link_state = LINK_STATE_UP;
 +              cifp->if_link_state = LINK_STATE_UP;
                break;
  
        default:
 -              SC2IFP(sc)->if_link_state = LINK_STATE_UNKNOWN;
 +              cifp->if_link_state = LINK_STATE_UNKNOWN;
                break;
        }
 -      rt_ifmsg(SC2IFP(sc));
 +      rt_ifmsg(cifp);
  }
  
  void
  carp_carpdev_state(void *v)
  {
        struct carp_if *cif = v;
 -
 -      carp_carpdev_state_locked(cif);
 -}
 -
 -static void
 -carp_carpdev_state_locked(struct carp_if *cif)
 -{
        struct carp_softc *sc;
  
        TAILQ_FOREACH(sc, &cif->vhif_vrs, sc_list)
 -              carp_sc_state_locked(sc);
 +              carp_sc_state(sc);
  }
  
  static void
 -carp_sc_state_locked(struct carp_softc *sc)
 +carp_sc_state(struct carp_softc *sc)
  {
        if (!(sc->sc_carpdev->if_flags & IFF_UP)) {
 -              sc->sc_flags_backup = SC2IFP(sc)->if_flags;
 -              SC2IFP(sc)->if_flags &= ~(IFF_UP | IFF_RUNNING);
                callout_stop(&sc->sc_ad_tmo);
                callout_stop(&sc->sc_md_tmo);
                callout_stop(&sc->sc_md6_tmo);
                }
                sc->sc_suppress = 1;
        } else {
 -              SC2IFP(sc)->if_flags |= sc->sc_flags_backup;
                carp_set_state(sc, INIT);
                carp_setrun(sc, 0);
                if (sc->sc_suppress)
        }
  }
  
 +static void
 +carp_stop(struct carp_softc *sc, int detach)
 +{
 +      sc->sc_if.if_flags &= ~IFF_RUNNING;
 +
 +      callout_stop(&sc->sc_ad_tmo);
 +      callout_stop(&sc->sc_md_tmo);
 +      callout_stop(&sc->sc_md6_tmo);
 +
 +      if (!detach && sc->sc_state == MASTER)
 +              carp_send_ad(sc);
 +
 +      if (sc->sc_suppress)
 +              carp_suppress_preempt--;
 +      sc->sc_suppress = 0;
 +
 +      if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS)
 +              carp_suppress_preempt--;
 +      sc->sc_sendad_errors = 0;
 +      sc->sc_sendad_success = 0;
 +
 +      carp_set_state(sc, INIT);
 +      carp_setrun(sc, 0);
 +}
 +
 +static void
 +carp_reset(struct carp_softc *sc, int detach)
 +{
 +      struct ifnet *cifp = &sc->sc_if;
 +
 +      carp_stop(sc, detach);
 +      if (!sc->sc_dead && (cifp->if_flags & IFF_UP))
 +              cifp->if_flags |= IFF_RUNNING;
 +}
 +
 +static int
 +carp_activate_vhaddr(struct carp_softc *sc, struct carp_vhaddr *vha,
 +                   struct ifnet *ifp, const struct in_ifaddr *ia_if, int own)
 +{
 +      struct ip_moptions *imo = &sc->sc_imo;
 +      struct carp_if *cif;
 +      struct carp_softc *vr, *after = NULL;
 +      int onlist, error;
 +#ifdef INVARIANTS
 +      int assert_onlist;
 +#endif
 +
 +      KKASSERT(vha->vha_ia != NULL);
 +
 +      KASSERT(ia_if != NULL, ("NULL backing address\n"));
 +      KASSERT(vha->vha_iaback == NULL, ("%p is already activated\n", vha));
 +      KASSERT((vha->vha_flags & CARP_VHAF_OWNER) == 0,
 +              ("inactive vhaddr %p is the address owner\n", vha));
 +
 +      KASSERT(sc->sc_carpdev == NULL || sc->sc_carpdev == ifp,
 +              ("%s is already on %s\n", sc->sc_if.if_xname,
 +               sc->sc_carpdev->if_xname));
 +
 +      KASSERT(imo->imo_multicast_ifp == NULL ||
 +              imo->imo_multicast_ifp == ifp,
 +              ("%s didn't leave mcast group on %s\n",
 +               sc->sc_if.if_xname, imo->imo_multicast_ifp->if_xname));
 +
 +      if (imo->imo_num_memberships == 0) {
 +              struct in_addr addr;
 +
 +              addr.s_addr = htonl(INADDR_CARP_GROUP);
 +              if ((imo->imo_membership[0] = in_addmulti(&addr, ifp)) == NULL)
 +                      return ENOBUFS;
 +              imo->imo_num_memberships++;
 +              imo->imo_multicast_ifp = ifp;
 +              imo->imo_multicast_ttl = CARP_DFLTTL;
 +              imo->imo_multicast_loop = 0;
 +      }
 +
 +      if (!ifp->if_carp) {
 +              KASSERT(sc->sc_carpdev == NULL,
 +                      ("%s is already on %s\n", sc->sc_if.if_xname,
 +                       sc->sc_carpdev->if_xname));
 +
 +              cif = kmalloc(sizeof(*cif), M_CARP, M_WAITOK | M_ZERO);
 +
 +              error = ifpromisc(ifp, 1);
 +              if (error) {
 +                      kfree(cif, M_CARP);
 +                      goto cleanup;
 +              }
 +
 +              TAILQ_INIT(&cif->vhif_vrs);
 +              ifp->if_carp = cif;
 +      } else {
 +              cif = ifp->if_carp;
 +              TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) {
 +                      if (vr != sc && vr->sc_vhid == sc->sc_vhid) {
 +                              error = EINVAL;
 +                              goto cleanup;
 +                      }
 +              }
 +      }
 +
 +#ifdef INVARIANTS
 +      if (sc->sc_carpdev != NULL)
 +              assert_onlist = 1;
 +      else
 +              assert_onlist = 0;
 +#endif
 +      sc->sc_ia = ia_if;
 +      sc->sc_carpdev = ifp;
 +
 +      cif = ifp->if_carp;
 +      onlist = 0;
 +      TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) {
 +              if (vr == sc)
 +                      onlist = 1;
 +              if (vr->sc_vhid < sc->sc_vhid)
 +                      after = vr;
 +      }
 +
 +#ifdef INVARIANTS
 +      if (assert_onlist) {
 +              KASSERT(onlist, ("%s is not on %s carp list\n",
 +                      sc->sc_if.if_xname, ifp->if_xname));
 +      } else {
 +              KASSERT(!onlist, ("%s is already on %s carp list\n",
 +                      sc->sc_if.if_xname, ifp->if_xname));
 +      }
 +#endif
 +
 +      if (!onlist) {
 +              /* We're trying to keep things in order */
 +              if (after == NULL)
 +                      TAILQ_INSERT_TAIL(&cif->vhif_vrs, sc, sc_list);
 +              else
 +                      TAILQ_INSERT_AFTER(&cif->vhif_vrs, after, sc, sc_list);
 +      }
 +
 +      vha->vha_iaback = ia_if;
 +      sc->sc_naddrs++;
 +
 +      if (own) {
 +              vha->vha_flags |= CARP_VHAF_OWNER;
 +
 +              /* XXX save user configured advskew? */
 +              sc->sc_advskew = 0;
 +      }
 +
 +      carp_hmac_prepare(sc);
 +      carp_set_state(sc, INIT);
 +      carp_setrun(sc, 0);
 +      return 0;
 +cleanup:
 +      carp_multicast_cleanup(sc);
 +      return error;
 +}
 +
 +static void
 +carp_deactivate_vhaddr(struct carp_softc *sc, struct carp_vhaddr *vha)
 +{
 +      KKASSERT(vha->vha_ia != NULL);
 +
 +      carp_hmac_prepare(sc);
 +
 +      if (vha->vha_iaback == NULL) {
 +              KASSERT((vha->vha_flags & CARP_VHAF_OWNER) == 0,
 +                      ("inactive vhaddr %p is the address owner\n", vha));
 +              return;
 +      }
 +
 +      vha->vha_flags &= ~CARP_VHAF_OWNER;
 +
 +      KKASSERT(sc->sc_naddrs > 0);
 +      vha->vha_iaback = NULL;
 +      sc->sc_naddrs--;
 +      if (!sc->sc_naddrs) {
 +              if (sc->sc_naddrs6) {
 +                      carp_multicast_cleanup(sc);
 +                      sc->sc_ia = NULL;
 +              } else {
 +                      carp_detach(sc, 0);
 +              }
 +      }
 +}
 +
 +static void
 +carp_link_addrs(struct carp_softc *sc, struct ifnet *ifp, struct ifaddr *ifa_if)
 +{
 +      struct carp_vhaddr *vha;
 +      struct in_ifaddr *ia_if;
 +
 +      KKASSERT(ifa_if->ifa_addr->sa_family == AF_INET);
 +      ia_if = ifatoia(ifa_if);
 +
 +      if ((ia_if->ia_flags & IFA_ROUTE) == 0)
 +              return;
 +
 +      /*
 +       * Test each inactive vhaddr against the newly added address.
 +       * If the newly added address could be the backing address,
 +       * then activate the matching vhaddr.
 +       */
 +      TAILQ_FOREACH(vha, &sc->sc_vha_list, vha_link) {
 +              const struct in_ifaddr *ia;
 +              u_long iaddr;
 +              int own;
 +
 +              if (vha->vha_iaback != NULL)
 +                      continue;
 +
 +              ia = vha->vha_ia;
 +              iaddr = ntohl(ia->ia_addr.sin_addr.s_addr);
 +
 +              if ((iaddr & ia_if->ia_subnetmask) != ia_if->ia_subnet)
 +                      continue;
 +
 +              own = 0;
 +              if (ia->ia_addr.sin_addr.s_addr ==
 +                  ia_if->ia_addr.sin_addr.s_addr)
 +                      own = 1;
 +
 +              carp_activate_vhaddr(sc, vha, ifp, ia_if, own);
 +      }
 +}
 +
 +static void
 +carp_unlink_addrs(struct carp_softc *sc, struct ifnet *ifp,
 +                struct ifaddr *ifa_if)
 +{
 +      struct carp_vhaddr *vha;
 +      struct in_ifaddr *ia_if;
 +
 +      KKASSERT(ifa_if->ifa_addr->sa_family == AF_INET);
 +      ia_if = ifatoia(ifa_if);
 +
 +      /*
 +       * Ad src address is deleted; set it to NULL.
 +       * Following loop will try pick up a new ad src address
 +       * if one of the vhaddr could retain its backing address.
 +       */
 +      if (sc->sc_ia == ia_if)
 +              sc->sc_ia = NULL;
 +
 +      /*
 +       * Test each active vhaddr against the deleted address.
 +       * If the deleted address is vhaddr address's backing
 +       * address, then deactivate the vhaddr.
 +       */
 +      TAILQ_FOREACH(vha, &sc->sc_vha_list, vha_link) {
 +              if (vha->vha_iaback == NULL)
 +                      continue;
 +
 +              if (vha->vha_iaback == ia_if)
 +                      carp_deactivate_vhaddr(sc, vha);
 +              else if (sc->sc_ia == NULL)
 +                      sc->sc_ia = vha->vha_iaback;
 +      }
 +}
 +
 +static void
 +carp_update_addrs(struct carp_softc *sc)
 +{
 +      struct carp_vhaddr *vha;
 +
 +      KKASSERT(sc->sc_carpdev == NULL);
 +
 +      TAILQ_FOREACH(vha, &sc->sc_vha_list, vha_link)
 +              carp_config_vhaddr(sc, vha);
 +}
 +
 +static void
 +carp_ifaddr(void *arg __unused, struct ifnet *ifp,
 +          enum ifaddr_event event, struct ifaddr *ifa)
 +{
 +      struct carp_softc *sc;
 +
 +      if (ifa->ifa_addr->sa_family != AF_INET)
 +              return;
 +
 +      if (ifp->if_type == IFT_CARP) {
 +              /*
 +               * Address is changed on carp(4) interface
 +               */
 +              switch (event) {
 +              case IFADDR_EVENT_ADD:
 +                      carp_add_addr(ifp->if_softc, ifa);
 +                      break;
 +
 +              case IFADDR_EVENT_CHANGE:
 +                      carp_config_addr(ifp->if_softc, ifa);
 +                      break;
 +
 +              case IFADDR_EVENT_DELETE:
 +                      carp_del_addr(ifp->if_softc, ifa);
 +                      break;
 +              }
 +              return;
 +      }
 +
 +      /*
 +       * Address is changed on non-carp(4) interface
 +       */
 +      if ((ifp->if_flags & IFF_MULTICAST) == 0)
 +              return;
 +
 +      crit_enter();
 +      LIST_FOREACH(sc, &carpif_list, sc_next) {
 +              if (sc->sc_carpdev != NULL && sc->sc_carpdev != ifp) {
 +                      /* Not the parent iface; skip */
 +                      continue;
 +              }
 +
 +              switch (event) {
 +              case IFADDR_EVENT_ADD:
 +                      carp_link_addrs(sc, ifp, ifa);
 +                      break;
 +
 +              case IFADDR_EVENT_DELETE:
 +                      if (sc->sc_carpdev != NULL) {
 +                              carp_unlink_addrs(sc, ifp, ifa);
 +                              if (sc->sc_carpdev == NULL)
 +                                      carp_update_addrs(sc);
 +                      } else {
 +                              /*
 +                               * The carp(4) interface didn't have a
 +                               * parent iface, so it is not possible
 +                               * that it will contain any address to
 +                               * be unlinked.
 +                               */
 +                      }
 +                      break;
 +
 +              case IFADDR_EVENT_CHANGE:
 +                      if (sc->sc_carpdev == NULL) {
 +                              /*
 +                               * The carp(4) interface didn't have a
 +                               * parent iface, so it is not possible
 +                               * that it will contain any address to
 +                               * be updated.
 +                               */
 +                              carp_link_addrs(sc, ifp, ifa);
 +                      } else {
 +                              /*
 +                               * First try breaking tie with the old
 +                               * address.  Then see whether we could
 +                               * link certain vhaddr to the new address.
 +                               * If that fails, i.e. carpdev is NULL,
 +                               * we try a global update.
 +                               *
 +                               * NOTE: The above order is critical.
 +                               */
 +                              carp_unlink_addrs(sc, ifp, ifa);
 +                              carp_link_addrs(sc, ifp, ifa);
 +                              if (sc->sc_carpdev == NULL)
 +                                      carp_update_addrs(sc);
 +                      }
 +                      break;
 +              }
 +      }
 +      crit_exit();
 +}
 +
  static int
  carp_modevent(module_t mod, int type, void *data)
  {
                carp_ifdetach_event =
                EVENTHANDLER_REGISTER(ifnet_detach_event, carp_ifdetach, NULL,
                                      EVENTHANDLER_PRI_ANY);
 +              carp_ifaddr_event =
 +              EVENTHANDLER_REGISTER(ifaddr_event, carp_ifaddr, NULL,
 +                                    EVENTHANDLER_PRI_ANY);
                if_clone_attach(&carp_cloner);
                break;
  
        case MOD_UNLOAD:
                EVENTHANDLER_DEREGISTER(ifnet_detach_event,
                                        carp_ifdetach_event);
 +              EVENTHANDLER_DEREGISTER(ifaddr_event,
 +                                      carp_ifaddr_event);
                if_clone_detach(&carp_cloner);
                break;
  
diff --combined sys/netinet/ip_output.c
@@@ -50,6 -50,7 +50,7 @@@
  #include <sys/socket.h>
  #include <sys/socketvar.h>
  #include <sys/proc.h>
+ #include <sys/priv.h>
  #include <sys/sysctl.h>
  #include <sys/thread2.h>
  #include <sys/in_cksum.h>
@@@ -117,6 -118,7 +118,6 @@@ static int ip_setmoption
  int   ip_optcopy(struct ip *, struct ip *);
  
  extern        int route_assert_owner_access;
 -extern        void db_print_backtrace(void);
  
  extern        struct protosw inetsw[];
  
@@@ -226,7 -228,7 +227,7 @@@ ip_output(struct mbuf *m0, struct mbuf 
                                kprintf("ip_output: "
                                        "rt rt_cpuid %d accessed on cpu %d\n",
                                        ro->ro_rt->rt_cpuid, mycpuid);
 -                              db_print_backtrace();
 +                              backtrace();
                        }
                }
  
@@@ -1488,7 -1490,7 +1489,7 @@@ ip_ctloutput(struct socket *so, struct 
                                break;
                        soopt_to_mbuf(sopt, m);
                        priv = (sopt->sopt_td != NULL &&
-                               suser(sopt->sopt_td) != 0) ? 0 : 1;
+                               priv_check(sopt->sopt_td, PRIV_ROOT) != 0) ? 0 : 1;
                        req = mtod(m, caddr_t);
                        len = m->m_len;
                        optname = sopt->sopt_name;
diff --combined sys/netinet/tcp_subr.c
@@@ -85,6 -85,7 +85,7 @@@
  #include <sys/domain.h>
  #endif
  #include <sys/proc.h>
+ #include <sys/priv.h>
  #include <sys/socket.h>
  #include <sys/socketvar.h>
  #include <sys/protosw.h>
  #include <netinet/tcp_fsm.h>
  #include <netinet/tcp_seq.h>
  #include <netinet/tcp_timer.h>
 +#include <netinet/tcp_timer2.h>
  #include <netinet/tcp_var.h>
  #include <netinet6/tcp6_var.h>
  #include <netinet/tcpip.h>
@@@ -294,11 -294,8 +295,11 @@@ struct   inp_tp 
                char    align[(sizeof(struct inpcb) + ALIGNM1) & ~ALIGNM1];
        } inp_tp_u;
        struct  tcpcb tcb;
 -      struct  callout inp_tp_rexmt, inp_tp_persist, inp_tp_keep, inp_tp_2msl;
 -      struct  callout inp_tp_delack;
 +      struct  tcp_callout inp_tp_rexmt;
 +      struct  tcp_callout inp_tp_persist;
 +      struct  tcp_callout inp_tp_keep;
 +      struct  tcp_callout inp_tp_2msl;
 +      struct  tcp_callout inp_tp_delack;
        struct  netmsg_tcp_timer inp_tp_timermsg;
  };
  #undef ALIGNMENT
@@@ -672,7 -669,6 +673,7 @@@ tcp_respond(struct tcpcb *tp, void *ipg
                        ro6->ro_rt = NULL;
                }
        } else {
 +              ipflags |= IP_DEBUGROUTE;
                ip_output(m, NULL, ro, ipflags, NULL, tp ? tp->t_inpcb : NULL);
                if ((ro == &sro) && (ro->ro_rt != NULL)) {
                        RTFREE(ro->ro_rt);
@@@ -705,12 -701,11 +706,12 @@@ tcp_newtcpcb(struct inpcb *inp
        tp->t_maxseg = tp->t_maxopd = isipv6 ? tcp_v6mssdflt : tcp_mssdflt;
  
        /* Set up our timeouts. */
 -      callout_init(tp->tt_rexmt = &it->inp_tp_rexmt);
 -      callout_init(tp->tt_persist = &it->inp_tp_persist);
 -      callout_init(tp->tt_keep = &it->inp_tp_keep);
 -      callout_init(tp->tt_2msl = &it->inp_tp_2msl);
 -      callout_init(tp->tt_delack = &it->inp_tp_delack);
 +      tp->tt_rexmt = &it->inp_tp_rexmt;
 +      tp->tt_persist = &it->inp_tp_persist;
 +      tp->tt_keep = &it->inp_tp_keep;
 +      tp->tt_2msl = &it->inp_tp_2msl;
 +      tp->tt_delack = &it->inp_tp_delack;
 +      tcp_inittimers(tp);
  
        tp->tt_msg = &it->inp_tp_timermsg;
        if (isipv6) {
@@@ -862,16 -857,13 +863,16 @@@ tcp_close(struct tcpcb *tp
  
        /*
         * Make sure that all of our timers are stopped before we
 -       * delete the PCB.
 +       * delete the PCB.  For listen TCP socket (tp->tt_msg == NULL),
 +       * timers are never used.
         */
 -      callout_stop(tp->tt_rexmt);
 -      callout_stop(tp->tt_persist);
 -      callout_stop(tp->tt_keep);
 -      callout_stop(tp->tt_2msl);
 -      callout_stop(tp->tt_delack);
 +      if (tp->tt_msg != NULL) {
 +              tcp_callout_stop(tp, tp->tt_rexmt);
 +              tcp_callout_stop(tp, tp->tt_persist);
 +              tcp_callout_stop(tp, tp->tt_keep);
 +              tcp_callout_stop(tp, tp->tt_2msl);
 +              tcp_callout_stop(tp, tp->tt_delack);
 +      }
  
        if (tp->t_flags & TF_ONOUTPUTQ) {
                KKASSERT(tp->tt_cpu == mycpu->gd_cpuid);
@@@ -1264,7 -1256,7 +1265,7 @@@ tcp_getcred(SYSCTL_HANDLER_ARGS
        int cpu;
        int error;
  
-       error = suser(req->td);
+       error = priv_check(req->td, PRIV_ROOT);
        if (error != 0)
                return (error);
        error = SYSCTL_IN(req, addrs, sizeof addrs);
@@@ -1297,7 -1289,7 +1298,7 @@@ tcp6_getcred(SYSCTL_HANDLER_ARGS
        int error;
        boolean_t mapped = FALSE;
  
-       error = suser(req->td);
+       error = priv_check(req->td, PRIV_ROOT);
        if (error != 0)
                return (error);
        error = SYSCTL_IN(req, addrs, sizeof addrs);
diff --combined sys/netinet6/in6.c
@@@ -77,6 -77,7 +77,7 @@@
  #include <sys/sockio.h>
  #include <sys/systm.h>
  #include <sys/proc.h>
+ #include <sys/priv.h>
  #include <sys/time.h>
  #include <sys/kernel.h>
  #include <sys/syslog.h>
@@@ -385,7 -386,7 +386,7 @@@ in6_control(struct socket *so, u_long c
        int error;
  
        privileged = 0;
-       if (suser(td) == 0)
+       if (priv_check(td, PRIV_ROOT) == 0)
                privileged++;
  
        switch (cmd) {
                return (mrt6_ioctl(cmd, data));
        }
  
 +      switch(cmd) {
 +      case SIOCAADDRCTL_POLICY:
 +      case SIOCDADDRCTL_POLICY:
 +              if (!privileged)
 +                      return (EPERM);
 +              return (in6_src_ioctl(cmd, data));
 +      }
 +
        if (ifp == NULL)
                return (EOPNOTSUPP);
  
  
        case SIOCAIFADDR_IN6:
        {
 -              int i, error = 0;
 +              int i, error = 0, iaIsNew;
                struct nd_prefix pr0, *pr;
  
 +              if (ia != NULL)
 +                      iaIsNew = 0;
 +              else
 +                      iaIsNew = 1;
 +
                /*
                 * first, make or update the interface address structure,
                 * and link it to the list.
                         */
                        pfxlist_onlink_check();
                }
 -              if (error == 0 && ia)
 -                      EVENTHANDLER_INVOKE(ifaddr_event, ifp);
 +              if (error == 0 && ia) {
 +                      EVENTHANDLER_INVOKE(ifaddr_event, ifp,
 +                      iaIsNew ? IFADDR_EVENT_ADD : IFADDR_EVENT_CHANGE,
 +                      &ia->ia_ifa);
 +              }
                break;
        }
  
                        pr->ndpr_expire = 1; /* XXX: just for expiration */
                }
  
 -        purgeaddr:
 +purgeaddr:
 +              EVENTHANDLER_INVOKE(ifaddr_event, ifp, IFADDR_EVENT_DELETE,
 +                                  &ia->ia_ifa);
                in6_purgeaddr(&ia->ia_ifa);
 -              EVENTHANDLER_INVOKE(ifaddr_event, ifp);
                break;
        }
  
diff --combined sys/netinet6/in6_src.c
  #include <sys/param.h>
  #include <sys/systm.h>
  #include <sys/jail.h>
 +#include <sys/kernel.h>
  #include <sys/malloc.h>
  #include <sys/mbuf.h>
  #include <sys/protosw.h>
  #include <sys/socket.h>
  #include <sys/socketvar.h>
 +#include <sys/sockio.h>
 +#include <sys/sysctl.h>
  #include <sys/errno.h>
  #include <sys/time.h>
  #include <sys/proc.h>
+ #include <sys/priv.h>
  
  #include <net/if.h>
  #include <net/route.h>
  
  #include "use_loop.h"
  
 +#define ADDR_LABEL_NOTAPP (-1)
 +struct in6_addrpolicy defaultaddrpolicy;
 +
 +static void   init_policy_queue(void);
 +static int    add_addrsel_policyent(struct in6_addrpolicy *);
 +static int    delete_addrsel_policyent(struct in6_addrpolicy *);
 +static int    walk_addrsel_policy(int (*)(struct in6_addrpolicy *, void *),
 +                                  void *);
 +static int    dump_addrsel_policyent(struct in6_addrpolicy *, void *);
 +
 +
  /*
   * Return an IPv6 address, which is the most appropriate for a given
   * destination and user specified options.
@@@ -421,7 -408,7 +422,7 @@@ in6_pcbsetport(struct in6_addr *laddr, 
                last  = ipport_hilastauto;
                lastport = &pcbinfo->lasthi;
        } else if (inp->inp_flags & INP_LOWPORT) {
-               if ((error = suser(td)) != 0)
+               if ((error = priv_check(td, PRIV_ROOT)) != 0)
                        return error;
                first = ipport_lowfirstauto;    /* 1023 */
                last  = ipport_lowlastauto;     /* 600 */
@@@ -624,169 -611,3 +625,169 @@@ in6_clearscope(struct in6_addr *addr
        if (IN6_IS_SCOPE_LINKLOCAL(addr))
                addr->s6_addr16[1] = 0;
  }
 +
 +void
 +addrsel_policy_init(void)
 +{
 +
 +      init_policy_queue();
 +
 +      /* initialize the "last resort" policy */
 +      bzero(&defaultaddrpolicy, sizeof(defaultaddrpolicy));
 +      defaultaddrpolicy.label = ADDR_LABEL_NOTAPP;
 +}
 +
 +/*
 + * Subroutines to manage the address selection policy table via sysctl.
 + */
 +struct walkarg {
 +      struct sysctl_req *w_req;
 +};
 +
 +static int in6_src_sysctl(SYSCTL_HANDLER_ARGS);
 +SYSCTL_DECL(_net_inet6_ip6);
 +SYSCTL_NODE(_net_inet6_ip6, IPV6CTL_ADDRCTLPOLICY, addrctlpolicy,
 +      CTLFLAG_RD, in6_src_sysctl, "");
 +
 +static int
 +in6_src_sysctl(SYSCTL_HANDLER_ARGS)
 +{
 +      struct walkarg w;
 +
 +      if (req->newptr)
 +              return EPERM;
 +
 +      bzero(&w, sizeof(w));
 +      w.w_req = req;
 +
 +      return (walk_addrsel_policy(dump_addrsel_policyent, &w));
 +}
 +
 +int
 +in6_src_ioctl(u_long cmd, caddr_t data)
 +{
 +      int i;
 +      struct in6_addrpolicy ent0;
 +
 +      if (cmd != SIOCAADDRCTL_POLICY && cmd != SIOCDADDRCTL_POLICY)
 +              return (EOPNOTSUPP); /* check for safety */
 +
 +      ent0 = *(struct in6_addrpolicy *)data;
 +
 +      if (ent0.label == ADDR_LABEL_NOTAPP)
 +              return (EINVAL);
 +      /* check if the prefix mask is consecutive. */
 +      if (in6_mask2len(&ent0.addrmask.sin6_addr, NULL) < 0)
 +              return (EINVAL);
 +      /* clear trailing garbages (if any) of the prefix address. */
 +      for (i = 0; i < 4; i++) {
 +              ent0.addr.sin6_addr.s6_addr32[i] &=
 +                      ent0.addrmask.sin6_addr.s6_addr32[i];
 +      }
 +      ent0.use = 0;
 +
 +      switch (cmd) {
 +      case SIOCAADDRCTL_POLICY:
 +              return (add_addrsel_policyent(&ent0));
 +      case SIOCDADDRCTL_POLICY:
 +              return (delete_addrsel_policyent(&ent0));
 +      }
 +
 +      return (0);             /* XXX: compromise compilers */
 +}
 +
 +/*
 + * The followings are implementation of the policy table using a
 + * simple tail queue.
 + * XXX such details should be hidden.
 + * XXX implementation using binary tree should be more efficient.
 + */
 +struct addrsel_policyent {
 +      TAILQ_ENTRY(addrsel_policyent) ape_entry;
 +      struct in6_addrpolicy ape_policy;
 +};
 +
 +TAILQ_HEAD(addrsel_policyhead, addrsel_policyent);
 +
 +struct addrsel_policyhead addrsel_policytab;
 +
 +static void
 +init_policy_queue(void)
 +{
 +      TAILQ_INIT(&addrsel_policytab);
 +}
 +
 +static int
 +add_addrsel_policyent(struct in6_addrpolicy *newpolicy)
 +{
 +      struct addrsel_policyent *new, *pol;
 +
 +      /* duplication check */
 +      for (pol = TAILQ_FIRST(&addrsel_policytab); pol;
 +           pol = TAILQ_NEXT(pol, ape_entry)) {
 +              if (SA6_ARE_ADDR_EQUAL(&newpolicy->addr,
 +                                     &pol->ape_policy.addr) &&
 +                  SA6_ARE_ADDR_EQUAL(&newpolicy->addrmask,
 +                                     &pol->ape_policy.addrmask)) {
 +                      return (EEXIST);        /* or override it? */
 +              }
 +      }
 +
 +      new = kmalloc(sizeof(*new), M_IFADDR, M_WAITOK | M_ZERO);
 +
 +      /* XXX: should validate entry */
 +      new->ape_policy = *newpolicy;
 +
 +      TAILQ_INSERT_TAIL(&addrsel_policytab, new, ape_entry);
 +
 +      return (0);
 +}
 +
 +static int
 +delete_addrsel_policyent(struct in6_addrpolicy *key)
 +{
 +      struct addrsel_policyent *pol;
 +
 +      /* search for the entry in the table */
 +      for (pol = TAILQ_FIRST(&addrsel_policytab); pol;
 +           pol = TAILQ_NEXT(pol, ape_entry)) {
 +              if (SA6_ARE_ADDR_EQUAL(&key->addr, &pol->ape_policy.addr) &&
 +                  SA6_ARE_ADDR_EQUAL(&key->addrmask,
 +                                     &pol->ape_policy.addrmask)) {
 +                      break;
 +              }
 +      }
 +      if (pol == NULL)
 +              return (ESRCH);
 +
 +      TAILQ_REMOVE(&addrsel_policytab, pol, ape_entry);
 +      kfree(pol, M_IFADDR);
 +
 +      return (0);
 +}
 +
 +static int
 +walk_addrsel_policy(int(*callback)(struct in6_addrpolicy *, void *), void *w)
 +{
 +      struct addrsel_policyent *pol;
 +      int error = 0;
 +
 +      for (pol = TAILQ_FIRST(&addrsel_policytab); pol;
 +           pol = TAILQ_NEXT(pol, ape_entry)) {
 +              if ((error = (*callback)(&pol->ape_policy, w)) != 0)
 +                      return (error);
 +      }
 +
 +      return (error);
 +}
 +
 +static int
 +dump_addrsel_policyent(struct in6_addrpolicy *pol, void *arg)
 +{
 +      int error = 0;
 +      struct walkarg *w = arg;
 +
 +      error = SYSCTL_OUT(w->w_req, pol, sizeof(*pol));
 +
 +      return (error);
 +}
diff --combined sys/netinet6/ip6_input.c
@@@ -84,6 -84,7 +84,7 @@@
  #include <sys/kernel.h>
  #include <sys/syslog.h>
  #include <sys/proc.h>
+ #include <sys/priv.h>
  
  #include <sys/thread2.h>
  #include <sys/msgport2.h>
@@@ -200,7 -201,6 +201,7 @@@ ip6_init(void
        netisr_register(NETISR_IPV6, cpu0_portfn, ip6_input,
                        NETISR_FLAG_NOTMPSAFE);
        scope6_init();
 +      addrsel_policy_init();
        nd6_init();
        frag6_init();
        /*
@@@ -1131,7 -1131,7 +1132,7 @@@ ip6_savecontrol(struct inpcb *in6p, str
        int rthdr_exist = 0;
  
  
-       if (suser(td) == 0)
+       if (priv_check(td, PRIV_ROOT) == 0)
                privileged++;
  
  #ifdef SO_TIMESTAMP
@@@ -44,6 -44,7 +44,7 @@@
  #include <sys/kernel.h>
  #include <sys/mbuf.h>
  #include <sys/proc.h>
+ #include <sys/priv.h>
  #include <sys/protosw.h>
  #include <sys/socket.h>
  #include <sys/socketvar.h>
@@@ -595,9 -596,9 +596,9 @@@ static in
  ripx_attach(struct socket *so, int proto, struct pru_attach_info *ai)
  {
        int error = 0;
 -      struct ipxpcb *ipxp = sotoipxpcb(so);
 +      struct ipxpcb *ipxp;
  
-       if ((error = suser_cred(ai->p_ucred, NULL_CRED_OKAY)) != 0)
+       if ((error = priv_check_cred(ai->p_ucred, PRIV_ROOT, NULL_CRED_OKAY)) != 0)
                return (error);
        crit_enter();
        error = ipx_pcballoc(so, &ipxrawpcb);
@@@ -62,6 -62,7 +62,7 @@@
  #include <sys/linker.h>
  #include <sys/malloc.h>
  #include <sys/proc.h>
+ #include <sys/priv.h>
  #include <sys/buf.h>
  #include <sys/reboot.h>
  #include <sys/mbuf.h>
@@@ -1101,7 -1102,7 +1102,7 @@@ exec_setregs(u_long entry, u_long stack
                pcb->pcb_dr2 = 0;
                pcb->pcb_dr3 = 0;
                pcb->pcb_dr6 = 0;
 -              pcb->pcb_dr7 = 0;
 +              pcb->pcb_dr7 = 0; /* JG set bit 10? */
                if (pcb == td->td_pcb) {
                        /*
                         * Clear the debug registers on the running
  fill_dbregs(struct lwp *lp, struct dbreg *dbregs)
  {
          if (lp == NULL) {
 -#if JG
 -                dbregs->dr0 = rdr0();
 -                dbregs->dr1 = rdr1();
 -                dbregs->dr2 = rdr2();
 -                dbregs->dr3 = rdr3();
 -                dbregs->dr4 = rdr4();
 -                dbregs->dr5 = rdr5();
 -                dbregs->dr6 = rdr6();
 -                dbregs->dr7 = rdr7();
 -#endif
 +                dbregs->dr[0] = rdr0();
 +                dbregs->dr[1] = rdr1();
 +                dbregs->dr[2] = rdr2();
 +                dbregs->dr[3] = rdr3();
 +                dbregs->dr[4] = rdr4();
 +                dbregs->dr[5] = rdr5();
 +                dbregs->dr[6] = rdr6();
 +                dbregs->dr[7] = rdr7();
          } else {
                struct pcb *pcb;
  
                  pcb = lp->lwp_thread->td_pcb;
 -#if JG
 -                dbregs->dr0 = pcb->pcb_dr0;
 -                dbregs->dr1 = pcb->pcb_dr1;
 -                dbregs->dr2 = pcb->pcb_dr2;
 -                dbregs->dr3 = pcb->pcb_dr3;
 -                dbregs->dr4 = 0;
 -                dbregs->dr5 = 0;
 -                dbregs->dr6 = pcb->pcb_dr6;
 -                dbregs->dr7 = pcb->pcb_dr7;
 -#endif
 +                dbregs->dr[0] = pcb->pcb_dr0;
 +                dbregs->dr[1] = pcb->pcb_dr1;
 +                dbregs->dr[2] = pcb->pcb_dr2;
 +                dbregs->dr[3] = pcb->pcb_dr3;
 +                dbregs->dr[4] = 0;
 +                dbregs->dr[5] = 0;
 +                dbregs->dr[6] = pcb->pcb_dr6;
 +                dbregs->dr[7] = pcb->pcb_dr7;
          }
        return (0);
  }
  set_dbregs(struct lwp *lp, struct dbreg *dbregs)
  {
        if (lp == NULL) {
 -#if JG
 -              load_dr0(dbregs->dr0);
 -              load_dr1(dbregs->dr1);
 -              load_dr2(dbregs->dr2);
 -              load_dr3(dbregs->dr3);
 -              load_dr4(dbregs->dr4);
 -              load_dr5(dbregs->dr5);
 -              load_dr6(dbregs->dr6);
 -              load_dr7(dbregs->dr7);
 -#endif
 +              load_dr0(dbregs->dr[0]);
 +              load_dr1(dbregs->dr[1]);
 +              load_dr2(dbregs->dr[2]);
 +              load_dr3(dbregs->dr[3]);
 +              load_dr4(dbregs->dr[4]);
 +              load_dr5(dbregs->dr[5]);
 +              load_dr6(dbregs->dr[6]);
 +              load_dr7(dbregs->dr[7]);
        } else {
                struct pcb *pcb;
                struct ucred *ucred;
                int i;
 -              uint32_t mask1, mask2;
 +              uint64_t mask1, mask2;
  
                /*
                 * Don't let an illegal value for dr7 get set.  Specifically,
                 * result in undefined behaviour and can lead to an unexpected
                 * TRCTRAP.
                 */
 -              for (i = 0, mask1 = 0x3<<16, mask2 = 0x2<<16; i < 8; 
 -                   i++, mask1 <<= 2, mask2 <<= 2)
 -#if JG
 -                      if ((dbregs->dr7 & mask1) == mask2)
 +              /* JG this loop looks unreadable */
 +              /* Check 4 2-bit fields for invalid patterns.
 +               * These fields are R/Wi, for i = 0..3
 +               */
 +              /* Is 10 in LENi allowed when running in compatibility mode? */
 +              /* Pattern 10 in R/Wi might be used to indicate
 +               * breakpoint on I/O. Further analysis should be
 +               * carried to decide if it is safe and useful to
 +               * provide access to that capability
 +               */
 +              for (i = 0, mask1 = 0x3<<16, mask2 = 0x2<<16; i < 4; 
 +                   i++, mask1 <<= 4, mask2 <<= 4)
 +                      if ((dbregs->dr[7] & mask1) == mask2)
                                return (EINVAL);
 -#endif
                
                pcb = lp->lwp_thread->td_pcb;
                ucred = lp->lwp_proc->p_ucred;
                 * from within kernel mode?
                 */
  
-               if (suser_cred(ucred, 0) != 0) {
+               if (priv_check_cred(ucred, PRIV_ROOT, 0) != 0) {
 -#if JG
 -                      if (dbregs->dr7 & 0x3) {
 +                      if (dbregs->dr[7] & 0x3) {
                                /* dr0 is enabled */
 -                              if (dbregs->dr0 >= VM_MAX_USER_ADDRESS)
 +                              if (dbregs->dr[0] >= VM_MAX_USER_ADDRESS)
                                        return (EINVAL);
                        }
  
 -                      if (dbregs->dr7 & (0x3<<2)) {
 +                      if (dbregs->dr[7] & (0x3<<2)) {
                                /* dr1 is enabled */
 -                              if (dbregs->dr1 >= VM_MAX_USER_ADDRESS)
 +                              if (dbregs->dr[1] >= VM_MAX_USER_ADDRESS)
                                        return (EINVAL);
                        }
  
 -                      if (dbregs->dr7 & (0x3<<4)) {
 +                      if (dbregs->dr[7] & (0x3<<4)) {
                                /* dr2 is enabled */
 -                              if (dbregs->dr2 >= VM_MAX_USER_ADDRESS)
 +                              if (dbregs->dr[2] >= VM_MAX_USER_ADDRESS)
                                        return (EINVAL);
                        }
  
 -                      if (dbregs->dr7 & (0x3<<6)) {
 +                      if (dbregs->dr[7] & (0x3<<6)) {
                                /* dr3 is enabled */
 -                              if (dbregs->dr3 >= VM_MAX_USER_ADDRESS)
 +                              if (dbregs->dr[3] >= VM_MAX_USER_ADDRESS)
                                        return (EINVAL);
                        }
 -#endif
                }
  
 -#if JG
 -              pcb->pcb_dr0 = dbregs->dr0;
 -              pcb->pcb_dr1 = dbregs->dr1;
 -              pcb->pcb_dr2 = dbregs->dr2;
 -              pcb->pcb_dr3 = dbregs->dr3;
 -              pcb->pcb_dr6 = dbregs->dr6;
 -              pcb->pcb_dr7 = dbregs->dr7;
 -#endif
 +              pcb->pcb_dr0 = dbregs->dr[0];
 +              pcb->pcb_dr1 = dbregs->dr[1];
 +              pcb->pcb_dr2 = dbregs->dr[2];
 +              pcb->pcb_dr3 = dbregs->dr[3];
 +              pcb->pcb_dr6 = dbregs->dr[6];
 +              pcb->pcb_dr7 = dbregs->dr[7];
  
                pcb->pcb_flags |= PCB_DBREGS;
        }
  int
  user_dbreg_trap(void)
  {
 -        u_int32_t dr7, dr6; /* debug registers dr6 and dr7 */
 -        u_int32_t bp;       /* breakpoint bits extracted from dr6 */
 +        u_int64_t dr7, dr6; /* debug registers dr6 and dr7 */
 +        u_int64_t bp;       /* breakpoint bits extracted from dr6 */
          int nbp;            /* number of breakpoints that triggered */
          caddr_t addr[4];    /* breakpoint addresses */
          int i;
          
          dr7 = rdr7();
 -        if ((dr7 & 0x000000ff) == 0) {
 +        if ((dr7 & 0xff) == 0) {
                  /*
                   * all GE and LE bits in the dr7 register are zero,
                   * thus the trap couldn't have been caused by the
  
          nbp = 0;
          dr6 = rdr6();
 -        bp = dr6 & 0x0000000f;
 +        bp = dr6 & 0xf;
  
 -        if (!bp) {
 +        if (bp == 0) {
                  /*
                   * None of the breakpoint bits are set meaning this
                   * trap was not caused by any of the debug registers
diff --combined sys/vfs/hammer/hammer.h
@@@ -49,6 -49,7 +49,7 @@@
  #include <sys/mountctl.h>
  #include <sys/vnode.h>
  #include <sys/proc.h>
+ #include <sys/priv.h>
  #include <sys/stat.h>
  #include <sys/globaldata.h>
  #include <sys/lockf.h>
@@@ -116,7 -117,6 +117,7 @@@ struct hammer_transaction 
  typedef struct hammer_transaction *hammer_transaction_t;
  
  #define HAMMER_TRANSF_NEWINODE        0x0001
 +#define HAMMER_TRANSF_DIDIO   0x0002
  
  /*
   * HAMMER locks
@@@ -353,12 -353,11 +354,12 @@@ typedef struct hammer_inode *hammer_ino
  /*
   * Used by the inode reclaim code to pipeline reclaims and avoid
   * blowing out kernel memory or letting the flusher get too far
 - * behind.
 + * behind.  The reclaim wakes up when count reaches 0 or the
 + * timer expires.
   */
  struct hammer_reclaim {
        TAILQ_ENTRY(hammer_reclaim) entry;
 -      int     okydoky;
 +      int     count;
  };
  
  #define HAMMER_RECLAIM_FLUSH  2000
@@@ -755,7 -754,6 +756,7 @@@ struct hammer_mount 
  typedef struct hammer_mount   *hammer_mount_t;
  
  #define HAMMER_MOUNT_CRITICAL_ERROR   0x0001
 +#define HAMMER_MOUNT_FLUSH_RECOVERY   0x0002
  
  struct hammer_sync_info {
        int error;
@@@ -824,6 -822,7 +825,6 @@@ extern int hammer_count_io_running_read
  extern int hammer_count_io_running_write;
  extern int hammer_count_io_locked;
  extern int hammer_limit_dirtybufspace;
 -extern int hammer_limit_iqueued;
  extern int hammer_limit_recs;
  extern int hammer_bio_count;
  extern int hammer_verify_zone;
@@@ -848,7 -847,6 +849,7 @@@ void       hammer_scan_inode_snapshots(hammer
  void  hammer_put_inode(struct hammer_inode *ip);
  void  hammer_put_inode_ref(struct hammer_inode *ip);
  void  hammer_inode_waitreclaims(hammer_mount_t hmp);
 +void  hammer_inode_waithard(hammer_mount_t hmp);
  
  int   hammer_unload_volume(hammer_volume_t volume, void *data __unused);
  int   hammer_adjust_volume_mode(hammer_volume_t volume, void *data __unused);
@@@ -966,8 -964,7 +967,8 @@@ int        hammer_btree_lock_children(hammer_c
  void  hammer_btree_unlock_children(hammer_cursor_t cursor,
                        struct hammer_node_locklist **locklistp);
  int   hammer_btree_search_node(hammer_base_elm_t elm, hammer_node_ondisk_t node);
 -hammer_node_t hammer_btree_get_parent(hammer_node_t node, int *parent_indexp,
 +hammer_node_t hammer_btree_get_parent(hammer_transaction_t trans,
 +                      hammer_node_t node, int *parent_indexp,
                        int *errorp, int try_exclusive);
  
  void  hammer_print_btree_node(hammer_node_ondisk_t ondisk);
@@@ -1003,8 -1000,8 +1004,8 @@@ void            hammer_rel_buffer(hammer_buffer_
  
  int           hammer_vfs_export(struct mount *mp, int op,
                        const struct export_args *export);
 -hammer_node_t hammer_get_node(hammer_mount_t hmp, hammer_off_t node_offset,
 -                      int isnew, int *errorp);
 +hammer_node_t hammer_get_node(hammer_transaction_t trans,
 +                      hammer_off_t node_offset, int isnew, int *errorp);
  void          hammer_ref_node(hammer_node_t node);
  hammer_node_t hammer_ref_node_safe(struct hammer_mount *hmp,
                        hammer_node_cache_t cache, int *errorp);
@@@ -1173,7 -1170,6 +1174,7 @@@ void hammer_flusher_sync(hammer_mount_
  int  hammer_flusher_async(hammer_mount_t hmp, hammer_flush_group_t flg);
  int  hammer_flusher_async_one(hammer_mount_t hmp);
  void hammer_flusher_wait(hammer_mount_t hmp, int seq);
 +void hammer_flusher_wait_next(hammer_mount_t hmp);
  int  hammer_flusher_meta_limit(hammer_mount_t hmp);
  int  hammer_flusher_meta_halflimit(hammer_mount_t hmp);
  int  hammer_flusher_undo_exhausted(hammer_transaction_t trans, int quarter);
diff --combined sys/vm/vm_swap.c
@@@ -42,6 -42,7 +42,7 @@@
  #include <sys/sysproto.h>
  #include <sys/buf.h>
  #include <sys/proc.h>
+ #include <sys/priv.h>
  #include <sys/nlookup.h>
  #include <sys/dmap.h>         /* XXX */
  #include <sys/vnode.h>
@@@ -183,7 -184,7 +184,7 @@@ sys_swapon(struct swapon_args *uap
        KKASSERT(td->td_proc);
        cred = td->td_proc->p_ucred;
  
-       error = suser(td);
+       error = priv_check(td, PRIV_ROOT);
        if (error)
                return (error);
  
@@@ -337,7 -338,6 +338,7 @@@ swaponvp(struct thread *td, struct vnod
                blist_free(swapblist, vsbase, blk);
                vm_swap_size += blk;
        }
 +      swap_pager_newswap();
  
        return (0);
  }