Protect v_usecount with a critical section for now (we depend on the BGL),
[dragonfly.git] / sys / kern / vfs_subr.c
index 8af1514..cd438a2 100644 (file)
@@ -37,7 +37,7 @@
  *
  *     @(#)vfs_subr.c  8.31 (Berkeley) 5/26/95
  * $FreeBSD: src/sys/kern/vfs_subr.c,v 1.249.2.30 2003/04/04 20:35:57 tegge Exp $
- * $DragonFly: src/sys/kern/vfs_subr.c,v 1.6 2003/06/25 03:55:57 dillon Exp $
+ * $DragonFly: src/sys/kern/vfs_subr.c,v 1.28 2004/03/28 07:54:00 dillon Exp $
  */
 
 /*
@@ -73,6 +73,7 @@
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_zone.h>
 
 #include <sys/buf2.h>
+#include <sys/thread2.h>
 
 static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
 
-static void    insmntque __P((struct vnode *vp, struct mount *mp));
-static void    vclean __P((struct vnode *vp, int flags, struct thread *td));
+static void    insmntque (struct vnode *vp, struct mount *mp);
+static void    vclean (struct vnode *vp, lwkt_tokref_t vlock, int flags, struct thread *td);
 static unsigned long   numvnodes;
-static void    vlruvp(struct vnode *vp);
 SYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
 
 enum vtype iftovt_tab[16] = {
@@ -116,8 +117,6 @@ static int reassignbufsortbad;
 SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, "");
 static int reassignbufmethod = 1;
 SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, "");
-static int nameileafonly = 0;
-SYSCTL_INT(_vfs, OID_AUTO, nameileafonly, CTLFLAG_RW, &nameileafonly, 0, "");
 
 #ifdef ENABLE_VFS_IOOPT
 int vfs_ioopt = 0;
@@ -125,14 +124,12 @@ SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, "");
 #endif
 
 struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); /* mounted fs */
-struct simplelock mountlist_slock;
-struct simplelock mntvnode_slock;
+struct lwkt_token mountlist_token;
+struct lwkt_token mntvnode_token;
 int    nfs_mount_type = -1;
-#ifndef NULL_SIMPLELOCKS
-static struct simplelock mntid_slock;
-static struct simplelock vnode_free_list_slock;
-static struct simplelock spechash_slock;
-#endif
+static struct lwkt_token mntid_token;
+static struct lwkt_token vnode_free_list_token;
+static struct lwkt_token spechash_token;
 struct nfs_public nfs_pub;     /* publicly exported FS */
 static vm_zone_t vnode_zone;
 
@@ -142,12 +139,17 @@ static vm_zone_t vnode_zone;
 #define SYNCER_MAXDELAY                32
 static int syncer_maxdelay = SYNCER_MAXDELAY;  /* maximum delay time */
 time_t syncdelay = 30;         /* max time to delay syncing data */
+SYSCTL_INT(_kern, OID_AUTO, syncdelay, CTLFLAG_RW, &syncdelay, 0,
+       "VFS data synchronization delay");
 time_t filedelay = 30;         /* time to delay syncing files */
-SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "");
+SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0,
+       "File synchronization delay");
 time_t dirdelay = 29;          /* time to delay syncing directories */
-SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "");
+SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0,
+       "Directory synchronization delay");
 time_t metadelay = 28;         /* time to delay syncing metadata */
-SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "");
+SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0,
+       "VFS metadata synchronization delay");
 static int rushjob;                    /* number of slots to run ASAP */
 static int stat_rush_requests; /* number of times I/O speeded up */
 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "");
@@ -167,11 +169,39 @@ static int vnlru_nowhere = 0;
 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, &vnlru_nowhere, 0,
     "Number of times the vnlru process ran without success");
 
-static void    vfs_free_addrlist __P((struct netexport *nep));
-static int     vfs_free_netcred __P((struct radix_node *rn, void *w));
-static int     vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep,
-                                      struct export_args *argp));
+static void    vfs_free_addrlist (struct netexport *nep);
+static int     vfs_free_netcred (struct radix_node *rn, void *w);
+static int     vfs_hang_addrlist (struct mount *mp, struct netexport *nep,
+                                      struct export_args *argp);
+
+#define VSHOULDFREE(vp) \
+       (!((vp)->v_flag & (VFREE|VDOOMED)) && \
+        !(vp)->v_holdcnt && !(vp)->v_usecount && \
+        (!(vp)->v_object || \
+         !((vp)->v_object->ref_count || (vp)->v_object->resident_page_count)))
+#define VMIGHTFREE(vp) \
+       (((vp)->v_flag & (VFREE|VDOOMED|VXLOCK)) == 0 &&   \
+        cache_leaf_test(vp) == 0 && (vp)->v_usecount == 0)
+#define VSHOULDBUSY(vp) \
+       (((vp)->v_flag & VFREE) && \
+        ((vp)->v_holdcnt || (vp)->v_usecount))
+
+static void vbusy(struct vnode *vp);
+static void vfree(struct vnode *vp);
+static void vmaybefree(struct vnode *vp);
 
+/*
+ * NOTE: the vnode interlock must be held on call.
+ */
+static __inline void
+vmaybefree(struct vnode *vp)
+{
+       if (VSHOULDFREE(vp))
+               vfree(vp);
+}
 /*
  * Initialize the vnode management data structures.
  */
@@ -179,13 +209,25 @@ void
 vntblinit()
 {
 
-       desiredvnodes = maxproc + cnt.v_page_count / 4;
+       /*
+        * Desired vnodes is a result of the physical page count
+        * and the size of kernel's heap.  It scales in proportion
+        * to the amount of available physical memory.  This can
+        * cause trouble on 64-bit and large memory platforms.
+        */
+       /* desiredvnodes = maxproc + vmstats.v_page_count / 4; */
+       desiredvnodes =
+               min(maxproc + vmstats.v_page_count /4,
+                   2 * (VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) /
+                   (5 * (sizeof(struct vm_object) + sizeof(struct vnode))));
+
        minvnodes = desiredvnodes / 4;
-       simple_lock_init(&mntvnode_slock);
-       simple_lock_init(&mntid_slock);
-       simple_lock_init(&spechash_slock);
+       lwkt_token_init(&mountlist_token);
+       lwkt_token_init(&mntvnode_token);
+       lwkt_token_init(&mntid_token);
+       lwkt_token_init(&spechash_token);
        TAILQ_INIT(&vnode_free_list);
-       simple_lock_init(&vnode_free_list_slock);
+       lwkt_token_init(&vnode_free_list_token);
        vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5);
        /*
         * Initialize the filesystem syncer.
@@ -200,8 +242,7 @@ vntblinit()
  * unmounting. Interlock is not released on failure.
  */
 int
-vfs_busy(struct mount *mp, int flags, struct simplelock *interlkp,
-       struct thread *td)
+vfs_busy(struct mount *mp, int flags, lwkt_tokref_t interlkp, struct thread *td)
 {
        int lkflags;
 
@@ -209,19 +250,16 @@ vfs_busy(struct mount *mp, int flags, struct simplelock *interlkp,
                if (flags & LK_NOWAIT)
                        return (ENOENT);
                mp->mnt_kern_flag |= MNTK_MWAIT;
-               if (interlkp) {
-                       simple_unlock(interlkp);
-               }
                /*
                 * Since all busy locks are shared except the exclusive
                 * lock granted when unmounting, the only place that a
                 * wakeup needs to be done is at the release of the
                 * exclusive lock at the end of dounmount.
+                *
+                * note: interlkp is a serializer and thus can be safely
+                * held through any sleep
                 */
-               tsleep((caddr_t)mp, PVFS, "vfs_busy", 0);
-               if (interlkp) {
-                       simple_lock(interlkp);
-               }
+               tsleep((caddr_t)mp, 0, "vfs_busy", 0);
                return (ENOENT);
        }
        lkflags = LK_SHARED | LK_NOPAUSE;
@@ -263,8 +301,8 @@ vfs_rootmountalloc(char *fstypename, char *devname, struct mount **mpp)
                return (ENODEV);
        mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK);
        bzero((char *)mp, (u_long)sizeof(struct mount));
-       lockinit(&mp->mnt_lock, PVFS, "vfslock", VLKTIMEOUT, LK_NOPAUSE);
-       (void)vfs_busy(mp, LK_NOWAIT, 0, td);
+       lockinit(&mp->mnt_lock, 0, "vfslock", VLKTIMEOUT, LK_NOPAUSE);
+       vfs_busy(mp, LK_NOWAIT, NULL, td);
        TAILQ_INIT(&mp->mnt_nvnodelist);
        TAILQ_INIT(&mp->mnt_reservedvnlist);
        mp->mnt_nvnodelistsize = 0;
@@ -295,7 +333,7 @@ int
 lite2_vfs_mountroot()
 {
        struct vfsconf *vfsp;
-       extern int (*lite2_mountroot) __P((void));
+       extern int (*lite2_mountroot) (void);
        int error;
 
        if (lite2_mountroot != NULL)
@@ -318,18 +356,18 @@ struct mount *
 vfs_getvfs(fsid)
        fsid_t *fsid;
 {
-       register struct mount *mp;
+       struct mount *mp;
+       lwkt_tokref ilock;
 
-       simple_lock(&mountlist_slock);
+       lwkt_gettoken(&ilock, &mountlist_token);
        TAILQ_FOREACH(mp, &mountlist, mnt_list) {
                if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
                    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
-                       simple_unlock(&mountlist_slock);
-                       return (mp);
+                       break;
            }
        }
-       simple_unlock(&mountlist_slock);
-       return ((struct mount *) 0);
+       lwkt_reltoken(&ilock);
+       return (mp);
 }
 
 /*
@@ -349,10 +387,11 @@ vfs_getnewfsid(mp)
        struct mount *mp;
 {
        static u_int16_t mntid_base;
+       lwkt_tokref ilock;
        fsid_t tfsid;
        int mtype;
 
-       simple_lock(&mntid_slock);
+       lwkt_gettoken(&ilock, &mntid_token);
        mtype = mp->mnt_vfc->vfc_typenum;
        tfsid.val[1] = mtype;
        mtype = (mtype & 0xFF) << 24;
@@ -365,7 +404,7 @@ vfs_getnewfsid(mp)
        }
        mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
        mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
-       simple_unlock(&mntid_slock);
+       lwkt_reltoken(&ilock);
 }
 
 /*
@@ -415,7 +454,7 @@ vfs_timestamp(tsp)
  */
 void
 vattr_null(vap)
-       register struct vattr *vap;
+       struct vattr *vap;
 {
 
        vap->va_type = VNON;
@@ -459,6 +498,8 @@ static int
 vlrureclaim(struct mount *mp)
 {
        struct vnode *vp;
+       lwkt_tokref ilock;
+       lwkt_tokref vlock;
        int done;
        int trigger;
        int usevnodes;
@@ -474,33 +515,64 @@ vlrureclaim(struct mount *mp)
        usevnodes = desiredvnodes;
        if (usevnodes <= 0)
                usevnodes = 1;
-       trigger = cnt.v_page_count * 2 / usevnodes;
+       trigger = vmstats.v_page_count * 2 / usevnodes;
 
        done = 0;
-       simple_lock(&mntvnode_slock);
+       lwkt_gettoken(&ilock, &mntvnode_token);
        count = mp->mnt_nvnodelistsize / 10 + 1;
        while (count && (vp = TAILQ_FIRST(&mp->mnt_nvnodelist)) != NULL) {
+               /*
+                * __VNODESCAN__
+                *
+                * The VP will stick around while we hold mntvnode_token,
+                * at least until we block, so we can safely do an initial
+                * check.  But we have to check again after obtaining
+                * the vnode interlock.  vp->v_interlock points to stable
+                * storage so it's ok if the vp gets ripped out from
+                * under us while we are blocked.
+                */
+               if (vp->v_type == VNON ||
+                   vp->v_type == VBAD ||
+                   !VMIGHTFREE(vp) ||          /* critical path opt */
+                   (vp->v_object &&
+                    vp->v_object->resident_page_count >= trigger)
+               ) {
+                       TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
+                       TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist,vp, v_nmntvnodes);
+                       --count;
+                       continue;
+               }
+
+               /*
+                * Get the interlock, delay moving the node to the tail so
+                * we don't race against new additions to the mountlist.
+                */
+               lwkt_gettoken(&vlock, vp->v_interlock);
+               if (TAILQ_FIRST(&mp->mnt_nvnodelist) != vp) {
+                       lwkt_reltoken(&vlock);
+                       continue;
+               }
                TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
-               TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
+               TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist,vp, v_nmntvnodes);
 
-               if (vp->v_type != VNON &&
-                   vp->v_type != VBAD &&
-                   VMIGHTFREE(vp) &&           /* critical path opt */
-                   (vp->v_object == NULL || vp->v_object->resident_page_count < trigger) &&
-                   simple_lock_try(&vp->v_interlock)
+               /*
+                * Must check again
+                */
+               if (vp->v_type == VNON ||
+                   vp->v_type == VBAD ||
+                   !VMIGHTFREE(vp) ||          /* critical path opt */
+                   (vp->v_object &&
+                    vp->v_object->resident_page_count >= trigger)
                ) {
-                       simple_unlock(&mntvnode_slock);
-                       if (VMIGHTFREE(vp)) {
-                               vgonel(vp, curthread);
-                               done++;
-                       } else {
-                               simple_unlock(&vp->v_interlock);
-                       }
-                       simple_lock(&mntvnode_slock);
+                       lwkt_reltoken(&vlock);
+                       --count;
+                       continue;
                }
+               vgonel(vp, &vlock, curthread);
+               ++done;
                --count;
        }
-       simple_unlock(&mntvnode_slock);
+       lwkt_reltoken(&ilock);
        return done;
 }
 
@@ -516,39 +588,39 @@ static void
 vnlru_proc(void)
 {
        struct mount *mp, *nmp;
+       lwkt_tokref ilock;
        int s;
        int done;
-       struct thread *td = vnlruthread;
-       struct proc *p = td->td_proc;
+       struct thread *td = curthread;
 
        EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, td,
            SHUTDOWN_PRI_FIRST);   
 
        s = splbio();
        for (;;) {
-               kproc_suspend_loop(td);
+               kproc_suspend_loop();
                if (numvnodes - freevnodes <= desiredvnodes * 9 / 10) {
                        vnlruproc_sig = 0;
                        wakeup(&vnlruproc_sig);
-                       tsleep(p, PVFS, "vlruwt", hz);
+                       tsleep(td, 0, "vlruwt", hz);
                        continue;
                }
                done = 0;
-               simple_lock(&mountlist_slock);
+               lwkt_gettoken(&ilock, &mountlist_token);
                for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
-                       if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, td)) {
+                       if (vfs_busy(mp, LK_NOWAIT, &ilock, td)) {
                                nmp = TAILQ_NEXT(mp, mnt_list);
                                continue;
                        }
                        done += vlrureclaim(mp);
-                       simple_lock(&mountlist_slock);
+                       lwkt_gettokref(&ilock);
                        nmp = TAILQ_NEXT(mp, mnt_list);
                        vfs_unbusy(mp, td);
                }
-               simple_unlock(&mountlist_slock);
+               lwkt_reltoken(&ilock);
                if (done == 0) {
                        vnlru_nowhere++;
-                       tsleep(p, PPAUSE, "vlrup", hz * 3);
+                       tsleep(td, 0, "vlrup", hz * 3);
                }
        }
        splx(s);
@@ -578,9 +650,11 @@ getnewvnode(tag, mp, vops, vpp)
 {
        int s;
        struct thread *td = curthread;  /* XXX */
-       struct proc *p = td->td_proc;
        struct vnode *vp = NULL;
+       struct vnode *xvp;
        vm_object_t object;
+       lwkt_tokref ilock;
+       lwkt_tokref vlock;
 
        s = splbio();
 
@@ -595,7 +669,7 @@ getnewvnode(tag, mp, vops, vpp)
                        vnlruproc_sig = 1;      /* avoid unnecessary wakeups */
                        wakeup(vnlruthread);
                }
-               tsleep(&vnlruproc_sig, PVFS, "vlruwk", hz);
+               tsleep(&vnlruproc_sig, 0, "vlruwk", hz);
        }
 
 
@@ -604,71 +678,114 @@ getnewvnode(tag, mp, vops, vpp)
         * a new vnode if we can't find one or if we have not reached a
         * good minimum for good LRU performance.
         */
-       simple_lock(&vnode_free_list_slock);
+       lwkt_gettoken(&ilock, &vnode_free_list_token);
        if (freevnodes >= wantfreevnodes && numvnodes >= minvnodes) {
                int count;
 
                for (count = 0; count < freevnodes; count++) {
+                       /*
+                        * __VNODESCAN__
+                        *
+                        * Pull the next vnode off the free list and do some
+                        * sanity checks.  Note that regardless of how we
+                        * block, if freevnodes is non-zero there had better
+                        * be something on the list.
+                        */
                        vp = TAILQ_FIRST(&vnode_free_list);
-                       if (vp == NULL || vp->v_usecount)
+                       if (vp == NULL)
                                panic("getnewvnode: free vnode isn't");
 
+                       /*
+                        * Move the vnode to the end of the list so other
+                        * processes do not double-block trying to recycle
+                        * the same vnode (as an optimization), then get
+                        * the interlock.
+                        */
                        TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
+                       TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
+
+                       /*
+                        * Skip vnodes that are in the process of being
+                        * held or referenced.  Since the act of adding or
+                        * removing a vnode on the freelist requires a token
+                        * and may block, the ref count may be adjusted
+                        * prior to its addition or removal.
+                        */
+                       if (VSHOULDBUSY(vp)) {
+                               vp = NULL;
+                               continue;
+                       }
+
+
+                       /*
+                        * Obtain the vnode interlock and check that the
+                        * vnode is still on the free list.
+                        *
+                        * This normally devolves into a degenerate case so
+                        * it is optimal.   Loop up if it isn't.  Note that
+                        * the vnode could be in the middle of being moved
+                        * off the free list (the VSHOULDBUSY() check) and
+                        * must be skipped if so.
+                        */
+                       lwkt_gettoken(&vlock, vp->v_interlock);
+                       TAILQ_FOREACH_REVERSE(xvp, &vnode_free_list, 
+                           freelst, v_freelist) {
+                               if (vp == xvp)
+                                       break;
+                       }
+                       if (vp != xvp || VSHOULDBUSY(vp)) {
+                               vp = NULL;
+                               continue;
+                       }
+
+                       /*
+                        * We now safely own the vnode.  If the vnode has
+                        * an object do not recycle it if its VM object
+                        * has resident pages or references.
+                        */
                        if ((VOP_GETVOBJECT(vp, &object) == 0 &&
-                           (object->resident_page_count || object->ref_count)) ||
-                           !simple_lock_try(&vp->v_interlock)) {
-                               TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
+                           (object->resident_page_count || object->ref_count))
+                       ) {
+                               lwkt_reltoken(&vlock);
                                vp = NULL;
                                continue;
                        }
-                       if (LIST_FIRST(&vp->v_cache_src)) {
-                               /*
-                                * note: nameileafonly sysctl is temporary,
-                                * for debugging only, and will eventually be
-                                * removed.
-                                */
-                               if (nameileafonly > 0) {
-                                       /*
-                                        * Do not reuse namei-cached directory
-                                        * vnodes that have cached
-                                        * subdirectories.
-                                        */
-                                       if (cache_leaf_test(vp) < 0) {
-                                               simple_unlock(&vp->v_interlock);
-                                               TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
-                                               vp = NULL;
-                                               continue;
-                                       }
-                               } else if (nameileafonly < 0 || 
-                                           vmiodirenable == 0) {
-                                       /*
-                                        * Do not reuse namei-cached directory
-                                        * vnodes if nameileafonly is -1 or
-                                        * if VMIO backing for directories is
-                                        * turned off (otherwise we reuse them
-                                        * too quickly).
-                                        */
-                                       simple_unlock(&vp->v_interlock);
-                                       TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
-                                       vp = NULL;
-                                       continue;
-                               }
+
+                       /*
+                        * We can almost reuse this vnode.  But we don't want
+                        * to recycle it if the vnode has children in the
+                        * namecache because that breaks the namecache's
+                        * path element chain.  (YYY use nc_refs for the
+                        * check?)
+                        */
+                       KKASSERT(vp->v_flag & VFREE);
+                       TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
+
+                       if (TAILQ_FIRST(&vp->v_namecache) == NULL ||
+                           cache_leaf_test(vp) >= 0) {
+                               /* ok, we can reuse this vnode */
+                               break;
                        }
-                       break;
+                       lwkt_reltoken(&vlock);
+                       TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
+                       vp = NULL;
                }
        }
 
+       /*
+        * If vp is non-NULL we hold it's interlock.
+        */
        if (vp) {
                vp->v_flag |= VDOOMED;
                vp->v_flag &= ~VFREE;
                freevnodes--;
-               simple_unlock(&vnode_free_list_slock);
-               cache_purge(vp);
+               lwkt_reltoken(&ilock);
+               cache_purge(vp);        /* YYY may block */
                vp->v_lease = NULL;
                if (vp->v_type != VBAD) {
-                       vgonel(vp, td);
+                       vgonel(vp, &vlock, td);
                } else {
-                       simple_unlock(&vp->v_interlock);
+                       lwkt_reltoken(&vlock);
                }
 
 #ifdef INVARIANTS
@@ -691,14 +808,14 @@ getnewvnode(tag, mp, vops, vpp)
                vp->v_socket = 0;
                vp->v_writecount = 0;   /* XXX */
        } else {
-               simple_unlock(&vnode_free_list_slock);
-               vp = (struct vnode *) zalloc(vnode_zone);
-               bzero((char *) vp, sizeof *vp);
-               simple_lock_init(&vp->v_interlock);
+               lwkt_reltoken(&ilock);
+               vp = zalloc(vnode_zone);
+               bzero(vp, sizeof(*vp));
+               vp->v_interlock = lwkt_token_pool_get(vp);
+               lwkt_token_init(&vp->v_pollinfo.vpi_token);
                vp->v_dd = vp;
                cache_purge(vp);
-               LIST_INIT(&vp->v_cache_src);
-               TAILQ_INIT(&vp->v_cache_dst);
+               TAILQ_INIT(&vp->v_namecache);
                numvnodes++;
        }
 
@@ -713,7 +830,7 @@ getnewvnode(tag, mp, vops, vpp)
        vp->v_data = 0;
        splx(s);
 
-       vfs_object_create(vp, td, p->p_ucred);
+       vfs_object_create(vp, td);
        return (0);
 }
 
@@ -722,11 +839,12 @@ getnewvnode(tag, mp, vops, vpp)
  */
 static void
 insmntque(vp, mp)
-       register struct vnode *vp;
-       register struct mount *mp;
+       struct vnode *vp;
+       struct mount *mp;
 {
+       lwkt_tokref ilock;
 
-       simple_lock(&mntvnode_slock);
+       lwkt_gettoken(&ilock, &mntvnode_token);
        /*
         * Delete from old mount point vnode list, if on one.
         */
@@ -740,12 +858,12 @@ insmntque(vp, mp)
         * Insert into list of vnodes for the new mount point, if available.
         */
        if ((vp->v_mount = mp) == NULL) {
-               simple_unlock(&mntvnode_slock);
+               lwkt_reltoken(&ilock);
                return;
        }
        TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
        mp->mnt_nvnodelistsize++;
-       simple_unlock(&mntvnode_slock);
+       lwkt_reltoken(&ilock);
 }
 
 /*
@@ -753,9 +871,9 @@ insmntque(vp, mp)
  */
 void
 vwakeup(bp)
-       register struct buf *bp;
+       struct buf *bp;
 {
-       register struct vnode *vp;
+       struct vnode *vp;
 
        bp->b_flags &= ~B_WRITEINPROG;
        if ((vp = bp->b_vp)) {
@@ -774,20 +892,21 @@ vwakeup(bp)
  * Called with the underlying object locked.
  */
 int
-vinvalbuf(struct vnode *vp, int flags, struct ucred *cred,
-       struct thread *td, int slpflag, int slptimeo)
+vinvalbuf(struct vnode *vp, int flags, struct thread *td,
+       int slpflag, int slptimeo)
 {
-       register struct buf *bp;
+       struct buf *bp;
        struct buf *nbp, *blist;
        int s, error;
        vm_object_t object;
+       lwkt_tokref vlock;
 
        if (flags & V_SAVE) {
                s = splbio();
                while (vp->v_numoutput) {
                        vp->v_flag |= VBWAIT;
                        error = tsleep((caddr_t)&vp->v_numoutput,
-                           slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo);
+                           slpflag, "vinvlbuf", slptimeo);
                        if (error) {
                                splx(s);
                                return (error);
@@ -795,7 +914,7 @@ vinvalbuf(struct vnode *vp, int flags, struct ucred *cred,
                }
                if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
                        splx(s);
-                       if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, td)) != 0)
+                       if ((error = VOP_FSYNC(vp, MNT_WAIT, td)) != 0)
                                return (error);
                        s = splbio();
                        if (vp->v_numoutput > 0 ||
@@ -864,7 +983,7 @@ vinvalbuf(struct vnode *vp, int flags, struct ucred *cred,
        do {
                while (vp->v_numoutput > 0) {
                        vp->v_flag |= VBWAIT;
-                       tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0);
+                       tsleep(&vp->v_numoutput, 0, "vnvlbv", 0);
                }
                if (VOP_GETVOBJECT(vp, &object) == 0) {
                        while (object->paging_in_progress)
@@ -877,12 +996,12 @@ vinvalbuf(struct vnode *vp, int flags, struct ucred *cred,
        /*
         * Destroy the copy in the VM cache, too.
         */
-       simple_lock(&vp->v_interlock);
+       lwkt_gettoken(&vlock, vp->v_interlock);
        if (VOP_GETVOBJECT(vp, &object) == 0) {
                vm_object_page_remove(object, 0, 0,
                        (flags & V_SAVE) ? TRUE : FALSE);
        }
-       simple_unlock(&vp->v_interlock);
+       lwkt_reltoken(&vlock);
 
        if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd))
                panic("vinvalbuf: flush failed");
@@ -895,8 +1014,7 @@ vinvalbuf(struct vnode *vp, int flags, struct ucred *cred,
  * sync activity.
  */
 int
-vtruncbuf(struct vnode *vp, struct ucred *cred, struct thread *td,
-       off_t length, int blksize)
+vtruncbuf(struct vnode *vp, struct thread *td, off_t length, int blksize)
 {
        struct buf *bp;
        struct buf *nbp;
@@ -983,7 +1101,7 @@ restartsync:
 
        while (vp->v_numoutput > 0) {
                vp->v_flag |= VBWAIT;
-               tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0);
+               tsleep(&vp->v_numoutput, 0, "vbtrunc", 0);
        }
 
        splx(s);
@@ -998,8 +1116,8 @@ restartsync:
  */
 void
 bgetvp(vp, bp)
-       register struct vnode *vp;
-       register struct buf *bp;
+       struct vnode *vp;
+       struct buf *bp;
 {
        int s;
 
@@ -1023,7 +1141,7 @@ bgetvp(vp, bp)
  */
 void
 brelvp(bp)
-       register struct buf *bp;
+       struct buf *bp;
 {
        struct vnode *vp;
        struct buflists *listheadp;
@@ -1103,7 +1221,7 @@ vn_syncer_add_to_worklist(struct vnode *vp, int delay)
 }
 
 struct  thread *updatethread;
-static void sched_sync __P((void));
+static void sched_sync (void);
 static struct kproc_desc up_kp = {
        "syncer",
        sched_sync,
@@ -1121,14 +1239,13 @@ sched_sync(void)
        struct vnode *vp;
        long starttime;
        int s;
-       struct thread *td = updatethread;
-       struct proc *p = td->td_proc;
+       struct thread *td = curthread;
 
        EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, td,
            SHUTDOWN_PRI_LAST);   
 
        for (;;) {
-               kproc_suspend_loop(td);
+               kproc_suspend_loop();
 
                starttime = time_second;
 
@@ -1145,9 +1262,9 @@ sched_sync(void)
 
                while ((vp = LIST_FIRST(slp)) != NULL) {
                        if (VOP_ISLOCKED(vp, NULL) == 0) {
-                               vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
-                               (void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, td);
-                               VOP_UNLOCK(vp, 0, td);
+                               vn_lock(vp, NULL, LK_EXCLUSIVE | LK_RETRY, td);
+                               (void) VOP_FSYNC(vp, MNT_LAZY, td);
+                               VOP_UNLOCK(vp, NULL, 0, td);
                        }
                        s = splbio();
                        if (LIST_FIRST(slp) == vp) {
@@ -1200,7 +1317,7 @@ sched_sync(void)
                 * filesystem activity.
                 */
                if (time_second == starttime)
-                       tsleep(&lbolt, PPAUSE, "syncer", 0);
+                       tsleep(&lbolt, 0, "syncer", 0);
        }
 }
 
@@ -1208,16 +1325,18 @@ sched_sync(void)
  * Request the syncer daemon to speed up its work.
  * We never push it to speed up more than half of its
  * normal turn time, otherwise it could take over the cpu.
+ *
+ * YYY wchan field protected by the BGL.
  */
 int
 speedup_syncer()
 {
-       int s;
-
-       s = splhigh();
-       if (updatethread->td_proc->p_wchan == &lbolt) /* YYY */
-               setrunnable(updatethread->td_proc);
-       splx(s);
+       crit_enter();
+       if (updatethread->td_wchan == &lbolt) { /* YYY */
+               unsleep(updatethread);
+               lwkt_schedule(updatethread);
+       }
+       crit_exit();
        if (rushjob < syncdelay / 2) {
                rushjob += 1;
                stat_rush_requests += 1;
@@ -1235,8 +1354,8 @@ speedup_syncer()
  */
 void
 pbgetvp(vp, bp)
-       register struct vnode *vp;
-       register struct buf *bp;
+       struct vnode *vp;
+       struct buf *bp;
 {
 
        KASSERT(bp->b_vp == NULL, ("pbgetvp: not free"));
@@ -1251,7 +1370,7 @@ pbgetvp(vp, bp)
  */
 void
 pbrelvp(bp)
-       register struct buf *bp;
+       struct buf *bp;
 {
 
        KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
@@ -1289,8 +1408,8 @@ pbreassignbuf(bp, newvp)
  */
 void
 reassignbuf(bp, newvp)
-       register struct buf *bp;
-       register struct vnode *newvp;
+       struct buf *bp;
+       struct vnode *newvp;
 {
        struct buflists *listheadp;
        int delay;
@@ -1427,7 +1546,7 @@ bdevvp(dev, vpp)
        dev_t dev;
        struct vnode **vpp;
 {
-       register struct vnode *vp;
+       struct vnode *vp;
        struct vnode *nvp;
        int error;
 
@@ -1448,7 +1567,7 @@ bdevvp(dev, vpp)
 }
 
 /*
- * Add vnode to the alias list hung off the dev_t.
+ * Add vnode to the alias list hung off the dev_t.
  *
  * The reason for this gunk is that multiple vnodes can reference
  * the same physical device, so checking vp->v_usecount to see
@@ -1456,29 +1575,32 @@ bdevvp(dev, vpp)
  * the vnodes need to be accumulated.  vcount() does that.
  */
 void
-addaliasu(nvp, nvp_rdev)
-       struct vnode *nvp;
-       udev_t nvp_rdev;
+addaliasu(struct vnode *nvp, udev_t nvp_rdev)
 {
+       dev_t dev;
 
        if (nvp->v_type != VBLK && nvp->v_type != VCHR)
                panic("addaliasu on non-special vnode");
-       addalias(nvp, udev2dev(nvp_rdev, nvp->v_type == VBLK ? 1 : 0));
+       dev = udev2dev(nvp_rdev, nvp->v_type == VBLK ? 1 : 0);
+       if (dev != NODEV) {
+               nvp->v_rdev = dev;
+               addalias(nvp, dev);
+       } else
+               nvp->v_rdev = NULL;
 }
 
 void
-addalias(nvp, dev)
-       struct vnode *nvp;
-       dev_t dev;
+addalias(struct vnode *nvp, dev_t dev)
 {
+       lwkt_tokref ilock;
 
        if (nvp->v_type != VBLK && nvp->v_type != VCHR)
                panic("addalias on non-special vnode");
 
        nvp->v_rdev = dev;
-       simple_lock(&spechash_slock);
+       lwkt_gettoken(&ilock, &spechash_token);
        SLIST_INSERT_HEAD(&dev->si_hlist, nvp, v_specnext);
-       simple_unlock(&spechash_slock);
+       lwkt_reltoken(&ilock);
 }
 
 /*
@@ -1488,44 +1610,58 @@ addalias(nvp, dev)
  * when the transition is completed, and an error returned to
  * indicate that the vnode is no longer usable (possibly having
  * been changed to a new file system type).
+ *
+ * This code is very sensitive.  We are depending on the vnode interlock
+ * to be maintained through to the vn_lock() call, which means that we
+ * cannot block which means that we cannot call vbusy() until after vn_lock().
+ * If the interlock is not maintained, the VXLOCK check will not properly
+ * interlock against a vclean()'s LK_DRAIN operation on the lock.
  */
 int
-vget(vp, flags, td)
-       struct vnode *vp;
-       int flags;
-       struct thread *td;
+vget(struct vnode *vp, lwkt_tokref_t vlock, int flags, thread_t td)
 {
        int error;
+       lwkt_tokref vvlock;
+
+       /*
+        * We need the interlock to safely modify the v_ fields.  ZZZ it is
+        * only legal to pass (1) the vnode's interlock and (2) only pass
+        * NULL w/o LK_INTERLOCK if the vnode is *ALREADY* referenced or
+        * held.
+        */
+       if ((flags & LK_INTERLOCK) == 0) {
+               lwkt_gettoken(&vvlock, vp->v_interlock);
+               vlock = &vvlock;
+       }
 
        /*
         * If the vnode is in the process of being cleaned out for
         * another use, we wait for the cleaning to finish and then
         * return failure. Cleaning is determined by checking that
-        * the VXLOCK flag is set.
+        * the VXLOCK flag is set.  It is possible for the vnode to be
+        * self-referenced during the cleaning operation.
         */
-       if ((flags & LK_INTERLOCK) == 0) {
-               simple_lock(&vp->v_interlock);
-       }
        if (vp->v_flag & VXLOCK) {
-               if (vp->v_vxproc == curproc) {
+               if (vp->v_vxthread == curthread) {
 #if 0
                        /* this can now occur in normal operation */
                        log(LOG_INFO, "VXLOCK interlock avoided\n");
 #endif
                } else {
                        vp->v_flag |= VXWANT;
-                       simple_unlock(&vp->v_interlock);
-                       tsleep((caddr_t)vp, PINOD, "vget", 0);
+                       lwkt_reltoken(vlock);
+                       tsleep((caddr_t)vp, 0, "vget", 0);
                        return (ENOENT);
                }
        }
 
+       /*
+        * Bump v_usecount to prevent the vnode from being recycled.  The
+        * usecount needs to be bumped before we successfully get our lock.
+        */
        vp->v_usecount++;
-
-       if (VSHOULDBUSY(vp))
-               vbusy(vp);
        if (flags & LK_TYPE_MASK) {
-               if ((error = vn_lock(vp, flags | LK_INTERLOCK, td)) != 0) {
+               if ((error = vn_lock(vp, vlock, flags | LK_INTERLOCK, td)) != 0) {
                        /*
                         * must expand vrele here because we do not want
                         * to call VOP_INACTIVE if the reference count
@@ -1534,26 +1670,25 @@ vget(vp, flags, td)
                         * before sleeping so that multiple processes do
                         * not try to recycle it.
                         */
-                       simple_lock(&vp->v_interlock);
+                       lwkt_gettokref(vlock);
                        vp->v_usecount--;
-                       if (VSHOULDFREE(vp))
-                               vfree(vp);
-                       else
-                               vlruvp(vp);
-                       simple_unlock(&vp->v_interlock);
+                       vmaybefree(vp);
+                       lwkt_reltoken(vlock);
                }
                return (error);
        }
-       simple_unlock(&vp->v_interlock);
+       if (VSHOULDBUSY(vp))
+               vbusy(vp);      /* interlock must be held on call */
+       lwkt_reltoken(vlock);
        return (0);
 }
 
 void
 vref(struct vnode *vp)
 {
-       simple_lock(&vp->v_interlock);
+       crit_enter();   /* YYY use crit section for moment / BGL protected */
        vp->v_usecount++;
-       simple_unlock(&vp->v_interlock);
+       crit_exit();
 }
 
 /*
@@ -1564,39 +1699,37 @@ void
 vrele(struct vnode *vp)
 {
        struct thread *td = curthread;  /* XXX */
+       lwkt_tokref vlock;
 
-       KASSERT(vp != NULL, ("vrele: null vp"));
+       KASSERT(vp != NULL && vp->v_usecount >= 0,
+           ("vrele: null vp or <=0 v_usecount"));
 
-       simple_lock(&vp->v_interlock);
+       lwkt_gettoken(&vlock, vp->v_interlock);
 
        if (vp->v_usecount > 1) {
-
                vp->v_usecount--;
-               simple_unlock(&vp->v_interlock);
-
+               lwkt_reltoken(&vlock);
                return;
        }
 
        if (vp->v_usecount == 1) {
                vp->v_usecount--;
                /*
-                * We must call VOP_INACTIVE with the node locked.
-                * If we are doing a vpu, the node is already locked,
-                * but, in the case of vrele, we must explicitly lock
-                * the vnode before calling VOP_INACTIVE
+                * We must call VOP_INACTIVE with the node locked and the
+                * usecount 0.  If we are doing a vpu, the node is already
+                * locked, but, in the case of vrele, we must explicitly lock
+                * the vnode before calling VOP_INACTIVE.
                 */
 
-               if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, td) == 0)
+               if (vn_lock(vp, NULL, LK_EXCLUSIVE, td) == 0)
                        VOP_INACTIVE(vp, td);
-               if (VSHOULDFREE(vp))
-                       vfree(vp);
-               else
-                       vlruvp(vp);
+               vmaybefree(vp);
+               lwkt_reltoken(&vlock);
        } else {
 #ifdef DIAGNOSTIC
                vprint("vrele: negative ref count", vp);
-               simple_unlock(&vp->v_interlock);
 #endif
+               lwkt_reltoken(&vlock);
                panic("vrele: negative ref cnt");
        }
 }
@@ -1605,14 +1738,15 @@ void
 vput(struct vnode *vp)
 {
        struct thread *td = curthread;  /* XXX */
+       lwkt_tokref vlock;
 
        KASSERT(vp != NULL, ("vput: null vp"));
 
-       simple_lock(&vp->v_interlock);
+       lwkt_gettoken(&vlock, vp->v_interlock);
 
        if (vp->v_usecount > 1) {
                vp->v_usecount--;
-               VOP_UNLOCK(vp, LK_INTERLOCK, td);
+               VOP_UNLOCK(vp, &vlock, LK_INTERLOCK, td);
                return;
        }
 
@@ -1623,33 +1757,32 @@ vput(struct vnode *vp)
                 * If we are doing a vpu, the node is already locked,
                 * so we just need to release the vnode mutex.
                 */
-               simple_unlock(&vp->v_interlock);
                VOP_INACTIVE(vp, td);
-               if (VSHOULDFREE(vp))
-                       vfree(vp);
-               else
-                       vlruvp(vp);
+               vmaybefree(vp);
+               lwkt_reltoken(&vlock);
        } else {
 #ifdef DIAGNOSTIC
                vprint("vput: negative ref count", vp);
 #endif
+               lwkt_reltoken(&vlock);
                panic("vput: negative ref cnt");
        }
 }
 
 /*
- * Somebody doesn't want the vnode recycled.
+ * Somebody doesn't want the vnode recycled. ZZZ vnode interlock should
+ * be held but isn't.
  */
 void
 vhold(vp)
-       register struct vnode *vp;
+       struct vnode *vp;
 {
        int s;
 
        s = splbio();
        vp->v_holdcnt++;
        if (VSHOULDBUSY(vp))
-               vbusy(vp);
+               vbusy(vp);      /* interlock must be held on call */
        splx(s);
 }
 
@@ -1658,17 +1791,88 @@ vhold(vp)
  */
 void
 vdrop(vp)
-       register struct vnode *vp;
+       struct vnode *vp;
 {
-       int s;
+       lwkt_tokref vlock;
 
-       s = splbio();
+       lwkt_gettoken(&vlock, vp->v_interlock);
        if (vp->v_holdcnt <= 0)
                panic("vdrop: holdcnt");
        vp->v_holdcnt--;
-       if (VSHOULDFREE(vp))
-               vfree(vp);
-       splx(s);
+       vmaybefree(vp);
+       lwkt_reltoken(&vlock);
+}
+
+int
+vmntvnodescan(
+    struct mount *mp, 
+    int (*fastfunc)(struct mount *mp, struct vnode *vp, void *data),
+    int (*slowfunc)(struct mount *mp, struct vnode *vp, lwkt_tokref_t vlock, void *data),
+    void *data
+) {
+       lwkt_tokref ilock;
+       lwkt_tokref vlock;
+       struct vnode *pvp;
+       struct vnode *vp;
+       int r = 0;
+
+       /*
+        * Scan the vnodes on the mount's vnode list.  Use a placemarker
+        */
+       pvp = zalloc(vnode_zone);
+       pvp->v_flag |= VPLACEMARKER;
+
+       lwkt_gettoken(&ilock, &mntvnode_token);
+       TAILQ_INSERT_HEAD(&mp->mnt_nvnodelist, pvp, v_nmntvnodes);
+
+       while ((vp = TAILQ_NEXT(pvp, v_nmntvnodes)) != NULL) {
+               /*
+                * Move the placemarker and skip other placemarkers we
+                * encounter.  The nothing can get in our way so the
+                * mount point on the vp must be valid.
+                */
+               TAILQ_REMOVE(&mp->mnt_nvnodelist, pvp, v_nmntvnodes);
+               TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, pvp, v_nmntvnodes);
+               if (vp->v_flag & VPLACEMARKER)
+                       continue;
+               KKASSERT(vp->v_mount == mp);
+
+               /*
+                * Quick test
+                */
+               if (fastfunc) {
+                       if ((r = fastfunc(mp, vp, data)) < 0)
+                               continue;
+                       if (r)
+                               break;
+               }
+
+               /*
+                * Get the vnodes interlock and make sure it is still on the
+                * mount list.  Skip it if it has moved (we may encounter it
+                * later).  Then do the with-interlock test.  The callback
+                * is responsible for releasing the vnode interlock.
+                *
+                * The interlock is type-stable.
+                */
+               if (slowfunc) {
+                       lwkt_gettoken(&vlock, vp->v_interlock);
+                       if (vp != TAILQ_PREV(pvp, vnodelst, v_nmntvnodes)) {
+                               printf("vmntvnodescan (debug info only): f=%p vp=%p vnode ripped out from under us\n", slowfunc, vp);
+                               lwkt_reltoken(&vlock);
+                               continue;
+                       }
+                       if ((r = slowfunc(mp, vp, &vlock, data)) != 0) {
+                               KKASSERT(lwkt_havetokref(&vlock) == 0);
+                               break;
+                       }
+                       KKASSERT(lwkt_havetokref(&vlock) == 0);
+               }
+       }
+       TAILQ_REMOVE(&mp->mnt_nvnodelist, pvp, v_nmntvnodes);
+       zfree(vnode_zone, pvp);
+       lwkt_reltoken(&ilock);
+       return(r);
 }
 
 /*
@@ -1696,6 +1900,14 @@ static int busyprt = 0;          /* print out busy vnodes */
 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
 #endif
 
+static int vflush_scan(struct mount *mp, struct vnode *vp, lwkt_tokref_t vlock, void *data);
+
+struct vflush_info {
+       int flags;
+       int busy;
+       thread_t td;
+};
+
 int
 vflush(mp, rootrefs, flags)
        struct mount *mp;
@@ -1703,10 +1915,10 @@ vflush(mp, rootrefs, flags)
        int flags;
 {
        struct thread *td = curthread;  /* XXX */
-       struct proc *p = td->td_proc;
-       struct vnode *vp, *nvp, *rootvp = NULL;
-       struct vattr vattr;
-       int busy = 0, error;
+       struct vnode *rootvp = NULL;
+       int error;
+       lwkt_tokref vlock;
+       struct vflush_info vflush_info;
 
        if (rootrefs > 0) {
                KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
@@ -1719,90 +1931,28 @@ vflush(mp, rootrefs, flags)
                        return (error);
                vput(rootvp);
        }
-       simple_lock(&mntvnode_slock);
-loop:
-       for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp; vp = nvp) {
-               /*
-                * Make sure this vnode wasn't reclaimed in getnewvnode().
-                * Start over if it has (it won't be on the list anymore).
-                */
-               if (vp->v_mount != mp)
-                       goto loop;
-               nvp = TAILQ_NEXT(vp, v_nmntvnodes);
-
-               simple_lock(&vp->v_interlock);
-               /*
-                * Skip over a vnodes marked VSYSTEM.
-                */
-               if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) {
-                       simple_unlock(&vp->v_interlock);
-                       continue;
-               }
-               /*
-                * If WRITECLOSE is set, flush out unlinked but still open
-                * files (even if open only for reading) and regular file
-                * vnodes open for writing. 
-                */
-               if ((flags & WRITECLOSE) &&
-                   (vp->v_type == VNON ||
-                   (VOP_GETATTR(vp, &vattr, p->p_ucred, td) == 0 &&
-                   vattr.va_nlink > 0)) &&
-                   (vp->v_writecount == 0 || vp->v_type != VREG)) {
-                       simple_unlock(&vp->v_interlock);
-                       continue;
-               }
 
-               /*
-                * With v_usecount == 0, all we need to do is clear out the
-                * vnode data structures and we are done.
-                */
-               if (vp->v_usecount == 0) {
-                       simple_unlock(&mntvnode_slock);
-                       vgonel(vp, td);
-                       simple_lock(&mntvnode_slock);
-                       continue;
-               }
+       vflush_info.busy = 0;
+       vflush_info.flags = flags;
+       vflush_info.td = td;
+       vmntvnodescan(mp, NULL, vflush_scan, &vflush_info);
 
-               /*
-                * If FORCECLOSE is set, forcibly close the vnode. For block
-                * or character devices, revert to an anonymous device. For
-                * all other files, just kill them.
-                */
-               if (flags & FORCECLOSE) {
-                       simple_unlock(&mntvnode_slock);
-                       if (vp->v_type != VBLK && vp->v_type != VCHR) {
-                               vgonel(vp, td);
-                       } else {
-                               vclean(vp, 0, td);
-                               vp->v_op = spec_vnodeop_p;
-                               insmntque(vp, (struct mount *) 0);
-                       }
-                       simple_lock(&mntvnode_slock);
-                       continue;
-               }
-#ifdef DIAGNOSTIC
-               if (busyprt)
-                       vprint("vflush: busy vnode", vp);
-#endif
-               simple_unlock(&vp->v_interlock);
-               busy++;
-       }
-       simple_unlock(&mntvnode_slock);
        if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
                /*
                 * If just the root vnode is busy, and if its refcount
                 * is equal to `rootrefs', then go ahead and kill it.
                 */
-               simple_lock(&rootvp->v_interlock);
-               KASSERT(busy > 0, ("vflush: not busy"));
+               lwkt_gettoken(&vlock, rootvp->v_interlock);
+               KASSERT(vflush_info.busy > 0, ("vflush: not busy"));
                KASSERT(rootvp->v_usecount >= rootrefs, ("vflush: rootrefs"));
-               if (busy == 1 && rootvp->v_usecount == rootrefs) {
-                       vgonel(rootvp, td);
-                       busy = 0;
-               } else
-                       simple_unlock(&rootvp->v_interlock);
+               if (vflush_info.busy == 1 && rootvp->v_usecount == rootrefs) {
+                       vgonel(rootvp, &vlock, td);
+                       vflush_info.busy = 0;
+               } else {
+                       lwkt_reltoken(&vlock);
+               }
        }
-       if (busy)
+       if (vflush_info.busy)
                return (EBUSY);
        for (; rootrefs > 0; rootrefs--)
                vrele(rootvp);
@@ -1810,32 +1960,74 @@ loop:
 }
 
 /*
- * We do not want to recycle the vnode too quickly.
- *
- * XXX we can't move vp's around the nvnodelist without really screwing
- * up the efficiency of filesystem SYNC and friends.  This code is 
- * disabled until we fix the syncing code's scanning algorithm.
+ * The scan callback is made with an interlocked vnode.
  */
-static void
-vlruvp(struct vnode *vp)
+static int
+vflush_scan(struct mount *mp, struct vnode *vp, lwkt_tokref_t vlock, void *data)
 {
-#if 0
-       struct mount *mp;
+       struct vflush_info *info = data;
+       struct vattr vattr;
 
-       if ((mp = vp->v_mount) != NULL) {
-               simple_lock(&mntvnode_slock);
-               TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
-               TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
-               simple_unlock(&mntvnode_slock);
+       /*
+        * Skip over a vnodes marked VSYSTEM.
+        */
+       if ((info->flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) {
+               lwkt_reltoken(vlock);
+               return(0);
        }
+
+       /*
+        * If WRITECLOSE is set, flush out unlinked but still open
+        * files (even if open only for reading) and regular file
+        * vnodes open for writing. 
+        */
+       if ((info->flags & WRITECLOSE) &&
+           (vp->v_type == VNON ||
+           (VOP_GETATTR(vp, &vattr, info->td) == 0 &&
+           vattr.va_nlink > 0)) &&
+           (vp->v_writecount == 0 || vp->v_type != VREG)) {
+               lwkt_reltoken(vlock);
+               return(0);
+       }
+
+       /*
+        * With v_usecount == 0, all we need to do is clear out the
+        * vnode data structures and we are done.
+        */
+       if (vp->v_usecount == 0) {
+               vgonel(vp, vlock, info->td);
+               return(0);
+       }
+
+       /*
+        * If FORCECLOSE is set, forcibly close the vnode. For block
+        * or character devices, revert to an anonymous device. For
+        * all other files, just kill them.
+        */
+       if (info->flags & FORCECLOSE) {
+               if (vp->v_type != VBLK && vp->v_type != VCHR) {
+                       vgonel(vp, vlock, info->td);
+               } else {
+                       vclean(vp, vlock, 0, info->td);
+                       vp->v_op = spec_vnodeop_p;
+                       insmntque(vp, (struct mount *) 0);
+               }
+               return(0);
+       }
+#ifdef DIAGNOSTIC
+       if (busyprt)
+               vprint("vflush: busy vnode", vp);
 #endif
+       lwkt_reltoken(vlock);
+       ++info->busy;
+       return(0);
 }
 
 /*
  * Disassociate the underlying file system from a vnode.
  */
 static void
-vclean(struct vnode *vp, int flags, struct thread *td)
+vclean(struct vnode *vp, lwkt_tokref_t vlock, int flags, struct thread *td)
 {
        int active;
 
@@ -1854,21 +2046,24 @@ vclean(struct vnode *vp, int flags, struct thread *td)
        if (vp->v_flag & VXLOCK)
                panic("vclean: deadlock");
        vp->v_flag |= VXLOCK;
-       vp->v_vxproc = curproc;
+       vp->v_vxthread = curthread;
+
        /*
         * Even if the count is zero, the VOP_INACTIVE routine may still
         * have the object locked while it cleans it out. The VOP_LOCK
         * ensures that the VOP_INACTIVE routine is done with its work.
         * For active vnodes, it ensures that no other activity can
         * occur while the underlying object is being cleaned out.
+        *
+        * NOTE: we continue to hold the vnode interlock through to the
+        * end of vclean().
         */
-       VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, td);
+       VOP_LOCK(vp, NULL, LK_DRAIN, td);
 
        /*
         * Clean out any buffers associated with the vnode.
         */
-       vinvalbuf(vp, V_SAVE, NOCRED, td, 0, 0);
-
+       vinvalbuf(vp, V_SAVE, td, 0, 0);
        VOP_DESTROYVOBJECT(vp);
 
        /*
@@ -1878,14 +2073,14 @@ vclean(struct vnode *vp, int flags, struct thread *td)
         */
        if (active) {
                if (flags & DOCLOSE)
-                       VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
+                       VOP_CLOSE(vp, FNONBLOCK, td);
                VOP_INACTIVE(vp, td);
        } else {
                /*
                 * Any other processes trying to obtain this lock must first
                 * wait for VXLOCK to clear, then call the new lock operation.
                 */
-               VOP_UNLOCK(vp, 0, td);
+               VOP_UNLOCK(vp, NULL, 0, td);
        }
        /*
         * Reclaim the vnode.
@@ -1898,7 +2093,6 @@ vclean(struct vnode *vp, int flags, struct thread *td)
                 * Inline copy of vrele() since VOP_INACTIVE
                 * has already been called.
                 */
-               simple_lock(&vp->v_interlock);
                if (--vp->v_usecount <= 0) {
 #ifdef DIAGNOSTIC
                        if (vp->v_usecount < 0 || vp->v_writecount != 0) {
@@ -1908,14 +2102,11 @@ vclean(struct vnode *vp, int flags, struct thread *td)
 #endif
                        vfree(vp);
                }
-               simple_unlock(&vp->v_interlock);
        }
 
        cache_purge(vp);
        vp->v_vnlock = NULL;
-
-       if (VSHOULDFREE(vp))
-               vfree(vp);
+       vmaybefree(vp);
        
        /*
         * Done with purge, notify sleepers of the grim news.
@@ -1924,11 +2115,12 @@ vclean(struct vnode *vp, int flags, struct thread *td)
        vn_pollgone(vp);
        vp->v_tag = VT_NON;
        vp->v_flag &= ~VXLOCK;
-       vp->v_vxproc = NULL;
+       vp->v_vxthread = NULL;
        if (vp->v_flag & VXWANT) {
                vp->v_flag &= ~VXWANT;
                wakeup((caddr_t) vp);
        }
+       lwkt_reltoken(vlock);
 }
 
 /*
@@ -1943,6 +2135,7 @@ vop_revoke(ap)
        } */ *ap;
 {
        struct vnode *vp, *vq;
+       lwkt_tokref ilock;
        dev_t dev;
 
        KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke"));
@@ -1954,15 +2147,15 @@ vop_revoke(ap)
         */
        if (vp->v_flag & VXLOCK) {
                vp->v_flag |= VXWANT;
-               simple_unlock(&vp->v_interlock);
-               tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0);
+               /*lwkt_reltoken(vlock); ZZZ */
+               tsleep((caddr_t)vp, 0, "vop_revokeall", 0);
                return (0);
        }
        dev = vp->v_rdev;
        for (;;) {
-               simple_lock(&spechash_slock);
+               lwkt_gettoken(&ilock, &spechash_token);
                vq = SLIST_FIRST(&dev->si_hlist);
-               simple_unlock(&spechash_slock);
+               lwkt_reltoken(&ilock);
                if (!vq)
                        break;
                vgone(vq);
@@ -1975,17 +2168,18 @@ vop_revoke(ap)
  * Release the passed interlock if the vnode will be recycled.
  */
 int
-vrecycle(struct vnode *vp, struct simplelock *inter_lkp, struct thread *td)
+vrecycle(struct vnode *vp, lwkt_tokref_t inter_lkp, struct thread *td)
 {
-       simple_lock(&vp->v_interlock);
+       lwkt_tokref vlock;
+
+       lwkt_gettoken(&vlock, vp->v_interlock);
        if (vp->v_usecount == 0) {
-               if (inter_lkp) {
-                       simple_unlock(inter_lkp);
-               }
-               vgonel(vp, td);
+               if (inter_lkp)
+                       lwkt_reltoken(inter_lkp);
+               vgonel(vp, &vlock, td);
                return (1);
        }
-       simple_unlock(&vp->v_interlock);
+       lwkt_reltoken(&vlock);
        return (0);
 }
 
@@ -1997,17 +2191,19 @@ void
 vgone(struct vnode *vp)
 {
        struct thread *td = curthread;  /* XXX */
+       lwkt_tokref vlock;
 
-       simple_lock(&vp->v_interlock);
-       vgonel(vp, td);
+       lwkt_gettoken(&vlock, vp->v_interlock);
+       vgonel(vp, &vlock, td);
 }
 
 /*
  * vgone, with the vp interlock held.
  */
 void
-vgonel(struct vnode *vp, struct thread *td)
+vgonel(struct vnode *vp, lwkt_tokref_t vlock, struct thread *td)
 {
+       lwkt_tokref ilock;
        int s;
 
        /*
@@ -2016,16 +2212,16 @@ vgonel(struct vnode *vp, struct thread *td)
         */
        if (vp->v_flag & VXLOCK) {
                vp->v_flag |= VXWANT;
-               simple_unlock(&vp->v_interlock);
-               tsleep((caddr_t)vp, PINOD, "vgone", 0);
+               lwkt_reltoken(vlock);
+               tsleep((caddr_t)vp, 0, "vgone", 0);
                return;
        }
 
        /*
         * Clean out the filesystem specific data.
         */
-       vclean(vp, DOCLOSE, td);
-       simple_lock(&vp->v_interlock);
+       vclean(vp, vlock, DOCLOSE, td);
+       lwkt_gettokref(vlock);
 
        /*
         * Delete from old mount point vnode list, if on one.
@@ -2037,10 +2233,10 @@ vgonel(struct vnode *vp, struct thread *td)
         * if it is on one.
         */
        if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_rdev != NULL) {
-               simple_lock(&spechash_slock);
+               lwkt_gettoken(&ilock, &spechash_token);
                SLIST_REMOVE(&vp->v_hashchain, vp, vnode, v_specnext);
                freedev(vp->v_rdev);
-               simple_unlock(&spechash_slock);
+               lwkt_reltoken(&ilock);
                vp->v_rdev = NULL;
        }
 
@@ -2056,19 +2252,18 @@ vgonel(struct vnode *vp, struct thread *td)
         */
        if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) {
                s = splbio();
-               simple_lock(&vnode_free_list_slock);
+               lwkt_gettoken(&ilock, &vnode_free_list_token);
                if (vp->v_flag & VFREE)
                        TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
                else
                        freevnodes++;
                vp->v_flag |= VFREE;
                TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
-               simple_unlock(&vnode_free_list_slock);
+               lwkt_reltoken(&ilock);
                splx(s);
        }
-
        vp->v_type = VBAD;
-       simple_unlock(&vp->v_interlock);
+       lwkt_reltoken(vlock);
 }
 
 /*
@@ -2080,17 +2275,18 @@ vfinddev(dev, type, vpp)
        enum vtype type;
        struct vnode **vpp;
 {
+       lwkt_tokref ilock;
        struct vnode *vp;
 
-       simple_lock(&spechash_slock);
+       lwkt_gettoken(&ilock, &spechash_token);
        SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) {
                if (type == vp->v_type) {
                        *vpp = vp;
-                       simple_unlock(&spechash_slock);
+                       lwkt_reltoken(&ilock);
                        return (1);
                }
        }
-       simple_unlock(&spechash_slock);
+       lwkt_reltoken(&ilock);
        return (0);
 }
 
@@ -2101,14 +2297,15 @@ int
 vcount(vp)
        struct vnode *vp;
 {
+       lwkt_tokref ilock;
        struct vnode *vq;
        int count;
 
        count = 0;
-       simple_lock(&spechash_slock);
+       lwkt_gettoken(&ilock, &spechash_token);
        SLIST_FOREACH(vq, &vp->v_hashchain, v_specnext)
                count += vq->v_usecount;
-       simple_unlock(&spechash_slock);
+       lwkt_reltoken(&ilock);
        return (count);
 }
 
@@ -2186,13 +2383,14 @@ vprint(label, vp)
 DB_SHOW_COMMAND(lockedvnodes, lockedvnodes)
 {
        struct thread *td = curthread;  /* XXX */
+       lwkt_tokref ilock;
        struct mount *mp, *nmp;
        struct vnode *vp;
 
        printf("Locked vnodes\n");
-       simple_lock(&mountlist_slock);
+       lwkt_gettoken(&ilock, &mountlist_token);
        for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
-               if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, td)) {
+               if (vfs_busy(mp, LK_NOWAIT, &ilock, td)) {
                        nmp = TAILQ_NEXT(mp, mnt_list);
                        continue;
                }
@@ -2200,18 +2398,18 @@ DB_SHOW_COMMAND(lockedvnodes, lockedvnodes)
                        if (VOP_ISLOCKED(vp, NULL))
                                vprint((char *)0, vp);
                }
-               simple_lock(&mountlist_slock);
+               lwkt_gettokref(&ilock);
                nmp = TAILQ_NEXT(mp, mnt_list);
                vfs_unbusy(mp, td);
        }
-       simple_unlock(&mountlist_slock);
+       lwkt_reltoken(&ilock);
 }
 #endif
 
 /*
  * Top level filesystem related information gathering.
  */
-static int     sysctl_ovfs_conf __P((SYSCTL_HANDLER_ARGS));
+static int     sysctl_ovfs_conf (SYSCTL_HANDLER_ARGS);
 
 static int
 vfs_sysctl(SYSCTL_HANDLER_ARGS)
@@ -2298,6 +2496,8 @@ sysctl_vnode(SYSCTL_HANDLER_ARGS)
        struct proc *p = curproc;       /* XXX */
        struct mount *mp, *nmp;
        struct vnode *nvp, *vp;
+       lwkt_tokref ilock;
+       lwkt_tokref jlock;
        int error;
 
 #define VPTRSZ sizeof (struct vnode *)
@@ -2308,14 +2508,14 @@ sysctl_vnode(SYSCTL_HANDLER_ARGS)
                return (SYSCTL_OUT(req, 0,
                        (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ)));
 
-       simple_lock(&mountlist_slock);
+       lwkt_gettoken(&ilock, &mountlist_token);
        for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
-               if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
+               if (vfs_busy(mp, LK_NOWAIT, &ilock, p)) {
                        nmp = TAILQ_NEXT(mp, mnt_list);
                        continue;
                }
+               lwkt_gettoken(&jlock, &mntvnode_token);
 again:
-               simple_lock(&mntvnode_slock);
                for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
                     vp != NULL;
                     vp = nvp) {
@@ -2324,23 +2524,21 @@ again:
                         * this filesystem.  RACE: could have been
                         * recycled onto the same filesystem.
                         */
-                       if (vp->v_mount != mp) {
-                               simple_unlock(&mntvnode_slock);
+                       if (vp->v_mount != mp)
                                goto again;
-                       }
                        nvp = TAILQ_NEXT(vp, v_nmntvnodes);
-                       simple_unlock(&mntvnode_slock);
                        if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) ||
-                           (error = SYSCTL_OUT(req, vp, VNODESZ)))
+                           (error = SYSCTL_OUT(req, vp, VNODESZ))) {
+                               lwkt_reltoken(&jlock);
                                return (error);
-                       simple_lock(&mntvnode_slock);
+                       }
                }
-               simple_unlock(&mntvnode_slock);
-               simple_lock(&mountlist_slock);
-               nmp = TAILQ_NEXT(mp, mnt_list);
+               lwkt_reltoken(&jlock);
+               lwkt_gettokref(&ilock);
+               nmp = TAILQ_NEXT(mp, mnt_list); /* ZZZ */
                vfs_unbusy(mp, p);
        }
-       simple_unlock(&mountlist_slock);
+       lwkt_reltoken(&ilock);
 
        return (0);
 }
@@ -2413,9 +2611,9 @@ vfs_hang_addrlist(mp, nep, argp)
        struct netexport *nep;
        struct export_args *argp;
 {
-       register struct netcred *np;
-       register struct radix_node_head *rnh;
-       register int i;
+       struct netcred *np;
+       struct radix_node_head *rnh;
+       int i;
        struct radix_node *rn;
        struct sockaddr *saddr, *smask = 0;
        struct domain *dom;
@@ -2432,7 +2630,9 @@ vfs_hang_addrlist(mp, nep, argp)
                return (0);
        }
 
-       if (argp->ex_addrlen > MLEN)
+       if (argp->ex_addrlen < 0 || argp->ex_addrlen > MLEN)
+               return (EINVAL);
+       if (argp->ex_masklen < 0 || argp->ex_masklen > MLEN)
                return (EINVAL);
 
        i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
@@ -2489,7 +2689,7 @@ vfs_free_netcred(rn, w)
        struct radix_node *rn;
        void *w;
 {
-       register struct radix_node_head *rnh = (struct radix_node_head *) w;
+       struct radix_node_head *rnh = (struct radix_node_head *) w;
 
        (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh);
        free((caddr_t) rn, M_NETADDR);
@@ -2503,8 +2703,8 @@ static void
 vfs_free_addrlist(nep)
        struct netexport *nep;
 {
-       register int i;
-       register struct radix_node_head *rnh;
+       int i;
+       struct radix_node_head *rnh;
 
        for (i = 0; i <= AF_MAX; i++)
                if ((rnh = nep->ne_rtable[i])) {
@@ -2627,12 +2827,12 @@ vfs_setpublicfs(mp, nep, argp)
 
 struct netcred *
 vfs_export_lookup(mp, nep, nam)
-       register struct mount *mp;
+       struct mount *mp;
        struct netexport *nep;
        struct sockaddr *nam;
 {
-       register struct netcred *np;
-       register struct radix_node_head *rnh;
+       struct netcred *np;
+       struct radix_node_head *rnh;
        struct sockaddr *saddr;
 
        np = NULL;
@@ -2661,55 +2861,72 @@ vfs_export_lookup(mp, nep, nam)
 }
 
 /*
- * perform msync on all vnodes under a mount point
- * the mount point must be locked.
+ * perform msync on all vnodes under a mount point.  The mount point must
+ * be locked.  This code is also responsible for lazy-freeing unreferenced
+ * vnodes whos VM objects no longer contain pages.
+ *
+ * NOTE: MNT_WAIT still skips vnodes in the VXLOCK state.
  */
+static int vfs_msync_scan1(struct mount *mp, struct vnode *vp, void *data);
+static int vfs_msync_scan2(struct mount *mp, struct vnode *vp, 
+                               lwkt_tokref_t vlock, void *data);
+
 void
 vfs_msync(struct mount *mp, int flags) 
 {
-       struct thread *td = curthread;  /* XXX */
-       struct vnode *vp, *nvp;
-       struct vm_object *obj;
-       int tries;
-
-       tries = 5;
-       simple_lock(&mntvnode_slock);
-loop:
-       for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp != NULL; vp = nvp) {
-               if (vp->v_mount != mp) {
-                       if (--tries > 0)
-                               goto loop;
-                       break;
-               }
-               nvp = TAILQ_NEXT(vp, v_nmntvnodes);
+       vmntvnodescan(mp, vfs_msync_scan1, vfs_msync_scan2, (void *)flags);
+}
 
-               if (vp->v_flag & VXLOCK)        /* XXX: what if MNT_WAIT? */
-                       continue;
+/*
+ * scan1 is a fast pre-check.  There could be hundreds of thousands of
+ * vnodes, we cannot afford to do anything heavy weight until we have a
+ * fairly good indication that there is work to do.
+ */
+static
+int
+vfs_msync_scan1(struct mount *mp, struct vnode *vp, void *data)
+{
+       int flags = (int)data;
 
-               /*
-                * There could be hundreds of thousands of vnodes, we cannot
-                * afford to do anything heavy-weight until we have a fairly
-                * good indication that there is something to do.
-                */
-               if ((vp->v_flag & VOBJDIRTY) &&
+       if ((vp->v_flag & VXLOCK) == 0) {
+               if (VSHOULDFREE(vp))
+                       return(0);
+               if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
+                   (vp->v_flag & VOBJDIRTY) &&
                    (flags == MNT_WAIT || VOP_ISLOCKED(vp, NULL) == 0)) {
-                       simple_unlock(&mntvnode_slock);
-                       if (!vget(vp,
-                           LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, td)) {
-                               if (VOP_GETVOBJECT(vp, &obj) == 0) {
-                                       vm_object_page_clean(obj, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC);
-                               }
-                               vput(vp);
-                       }
-                       simple_lock(&mntvnode_slock);
-                       if (TAILQ_NEXT(vp, v_nmntvnodes) != nvp) {
-                               if (--tries > 0)
-                                       goto loop;
-                               break;
+                       return(0);
+               }
+       }
+       return(-1);
+}
+
+static
+int
+vfs_msync_scan2(struct mount *mp, struct vnode *vp, lwkt_tokref_t vlock, void *data)
+{
+       vm_object_t obj;
+       int error;
+       int flags = (int)data;
+
+       if (vp->v_flag & VXLOCK)
+               return(0);
+
+       if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
+           (vp->v_flag & VOBJDIRTY) &&
+           (flags == MNT_WAIT || VOP_ISLOCKED(vp, NULL) == 0)) {
+               error = vget(vp, vlock, LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ | LK_INTERLOCK, curthread);
+               if (error == 0) {
+                       if (VOP_GETVOBJECT(vp, &obj) == 0) {
+                               vm_object_page_clean(obj, 0, 0, 
+                                flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC);
                        }
+                       vput(vp);
                }
+               return(0);
        }
-       simple_unlock(&mntvnode_slock);
+       vmaybefree(vp);
+       lwkt_reltoken(vlock);
+       return(0);
 }
 
 /*
@@ -2721,48 +2938,61 @@ loop:
  * vp must be locked when vfs_object_create is called.
  */
 int
-vfs_object_create(struct vnode *vp, struct thread *td, struct ucred *cred)
+vfs_object_create(struct vnode *vp, struct thread *td)
 {
-       return (VOP_CREATEVOBJECT(vp, cred, td));
+       return (VOP_CREATEVOBJECT(vp, td));
 }
 
-void
-vfree(vp)
-       struct vnode *vp;
+/*
+ * NOTE: the vnode interlock must be held during the call.  We have to recheck
+ * the VFREE flag since the vnode may have been removed from the free list
+ * while we were blocked on vnode_free_list_token.  The use or hold count
+ * must have already been bumped by the caller.
+ */
+static void
+vbusy(struct vnode *vp)
 {
-       int s;
+       lwkt_tokref ilock;
 
-       s = splbio();
-       simple_lock(&vnode_free_list_slock);
-       KASSERT((vp->v_flag & VFREE) == 0, ("vnode already free"));
-       if (vp->v_flag & VAGE) {
-               TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
-       } else {
-               TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
+       lwkt_gettoken(&ilock, &vnode_free_list_token);
+       if ((vp->v_flag & VFREE) != 0) {
+           TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
+           freevnodes--;
+           vp->v_flag &= ~(VFREE|VAGE);
        }
-       freevnodes++;
-       simple_unlock(&vnode_free_list_slock);
-       vp->v_flag &= ~VAGE;
-       vp->v_flag |= VFREE;
-       splx(s);
+       lwkt_reltoken(&ilock);
 }
 
-void
-vbusy(vp)
-       struct vnode *vp;
+/*
+ * NOTE: the vnode interlock must be held during the call.  The use or hold
+ * count must have already been bumped by the caller.  We use a VINFREE to
+ * interlock against other calls to vfree() which might occur while we 
+ * are blocked.  The vnode cannot be reused until it has actually been
+ * placed on the free list, so there are no other races even though the
+ * use and hold counts are 0.
+ */
+static void
+vfree(struct vnode *vp)
 {
-       int s;
+       lwkt_tokref ilock;
 
-       s = splbio();
-       simple_lock(&vnode_free_list_slock);
-       KASSERT((vp->v_flag & VFREE) != 0, ("vnode not free"));
-       TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
-       freevnodes--;
-       simple_unlock(&vnode_free_list_slock);
-       vp->v_flag &= ~(VFREE|VAGE);
-       splx(s);
+       if ((vp->v_flag & VINFREE) == 0) {
+               vp->v_flag |= VINFREE;
+               lwkt_gettoken(&ilock, &vnode_free_list_token); /* can block */
+               KASSERT((vp->v_flag & VFREE) == 0, ("vnode already free"));
+               if (vp->v_flag & VAGE) {
+                       TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
+               } else {
+                       TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
+               }
+               freevnodes++;
+               vp->v_flag &= ~(VAGE|VINFREE);
+               vp->v_flag |= VFREE;
+               lwkt_reltoken(&ilock);  /* can block */
+       }
 }
 
+
 /*
  * Record a process's interest in events which might happen to
  * a vnode.  Because poll uses the historic select-style interface
@@ -2774,7 +3004,9 @@ vbusy(vp)
 int
 vn_pollrecord(struct vnode *vp, struct thread *td, int events)
 {
-       simple_lock(&vp->v_pollinfo.vpi_lock);
+       lwkt_tokref ilock;
+
+       lwkt_gettoken(&ilock, &vp->v_pollinfo.vpi_token);
        if (vp->v_pollinfo.vpi_revents & events) {
                /*
                 * This leaves events we are not interested
@@ -2786,12 +3018,12 @@ vn_pollrecord(struct vnode *vp, struct thread *td, int events)
                events &= vp->v_pollinfo.vpi_revents;
                vp->v_pollinfo.vpi_revents &= ~events;
 
-               simple_unlock(&vp->v_pollinfo.vpi_lock);
+               lwkt_reltoken(&ilock);
                return events;
        }
        vp->v_pollinfo.vpi_events |= events;
        selrecord(td, &vp->v_pollinfo.vpi_selinfo);
-       simple_unlock(&vp->v_pollinfo.vpi_lock);
+       lwkt_reltoken(&ilock);
        return 0;
 }
 
@@ -2806,7 +3038,9 @@ vn_pollevent(vp, events)
        struct vnode *vp;
        short events;
 {
-       simple_lock(&vp->v_pollinfo.vpi_lock);
+       lwkt_tokref ilock;
+
+       lwkt_gettoken(&ilock, &vp->v_pollinfo.vpi_token);
        if (vp->v_pollinfo.vpi_events & events) {
                /*
                 * We clear vpi_events so that we don't
@@ -2823,7 +3057,7 @@ vn_pollevent(vp, events)
                vp->v_pollinfo.vpi_revents |= events;
                selwakeup(&vp->v_pollinfo.vpi_selinfo);
        }
-       simple_unlock(&vp->v_pollinfo.vpi_lock);
+       lwkt_reltoken(&ilock);
 }
 
 /*
@@ -2835,12 +3069,14 @@ void
 vn_pollgone(vp)
        struct vnode *vp;
 {
-       simple_lock(&vp->v_pollinfo.vpi_lock);
+       lwkt_tokref ilock;
+
+       lwkt_gettoken(&ilock, &vp->v_pollinfo.vpi_token);
        if (vp->v_pollinfo.vpi_events) {
                vp->v_pollinfo.vpi_events = 0;
                selwakeup(&vp->v_pollinfo.vpi_selinfo);
        }
-       simple_unlock(&vp->v_pollinfo.vpi_lock);
+       lwkt_reltoken(&ilock);
 }
 
 
@@ -2848,14 +3084,14 @@ vn_pollgone(vp)
 /*
  * Routine to create and manage a filesystem syncer vnode.
  */
-#define sync_close ((int (*) __P((struct  vop_close_args *)))nullop)
-static int     sync_fsync __P((struct  vop_fsync_args *));
-static int     sync_inactive __P((struct  vop_inactive_args *));
-static int     sync_reclaim  __P((struct  vop_reclaim_args *));
-#define sync_lock ((int (*) __P((struct  vop_lock_args *)))vop_nolock)
-#define sync_unlock ((int (*) __P((struct  vop_unlock_args *)))vop_nounlock)
-static int     sync_print __P((struct vop_print_args *));
-#define sync_islocked ((int(*) __P((struct vop_islocked_args *)))vop_noislocked)
+#define sync_close ((int (*) (struct  vop_close_args *))nullop)
+static int     sync_fsync (struct  vop_fsync_args *);
+static int     sync_inactive (struct  vop_inactive_args *);
+static int     sync_reclaim  (struct  vop_reclaim_args *);
+#define sync_lock ((int (*) (struct  vop_lock_args *))vop_nolock)
+#define sync_unlock ((int (*) (struct  vop_unlock_args *))vop_nounlock)
+static int     sync_print (struct vop_print_args *);
+#define sync_islocked ((int(*) (struct vop_islocked_args *))vop_noislocked)
 
 static vop_t **sync_vnodeop_p;
 static struct vnodeopv_entry_desc sync_vnodeop_entries[] = {
@@ -2877,10 +3113,14 @@ VNODEOP_SET(sync_vnodeop_opv_desc);
 
 /*
  * Create a new filesystem syncer vnode for the specified mount point.
+ * This vnode is placed on the worklist and is responsible for sync'ing
+ * the filesystem.
+ *
+ * NOTE: read-only mounts are also placed on the worklist.  The filesystem
+ * sync code is also responsible for cleaning up vnodes.
  */
 int
-vfs_allocate_syncvnode(mp)
-       struct mount *mp;
+vfs_allocate_syncvnode(struct mount *mp)
 {
        struct vnode *vp;
        static long start, incr, next;
@@ -2928,6 +3168,7 @@ sync_fsync(ap)
        struct vnode *syncvp = ap->a_vp;
        struct mount *mp = syncvp->v_mount;
        struct thread *td = ap->a_td;
+       lwkt_tokref ilock;
        int asyncflag;
 
        /*
@@ -2943,19 +3184,26 @@ sync_fsync(ap)
 
        /*
         * Walk the list of vnodes pushing all that are dirty and
-        * not already on the sync list.
+        * not already on the sync list, and freeing vnodes which have
+        * no refs and whos VM objects are empty.  vfs_msync() handles
+        * the VM issues and must be called whether the mount is readonly
+        * or not.
         */
-       simple_lock(&mountlist_slock);
-       if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_slock, td) != 0) {
-               simple_unlock(&mountlist_slock);
+       lwkt_gettoken(&ilock, &mountlist_token);
+       if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &ilock, td) != 0) {
+               lwkt_reltoken(&ilock);
                return (0);
        }
-       asyncflag = mp->mnt_flag & MNT_ASYNC;
-       mp->mnt_flag &= ~MNT_ASYNC;
-       vfs_msync(mp, MNT_NOWAIT);
-       VFS_SYNC(mp, MNT_LAZY, ap->a_cred, td);
-       if (asyncflag)
-               mp->mnt_flag |= MNT_ASYNC;
+       if (mp->mnt_flag & MNT_RDONLY) {
+               vfs_msync(mp, MNT_NOWAIT);
+       } else {
+               asyncflag = mp->mnt_flag & MNT_ASYNC;
+               mp->mnt_flag &= ~MNT_ASYNC;     /* ZZZ hack */
+               vfs_msync(mp, MNT_NOWAIT);
+               VFS_SYNC(mp, MNT_LAZY, td);
+               if (asyncflag)
+                       mp->mnt_flag |= MNT_ASYNC;
+       }
        vfs_unbusy(mp, td);
        return (0);
 }
@@ -3048,12 +3296,12 @@ vn_isdisk(vp, errp)
                        *errp = ENXIO;
                return (0);
        }
-       if (!devsw(vp->v_rdev)) {
+       if (!dev_dport(vp->v_rdev)) {
                if (errp != NULL)
                        *errp = ENXIO;
                return (0);
        }
-       if (!(devsw(vp->v_rdev)->d_flags & D_DISK)) {
+       if (!(dev_dflags(vp->v_rdev) & D_DISK)) {
                if (errp != NULL)
                        *errp = ENOTBLK;
                return (0);
@@ -3069,30 +3317,68 @@ NDFREE(ndp, flags)
      const uint flags;
 {
        if (!(flags & NDF_NO_FREE_PNBUF) &&
-           (ndp->ni_cnd.cn_flags & HASBUF)) {
+           (ndp->ni_cnd.cn_flags & CNP_HASBUF)) {
                zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
-               ndp->ni_cnd.cn_flags &= ~HASBUF;
+               ndp->ni_cnd.cn_flags &= ~CNP_HASBUF;
+       }
+       if (!(flags & NDF_NO_DNCP_RELE) &&
+           (ndp->ni_cnd.cn_flags & CNP_WANTDNCP) &&
+           ndp->ni_dncp) {
+               cache_drop(ndp->ni_dncp);
+               ndp->ni_dncp = NULL;
+       }
+       if (!(flags & NDF_NO_NCP_RELE) &&
+           (ndp->ni_cnd.cn_flags & CNP_WANTNCP) &&
+           ndp->ni_ncp) {
+               cache_drop(ndp->ni_ncp);
+               ndp->ni_ncp = NULL;
        }
        if (!(flags & NDF_NO_DVP_UNLOCK) &&
-           (ndp->ni_cnd.cn_flags & LOCKPARENT) &&
-           ndp->ni_dvp != ndp->ni_vp)
-               VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_td);
+           (ndp->ni_cnd.cn_flags & CNP_LOCKPARENT) &&
+           ndp->ni_dvp != ndp->ni_vp) {
+               VOP_UNLOCK(ndp->ni_dvp, NULL, 0, ndp->ni_cnd.cn_td);
+       }
        if (!(flags & NDF_NO_DVP_RELE) &&
-           (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) {
+           (ndp->ni_cnd.cn_flags & (CNP_LOCKPARENT|CNP_WANTPARENT))) {
                vrele(ndp->ni_dvp);
                ndp->ni_dvp = NULL;
        }
        if (!(flags & NDF_NO_VP_UNLOCK) &&
-           (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp)
-               VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_td);
+           (ndp->ni_cnd.cn_flags & CNP_LOCKLEAF) && ndp->ni_vp) {
+               VOP_UNLOCK(ndp->ni_vp, NULL, 0, ndp->ni_cnd.cn_td);
+       }
        if (!(flags & NDF_NO_VP_RELE) &&
            ndp->ni_vp) {
                vrele(ndp->ni_vp);
                ndp->ni_vp = NULL;
        }
        if (!(flags & NDF_NO_STARTDIR_RELE) &&
-           (ndp->ni_cnd.cn_flags & SAVESTART)) {
+           (ndp->ni_cnd.cn_flags & CNP_SAVESTART)) {
                vrele(ndp->ni_startdir);
                ndp->ni_startdir = NULL;
        }
 }
+
+#ifdef DEBUG_VFS_LOCKS
+
+void
+assert_vop_locked(struct vnode *vp, const char *str)
+{
+
+       if (vp && IS_LOCKING_VFS(vp) && !VOP_ISLOCKED(vp, NULL)) {
+               panic("%s: %p is not locked shared but should be", str, vp);
+       }
+}
+
+void
+assert_vop_unlocked(struct vnode *vp, const char *str)
+{
+
+       if (vp && IS_LOCKING_VFS(vp)) {
+               if (VOP_ISLOCKED(vp, curthread) == LK_EXCLUSIVE) {
+                       panic("%s: %p is locked but should not be", str, vp);
+               }
+       }
+}
+
+#endif