From: Matthew Dillon <dillon@apollo.backplane.com>
Date: Fri, 27 Aug 2010 04:18:06 +0000 (-0700)
Subject: kernel - Major MPSAFE Infrastructure
X-Git-Tag: v2.9.0~391
X-Git-Url: https://gitweb.dragonflybsd.org/dragonfly.git/commitdiff_plain/77912481ac5f5d886b07c9f7038b03eba09b2bca

kernel - Major MPSAFE Infrastructure

* vm_page_lookup() now requires the vm_token to be held on call instead of
  the MP lock.  And fix the few places where the routine was being called
  without the vm_token.

  Various situations where a vm_page_lookup() is performed followed by
  vm_page_wire(), without busying the page, and other similar situations,
  require the vm_token to be held across the whole block of code.

* bio_done callbacks are now MPSAFE but some drivers (ata, ccd, vinum,
  aio, nfs) are not MPSAFE yet so get the mplock for those.   They will
  be converted to a generic driver-wide token later.

* Remove critical sections that used to protect VM system related
  interrupts, replace with the vm_token.

* Spinlocks now bump thread->td_critcount in addition to
  mycpu->gd_spinlock*.  Note the ordering is important.  Then remove
  gd_spinlock* checks elsewhere that are covered by td_critcount and
  replace with assertions.

  Also use td_critcount in the kern_mutex.c code instead of gd_spinlock*.

  This fixes situations where the last crit_exit() would call splx()
  without checking for spinlocks.  Adding the additional checks would
  have made the crit_*() inlines too complex so instead we just fold
  it into td_critcount.

* lwkt_yield() no longer guarantees that lwkt_switch() will be called
  so call lwkt_switch() instead in places where a switch is required.
  For example, to unwind a preemption.  Otherwise the kernel could end
  up live-locking trying to yield because the new switch code does not
  necessarily schedule a different kernel thread.

* Add the sysctl user_pri_sched (default 0).  Setting this will make
  the LWKT scheduler more aggressively schedule user threads when
  runnable kernel threads are unable to gain token/mplock resources.
  For debugging only.

* Change the bufspin spinlock to bufqspin and bufcspin, and generally
  rework vfs_bio.c to lock numerous fields with bufcspin.  Also use
  bufcspin to interlock waitrunningbufspace() and friends.

  Remove several mplocks in vfs_bio.c that are no longer needed.

  Protect the page manipulation code in vfs_bio.c with vm_token instead
  of the mplock.

* Fix a deadlock with the FINDBLK_TEST/BUF_LOCK sequence which can occur
  due to the fact that the buffer may change its (vp,loffset) during
  the BUF_LOCK call.  Even though the code checks for this after
  the lock succeeds there is still the problem of the locking operation
  itself potentially creating a deadlock betwen two threads by locking
  an unexpected buffer when the caller is already holding other buffers
  locked.

  We do this by adding an interlock refcounter, b_refs.  getnewbuf()
  will avoid reusing such buffers.

* The syncer_token was not protecting all accesses to the syncer list.
  Fix that.

* Make HAMMER MPSAFE.  All major entry points now use a per-mount token,
  hmp->fs_token.  Backend callbacks (bioops, bio_done) use hmp->io_token.
  The cache-case for the read and getattr paths require not tokens at
  all (as before).

  The bitfield flags had to be separated into two groups to deal with
  SMP cache coherency races.

  Certain flags in the hammer_record structure had to be separated for
  the same reason.

  Certain interactions between the frontend and the backend must use
  the hmp->io_token.

  It is important to note that for any given buffer there are two
  locking entities: (1) The hammer structure and (2) The buffer cache
  buffer.  These interactions are very fragile.

  Do not allow the kernel to flush a dirty buffer if we are unable
  to obtain a norefs-interlock on the buffer, which fixes numerous
  frontend/backend MP races on the io structure.

  Add a write interlock in one of the recover_flush_buffer cases.
---

diff --git a/sys/dev/agp/agp.c b/sys/dev/agp/agp.c
index d7e70d257a..1f284c70e2 100644
--- a/sys/dev/agp/agp.c
+++ b/sys/dev/agp/agp.c
@@ -569,11 +569,13 @@ agp_generic_bind_memory(device_t dev, struct agp_memory *mem,
 				vm_page_wakeup(m);
 				for (k = 0; k < i + j; k += AGP_PAGE_SIZE)
 					AGP_UNBIND_PAGE(dev, offset + k);
+				lwkt_gettoken(&vm_token);
 				for (k = 0; k <= i; k += PAGE_SIZE) {
 					m = vm_page_lookup(mem->am_obj,
 							   OFF_TO_IDX(k));
 					vm_page_unwire(m, 0);
 				}
+				lwkt_reltoken(&vm_token);
 				lockmgr(&sc->as_lock, LK_RELEASE);
 				return error;
 			}
@@ -622,10 +624,12 @@ agp_generic_unbind_memory(device_t dev, struct agp_memory *mem)
 	 */
 	for (i = 0; i < mem->am_size; i += AGP_PAGE_SIZE)
 		AGP_UNBIND_PAGE(dev, mem->am_offset + i);
+	lwkt_gettoken(&vm_token);
 	for (i = 0; i < mem->am_size; i += PAGE_SIZE) {
 		m = vm_page_lookup(mem->am_obj, atop(i));
 		vm_page_unwire(m, 0);
 	}
+	lwkt_reltoken(&vm_token);
 		
 	agp_flush_cache();
 	AGP_FLUSH_TLB(dev);
diff --git a/sys/dev/agp/agp_i810.c b/sys/dev/agp/agp_i810.c
index 74383de9ba..a766c62904 100644
--- a/sys/dev/agp/agp_i810.c
+++ b/sys/dev/agp/agp_i810.c
@@ -1008,8 +1008,11 @@ agp_i810_free_memory(device_t dev, struct agp_memory *mem)
 			/*
 			 * Unwire the page which we wired in alloc_memory.
 			 */
-			vm_page_t m = vm_page_lookup(mem->am_obj, 0);
+			vm_page_t m;
+			lwkt_gettoken(&vm_token);
+			m = vm_page_lookup(mem->am_obj, 0);
 			vm_page_unwire(m, 0);
+			lwkt_reltoken(&vm_token);
 		} else {
 			contigfree(sc->argb_cursor, mem->am_size, M_AGP);
 			sc->argb_cursor = NULL;
diff --git a/sys/dev/disk/ata/ata-raid.c b/sys/dev/disk/ata/ata-raid.c
index 51d7fc1eba..bf986c0757 100644
--- a/sys/dev/disk/ata/ata-raid.c
+++ b/sys/dev/disk/ata/ata-raid.c
@@ -676,6 +676,8 @@ ar_done(struct bio *bio)
     struct ar_softc *rdp = (struct ar_softc *)bio->bio_caller_info1.ptr;
     struct ar_buf *buf = (struct ar_buf *)bio->bio_buf;
 
+    get_mplock();
+
     switch (rdp->flags & (AR_F_RAID0 | AR_F_RAID1 | AR_F_SPAN)) {
     case AR_F_SPAN:
     case AR_F_RAID0:
@@ -708,6 +710,7 @@ ar_done(struct bio *bio)
 		    buf->bp.b_error = 0;
 		    dev_dstrategy(AD_SOFTC(rdp->disks[buf->drive])->dev,
 				  &buf->bp.b_bio1);
+		    rel_mplock();
 		    return;
 		}
 		else {
@@ -743,6 +746,7 @@ ar_done(struct bio *bio)
 	kprintf("ar%d: unknown array type in ar_done\n", rdp->lun);
     }
     kfree(buf, M_AR);
+    rel_mplock();
 }
 
 static void
diff --git a/sys/dev/disk/ccd/ccd.c b/sys/dev/disk/ccd/ccd.c
index 20b16ac26c..bea11e18cf 100644
--- a/sys/dev/disk/ccd/ccd.c
+++ b/sys/dev/disk/ccd/ccd.c
@@ -141,7 +141,6 @@
 #include <sys/devicestat.h>
 #include <sys/fcntl.h>
 #include <sys/vnode.h>
-#include <sys/buf2.h>
 #include <sys/ccdvar.h>
 
 #include <vm/vm_zone.h>
@@ -150,6 +149,8 @@
 #include <vfs/ufs/fs.h> 	/* XXX used only to get BBSIZE and SBSIZE */
 
 #include <sys/thread2.h>
+#include <sys/buf2.h>
+#include <sys/mplock2.h>
 
 #if defined(CCDDEBUG) && !defined(DEBUG)
 #define DEBUG
@@ -1184,6 +1185,7 @@ ccdintr(struct ccd_softc *cs, struct bio *bio)
 
 /*
  * Called at interrupt time.
+ *
  * Mark the component as done and if all components are done,
  * take a ccd interrupt.
  */
@@ -1202,6 +1204,7 @@ ccdiodone(struct bio *bio)
 	 */
 	clearbiocache(bio->bio_next);
 
+	get_mplock();
 	crit_enter();
 #ifdef DEBUG
 	if (ccddebug & CCDB_FOLLOW)
@@ -1272,6 +1275,7 @@ ccdiodone(struct bio *bio)
 				cbp->cb_mirror->cb_pflags |= CCDPF_MIRROR_DONE;
 				putccdbuf(cbp);
 				crit_exit();
+				rel_mplock();
 				return;
 			}
 		} else {
@@ -1290,6 +1294,7 @@ ccdiodone(struct bio *bio)
 					);
 					putccdbuf(cbp);
 					crit_exit();
+					rel_mplock();
 					return;
 				} else {
 					putccdbuf(cbp->cb_mirror);
@@ -1314,6 +1319,7 @@ ccdiodone(struct bio *bio)
 	if (obp->b_resid == 0)
 		ccdintr(&ccd_softc[unit], obio);
 	crit_exit();
+	rel_mplock();
 }
 
 static int
diff --git a/sys/dev/raid/vinum/vinumhdr.h b/sys/dev/raid/vinum/vinumhdr.h
index a2dd60d243..adeb5239f4 100644
--- a/sys/dev/raid/vinum/vinumhdr.h
+++ b/sys/dev/raid/vinum/vinumhdr.h
@@ -69,6 +69,7 @@
 #include <machine/stdarg.h>
 #include <sys/thread2.h>
 #include <sys/buf2.h>
+#include <sys/mplock2.h>
 #else
 #include <setjmp.h>
 #include <stdarg.h>
diff --git a/sys/dev/raid/vinum/vinuminterrupt.c b/sys/dev/raid/vinum/vinuminterrupt.c
index 40fb0c45d7..a209d121d0 100644
--- a/sys/dev/raid/vinum/vinuminterrupt.c
+++ b/sys/dev/raid/vinum/vinuminterrupt.c
@@ -72,6 +72,8 @@ complete_rqe(struct bio *bio)
     struct sd *sd;
     char *gravity;					    /* for error messages */
 
+    get_mplock();
+
     rqe = (struct rqelement *) bp;			    /* point to the element that completed */
     rqg = rqe->rqg;					    /* and the request group */
     rq = rqg->rq;					    /* and the complete request */
@@ -228,6 +230,7 @@ complete_rqe(struct bio *bio)
 	    freerq(rq);					    /* return the request storage */
 	}
     }
+    rel_mplock();
 }
 
 /* Free a request block and anything hanging off it */
@@ -262,6 +265,8 @@ sdio_done(struct bio *bio)
 {
     struct sdbuf *sbp;
 
+    get_mplock();
+
     sbp = (struct sdbuf *) bio->bio_buf;
     if (sbp->b.b_flags & B_ERROR) {			    /* had an error */
 	sbp->bio->bio_buf->b_flags |= B_ERROR;			    /* propagate upwards */
@@ -289,6 +294,7 @@ sdio_done(struct bio *bio)
     BUF_UNLOCK(&sbp->b);
     BUF_LOCKFREE(&sbp->b);
     Free(sbp);
+    rel_mplock();
 }
 
 /* Start the second phase of a RAID-4 or RAID-5 group write operation. */
diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c
index 8e4078204d..0236c05d51 100644
--- a/sys/kern/kern_exec.c
+++ b/sys/kern/kern_exec.c
@@ -600,13 +600,8 @@ exec_map_page(struct image_params *imgp, vm_pindex_t pageno,
 	if (pageno >= object->size)
 		return (EIO);
 
-	/*
-	 * We shouldn't need protection for vm_page_grab() but we certainly
-	 * need it for the lookup loop below (lookup/busy race), since
-	 * an interrupt can unbusy and free the page before our busy check.
-	 */
 	m = vm_page_grab(object, pageno, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
-	crit_enter();
+
 	lwkt_gettoken(&vm_token);
 	while ((m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) {
 		ma = m;
@@ -627,14 +622,12 @@ exec_map_page(struct image_params *imgp, vm_pindex_t pageno,
 				vnode_pager_freepage(m);
 			}
 			lwkt_reltoken(&vm_token);
-			crit_exit();
 			return EIO;
 		}
 	}
 	vm_page_hold(m);	/* requires vm_token to be held */
 	vm_page_wakeup(m);	/* unbusy the page */
 	lwkt_reltoken(&vm_token);
-	crit_exit();
 
 	*plwb = lwbuf_alloc(m);
 	*pdata = (void *)lwbuf_kva(*plwb);
diff --git a/sys/kern/kern_mutex.c b/sys/kern/kern_mutex.c
index d7895cae20..810014a29d 100644
--- a/sys/kern/kern_mutex.c
+++ b/sys/kern/kern_mutex.c
@@ -112,14 +112,17 @@ __mtx_lock_ex(mtx_t mtx, mtx_link_t link, const char *ident, int flags, int to)
 			 * Also set MTX_EXWANTED coincident with EXLINK, if
 			 * not already set.
 			 */
+			thread_t td;
+
 			if (lock & MTX_EXLINK) {
 				cpu_pause();
 				++mtx_collision_count;
 				continue;
 			}
+			td = curthread;
 			/*lock &= ~MTX_EXLINK;*/
 			nlock = lock | MTX_EXWANTED | MTX_EXLINK;
-			++mycpu->gd_spinlocks_wr;
+			++td->td_critcount;
 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) {
 				/*
 				 * Check for early abort
@@ -127,7 +130,7 @@ __mtx_lock_ex(mtx_t mtx, mtx_link_t link, const char *ident, int flags, int to)
 				if (link->state == MTX_LINK_ABORTED) {
 					atomic_clear_int(&mtx->mtx_lock,
 							 MTX_EXLINK);
-					--mycpu->gd_spinlocks_wr;
+					--td->td_critcount;
 					error = ENOLCK;
 					if (mtx->mtx_link == NULL) {
 						atomic_clear_int(&mtx->mtx_lock,
@@ -140,7 +143,7 @@ __mtx_lock_ex(mtx_t mtx, mtx_link_t link, const char *ident, int flags, int to)
 				 * Success.  Link in our structure then
 				 * release EXLINK and sleep.
 				 */
-				link->owner = curthread;
+				link->owner = td;
 				link->state = MTX_LINK_LINKED;
 				if (mtx->mtx_link) {
 					link->next = mtx->mtx_link;
@@ -154,7 +157,7 @@ __mtx_lock_ex(mtx_t mtx, mtx_link_t link, const char *ident, int flags, int to)
 				}
 				tsleep_interlock(link, 0);
 				atomic_clear_int(&mtx->mtx_lock, MTX_EXLINK);
-				--mycpu->gd_spinlocks_wr;
+				--td->td_critcount;
 
 				error = tsleep(link, flags, ident, to);
 				++mtx_contention_count;
@@ -185,7 +188,7 @@ __mtx_lock_ex(mtx_t mtx, mtx_link_t link, const char *ident, int flags, int to)
 				if (error)
 					break;
 			} else {
-				--mycpu->gd_spinlocks_wr;
+				--td->td_critcount;
 			}
 		}
 		++mtx_collision_count;
@@ -516,21 +519,24 @@ _mtx_unlock(mtx_t mtx)
 			 * Acquire an exclusive lock leaving the lockcount
 			 * set to 1, and get EXLINK for access to mtx_link.
 			 */
+			thread_t td;
+
 			if (lock & MTX_EXLINK) {
 				cpu_pause();
 				++mtx_collision_count;
 				continue;
 			}
+			td = curthread;
 			/*lock &= ~MTX_EXLINK;*/
 			nlock |= MTX_EXLINK | MTX_EXCLUSIVE;
 			nlock |= (lock & MTX_SHWANTED);
-			++mycpu->gd_spinlocks_wr;
+			++td->td_critcount;
 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) {
 				mtx_chain_link(mtx);
-				--mycpu->gd_spinlocks_wr;
+				--td->td_critcount;
 				break;
 			}
-			--mycpu->gd_spinlocks_wr;
+			--td->td_critcount;
 		} else if (nlock == (MTX_EXCLUSIVE | MTX_EXWANTED | 1)) {
 			/*
 			 * Last release, exclusive lock, with exclusive
@@ -539,21 +545,24 @@ _mtx_unlock(mtx_t mtx)
 			 * leave the exclusive lock intact and the lockcount
 			 * set to 1, and get EXLINK for access to mtx_link.
 			 */
+			thread_t td;
+
 			if (lock & MTX_EXLINK) {
 				cpu_pause();
 				++mtx_collision_count;
 				continue;
 			}
+			td = curthread;
 			/*lock &= ~MTX_EXLINK;*/
 			nlock |= MTX_EXLINK;
 			nlock |= (lock & MTX_SHWANTED);
-			++mycpu->gd_spinlocks_wr;
+			++td->td_critcount;
 			if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) {
 				mtx_chain_link(mtx);
-				--mycpu->gd_spinlocks_wr;
+				--td->td_critcount;
 				break;
 			}
-			--mycpu->gd_spinlocks_wr;
+			--td->td_critcount;
 		} else {
 			/*
 			 * Not the last release (shared or exclusive)
@@ -646,6 +655,7 @@ static
 void
 mtx_delete_link(mtx_t mtx, mtx_link_t link)
 {
+	thread_t td = curthread;
 	u_int	lock;
 	u_int	nlock;
 
@@ -655,7 +665,7 @@ mtx_delete_link(mtx_t mtx, mtx_link_t link)
 	 * Do not use cmpxchg to wait for EXLINK to clear as this might
 	 * result in too much cpu cache traffic.
 	 */
-	++mycpu->gd_spinlocks_wr;
+	++td->td_critcount;
 	for (;;) {
 		lock = mtx->mtx_lock;
 		if (lock & MTX_EXLINK) {
@@ -685,7 +695,7 @@ mtx_delete_link(mtx_t mtx, mtx_link_t link)
 		link->state = MTX_LINK_IDLE;
 	}
 	atomic_clear_int(&mtx->mtx_lock, MTX_EXLINK);
-	--mycpu->gd_spinlocks_wr;
+	--td->td_critcount;
 }
 
 /*
@@ -697,13 +707,14 @@ mtx_delete_link(mtx_t mtx, mtx_link_t link)
 void
 mtx_abort_ex_link(mtx_t mtx, mtx_link_t link)
 {
+	thread_t td = curthread;
 	u_int	lock;
 	u_int	nlock;
 
 	/*
 	 * Acquire MTX_EXLINK
 	 */
-	++mycpu->gd_spinlocks_wr;
+	++td->td_critcount;
 	for (;;) {
 		lock = mtx->mtx_lock;
 		if (lock & MTX_EXLINK) {
@@ -755,5 +766,5 @@ mtx_abort_ex_link(mtx_t mtx, mtx_link_t link)
 		break;
 	}
 	atomic_clear_int(&mtx->mtx_lock, MTX_EXLINK);
-	--mycpu->gd_spinlocks_wr;
+	--td->td_critcount;
 }
diff --git a/sys/kern/kern_slaballoc.c b/sys/kern/kern_slaballoc.c
index 522b785e75..7a3f8cb6ee 100644
--- a/sys/kern/kern_slaballoc.c
+++ b/sys/kern/kern_slaballoc.c
@@ -1176,8 +1176,10 @@ kmem_slab_alloc(vm_size_t size, vm_offset_t align, int flags)
 	base_vmflags |= VM_ALLOC_SYSTEM;
     if (flags & M_USE_INTERRUPT_RESERVE)
         base_vmflags |= VM_ALLOC_INTERRUPT;
-    if ((flags & (M_RNOWAIT|M_WAITOK)) == 0)
-    	panic("kmem_slab_alloc: bad flags %08x (%p)", flags, ((int **)&size)[-1]);
+    if ((flags & (M_RNOWAIT|M_WAITOK)) == 0) {
+	panic("kmem_slab_alloc: bad flags %08x (%p)",
+	      flags, ((int **)&size)[-1]);
+    }
 
 
     /*
@@ -1217,7 +1219,7 @@ kmem_slab_alloc(vm_size_t size, vm_offset_t align, int flags)
 	    if (flags & M_WAITOK) {
 		if (td->td_preempted) {
 		    vm_map_unlock(&kernel_map);
-		    lwkt_yield();
+		    lwkt_switch();
 		    vm_map_lock(&kernel_map);
 		} else {
 		    vm_map_unlock(&kernel_map);
@@ -1233,9 +1235,11 @@ kmem_slab_alloc(vm_size_t size, vm_offset_t align, int flags)
 	     */
 	    while (i != 0) {
 		i -= PAGE_SIZE;
+		lwkt_gettoken(&vm_token);
 		m = vm_page_lookup(&kernel_object, OFF_TO_IDX(addr + i));
 		/* page should already be busy */
 		vm_page_free(m);
+		lwkt_reltoken(&vm_token);
 	    }
 	    vm_map_delete(&kernel_map, addr, addr + size, &count);
 	    vm_map_unlock(&kernel_map);
@@ -1260,6 +1264,7 @@ kmem_slab_alloc(vm_size_t size, vm_offset_t align, int flags)
     /*
      * Enter the pages into the pmap and deal with PG_ZERO and M_ZERO.
      */
+    lwkt_gettoken(&vm_token);
     for (i = 0; i < size; i += PAGE_SIZE) {
 	vm_page_t m;
 
@@ -1275,6 +1280,7 @@ kmem_slab_alloc(vm_size_t size, vm_offset_t align, int flags)
 	KKASSERT(m->flags & (PG_WRITEABLE | PG_MAPPED));
 	vm_page_flag_set(m, PG_REFERENCED);
     }
+    lwkt_reltoken(&vm_token);
     vm_map_unlock(&kernel_map);
     vm_map_entry_release(count);
     lwkt_reltoken(&vm_token);
diff --git a/sys/kern/kern_spinlock.c b/sys/kern/kern_spinlock.c
index 96c57cb5a4..fbb1c4f06a 100644
--- a/sys/kern/kern_spinlock.c
+++ b/sys/kern/kern_spinlock.c
@@ -136,6 +136,7 @@ spin_trylock_wr_contested(globaldata_t gd, struct spinlock *mtx, int value)
 			if (globaldata_find(bit)->gd_spinlock_rd == mtx) {
 				atomic_swap_int(&mtx->lock, value);
 				--gd->gd_spinlocks_wr;
+				--gd->gd_curthread->td_critcount;
 				return (FALSE);
 			}
 			value &= ~(1 << bit);
@@ -143,6 +144,7 @@ spin_trylock_wr_contested(globaldata_t gd, struct spinlock *mtx, int value)
 		return (TRUE);
 	}
 	--gd->gd_spinlocks_wr;
+	--gd->gd_curthread->td_critcount;
 	return (FALSE);
 }
 
diff --git a/sys/kern/lwkt_thread.c b/sys/kern/lwkt_thread.c
index 8deb893819..eb6d5530b1 100644
--- a/sys/kern/lwkt_thread.c
+++ b/sys/kern/lwkt_thread.c
@@ -151,6 +151,8 @@ SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count, CTLFLAG_RW,
 #endif
 static int fairq_enable = 1;
 SYSCTL_INT(_lwkt, OID_AUTO, fairq_enable, CTLFLAG_RW, &fairq_enable, 0, "");
+static int user_pri_sched = 0;
+SYSCTL_INT(_lwkt, OID_AUTO, user_pri_sched, CTLFLAG_RW, &user_pri_sched, 0, "");
 
 /*
  * These helper procedures handle the runq, they can only be called from
@@ -777,8 +779,8 @@ lwkt_switch(void)
 	     * NOTE: For UP there is no mplock and lwkt_getalltokens()
 	     *	     always succeeds.
 	     */
-	    if ((ntd->td_pri >= TDPRI_KERN_LPSCHED || nquserok) &&
-		ntd->td_fairq_accum >= 0 &&
+	    if ((ntd->td_pri >= TDPRI_KERN_LPSCHED || nquserok ||
+		user_pri_sched) && ntd->td_fairq_accum >= 0 &&
 #ifdef SMP
 		(ntd->td_mpcount == 0 || mpheld || cpu_try_mplock()) &&
 #endif
@@ -933,23 +935,16 @@ lwkt_preempt(thread_t ntd, int critcount)
     }
 #endif
     /*
-     * Take the easy way out and do not preempt if we are holding
-     * any spinlocks.  We could test whether the thread(s) being
-     * preempted interlock against the target thread's tokens and whether
-     * we can get all the target thread's tokens, but this situation 
-     * should not occur very often so its easier to simply not preempt.
-     * Also, plain spinlocks are impossible to figure out at this point so 
-     * just don't preempt.
+     * We don't have to check spinlocks here as they will also bump
+     * td_critcount.
      *
      * Do not try to preempt if the target thread is holding any tokens.
      * We could try to acquire the tokens but this case is so rare there
      * is no need to support it.
      */
-    if (gd->gd_spinlock_rd || gd->gd_spinlocks_wr) {
-	++preempt_miss;
-	need_lwkt_resched();
-	return;
-    }
+    KKASSERT(gd->gd_spinlock_rd == NULL);
+    KKASSERT(gd->gd_spinlocks_wr == 0);
+
     if (TD_TOKS_HELD(ntd)) {
 	++preempt_miss;
 	need_lwkt_resched();
diff --git a/sys/kern/uipc_syscalls.c b/sys/kern/uipc_syscalls.c
index 5591c8ded2..e378300338 100644
--- a/sys/kern/uipc_syscalls.c
+++ b/sys/kern/uipc_syscalls.c
@@ -1635,20 +1635,24 @@ retry_lookup:
 		 *	vm_page_wire() call.
 		 */
 		crit_enter();
+		lwkt_gettoken(&vm_token);
 		pg = vm_page_lookup(obj, pindex);
 		if (pg == NULL) {
 			pg = vm_page_alloc(obj, pindex, VM_ALLOC_NORMAL);
 			if (pg == NULL) {
 				vm_wait(0);
+				lwkt_reltoken(&vm_token);
 				crit_exit();
 				goto retry_lookup;
 			}
 			vm_page_wakeup(pg);
 		} else if (vm_page_sleep_busy(pg, TRUE, "sfpbsy")) {
+			lwkt_reltoken(&vm_token);
 			crit_exit();
 			goto retry_lookup;
 		}
 		vm_page_wire(pg);
+		lwkt_reltoken(&vm_token);
 		crit_exit();
 
 		/*
diff --git a/sys/kern/usched_bsd4.c b/sys/kern/usched_bsd4.c
index 048270c73c..ede271e278 100644
--- a/sys/kern/usched_bsd4.c
+++ b/sys/kern/usched_bsd4.c
@@ -291,7 +291,7 @@ bsd4_acquire_curproc(struct lwp *lp)
 		 * the run queue.  When we are reactivated we will have
 		 * another chance.
 		 */
-		lwkt_yield();
+		lwkt_switch();
 	} while (dd->uschedcp != lp);
 
 	crit_exit();
@@ -518,11 +518,6 @@ bsd4_setrunqueue(struct lwp *lp)
  * the BGL IS NOT HELD ON ENTRY.  This routine is called at ESTCPUFREQ on
  * each cpu.
  *
- * Because this is effectively a 'fast' interrupt, we cannot safely
- * use spinlocks unless gd_spinlock_rd is NULL and gd_spinlocks_wr is 0,
- * even if the spinlocks are 'non conflicting'.  This is due to the way
- * spinlock conflicts against cached read locks are handled.
- *
  * MPSAFE
  */
 static
@@ -557,15 +552,20 @@ bsd4_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp)
 		--lp->lwp_origcpu;
 
 	/*
-	 * We can only safely call bsd4_resetpriority(), which uses spinlocks,
-	 * if we aren't interrupting a thread that is using spinlocks.
-	 * Otherwise we can deadlock with another cpu waiting for our read
-	 * spinlocks to clear.
+	 * Spinlocks also hold a critical section so there should not be
+	 * any active.
 	 */
-	if (gd->gd_spinlock_rd == NULL && gd->gd_spinlocks_wr == 0)
-		bsd4_resetpriority(lp);
-	else
-		need_user_resched();
+	KKASSERT(gd->gd_spinlock_rd == NULL);
+	KKASSERT(gd->gd_spinlocks_wr == 0);
+
+	bsd4_resetpriority(lp);
+#if 0
+	/*
+	* if we can't call bsd4_resetpriority for some reason we must call
+	 * need user_resched().
+	 */
+	need_user_resched();
+#endif
 }
 
 /*
diff --git a/sys/kern/vfs_aio.c b/sys/kern/vfs_aio.c
index e87558a3e9..16d8c3ea94 100644
--- a/sys/kern/vfs_aio.c
+++ b/sys/kern/vfs_aio.c
@@ -2026,6 +2026,7 @@ aio_physwakeup(struct bio *bio)
 	struct aio_liojob *lj;
 
 	aiocbe = bio->bio_caller_info2.ptr;
+	get_mplock();
 
 	if (aiocbe) {
 		p = bio->bio_caller_info1.ptr;
@@ -2081,6 +2082,7 @@ aio_physwakeup(struct bio *bio)
 		}
 	}
 	biodone_sync(bio);
+	rel_mplock();
 }
 #endif /* VFS_AIO */
 
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index 2ff05a5360..a0d75331a9 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -89,7 +89,8 @@ typedef enum bufq_type bufq_type_t;
 #define BD_WAKE_MASK	(BD_WAKE_SIZE - 1)
 
 TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES];
-struct spinlock	bufspin = SPINLOCK_INITIALIZER(&bufspin);
+static struct spinlock bufqspin = SPINLOCK_INITIALIZER(&bufqspin);
+static struct spinlock bufcspin = SPINLOCK_INITIALIZER(&bufcspin);
 
 static MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer");
 
@@ -119,23 +120,31 @@ vm_page_t bogus_page;
  * These are all static, but make the ones we export globals so we do
  * not need to use compiler magic.
  */
-int bufspace, maxbufspace,
-	bufmallocspace, maxbufmallocspace, lobufspace, hibufspace;
+int bufspace;			/* locked by buffer_map */
+int maxbufspace;
+static int bufmallocspace;	/* atomic ops */
+int maxbufmallocspace, lobufspace, hibufspace;
 static int bufreusecnt, bufdefragcnt, buffreekvacnt;
-static int lorunningspace, hirunningspace, runningbufreq;
-int dirtybufspace, dirtybufspacehw, lodirtybufspace, hidirtybufspace;
-int dirtybufcount, dirtybufcounthw;
-int runningbufspace, runningbufcount;
+static int lorunningspace;
+static int hirunningspace;
+static int runningbufreq;		/* locked by bufcspin */
+static int dirtybufspace;		/* locked by bufcspin */
+static int dirtybufcount;		/* locked by bufcspin */
+static int dirtybufspacehw;		/* locked by bufcspin */
+static int dirtybufcounthw;		/* locked by bufcspin */
+static int runningbufspace;		/* locked by bufcspin */
+static int runningbufcount;		/* locked by bufcspin */
+int lodirtybufspace;
+int hidirtybufspace;
 static int getnewbufcalls;
 static int getnewbufrestarts;
 static int recoverbufcalls;
-static int needsbuffer;		/* locked by needsbuffer_spin */
-static int bd_request;		/* locked by needsbuffer_spin */
-static int bd_request_hw;	/* locked by needsbuffer_spin */
+static int needsbuffer;		/* locked by bufcspin */
+static int bd_request;		/* locked by bufcspin */
+static int bd_request_hw;	/* locked by bufcspin */
 static u_int bd_wake_ary[BD_WAKE_SIZE];
 static u_int bd_wake_index;
 static u_int vm_cycle_point = 40; /* 23-36 will migrate more act->inact */
-static struct spinlock needsbuffer_spin;
 static int debug_commit;
 
 static struct thread *bufdaemon_td;
@@ -215,7 +224,6 @@ char *buf_wmesg = BUF_WMESG;
  *	sufficient buffer space.  Buffer space becomes recoverable when 
  *	bp's get placed back in the queues.
  */
-
 static __inline void
 bufspacewakeup(void)
 {
@@ -224,11 +232,13 @@ bufspacewakeup(void)
 	 * though we haven't freed the kva space yet, the waiting
 	 * process will be able to now.
 	 */
+	spin_lock_wr(&bufcspin);
 	if (needsbuffer & VFS_BIO_NEED_BUFSPACE) {
-		spin_lock_wr(&needsbuffer_spin);
 		needsbuffer &= ~VFS_BIO_NEED_BUFSPACE;
-		spin_unlock_wr(&needsbuffer_spin);
+		spin_unlock_wr(&bufcspin);
 		wakeup(&needsbuffer);
+	} else {
+		spin_unlock_wr(&bufcspin);
 	}
 }
 
@@ -245,17 +255,21 @@ runningbufwakeup(struct buf *bp)
 	int limit;
 
 	if ((totalspace = bp->b_runningbufspace) != 0) {
-		atomic_subtract_int(&runningbufspace, totalspace);
-		atomic_subtract_int(&runningbufcount, 1);
+		spin_lock_wr(&bufcspin);
+		runningbufspace -= totalspace;
+		--runningbufcount;
 		bp->b_runningbufspace = 0;
 
 		/*
 		 * see waitrunningbufspace() for limit test.
 		 */
-		limit = hirunningspace * 2 / 3;
+		limit = hirunningspace * 4 / 6;
 		if (runningbufreq && runningbufspace <= limit) {
 			runningbufreq = 0;
+			spin_unlock_wr(&bufcspin);
 			wakeup(&runningbufreq);
+		} else {
+			spin_unlock_wr(&bufcspin);
 		}
 		bd_signal(totalspace);
 	}
@@ -274,25 +288,27 @@ runningbufwakeup(struct buf *bp)
 static __inline void
 bufcountwakeup(void) 
 {
+	spin_lock_wr(&bufcspin);
 	if (needsbuffer) {
-		spin_lock_wr(&needsbuffer_spin);
 		needsbuffer &= ~VFS_BIO_NEED_ANY;
-		spin_unlock_wr(&needsbuffer_spin);
+		spin_unlock_wr(&bufcspin);
 		wakeup(&needsbuffer);
+	} else {
+		spin_unlock_wr(&bufcspin);
 	}
 }
 
 /*
  * waitrunningbufspace()
  *
- * Wait for the amount of running I/O to drop to hirunningspace * 2 / 3.
+ * Wait for the amount of running I/O to drop to hirunningspace * 4 / 6.
  * This is the point where write bursting stops so we don't want to wait
  * for the running amount to drop below it (at least if we still want bioq
  * to burst writes).
  *
  * The caller may be using this function to block in a tight loop, we
  * must block while runningbufspace is greater then or equal to
- * hirunningspace * 2 / 3.
+ * hirunningspace * 4 / 6.
  *
  * And even with that it may not be enough, due to the presence of
  * B_LOCKED dirty buffers, so also wait for at least one running buffer
@@ -301,19 +317,23 @@ bufcountwakeup(void)
 void
 waitrunningbufspace(void)
 {
-	int limit = hirunningspace * 2 / 3;
+	int limit = hirunningspace * 4 / 6;
+	int dummy;
 
-	crit_enter();
+	spin_lock_wr(&bufcspin);
 	if (runningbufspace > limit) {
 		while (runningbufspace > limit) {
 			++runningbufreq;
-			tsleep(&runningbufreq, 0, "wdrn1", 0);
+			ssleep(&runningbufreq, &bufcspin, 0, "wdrn1", 0);
 		}
-	} else if (runningbufspace) {
+		spin_unlock_wr(&bufcspin);
+	} else if (runningbufspace > limit / 2) {
 		++runningbufreq;
-		tsleep(&runningbufreq, 0, "wdrn2", 1);
+		spin_unlock_wr(&bufcspin);
+		tsleep(&dummy, 0, "wdrn2", 1);
+	} else {
+		spin_unlock_wr(&bufcspin);
 	}
-	crit_exit();
 }
 
 /*
@@ -335,7 +355,7 @@ buf_dirty_count_severe(void)
 int
 buf_runningbufspace_severe(void)
 {
-	return (runningbufspace >= hirunningspace * 2 / 3);
+	return (runningbufspace >= hirunningspace * 4 / 6);
 }
 
 /*
@@ -380,17 +400,17 @@ bd_speedup(void)
 	if (bd_request == 0 &&
 	    (dirtybufspace - dirtybufspacehw > lodirtybufspace / 2 ||
 	     dirtybufcount - dirtybufcounthw >= nbuf / 2)) {
-		spin_lock_wr(&needsbuffer_spin);
+		spin_lock_wr(&bufcspin);
 		bd_request = 1;
-		spin_unlock_wr(&needsbuffer_spin);
+		spin_unlock_wr(&bufcspin);
 		wakeup(&bd_request);
 	}
 	if (bd_request_hw == 0 &&
 	    (dirtybufspacehw > lodirtybufspace / 2 ||
 	     dirtybufcounthw >= nbuf / 2)) {
-		spin_lock_wr(&needsbuffer_spin);
+		spin_lock_wr(&bufcspin);
 		bd_request_hw = 1;
-		spin_unlock_wr(&needsbuffer_spin);
+		spin_unlock_wr(&bufcspin);
 		wakeup(&bd_request_hw);
 	}
 }
@@ -453,11 +473,16 @@ bd_wait(int totalspace)
 		if (count >= BD_WAKE_SIZE)
 			count = BD_WAKE_SIZE - 1;
 
-		spin_lock_wr(&needsbuffer_spin);
+		spin_lock_wr(&bufcspin);
 		i = (bd_wake_index + count) & BD_WAKE_MASK;
 		++bd_wake_ary[i];
+
+		/*
+		 * This is not a strict interlock, so we play a bit loose
+		 * with locking access to dirtybufspace*
+		 */
 		tsleep_interlock(&bd_wake_ary[i], 0);
-		spin_unlock_wr(&needsbuffer_spin);
+		spin_unlock_wr(&bufcspin);
 		tsleep(&bd_wake_ary[i], PINTERLOCKED, "flstik", hz);
 
 		totalspace = runningbufspace + dirtybufspace - hidirtybufspace;
@@ -481,19 +506,19 @@ bd_signal(int totalspace)
 	if (totalspace > 0) {
 		if (totalspace > BKVASIZE * BD_WAKE_SIZE)
 			totalspace = BKVASIZE * BD_WAKE_SIZE;
-		spin_lock_wr(&needsbuffer_spin);
+		spin_lock_wr(&bufcspin);
 		while (totalspace > 0) {
 			i = bd_wake_index++;
 			i &= BD_WAKE_MASK;
 			if (bd_wake_ary[i]) {
 				bd_wake_ary[i] = 0;
-				spin_unlock_wr(&needsbuffer_spin);
+				spin_unlock_wr(&bufcspin);
 				wakeup(&bd_wake_ary[i]);
-				spin_lock_wr(&needsbuffer_spin);
+				spin_lock_wr(&bufcspin);
 			}
 			totalspace -= BKVASIZE;
 		}
-		spin_unlock_wr(&needsbuffer_spin);
+		spin_unlock_wr(&bufcspin);
 	}
 }
 
@@ -595,8 +620,6 @@ bufinit(void)
 	vm_offset_t bogus_offset;
 	int i;
 
-	spin_init(&needsbuffer_spin);
-
 	/* next, make a null set of free lists */
 	for (i = 0; i < BUFFER_QUEUES; i++)
 		TAILQ_INIT(&bufqueues[i]);
@@ -777,7 +800,6 @@ bfreekva(struct buf *bp)
 	int count;
 
 	if (bp->b_kvasize) {
-		get_mplock();
 		++buffreekvacnt;
 		count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
 		vm_map_lock(&buffer_map);
@@ -792,7 +814,6 @@ bfreekva(struct buf *bp)
 		bp->b_kvasize = 0;
 		bp->b_kvabase = NULL;
 		bufspacewakeup();
-		rel_mplock();
 	}
 }
 
@@ -818,9 +839,9 @@ _bremfree(struct buf *bp)
 void
 bremfree(struct buf *bp)
 {
-	spin_lock_wr(&bufspin);
+	spin_lock_wr(&bufqspin);
 	_bremfree(bp);
-	spin_unlock_wr(&bufspin);
+	spin_unlock_wr(&bufqspin);
 }
 
 static void
@@ -849,14 +870,12 @@ bread(struct vnode *vp, off_t loffset, int size, struct buf **bpp)
 
 	/* if not found in cache, do some I/O */
 	if ((bp->b_flags & B_CACHE) == 0) {
-		get_mplock();
 		bp->b_flags &= ~(B_ERROR | B_EINTR | B_INVAL);
 		bp->b_cmd = BUF_CMD_READ;
 		bp->b_bio1.bio_done = biodone_sync;
 		bp->b_bio1.bio_flags |= BIO_SYNC;
 		vfs_busy_pages(vp, bp);
 		vn_strategy(vp, &bp->b_bio1);
-		rel_mplock();
 		return (biowait(&bp->b_bio1, "biord"));
 	}
 	return (0);
@@ -884,7 +903,6 @@ breadn(struct vnode *vp, off_t loffset, int size, off_t *raoffset,
 
 	/* if not found in cache, do some I/O */
 	if ((bp->b_flags & B_CACHE) == 0) {
-		get_mplock();
 		bp->b_flags &= ~(B_ERROR | B_EINTR | B_INVAL);
 		bp->b_cmd = BUF_CMD_READ;
 		bp->b_bio1.bio_done = biodone_sync;
@@ -892,7 +910,6 @@ breadn(struct vnode *vp, off_t loffset, int size, off_t *raoffset,
 		vfs_busy_pages(vp, bp);
 		vn_strategy(vp, &bp->b_bio1);
 		++readwait;
-		rel_mplock();
 	}
 
 	for (i = 0; i < cnt; i++, raoffset++, rabsize++) {
@@ -901,13 +918,11 @@ breadn(struct vnode *vp, off_t loffset, int size, off_t *raoffset,
 		rabp = getblk(vp, *raoffset, *rabsize, 0, 0);
 
 		if ((rabp->b_flags & B_CACHE) == 0) {
-			get_mplock();
 			rabp->b_flags &= ~(B_ERROR | B_EINTR | B_INVAL);
 			rabp->b_cmd = BUF_CMD_READ;
 			vfs_busy_pages(vp, rabp);
 			BUF_KERNPROC(rabp);
 			vn_strategy(vp, &rabp->b_bio1);
-			rel_mplock();
 		} else {
 			brelse(rabp);
 		}
@@ -958,15 +973,11 @@ bwrite(struct buf *bp)
 	 * Normal bwrites pipeline writes.  NOTE: b_bufsize is only
 	 * valid for vnode-backed buffers.
 	 */
-	bp->b_runningbufspace = bp->b_bufsize;
-	if (bp->b_runningbufspace) {
-		runningbufspace += bp->b_runningbufspace;
-		++runningbufcount;
-	}
-
+	bsetrunningbufspace(bp, bp->b_bufsize);
 	vn_strategy(bp->b_vp, &bp->b_bio1);
 	error = biowait(&bp->b_bio1, "biows");
 	brelse(bp);
+
 	return (error);
 }
 
@@ -1002,12 +1013,7 @@ bawrite(struct buf *bp)
 	 * Normal bwrites pipeline writes.  NOTE: b_bufsize is only
 	 * valid for vnode-backed buffers.
 	 */
-	bp->b_runningbufspace = bp->b_bufsize;
-	if (bp->b_runningbufspace) {
-		runningbufspace += bp->b_runningbufspace;
-		++runningbufcount;
-	}
-
+	bsetrunningbufspace(bp, bp->b_bufsize);
 	BUF_KERNPROC(bp);
 	vn_strategy(bp->b_vp, &bp->b_bio1);
 }
@@ -1159,12 +1165,16 @@ bdirty(struct buf *bp)
 	if ((bp->b_flags & B_DELWRI) == 0) {
 		bp->b_flags |= B_DELWRI;
 		reassignbuf(bp);
-		atomic_add_int(&dirtybufcount, 1);
+
+		spin_lock_wr(&bufcspin);
+		++dirtybufcount;
 		dirtybufspace += bp->b_bufsize;
 		if (bp->b_flags & B_HEAVY) {
-			atomic_add_int(&dirtybufcounthw, 1);
-			atomic_add_int(&dirtybufspacehw, bp->b_bufsize);
+			++dirtybufcounthw;
+			dirtybufspacehw += bp->b_bufsize;
 		}
+		spin_unlock_wr(&bufcspin);
+
 		bd_heatup();
 	}
 }
@@ -1180,8 +1190,10 @@ bheavy(struct buf *bp)
 	if ((bp->b_flags & B_HEAVY) == 0) {
 		bp->b_flags |= B_HEAVY;
 		if (bp->b_flags & B_DELWRI) {
-			atomic_add_int(&dirtybufcounthw, 1);
-			atomic_add_int(&dirtybufspacehw, bp->b_bufsize);
+			spin_lock_wr(&bufcspin);
+			++dirtybufcounthw;
+			dirtybufspacehw += bp->b_bufsize;
+			spin_unlock_wr(&bufcspin);
 		}
 	}
 }
@@ -1205,12 +1217,16 @@ bundirty(struct buf *bp)
 	if (bp->b_flags & B_DELWRI) {
 		bp->b_flags &= ~B_DELWRI;
 		reassignbuf(bp);
-		atomic_subtract_int(&dirtybufcount, 1);
-		atomic_subtract_int(&dirtybufspace, bp->b_bufsize);
+
+		spin_lock_wr(&bufcspin);
+		--dirtybufcount;
+		dirtybufspace -= bp->b_bufsize;
 		if (bp->b_flags & B_HEAVY) {
-			atomic_subtract_int(&dirtybufcounthw, 1);
-			atomic_subtract_int(&dirtybufspacehw, bp->b_bufsize);
+			--dirtybufcounthw;
+			dirtybufspacehw -= bp->b_bufsize;
 		}
+		spin_unlock_wr(&bufcspin);
+
 		bd_signal(bp->b_bufsize);
 	}
 	/*
@@ -1219,6 +1235,22 @@ bundirty(struct buf *bp)
 	bp->b_flags &= ~B_DEFERRED;
 }
 
+/*
+ * Set the b_runningbufspace field, used to track how much I/O is
+ * in progress at any given moment.
+ */
+void
+bsetrunningbufspace(struct buf *bp, int bytes)
+{
+	bp->b_runningbufspace = bytes;
+	if (bytes) {
+		spin_lock_wr(&bufcspin);
+		runningbufspace += bytes;
+		++runningbufcount;
+		spin_unlock_wr(&bufcspin);
+	}
+}
+
 /*
  * brelse:
  *
@@ -1268,18 +1300,18 @@ brelse(struct buf *bp)
 		 * buffer cannot be immediately freed.
 		 */
 		bp->b_flags |= B_INVAL;
-		if (LIST_FIRST(&bp->b_dep) != NULL) {
-			get_mplock();
+		if (LIST_FIRST(&bp->b_dep) != NULL)
 			buf_deallocate(bp);
-			rel_mplock();
-		}
 		if (bp->b_flags & B_DELWRI) {
-			atomic_subtract_int(&dirtybufcount, 1);
-			atomic_subtract_int(&dirtybufspace, bp->b_bufsize);
+			spin_lock_wr(&bufcspin);
+			--dirtybufcount;
+			dirtybufspace -= bp->b_bufsize;
 			if (bp->b_flags & B_HEAVY) {
-				atomic_subtract_int(&dirtybufcounthw, 1);
-				atomic_subtract_int(&dirtybufspacehw, bp->b_bufsize);
+				--dirtybufcounthw;
+				dirtybufspacehw -= bp->b_bufsize;
 			}
+			spin_unlock_wr(&bufcspin);
+
 			bd_signal(bp->b_bufsize);
 		}
 		bp->b_flags &= ~(B_DELWRI | B_CACHE);
@@ -1306,11 +1338,8 @@ brelse(struct buf *bp)
 	if (bp->b_flags & (B_DELWRI | B_LOCKED)) {
 		bp->b_flags &= ~B_RELBUF;
 	} else if (vm_page_count_severe()) {
-		if (LIST_FIRST(&bp->b_dep) != NULL) {
-			get_mplock();
+		if (LIST_FIRST(&bp->b_dep) != NULL)
 			buf_deallocate(bp);		/* can set B_LOCKED */
-			rel_mplock();
-		}
 		if (bp->b_flags & (B_DELWRI | B_LOCKED))
 			bp->b_flags &= ~B_RELBUF;
 		else
@@ -1373,7 +1402,7 @@ brelse(struct buf *bp)
 		resid = bp->b_bufsize;
 		foff = bp->b_loffset;
 
-		get_mplock();
+		lwkt_gettoken(&vm_token);
 		for (i = 0; i < bp->b_xio.xio_npages; i++) {
 			m = bp->b_xio.xio_pages[i];
 			vm_page_flag_clear(m, PG_ZERO);
@@ -1462,19 +1491,17 @@ brelse(struct buf *bp)
 		}
 		if (bp->b_flags & (B_INVAL | B_RELBUF))
 			vfs_vmio_release(bp);
-		rel_mplock();
+		lwkt_reltoken(&vm_token);
 	} else {
 		/*
 		 * Rundown for non-VMIO buffers.
 		 */
 		if (bp->b_flags & (B_INVAL | B_RELBUF)) {
-			get_mplock();
 			if (bp->b_bufsize)
 				allocbuf(bp, 0);
 			KKASSERT (LIST_FIRST(&bp->b_dep) == NULL);
 			if (bp->b_vp)
 				brelvp(bp);
-			rel_mplock();
 		}
 	}
 			
@@ -1493,7 +1520,7 @@ brelse(struct buf *bp)
 	 * Buffers placed in the EMPTY or EMPTYKVA had better already be
 	 * disassociated from their vnode.
 	 */
-	spin_lock_wr(&bufspin);
+	spin_lock_wr(&bufqspin);
 	if (bp->b_flags & B_LOCKED) {
 		/*
 		 * Buffers that are locked are placed in the locked queue
@@ -1552,7 +1579,7 @@ brelse(struct buf *bp)
 		    break;
 		}
 	}
-	spin_unlock_wr(&bufspin);
+	spin_unlock_wr(&bufqspin);
 
 	/*
 	 * If B_INVAL, clear B_DELWRI.  We've already placed the buffer
@@ -1614,7 +1641,7 @@ bqrelse(struct buf *bp)
 
 	buf_act_advance(bp);
 
-	spin_lock_wr(&bufspin);
+	spin_lock_wr(&bufqspin);
 	if (bp->b_flags & B_LOCKED) {
 		/*
 		 * Locked buffers are released to the locked queue.  However,
@@ -1634,14 +1661,14 @@ bqrelse(struct buf *bp)
 		 * buffer (most importantly: the wired pages making up its
 		 * backing store) *now*.
 		 */
-		spin_unlock_wr(&bufspin);
+		spin_unlock_wr(&bufqspin);
 		brelse(bp);
 		return;
 	} else {
 		bp->b_qindex = BQUEUE_CLEAN;
 		TAILQ_INSERT_TAIL(&bufqueues[BQUEUE_CLEAN], bp, b_freelist);
 	}
-	spin_unlock_wr(&bufspin);
+	spin_unlock_wr(&bufqspin);
 
 	if ((bp->b_flags & B_LOCKED) == 0 &&
 	    ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0)) {
@@ -1683,7 +1710,6 @@ vfs_vmio_release(struct buf *bp)
 	vm_page_t m;
 
 	lwkt_gettoken(&vm_token);
-	crit_enter();
 	for (i = 0; i < bp->b_xio.xio_npages; i++) {
 		m = bp->b_xio.xio_pages[i];
 		bp->b_xio.xio_pages[i] = NULL;
@@ -1745,9 +1771,10 @@ vfs_vmio_release(struct buf *bp)
 			}
 		}
 	}
-	crit_exit();
 	lwkt_reltoken(&vm_token);
-	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_xio.xio_npages);
+
+	pmap_qremove(trunc_page((vm_offset_t) bp->b_data),
+		     bp->b_xio.xio_npages);
 	if (bp->b_bufsize) {
 		bufspacewakeup();
 		bp->b_bufsize = 0;
@@ -1755,11 +1782,8 @@ vfs_vmio_release(struct buf *bp)
 	bp->b_xio.xio_npages = 0;
 	bp->b_flags &= ~B_VMIO;
 	KKASSERT (LIST_FIRST(&bp->b_dep) == NULL);
-	if (bp->b_vp) {
-		get_mplock();
+	if (bp->b_vp)
 		brelvp(bp);
-		rel_mplock();
-	}
 }
 
 /*
@@ -1908,7 +1932,7 @@ restart:
 	 * where we cannot backup.
 	 */
 	nqindex = BQUEUE_EMPTYKVA;
-	spin_lock_wr(&bufspin);
+	spin_lock_wr(&bufqspin);
 	nbp = TAILQ_FIRST(&bufqueues[BQUEUE_EMPTYKVA]);
 
 	if (nbp == NULL) {
@@ -1940,7 +1964,7 @@ restart:
 	 * Run scan, possibly freeing data and/or kva mappings on the fly
 	 * depending.
 	 *
-	 * WARNING!  bufspin is held!
+	 * WARNING!  bufqspin is held!
 	 */
 	while ((bp = nbp) != NULL) {
 		int qindex = nqindex;
@@ -1992,8 +2016,16 @@ restart:
 		 * Note: we no longer distinguish between VMIO and non-VMIO
 		 * buffers.
 		 */
+		KASSERT((bp->b_flags & B_DELWRI) == 0,
+			("delwri buffer %p found in queue %d", bp, qindex));
 
-		KASSERT((bp->b_flags & B_DELWRI) == 0, ("delwri buffer %p found in queue %d", bp, qindex));
+		/*
+		 * Do not try to reuse a buffer with a non-zero b_refs.
+		 * This is an unsynchronized test.  A synchronized test
+		 * is also performed after we lock the buffer.
+		 */
+		if (bp->b_refs)
+			continue;
 
 		/*
 		 * If we are defragging then we need a buffer with 
@@ -2015,18 +2047,21 @@ restart:
 		 */
 
 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
-			spin_unlock_wr(&bufspin);
+			spin_unlock_wr(&bufqspin);
 			tsleep(&bd_request, 0, "gnbxxx", hz / 100);
 			goto restart;
 		}
 		if (bp->b_qindex != qindex) {
-			spin_unlock_wr(&bufspin);
-			kprintf("getnewbuf: warning, BUF_LOCK blocked unexpectedly on buf %p index %d->%d, race corrected\n", bp, qindex, bp->b_qindex);
+			spin_unlock_wr(&bufqspin);
+			kprintf("getnewbuf: warning, BUF_LOCK blocked "
+				"unexpectedly on buf %p index %d->%d, "
+				"race corrected\n",
+				bp, qindex, bp->b_qindex);
 			BUF_UNLOCK(bp);
 			goto restart;
 		}
 		bremfree_locked(bp);
-		spin_unlock_wr(&bufspin);
+		spin_unlock_wr(&bufqspin);
 
 		/*
 		 * Dependancies must be handled before we disassociate the
@@ -2036,12 +2071,10 @@ restart:
 		 * be immediately disassociated.  HAMMER then becomes
 		 * responsible for releasing the buffer.
 		 *
-		 * NOTE: bufspin is UNLOCKED now.
+		 * NOTE: bufqspin is UNLOCKED now.
 		 */
 		if (LIST_FIRST(&bp->b_dep) != NULL) {
-			get_mplock();
 			buf_deallocate(bp);
-			rel_mplock();
 			if (bp->b_flags & B_LOCKED) {
 				bqrelse(bp);
 				goto restart;
@@ -2050,15 +2083,10 @@ restart:
 		}
 
 		if (qindex == BQUEUE_CLEAN) {
-			get_mplock();
-			if (bp->b_flags & B_VMIO) {
-				get_mplock();
+			if (bp->b_flags & B_VMIO)
 				vfs_vmio_release(bp);
-				rel_mplock();
-			}
 			if (bp->b_vp)
 				brelvp(bp);
-			rel_mplock();
 		}
 
 		/*
@@ -2077,11 +2105,8 @@ restart:
 		 * scrapping a buffer's contents because it is already 
 		 * wired.
 		 */
-		if (bp->b_bufsize) {
-			get_mplock();
+		if (bp->b_bufsize)
 			allocbuf(bp, 0);
-			rel_mplock();
-		}
 
 		bp->b_flags = B_BNOCLIP;
 		bp->b_cmd = BUF_CMD_DONE;
@@ -2124,8 +2149,23 @@ restart:
 		}
 		if (bufspace < lobufspace)
 			flushingbufs = 0;
+
+		/*
+		 * The brelvp() above interlocked the buffer, test b_refs
+		 * to determine if the buffer can be reused.  b_refs
+		 * interlocks lookup/blocking-lock operations and allowing
+		 * buffer reuse can create deadlocks depending on what
+		 * (vp,loffset) is assigned to the reused buffer (see getblk).
+		 */
+		if (bp->b_refs) {
+			bp->b_flags |= B_INVAL;
+			bfreekva(bp);
+			brelse(bp);
+			goto restart;
+		}
+
 		break;
-		/* NOT REACHED, bufspin not held */
+		/* NOT REACHED, bufqspin not held */
 	}
 
 	/*
@@ -2134,13 +2174,13 @@ restart:
 	 *
 	 * Generally we are sleeping due to insufficient buffer space.
 	 *
-	 * NOTE: bufspin is held if bp is NULL, else it is not held.
+	 * NOTE: bufqspin is held if bp is NULL, else it is not held.
 	 */
 	if (bp == NULL) {
 		int flags;
 		char *waitmsg;
 
-		spin_unlock_wr(&bufspin);
+		spin_unlock_wr(&bufqspin);
 		if (defrag) {
 			flags = VFS_BIO_NEED_BUFSPACE;
 			waitmsg = "nbufkv";
@@ -2152,12 +2192,17 @@ restart:
 			flags = VFS_BIO_NEED_ANY;
 		}
 
-		needsbuffer |= flags;
 		bd_speedup();	/* heeeelp */
+		spin_lock_wr(&bufcspin);
+		needsbuffer |= flags;
 		while (needsbuffer & flags) {
-			if (tsleep(&needsbuffer, slpflags, waitmsg, slptimeo))
+			if (ssleep(&needsbuffer, &bufcspin,
+				   slpflags, waitmsg, slptimeo)) {
+				spin_unlock_wr(&bufcspin);
 				return (NULL);
+			}
 		}
+		spin_unlock_wr(&bufcspin);
 	} else {
 		/*
 		 * We finally have a valid bp.  We aren't quite out of the
@@ -2165,7 +2210,7 @@ restart:
 		 * to keep fragmentation sane we only allocate kva in
 		 * BKVASIZE chunks.
 		 *
-		 * (bufspin is not held)
+		 * (bufqspin is not held)
 		 */
 		maxsize = (maxsize + BKVAMASK) & ~BKVAMASK;
 
@@ -2175,7 +2220,6 @@ restart:
 
 			bfreekva(bp);
 
-			get_mplock();
 			count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
 			vm_map_lock(&buffer_map);
 
@@ -2191,7 +2235,6 @@ restart:
 				++bufdefragcnt;
 				defrag = 1;
 				bp->b_flags |= B_INVAL;
-				rel_mplock();
 				brelse(bp);
 				goto restart;
 			}
@@ -2210,7 +2253,6 @@ restart:
 			}
 			vm_map_unlock(&buffer_map);
 			vm_map_entry_release(count);
-			rel_mplock();
 		}
 		bp->b_data = bp->b_kvabase;
 	}
@@ -2221,6 +2263,8 @@ restart:
  * This routine is called in an emergency to recover VM pages from the
  * buffer cache by cashing in clean buffers.  The idea is to recover
  * enough pages to be able to satisfy a stuck bio_page_alloc().
+ *
+ * MPSAFE
  */
 static int
 recoverbufpages(void)
@@ -2230,7 +2274,7 @@ recoverbufpages(void)
 
 	++recoverbufcalls;
 
-	spin_lock_wr(&bufspin);
+	spin_lock_wr(&bufqspin);
 	while (bytes < MAXBSIZE) {
 		bp = TAILQ_FIRST(&bufqueues[BQUEUE_CLEAN]);
 		if (bp == NULL)
@@ -2262,17 +2306,22 @@ recoverbufpages(void)
 		 */
 
 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
-			kprintf("recoverbufpages: warning, locked buf %p, race corrected\n", bp);
-			tsleep(&bd_request, 0, "gnbxxx", hz / 100);
+			kprintf("recoverbufpages: warning, locked buf %p, "
+				"race corrected\n",
+				bp);
+			ssleep(&bd_request, &bufqspin, 0, "gnbxxx", hz / 100);
 			continue;
 		}
 		if (bp->b_qindex != BQUEUE_CLEAN) {
-			kprintf("recoverbufpages: warning, BUF_LOCK blocked unexpectedly on buf %p index %d, race corrected\n", bp, bp->b_qindex);
+			kprintf("recoverbufpages: warning, BUF_LOCK blocked "
+				"unexpectedly on buf %p index %d, race "
+				"corrected\n",
+				bp, bp->b_qindex);
 			BUF_UNLOCK(bp);
 			continue;
 		}
 		bremfree_locked(bp);
-		spin_unlock_wr(&bufspin);
+		spin_unlock_wr(&bufqspin);
 
 		/*
 		 * Dependancies must be handled before we disassociate the
@@ -2286,7 +2335,7 @@ recoverbufpages(void)
 			buf_deallocate(bp);
 			if (bp->b_flags & B_LOCKED) {
 				bqrelse(bp);
-				spin_lock_wr(&bufspin);
+				spin_lock_wr(&bufqspin);
 				continue;
 			}
 			KKASSERT(LIST_FIRST(&bp->b_dep) == NULL);
@@ -2294,7 +2343,6 @@ recoverbufpages(void)
 
 		bytes += bp->b_bufsize;
 
-		get_mplock();
 		if (bp->b_flags & B_VMIO) {
 			bp->b_flags |= B_DIRECT;    /* try to free pages */
 			vfs_vmio_release(bp);
@@ -2312,7 +2360,6 @@ recoverbufpages(void)
 		 */
 		if (bp->b_bufsize)
 			allocbuf(bp, 0);
-		rel_mplock();
 
 		bp->b_flags = B_BNOCLIP;
 		bp->b_cmd = BUF_CMD_DONE;
@@ -2328,9 +2375,9 @@ recoverbufpages(void)
 		bp->b_flags |= B_INVAL;
 		/* bfreekva(bp); */
 		brelse(bp);
-		spin_lock_wr(&bufspin);
+		spin_lock_wr(&bufqspin);
 	}
-	spin_unlock_wr(&bufspin);
+	spin_unlock_wr(&bufqspin);
 	return(bytes);
 }
 
@@ -2374,11 +2421,11 @@ buf_daemon(void)
 			      bufdaemon_td, SHUTDOWN_PRI_LAST);
 	curthread->td_flags |= TDF_SYSTHREAD;
 
+	rel_mplock();
+
 	/*
 	 * This process is allowed to take the buffer cache to the limit
 	 */
-	crit_enter();
-
 	for (;;) {
 		kproc_suspend_loop();
 
@@ -2412,13 +2459,11 @@ buf_daemon(void)
 		 * request and sleep until we are needed again.
 		 * The sleep is just so the suspend code works.
 		 */
-		spin_lock_wr(&needsbuffer_spin);
-		if (bd_request == 0) {
-			ssleep(&bd_request, &needsbuffer_spin, 0,
-			       "psleep", hz);
-		}
+		spin_lock_wr(&bufcspin);
+		if (bd_request == 0)
+			ssleep(&bd_request, &bufcspin, 0, "psleep", hz);
 		bd_request = 0;
-		spin_unlock_wr(&needsbuffer_spin);
+		spin_unlock_wr(&bufcspin);
 	}
 }
 
@@ -2434,11 +2479,11 @@ buf_daemon_hw(void)
 			      bufdaemonhw_td, SHUTDOWN_PRI_LAST);
 	curthread->td_flags |= TDF_SYSTHREAD;
 
+	rel_mplock();
+
 	/*
 	 * This process is allowed to take the buffer cache to the limit
 	 */
-	crit_enter();
-
 	for (;;) {
 		kproc_suspend_loop();
 
@@ -2472,13 +2517,11 @@ buf_daemon_hw(void)
 		 * request and sleep until we are needed again.
 		 * The sleep is just so the suspend code works.
 		 */
-		spin_lock_wr(&needsbuffer_spin);
-		if (bd_request_hw == 0) {
-			ssleep(&bd_request_hw, &needsbuffer_spin, 0,
-			       "psleep", hz);
-		}
+		spin_lock_wr(&bufcspin);
+		if (bd_request_hw == 0)
+			ssleep(&bd_request_hw, &bufcspin, 0, "psleep", hz);
 		bd_request_hw = 0;
-		spin_unlock_wr(&needsbuffer_spin);
+		spin_unlock_wr(&bufcspin);
 	}
 }
 
@@ -2500,7 +2543,7 @@ flushbufqueues(bufq_type_t q)
 	int r = 0;
 	int spun;
 
-	spin_lock_wr(&bufspin);
+	spin_lock_wr(&bufqspin);
 	spun = 1;
 
 	bp = TAILQ_FIRST(&bufqueues[q]);
@@ -2510,11 +2553,13 @@ flushbufqueues(bufq_type_t q)
 
 		if (bp->b_flags & B_DELWRI) {
 			if (bp->b_flags & B_INVAL) {
-				spin_unlock_wr(&bufspin);
+				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
+					bp = TAILQ_NEXT(bp, b_freelist);
+					continue;
+				}
+				_bremfree(bp);
+				spin_unlock_wr(&bufqspin);
 				spun = 0;
-				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0)
-					panic("flushbufqueues: locked buf");
-				bremfree(bp);
 				brelse(bp);
 				++r;
 				break;
@@ -2543,7 +2588,7 @@ flushbufqueues(bufq_type_t q)
 			 * NOTE: buf_checkwrite is MPSAFE.
 			 */
 			if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
-				spin_unlock_wr(&bufspin);
+				spin_unlock_wr(&bufqspin);
 				spun = 0;
 				if (LIST_FIRST(&bp->b_dep) != NULL &&
 				    buf_checkwrite(bp)) {
@@ -2564,7 +2609,7 @@ flushbufqueues(bufq_type_t q)
 		bp = TAILQ_NEXT(bp, b_freelist);
 	}
 	if (spun)
-		spin_unlock_wr(&bufspin);
+		spin_unlock_wr(&bufqspin);
 	return (r);
 }
 
@@ -2598,7 +2643,9 @@ inmem(struct vnode *vp, off_t loffset)
 		size = vp->v_mount->mnt_stat.f_iosize;
 
 	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
+		lwkt_gettoken(&vm_token);
 		m = vm_page_lookup(obj, OFF_TO_IDX(loffset + toff));
+		lwkt_reltoken(&vm_token);
 		if (m == NULL)
 			return 0;
 		tinc = size;
@@ -2629,6 +2676,11 @@ inmem(struct vnode *vp, off_t loffset)
  *			  to acquire the lock we return NULL, even if the
  *			  buffer exists.
  *
+ *	FINDBLK_REF	- Returns the buffer ref'd, which prevents reuse
+ *			  by getnewbuf() but does not prevent disassociation
+ *			  while we are locked.  Used to avoid deadlocks
+ *			  against random (vp,loffset)s due to reassignment.
+ *
  *	(0)		- Lock the buffer blocking.
  *
  * MPSAFE
@@ -2644,22 +2696,59 @@ findblk(struct vnode *vp, off_t loffset, int flags)
 		lkflags |= LK_NOWAIT;
 
 	for (;;) {
+		/*
+		 * Lookup.  Ref the buf while holding v_token to prevent
+		 * reuse (but does not prevent diassociation).
+		 */
 		lwkt_gettoken(&vp->v_token);
 		bp = buf_rb_hash_RB_LOOKUP(&vp->v_rbhash_tree, loffset);
+		if (bp == NULL) {
+			lwkt_reltoken(&vp->v_token);
+			return(NULL);
+		}
+		atomic_add_int(&bp->b_refs, 1);
 		lwkt_reltoken(&vp->v_token);
-		if (bp == NULL || (flags & FINDBLK_TEST))
+
+		/*
+		 * If testing only break and return bp, do not lock.
+		 */
+		if (flags & FINDBLK_TEST)
 			break;
+
+		/*
+		 * Lock the buffer, return an error if the lock fails.
+		 * (only FINDBLK_NBLOCK can cause the lock to fail).
+		 */
 		if (BUF_LOCK(bp, lkflags)) {
-			bp = NULL;
-			break;
+			atomic_subtract_int(&bp->b_refs, 1);
+			/* bp = NULL; not needed */
+			return(NULL);
 		}
+
+		/*
+		 * Revalidate the locked buf before allowing it to be
+		 * returned.
+		 */
 		if (bp->b_vp == vp && bp->b_loffset == loffset)
 			break;
+		atomic_subtract_int(&bp->b_refs, 1);
 		BUF_UNLOCK(bp);
 	}
+
+	/*
+	 * Success
+	 */
+	if ((flags & FINDBLK_REF) == 0)
+		atomic_subtract_int(&bp->b_refs, 1);
 	return(bp);
 }
 
+void
+unrefblk(struct buf *bp)
+{
+	atomic_subtract_int(&bp->b_refs, 1);
+}
+
 /*
  * getcacheblk:
  *
@@ -2751,27 +2840,37 @@ getblk(struct vnode *vp, off_t loffset, int size, int blkflags, int slptimeo)
 		panic("getblk: vnode %p has no object!", vp);
 
 loop:
-	if ((bp = findblk(vp, loffset, FINDBLK_TEST)) != NULL) {
+	if ((bp = findblk(vp, loffset, FINDBLK_REF | FINDBLK_TEST)) != NULL) {
 		/*
 		 * The buffer was found in the cache, but we need to lock it.
-		 * Even with LK_NOWAIT the lockmgr may break our critical
-		 * section, so double-check the validity of the buffer
-		 * once the lock has been obtained.
+		 * We must acquire a ref on the bp to prevent reuse, but
+		 * this will not prevent disassociation (brelvp()) so we
+		 * must recheck (vp,loffset) after acquiring the lock.
+		 *
+		 * Without the ref the buffer could potentially be reused
+		 * before we acquire the lock and create a deadlock
+		 * situation between the thread trying to reuse the buffer
+		 * and us due to the fact that we would wind up blocking
+		 * on a random (vp,loffset).
 		 */
 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
-			if (blkflags & GETBLK_NOWAIT)
+			if (blkflags & GETBLK_NOWAIT) {
+				unrefblk(bp);
 				return(NULL);
+			}
 			lkflags = LK_EXCLUSIVE | LK_SLEEPFAIL;
 			if (blkflags & GETBLK_PCATCH)
 				lkflags |= LK_PCATCH;
 			error = BUF_TIMELOCK(bp, lkflags, "getblk", slptimeo);
 			if (error) {
+				unrefblk(bp);
 				if (error == ENOLCK)
 					goto loop;
 				return (NULL);
 			}
 			/* buffer may have changed on us */
 		}
+		unrefblk(bp);
 
 		/*
 		 * Once the buffer has been locked, make sure we didn't race
@@ -2839,7 +2938,6 @@ loop:
 		 * definitely do not want to B_NOCACHE the backing store.
 		 */
 		if (size != bp->b_bcount) {
-			get_mplock();
 			if (bp->b_flags & B_DELWRI) {
 				bp->b_flags |= B_RELBUF;
 				bwrite(bp);
@@ -2850,7 +2948,6 @@ loop:
 				bp->b_flags |= B_RELBUF;
 				brelse(bp);
 			}
-			rel_mplock();
 			goto loop;
 		}
 		KKASSERT(size <= bp->b_kvasize);
@@ -2891,13 +2988,11 @@ loop:
 		 */
 
 		if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
-			get_mplock();
 			kprintf("getblk: Warning, bp %p loff=%jx DELWRI set "
 				"and CACHE clear, b_flags %08x\n",
 				bp, (intmax_t)bp->b_loffset, bp->b_flags);
 			bp->b_flags |= B_NOCACHE;
 			bwrite(bp);
-			rel_mplock();
 			goto loop;
 		}
 	} else {
@@ -2966,9 +3061,7 @@ loop:
 		bp->b_flags |= B_VMIO;
 		KKASSERT(bp->b_cmd == BUF_CMD_DONE);
 
-		get_mplock();
 		allocbuf(bp, size);
-		rel_mplock();
 	}
 	KKASSERT(dsched_is_clear_buf_priv(bp));
 	return (bp);
@@ -3015,9 +3108,7 @@ geteblk(int size)
 
 	while ((bp = getnewbuf(0, 0, size, maxsize)) == 0)
 		;
-	get_mplock();
 	allocbuf(bp, size);
-	rel_mplock();
 	bp->b_flags |= B_INVAL;	/* b_dep cleared by getnewbuf() */
 	KKASSERT(dsched_is_clear_buf_priv(bp));
 	return (bp);
@@ -3035,7 +3126,8 @@ geteblk(int size)
  *	Note that this code is tricky, and has many complications to resolve
  *	deadlock or inconsistant data situations.  Tread lightly!!! 
  *	There are B_CACHE and B_DELWRI interactions that must be dealt with by 
- *	the caller.  Calling this code willy nilly can result in the loss of data.
+ *	the caller.  Calling this code willy nilly can result in the loss of
+ *	data.
  *
  *	allocbuf() only adjusts B_CACHE for VMIO buffers.  getblk() deals with
  *	B_CACHE for the non-VMIO case.
@@ -3043,7 +3135,7 @@ geteblk(int size)
  *	This routine does not need to be called from a critical section but you
  *	must own the buffer.
  *
- * NOTMPSAFE
+ * MPSAFE
  */
 int
 allocbuf(struct buf *bp, int size)
@@ -3080,7 +3172,7 @@ allocbuf(struct buf *bp, int size)
 				} else {
 					kfree(bp->b_data, M_BIOBUF);
 					if (bp->b_bufsize) {
-						bufmallocspace -= bp->b_bufsize;
+						atomic_subtract_int(&bufmallocspace, bp->b_bufsize);
 						bufspacewakeup();
 						bp->b_bufsize = 0;
 					}
@@ -3108,7 +3200,7 @@ allocbuf(struct buf *bp, int size)
 				bp->b_bufsize = mbsize;
 				bp->b_bcount = size;
 				bp->b_flags |= B_MALLOC;
-				bufmallocspace += mbsize;
+				atomic_add_int(&bufmallocspace, mbsize);
 				return 1;
 			}
 			origbuf = NULL;
@@ -3123,7 +3215,8 @@ allocbuf(struct buf *bp, int size)
 				origbufsize = bp->b_bufsize;
 				bp->b_data = bp->b_kvabase;
 				if (bp->b_bufsize) {
-					bufmallocspace -= bp->b_bufsize;
+					atomic_subtract_int(&bufmallocspace,
+							    bp->b_bufsize);
 					bufspacewakeup();
 					bp->b_bufsize = 0;
 				}
@@ -3207,7 +3300,7 @@ allocbuf(struct buf *bp, int size)
 			vp = bp->b_vp;
 			obj = vp->v_object;
 
-			crit_enter();
+			lwkt_gettoken(&vm_token);
 			while (bp->b_xio.xio_npages < desiredpages) {
 				vm_page_t m;
 				vm_pindex_t pi;
@@ -3253,7 +3346,7 @@ allocbuf(struct buf *bp, int size)
 				if (bp->b_act_count < m->act_count)
 					bp->b_act_count = m->act_count;
 			}
-			crit_exit();
+			lwkt_reltoken(&vm_token);
 
 			/*
 			 * Step 2.  We've loaded the pages into the buffer,
@@ -3313,9 +3406,11 @@ allocbuf(struct buf *bp, int size)
 
 	/* adjust space use on already-dirty buffer */
 	if (bp->b_flags & B_DELWRI) {
+		spin_lock_wr(&bufcspin);
 		dirtybufspace += newbsize - bp->b_bufsize;
 		if (bp->b_flags & B_HEAVY)
 			dirtybufspacehw += newbsize - bp->b_bufsize;
+		spin_unlock_wr(&bufcspin);
 	}
 	if (newbsize < bp->b_bufsize)
 		bufspacewakeup();
@@ -3640,8 +3735,7 @@ bpdone(struct buf *bp, int elseit)
 			bp->b_flags |= B_CACHE;
 		}
 
-		crit_enter();
-		get_mplock();
+		lwkt_gettoken(&vm_token);
 		for (i = 0; i < bp->b_xio.xio_npages; i++) {
 			int bogusflag = 0;
 			int resid;
@@ -3718,8 +3812,7 @@ bpdone(struct buf *bp, int elseit)
 		bp->b_flags &= ~B_HASBOGUS;
 		if (obj)
 			vm_object_pip_wakeupn(obj, 0);
-		rel_mplock();
-		crit_exit();
+		lwkt_reltoken(&vm_token);
 	}
 
 	/*
@@ -3823,6 +3916,8 @@ vfs_unbusy_pages(struct buf *bp)
 	int i;
 
 	runningbufwakeup(bp);
+
+	lwkt_gettoken(&vm_token);
 	if (bp->b_flags & B_VMIO) {
 		struct vnode *vp = bp->b_vp;
 		vm_object_t obj;
@@ -3854,6 +3949,7 @@ vfs_unbusy_pages(struct buf *bp)
 		bp->b_flags &= ~B_HASBOGUS;
 		vm_object_pip_wakeupn(obj, 0);
 	}
+	lwkt_reltoken(&vm_token);
 }
 
 /*
@@ -3869,6 +3965,8 @@ vfs_unbusy_pages(struct buf *bp)
  *	Since I/O has not been initiated yet, certain buffer flags
  *	such as B_ERROR or B_INVAL may be in an inconsistant state
  *	and should be ignored.
+ *
+ * MPSAFE
  */
 void
 vfs_busy_pages(struct vnode *vp, struct buf *bp)
@@ -3887,6 +3985,8 @@ vfs_busy_pages(struct vnode *vp, struct buf *bp)
 	if (bp->b_flags & B_VMIO) {
 		vm_object_t obj;
 
+		lwkt_gettoken(&vm_token);
+
 		obj = vp->v_object;
 		KASSERT(bp->b_loffset != NOOFFSET,
 			("vfs_busy_pages: no buffer offset"));
@@ -4002,6 +4102,7 @@ retry:
 			pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
 				bp->b_xio.xio_pages, bp->b_xio.xio_npages);
 		}
+		lwkt_reltoken(&vm_token);
 	}
 
 	/*
@@ -4261,6 +4362,7 @@ vfs_bio_clrbuf(struct buf *bp)
  *	If a page cannot be allocated, the 'pagedaemon' is woken up to
  *	retrieve the full range (to - from) of pages.
  *
+ * MPSAFE
  */
 void
 vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to)
@@ -4307,6 +4409,8 @@ vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to)
  * The pageout daemon can run (putpages -> VOP_WRITE -> getblk -> allocbuf).
  * If the buffer cache's vm_page_alloc() fails a vm_wait() can deadlock
  * against the pageout daemon if pages are not freed from other sources.
+ *
+ * MPSAFE
  */
 static
 vm_page_t
@@ -4317,9 +4421,12 @@ bio_page_alloc(vm_object_t obj, vm_pindex_t pg, int deficit)
 	/*
 	 * Try a normal allocation, allow use of system reserve.
 	 */
+	lwkt_gettoken(&vm_token);
 	p = vm_page_alloc(obj, pg, VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM);
-	if (p)
+	if (p) {
+		lwkt_reltoken(&vm_token);
 		return(p);
+	}
 
 	/*
 	 * The normal allocation failed and we clearly have a page
@@ -4333,8 +4440,10 @@ bio_page_alloc(vm_object_t obj, vm_pindex_t pg, int deficit)
 	 * We may have blocked, the caller will know what to do if the
 	 * page now exists.
 	 */
-	if (vm_page_lookup(obj, pg))
+	if (vm_page_lookup(obj, pg)) {
+		lwkt_reltoken(&vm_token);
 		return(NULL);
+	}
 
 	/*
 	 * Allocate and allow use of the interrupt reserve.
@@ -4356,6 +4465,7 @@ bio_page_alloc(vm_object_t obj, vm_pindex_t pg, int deficit)
 			"allocation failed\n");
 		vm_wait(hz * 5);
 	}
+	lwkt_reltoken(&vm_token);
 	return(p);
 }
 
@@ -4366,6 +4476,8 @@ bio_page_alloc(vm_object_t obj, vm_pindex_t pg, int deficit)
  *
  *	The range of pages underlying the buffer's address space will
  *	be unmapped and un-wired.
+ *
+ * MPSAFE
  */
 void
 vm_hold_free_pages(struct buf *bp, vm_offset_t from, vm_offset_t to)
@@ -4682,12 +4794,14 @@ vfs_bufstats(void)
                 count = 0;
                 for (j = 0; j <= MAXBSIZE/PAGE_SIZE; j++)
                         counts[j] = 0;
-		crit_enter();
+
+		spin_lock_wr(&bufqspin);
                 TAILQ_FOREACH(bp, dp, b_freelist) {
                         counts[bp->b_bufsize/PAGE_SIZE]++;
                         count++;
                 }
-		crit_exit();
+		spin_unlock_wr(&bufqspin);
+
                 kprintf("%s: total-%d", bname[i], count);
                 for (j = 0; j <= MAXBSIZE/PAGE_SIZE; j++)
                         if (counts[j] != 0)
diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c
index 7eff091e2b..4c6dc3e4d1 100644
--- a/sys/kern/vfs_cluster.c
+++ b/sys/kern/vfs_cluster.c
@@ -1015,11 +1015,7 @@ cluster_wbuild(struct vnode *vp, int blksize, off_t start_loffset, int bytes)
 		bp->b_cmd = BUF_CMD_WRITE;
 
 		vfs_busy_pages(vp, bp);
-		bp->b_runningbufspace = bp->b_bufsize;
-		if (bp->b_runningbufspace) {
-			runningbufspace += bp->b_runningbufspace;
-			++runningbufcount;
-		}
+		bsetrunningbufspace(bp, bp->b_bufsize);
 		BUF_KERNPROC(bp);
 		vn_strategy(vp, &bp->b_bio1);
 
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index 635355bbac..6694c92025 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -657,7 +657,7 @@ vfsync(struct vnode *vp, int waitfor, int passes,
 		if (error == 0)
 			vp->v_lazyw = 0;
 		else if (!RB_EMPTY(&vp->v_rbdirty_tree))
-			vn_syncer_add_to_worklist(vp, 1);
+			vn_syncer_add(vp, 1);
 		error = 0;
 		break;
 	case MNT_NOWAIT:
@@ -887,6 +887,8 @@ bgetvp(struct vnode *vp, struct buf *bp, int testsize)
 
 /*
  * Disassociate a buffer from a vnode.
+ *
+ * MPSAFE
  */
 void
 brelvp(struct buf *bp)
@@ -911,11 +913,10 @@ brelvp(struct buf *bp)
 		buf_rb_hash_RB_REMOVE(&vp->v_rbhash_tree, bp);
 		bp->b_flags &= ~B_HASHED;
 	}
-	if ((vp->v_flag & VONWORKLST) && RB_EMPTY(&vp->v_rbdirty_tree)) {
-		vclrflags(vp, VONWORKLST);
-		LIST_REMOVE(vp, v_synclist);
-	}
+	if ((vp->v_flag & VONWORKLST) && RB_EMPTY(&vp->v_rbdirty_tree))
+		vn_syncer_remove(vp);
 	bp->b_vp = NULL;
+
 	lwkt_reltoken(&vp->v_token);
 
 	vdrop(vp);
@@ -975,7 +976,7 @@ reassignbuf(struct buf *bp)
 			default:
 				delay = filedelay;
 			}
-			vn_syncer_add_to_worklist(vp, delay);
+			vn_syncer_add(vp, delay);
 		}
 	} else {
 		/*
@@ -995,8 +996,7 @@ reassignbuf(struct buf *bp)
 		}
 		if ((vp->v_flag & VONWORKLST) &&
 		    RB_EMPTY(&vp->v_rbdirty_tree)) {
-			vclrflags(vp, VONWORKLST);
-			LIST_REMOVE(vp, v_synclist);
+			vn_syncer_remove(vp);
 		}
 	}
 	lwkt_reltoken(&vp->v_token);
diff --git a/sys/kern/vfs_sync.c b/sys/kern/vfs_sync.c
index 1f02e390d8..c094e9ace4 100644
--- a/sys/kern/vfs_sync.c
+++ b/sys/kern/vfs_sync.c
@@ -152,10 +152,14 @@ vfs_sync_init(void)
 /*
  * Add an item to the syncer work queue.
  *
+ * WARNING: Cannot get vp->v_token here if not already held, we must
+ *	    depend on the syncer_token (which might already be held by
+ *	    the caller) to protect v_synclist and VONWORKLST.
+ *
  * MPSAFE
  */
 void
-vn_syncer_add_to_worklist(struct vnode *vp, int delay)
+vn_syncer_add(struct vnode *vp, int delay)
 {
 	int slot;
 
@@ -173,6 +177,25 @@ vn_syncer_add_to_worklist(struct vnode *vp, int delay)
 	lwkt_reltoken(&syncer_token);
 }
 
+/*
+ * Removes the vnode from the syncer list.  Since we might block while
+ * acquiring the syncer_token we have to recheck conditions.
+ *
+ * vp->v_token held on call
+ */
+void
+vn_syncer_remove(struct vnode *vp)
+{
+	lwkt_gettoken(&syncer_token);
+
+	if ((vp->v_flag & VONWORKLST) && RB_EMPTY(&vp->v_rbdirty_tree)) {
+		vclrflags(vp, VONWORKLST);
+		LIST_REMOVE(vp, v_synclist);
+	}
+
+	lwkt_reltoken(&syncer_token);
+}
+
 struct  thread *updatethread;
 static void sched_sync (void);
 static struct kproc_desc up_kp = {
@@ -218,29 +241,28 @@ sched_sync(void)
 			}
 
 			/*
-			 * If the vnode is still at the head of the list
-			 * we were not able to completely flush it.  To
-			 * give other vnodes a fair shake we move it to
-			 * a later slot.
+			 * vp is stale but can still be used if we can
+			 * verify that it remains at the head of the list.
+			 * Be careful not to try to get vp->v_token as
+			 * vp can become stale if this blocks.
+			 *
+			 * If the vp is still at the head of the list were
+			 * unable to completely flush it and move it to
+			 * a later slot to give other vnodes a fair shot.
 			 *
 			 * Note that v_tag VT_VFS vnodes can remain on the
 			 * worklist with no dirty blocks, but sync_fsync()
 			 * moves it to a later slot so we will never see it
 			 * here.
+			 *
+			 * It is possible to race a vnode with no dirty
+			 * buffers being removed from the list.  If this
+			 * occurs we will move the vnode in the synclist
+			 * and then the other thread will remove it.  Do
+			 * not try to remove it here.
 			 */
-			if (LIST_FIRST(slp) == vp) {
-				lwkt_gettoken(&vp->v_token);
-				if (LIST_FIRST(slp) == vp) {
-					if (RB_EMPTY(&vp->v_rbdirty_tree) &&
-					    !vn_isdisk(vp, NULL)) {
-						panic("sched_sync: fsync "
-						      "failed vp %p tag %d",
-						      vp, vp->v_tag);
-					}
-					vn_syncer_add_to_worklist(vp, syncdelay);
-				}
-				lwkt_reltoken(&vp->v_token);
-			}
+			if (LIST_FIRST(slp) == vp)
+				vn_syncer_add(vp, syncdelay);
 		}
 		lwkt_reltoken(&syncer_token);
 
@@ -359,7 +381,7 @@ vfs_allocate_syncvnode(struct mount *mp)
 		}
 		next = start;
 	}
-	vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0);
+	vn_syncer_add(vp, syncdelay > 0 ? next % syncdelay : 0);
 
 	/*
 	 * The mnt_syncer field inherits the vnode reference, which is
@@ -397,7 +419,7 @@ sync_fsync(struct vop_fsync_args *ap)
 	/*
 	 * Move ourselves to the back of the sync list.
 	 */
-	vn_syncer_add_to_worklist(syncvp, syncdelay);
+	vn_syncer_add(syncvp, syncdelay);
 
 	/*
 	 * Walk the list of vnodes pushing all that are dirty and
diff --git a/sys/platform/pc32/isa/clock.c b/sys/platform/pc32/isa/clock.c
index ea0cee2a4b..697fcc8da5 100644
--- a/sys/platform/pc32/isa/clock.c
+++ b/sys/platform/pc32/isa/clock.c
@@ -450,20 +450,26 @@ DODELAY(int n, int doswitch)
 #endif
 }
 
+/*
+ * DELAY() never switches
+ */
 void
 DELAY(int n)
 {
 	DODELAY(n, 0);
 }
 
+/*
+ * DRIVERSLEEP() does not switch if called with a spinlock held or
+ * from a hard interrupt.
+ */
 void
 DRIVERSLEEP(int usec)
 {
 	globaldata_t gd = mycpu;
 
 	if (gd->gd_intr_nesting_level || 
-	    gd->gd_spinlock_rd ||
-	    gd->gd_spinlocks_wr) {
+	    gd->gd_spinlock_rd != NULL || gd->gd_spinlocks_wr) {
 		DODELAY(usec, 0);
 	} else {
 		DODELAY(usec, 1);
diff --git a/sys/platform/pc64/isa/clock.c b/sys/platform/pc64/isa/clock.c
index 41941d3591..6b78a7d9b6 100644
--- a/sys/platform/pc64/isa/clock.c
+++ b/sys/platform/pc64/isa/clock.c
@@ -453,20 +453,26 @@ DODELAY(int n, int doswitch)
 #endif
 }
 
+/*
+ * DELAY() never switches.
+ */
 void
 DELAY(int n)
 {
 	DODELAY(n, 0);
 }
 
+/*
+ * DRIVERSLEEP() does not switch if called with a spinlock held or
+ * from a hard interrupt.
+ */
 void
 DRIVERSLEEP(int usec)
 {
 	globaldata_t gd = mycpu;
 
 	if (gd->gd_intr_nesting_level || 
-	    gd->gd_spinlock_rd ||
-	    gd->gd_spinlocks_wr) {
+	    gd->gd_spinlock_rd || gd->gd_spinlocks_wr) {
 		DODELAY(usec, 0);
 	} else {
 		DODELAY(usec, 1);
diff --git a/sys/sys/bio.h b/sys/sys/bio.h
index b01491a8e1..bb502dc22b 100644
--- a/sys/sys/bio.h
+++ b/sys/sys/bio.h
@@ -66,7 +66,7 @@ struct bio {
 	struct bio	*bio_prev;	/* BIO stack */
 	struct bio	*bio_next;	/* BIO stack / cached translations */
 	struct buf	*bio_buf;   	/* High-level buffer back-pointer. */
-	biodone_t	*bio_done;   	/* Caller completion function */
+	biodone_t	*bio_done;   	/* MPSAFE caller completion function */
 	off_t		bio_offset;	/* Logical offset relative to device */
 	void		*bio_driver_info;
 	int		bio_flags;
diff --git a/sys/sys/buf.h b/sys/sys/buf.h
index 3064ed0470..21b69bb568 100644
--- a/sys/sys/buf.h
+++ b/sys/sys/buf.h
@@ -177,6 +177,7 @@ struct buf {
 	int	b_kvasize;		/* size of kva for buffer */
 	int	b_dirtyoff;		/* Offset in buffer of dirty region. */
 	int	b_dirtyend;		/* Offset of end of dirty region. */
+	int	b_refs;			/* FINDBLK_REF / unrefblk() */
 	struct	xio b_xio;  		/* data buffer page list management */
 	struct  bio_ops *b_ops;		/* bio_ops used w/ b_dep */
 	struct	workhead b_dep;		/* List of filesystem dependencies. */
@@ -211,6 +212,7 @@ struct buf {
 
 #define FINDBLK_TEST	0x0010	/* test only, do not lock */
 #define FINDBLK_NBLOCK	0x0020	/* use non-blocking lock, can return NULL */
+#define FINDBLK_REF	0x0040	/* ref the buf to prevent reuse */
 
 /*
  * These flags are kept in b_flags.
@@ -375,8 +377,6 @@ struct cluster_save {
 extern int	nbuf;			/* The number of buffer headers */
 extern long	maxswzone;		/* Max KVA for swap structures */
 extern long	maxbcache;		/* Max KVA for buffer cache */
-extern int	runningbufspace;
-extern int	runningbufcount;
 extern int	hidirtybufspace;
 extern int      buf_maxio;              /* nominal maximum I/O for buffer */
 extern struct buf *buf;			/* The buffer headers. */
@@ -422,6 +422,7 @@ struct buf *findblk (struct vnode *, off_t, int);
 struct buf *getblk (struct vnode *, off_t, int, int, int);
 struct buf *getcacheblk (struct vnode *, off_t);
 struct buf *geteblk (int);
+void unrefblk(struct buf *bp);
 void regetblk(struct buf *bp);
 struct bio *push_bio(struct bio *);
 struct bio *pop_bio(struct bio *);
@@ -446,6 +447,7 @@ void	vunmapbuf (struct buf *);
 void	relpbuf (struct buf *, int *);
 void	brelvp (struct buf *);
 int	bgetvp (struct vnode *, struct buf *, int);
+void	bsetrunningbufspace(struct buf *, int);
 int	allocbuf (struct buf *bp, int size);
 int	scan_all_buffers (int (*)(struct buf *, void *), void *);
 void	reassignbuf (struct buf *);
diff --git a/sys/sys/spinlock2.h b/sys/sys/spinlock2.h
index 0c974f708b..7fb1e8be33 100644
--- a/sys/sys/spinlock2.h
+++ b/sys/sys/spinlock2.h
@@ -86,6 +86,8 @@ spin_trylock_wr(struct spinlock *mtx)
 	globaldata_t gd = mycpu;
 	int value;
 
+	++gd->gd_curthread->td_critcount;
+	cpu_ccfence();
 	++gd->gd_spinlocks_wr;
 	if ((value = atomic_swap_int(&mtx->lock, SPINLOCK_EXCLUSIVE)) != 0)
 		return (spin_trylock_wr_contested(gd, mtx, value));
@@ -99,6 +101,8 @@ spin_trylock_wr(struct spinlock *mtx)
 {
 	globaldata_t gd = mycpu;
 
+	++gd->gd_curthread->td_critcount;
+	cpu_ccfence();
 	++gd->gd_spinlocks_wr;
 	return (TRUE);
 }
@@ -116,6 +120,8 @@ spin_lock_wr_quick(globaldata_t gd, struct spinlock *mtx)
 	int value;
 #endif
 
+	++gd->gd_curthread->td_critcount;
+	cpu_ccfence();
 	++gd->gd_spinlocks_wr;
 #ifdef SMP
 	if ((value = atomic_swap_int(&mtx->lock, SPINLOCK_EXCLUSIVE)) != 0) {
@@ -132,41 +138,6 @@ spin_lock_wr(struct spinlock *mtx)
 	spin_lock_wr_quick(mycpu, mtx);
 }
 
-#if 0
-
-/*
- * Upgrade a shared spinlock to exclusive.  Return TRUE if we were
- * able to upgrade without another exclusive holder getting in before
- * us, FALSE otherwise.
- */
-static __inline int
-spin_lock_upgrade(struct spinlock *mtx)
-{
-	globaldata_t gd = mycpu;
-#ifdef SMP
-	int value;
-#endif
-
-	++gd->gd_spinlocks_wr;
-#ifdef SMP
-	value = atomic_swap_int(&mtx->lock, SPINLOCK_EXCLUSIVE);
-	cpu_sfence();
-#endif
-	gd->gd_spinlock_rd = NULL;
-#ifdef SMP
-	value &= ~gd->gd_cpumask;
-	if (value) {
-		spin_lock_wr_contested(mtx, value);
-		if (value & SPINLOCK_EXCLUSIVE)
-			return (FALSE);
-		XXX regain original shared lock?
-	}
-	return (TRUE);
-#endif
-}
-
-#endif
-
 /*
  * Obtain a shared spinlock and return.  This is a critical code path.
  *
@@ -188,6 +159,8 @@ spin_lock_upgrade(struct spinlock *mtx)
 static __inline void
 spin_lock_rd_quick(globaldata_t gd, struct spinlock *mtx)
 {
+	++gd->gd_curthread->td_critcount;
+	cpu_ccfence();
 	gd->gd_spinlock_rd = mtx;
 #ifdef SMP
 	cpu_mfence();
@@ -215,6 +188,8 @@ spin_unlock_wr_quick(globaldata_t gd, struct spinlock *mtx)
 #endif
 	KKASSERT(gd->gd_spinlocks_wr > 0);
 	--gd->gd_spinlocks_wr;
+	cpu_ccfence();
+	--gd->gd_curthread->td_critcount;
 }
 
 static __inline void
@@ -235,6 +210,8 @@ spin_unlock_rd_quick(globaldata_t gd, struct spinlock *mtx)
 {
 	KKASSERT(gd->gd_spinlock_rd == mtx);
 	gd->gd_spinlock_rd = NULL;
+	cpu_ccfence();
+	--gd->gd_curthread->td_critcount;
 }
 
 static __inline void
diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h
index 98f287298d..e9c6cedbdd 100644
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@@ -595,7 +595,8 @@ void	vfs_lock_init(void);
 void	vfs_sync_init(void);
 void	mount_init(struct mount *mp);
 
-void	vn_syncer_add_to_worklist(struct vnode *, int);
+void	vn_syncer_add(struct vnode *, int);
+void	vn_syncer_remove(struct vnode *);
 void	vnlru_proc_wait(void);
 
 extern	struct vop_ops default_vnode_vops;
diff --git a/sys/vfs/devfs/devfs_vnops.c b/sys/vfs/devfs/devfs_vnops.c
index bce2879b6b..7e601c6e6d 100644
--- a/sys/vfs/devfs/devfs_vnops.c
+++ b/sys/vfs/devfs/devfs_vnops.c
@@ -1723,6 +1723,8 @@ devfs_spec_strategy(struct vop_strategy_args *ap)
 
 /*
  * Chunked up transfer completion routine - chain transfers until done
+ *
+ * NOTE: MPSAFE callback.
  */
 static
 void
@@ -1877,6 +1879,9 @@ devfs_spec_advlock(struct vop_advlock_args *ap)
 	return ((ap->a_flags & F_POSIX) ? EINVAL : EOPNOTSUPP);
 }
 
+/*
+ * NOTE: MPSAFE callback.
+ */
 static void
 devfs_spec_getpages_iodone(struct bio *bio)
 {
@@ -1940,11 +1945,7 @@ devfs_spec_getpages(struct vop_getpages_args *ap)
 	bp->b_cmd = BUF_CMD_READ;
 	bp->b_bcount = size;
 	bp->b_resid = 0;
-	bp->b_runningbufspace = size;
-	if (size) {
-		runningbufspace += bp->b_runningbufspace;
-		++runningbufcount;
-	}
+	bsetrunningbufspace(bp, size);
 
 	bp->b_bio1.bio_offset = offset;
 	bp->b_bio1.bio_done = devfs_spec_getpages_iodone;
diff --git a/sys/vfs/hammer/hammer.h b/sys/vfs/hammer/hammer.h
index 22fa93dd23..ea410022b6 100644
--- a/sys/vfs/hammer/hammer.h
+++ b/sys/vfs/hammer/hammer.h
@@ -508,6 +508,7 @@ struct hammer_record {
 	struct hammer_btree_leaf_elm	leaf;
 	union hammer_data_ondisk	*data;
 	int				flags;
+	int				gflags;
 	hammer_off_t			zone2_offset;	/* direct-write only */
 };
 
@@ -525,11 +526,14 @@ typedef struct hammer_record *hammer_record_t;
 #define HAMMER_RECF_INTERLOCK_BE	0x0020	/* backend interlock */
 #define HAMMER_RECF_WANTED		0x0040	/* wanted by the frontend */
 #define HAMMER_RECF_CONVERT_DELETE 	0x0100	/* special case */
-#define HAMMER_RECF_DIRECT_IO		0x0200	/* related direct I/O running*/
-#define HAMMER_RECF_DIRECT_WAIT		0x0400	/* related direct I/O running*/
-#define HAMMER_RECF_DIRECT_INVAL	0x0800	/* buffer alias invalidation */
 #define HAMMER_RECF_REDO		0x1000	/* REDO was laid down */
 
+/*
+ * These flags must be separate to deal with SMP races
+ */
+#define HAMMER_RECG_DIRECT_IO		0x0001	/* related direct I/O running*/
+#define HAMMER_RECG_DIRECT_WAIT		0x0002	/* related direct I/O running*/
+#define HAMMER_RECG_DIRECT_INVAL	0x0004	/* buffer alias invalidation */
 /*
  * hammer_create_at_cursor() and hammer_delete_at_cursor() flags.
  */
@@ -608,17 +612,33 @@ struct hammer_io {
 	int			bytes;	   /* buffer cache buffer size */
 	int			modify_refs;
 
-	u_int		modified : 1;	/* bp's data was modified */
-	u_int		released : 1;	/* bp released (w/ B_LOCKED set) */
+	/*
+	 * These can be modified at any time by the backend while holding
+	 * io_token, due to bio_done and hammer_io_complete() callbacks.
+	 */
 	u_int		running : 1;	/* bp write IO in progress */
 	u_int		waiting : 1;	/* someone is waiting on us */
+	u_int		ioerror : 1;	/* abort on io-error */
+	u_int		unusedA : 29;
+
+	/*
+	 * These can only be modified by the frontend while holding
+	 * fs_token, or by the backend while holding the io interlocked
+	 * with no references (which will block the frontend when it
+	 * tries to reference it).
+	 *
+	 * WARNING! SMP RACES will create havoc if the callbacks ever tried
+	 *	    to modify any of these outside the above restrictions.
+	 */
+	u_int		modified : 1;	/* bp's data was modified */
+	u_int		released : 1;	/* bp released (w/ B_LOCKED set) */
 	u_int		validated : 1;	/* ondisk has been validated */
 	u_int		waitdep : 1;	/* flush waits for dependancies */
 	u_int		recovered : 1;	/* has recovery ref */
 	u_int		waitmod : 1;	/* waiting for modify_refs */
 	u_int		reclaim : 1;	/* reclaim requested */
 	u_int		gencrc : 1;	/* crc needs to be generated */
-	u_int		ioerror : 1;	/* abort on io-error */
+	u_int		unusedB : 24;
 };
 
 typedef struct hammer_io *hammer_io_t;
@@ -1353,6 +1373,7 @@ void hammer_io_clear_modify(struct hammer_io *io, int inval);
 void hammer_io_clear_modlist(struct hammer_io *io);
 void hammer_io_flush_sync(hammer_mount_t hmp);
 void hammer_io_clear_error(struct hammer_io *io);
+void hammer_io_clear_error_noassert(struct hammer_io *io);
 void hammer_io_notmeta(hammer_buffer_t buffer);
 void hammer_io_limit_backlog(hammer_mount_t hmp);
 
diff --git a/sys/vfs/hammer/hammer_flusher.c b/sys/vfs/hammer/hammer_flusher.c
index 45a2b4fc1f..45bdfa3a37 100644
--- a/sys/vfs/hammer/hammer_flusher.c
+++ b/sys/vfs/hammer/hammer_flusher.c
@@ -751,6 +751,9 @@ hammer_flusher_finalize(hammer_transaction_t trans, int final)
 	 * by a crash up until the next flush cycle due to the first_offset
 	 * in the volume header for the UNDO FIFO not being adjusted until
 	 * the following flush cycle.
+	 *
+	 * No io interlock is needed, bioops callbacks will not mess with
+	 * meta data buffers.
 	 */
 	count = 0;
 	while ((io = TAILQ_FIRST(&hmp->meta_list)) != NULL) {
diff --git a/sys/vfs/hammer/hammer_io.c b/sys/vfs/hammer/hammer_io.c
index ab659bddf1..457346544d 100644
--- a/sys/vfs/hammer/hammer_io.c
+++ b/sys/vfs/hammer/hammer_io.c
@@ -44,6 +44,10 @@
  * If the kernel tries to destroy a passively associated buf which we cannot
  * yet let go we set B_LOCKED in the buffer and then actively released it
  * later when we can.
+ *
+ * The io_token is required for anything which might race bioops and bio_done
+ * callbacks, with one exception: A successful hammer_try_interlock_norefs().
+ * the fs_token will be held in all other cases.
  */
 
 #include "hammer.h"
@@ -62,7 +66,6 @@ static int hammer_io_direct_uncache_callback(hammer_inode_t ip, void *data);
 static void hammer_io_set_modlist(struct hammer_io *io);
 static void hammer_io_flush_mark(hammer_volume_t volume);
 
-
 /*
  * Initialize a new, already-zero'd hammer_io structure, or reinitialize
  * an existing hammer_io structure which may have switched to another type.
@@ -77,21 +80,24 @@ hammer_io_init(hammer_io_t io, hammer_volume_t volume, enum hammer_io_type type)
 
 /*
  * Helper routine to disassociate a buffer cache buffer from an I/O
- * structure.  The io must be interlocked marked appropriately for
+ * structure.  The io must be interlocked and marked appropriately for
  * reclamation.
  *
- * The io may have 0 or 1 references depending on who called us.  The
- * caller is responsible for dealing with the refs.
- *
  * The io must be in a released state with the io->bp owned and
  * locked by the caller of this function.  When not called from an
  * io_deallocate() this cannot race an io_deallocate() since the
  * kernel would be unable to get the buffer lock in that case.
+ * (The released state in this case means we own the bp, not the
+ * hammer_io structure).
+ *
+ * The io may have 0 or 1 references depending on who called us.  The
+ * caller is responsible for dealing with the refs.
  *
  * This call can only be made when no action is required on the buffer.
  *
- * The caller must own the buffer and the IO must indicate that the
- * structure no longer owns it (io.released != 0).
+ * This function is guaranteed not to race against anything because we
+ * own both the io lock and the bp lock and are interlocked with no
+ * references.
  */
 static void
 hammer_io_disassociate(hammer_io_structure_t iou)
@@ -208,11 +214,28 @@ hammer_io_wait_all(hammer_mount_t hmp, const char *ident, int doflush)
 void
 hammer_io_clear_error(struct hammer_io *io)
 {
+	hammer_mount_t hmp = io->hmp;
+
+	lwkt_gettoken(&hmp->io_token);
 	if (io->ioerror) {
 		io->ioerror = 0;
 		hammer_rel(&io->lock);
 		KKASSERT(hammer_isactive(&io->lock));
 	}
+	lwkt_reltoken(&hmp->io_token);
+}
+
+void
+hammer_io_clear_error_noassert(struct hammer_io *io)
+{
+	hammer_mount_t hmp = io->hmp;
+
+	lwkt_gettoken(&hmp->io_token);
+	if (io->ioerror) {
+		io->ioerror = 0;
+		hammer_rel(&io->lock);
+	}
+	lwkt_reltoken(&hmp->io_token);
 }
 
 /*
@@ -239,9 +262,6 @@ hammer_io_notmeta(hammer_buffer_t buffer)
 	}
 }
 
-
-#define HAMMER_MAXRA	4
-
 /*
  * Load bp for a HAMMER structure.  The io must be exclusively locked by
  * the caller.
@@ -435,7 +455,7 @@ hammer_io_inval(hammer_volume_t volume, hammer_off_t zone2_offset)
 		iou->io.released = 0;
 		BUF_KERNPROC(bp);
 		iou->io.reclaim = 1;
-		iou->io.waitdep = 1;
+		iou->io.waitdep = 1;	/* XXX this is a fs_token field */
 		KKASSERT(hammer_isactive(&iou->io.lock) == 1);
 		hammer_rel_buffer(&iou->buffer, 0);
 		/*hammer_io_deallocate(bp);*/
@@ -618,11 +638,15 @@ hammer_io_release(struct hammer_io *io, int flush)
  * no other references to the structure exists other then ours.  This
  * routine is ONLY called when HAMMER believes it is safe to flush a
  * potentially modified buffer out.
+ *
+ * The locked io or io reference prevents a flush from being initiated
+ * by the kernel.
  */
 void
 hammer_io_flush(struct hammer_io *io, int reclaim)
 {
 	struct buf *bp;
+	hammer_mount_t hmp;
 
 	/*
 	 * Degenerate case - nothing to flush if nothing is dirty.
@@ -643,6 +667,7 @@ hammer_io_flush(struct hammer_io *io, int reclaim)
 	 *
 	 * The io_token should not be required here as only
 	 */
+	hmp = io->hmp;
 	bp = io->bp;
 	if (io->released) {
 		regetblk(bp);
@@ -692,9 +717,11 @@ hammer_io_flush(struct hammer_io *io, int reclaim)
 	 *	 update io_running_space.
 	 */
 	io->running = 1;
-	atomic_add_int(&io->hmp->io_running_space, io->bytes);
+	atomic_add_int(&hmp->io_running_space, io->bytes);
 	atomic_add_int(&hammer_count_io_running_write, io->bytes);
-	TAILQ_INSERT_TAIL(&io->hmp->iorun_list, io, iorun_entry);
+	lwkt_gettoken(&hmp->io_token);
+	TAILQ_INSERT_TAIL(&hmp->iorun_list, io, iorun_entry);
+	lwkt_reltoken(&hmp->io_token);
 	bawrite(bp);
 	hammer_io_flush_mark(io->volume);
 }
@@ -715,6 +742,8 @@ hammer_io_flush(struct hammer_io *io, int reclaim)
  * Mark a HAMMER structure as undergoing modification.  Meta-data buffers
  * are locked until the flusher can deal with them, pure data buffers
  * can be written out.
+ *
+ * The referenced io prevents races.
  */
 static
 void
@@ -736,16 +765,20 @@ hammer_io_modify(hammer_io_t io, int count)
 	if (io->modified && io->released == 0)
 		return;
 
+	/*
+	 * NOTE: It is important not to set the modified bit
+	 *	 until after we have acquired the bp or we risk
+	 *	 racing against checkwrite.
+	 */
 	hammer_lock_ex(&io->lock);
-	if (io->modified == 0) {
-		hammer_io_set_modlist(io);
-		io->modified = 1;
-	}
 	if (io->released) {
 		regetblk(io->bp);
 		BUF_KERNPROC(io->bp);
 		io->released = 0;
-		KKASSERT(io->modified != 0);
+	}
+	if (io->modified == 0) {
+		hammer_io_set_modlist(io);
+		io->modified = 1;
 	}
 	hammer_unlock(&io->lock);
 }
@@ -762,14 +795,28 @@ hammer_io_modify_done(hammer_io_t io)
 	}
 }
 
+/*
+ * The write interlock blocks other threads trying to modify a buffer
+ * (they block in hammer_io_modify()) after us, or blocks us while other
+ * threads are in the middle of modifying a buffer.
+ *
+ * The caller also has a ref on the io, however if we are not careful
+ * we will race bioops callbacks (checkwrite).  To deal with this
+ * we must at least acquire and release the io_token, and it is probably
+ * better to hold it through the setting of modify_refs.
+ */
 void
 hammer_io_write_interlock(hammer_io_t io)
 {
+	hammer_mount_t hmp = io->hmp;
+
+	lwkt_gettoken(&hmp->io_token);
 	while (io->modify_refs != 0) {
 		io->waitmod = 1;
 		tsleep(io, 0, "hmrmod", 0);
 	}
 	io->modify_refs = -1;
+	lwkt_reltoken(&hmp->io_token);
 }
 
 void
@@ -855,8 +902,19 @@ hammer_modify_buffer_done(hammer_buffer_t buffer)
 void
 hammer_io_clear_modify(struct hammer_io *io, int inval)
 {
+	hammer_mount_t hmp;
+
+	/*
+	 * io_token is needed to avoid races on mod_list
+	 */
 	if (io->modified == 0)
 		return;
+	hmp = io->hmp;
+	lwkt_gettoken(&hmp->io_token);
+	if (io->modified == 0) {
+		lwkt_reltoken(&hmp->io_token);
+		return;
+	}
 
 	/*
 	 * Take us off the mod-list and clear the modified bit.
@@ -871,6 +929,8 @@ hammer_io_clear_modify(struct hammer_io *io, int inval)
 	io->mod_list = NULL;
 	io->modified = 0;
 
+	lwkt_reltoken(&hmp->io_token);
+
 	/*
 	 * If this bit is not set there are no delayed adjustments.
 	 */
@@ -932,6 +992,7 @@ hammer_io_set_modlist(struct hammer_io *io)
 {
 	struct hammer_mount *hmp = io->hmp;
 
+	lwkt_gettoken(&hmp->io_token);
 	KKASSERT(io->mod_list == NULL);
 
 	switch(io->type) {
@@ -956,6 +1017,7 @@ hammer_io_set_modlist(struct hammer_io *io)
 		break;
 	}
 	TAILQ_INSERT_TAIL(io->mod_list, io, mod_entry);
+	lwkt_reltoken(&hmp->io_token);
 }
 
 /************************************************************************
@@ -978,9 +1040,10 @@ hammer_io_start(struct buf *bp)
 /*
  * Post-IO completion kernel callback - MAY BE CALLED FROM INTERRUPT!
  *
- * NOTE: HAMMER may modify a buffer after initiating I/O.  The modified bit
- * may also be set if we were marking a cluster header open.  Only remove
- * our dependancy if the modified bit is clear.
+ * NOTE: HAMMER may modify a data buffer after we have initiated write
+ *	 I/O.
+ *
+ * NOTE: MPSAFE callback
  *
  * bioops callback - hold io_token
  */
@@ -1013,8 +1076,11 @@ hammer_io_complete(struct buf *bp)
 		 * away.
 		 */
 		if (bp->b_flags & B_ERROR) {
+			lwkt_gettoken(&hmp->fs_token);
 			hammer_critical_error(hmp, NULL, bp->b_error,
 					      "while flushing meta-data");
+			lwkt_reltoken(&hmp->fs_token);
+
 			switch(iou->io.type) {
 			case HAMMER_STRUCTURE_UNDO_BUFFER:
 				break;
@@ -1194,6 +1260,8 @@ hammer_io_checkread(struct buf *bp)
 }
 
 /*
+ * The kernel is asking us whether it can write out a dirty buffer or not.
+ *
  * bioops callback - hold io_token
  */
 static int
@@ -1219,8 +1287,30 @@ hammer_io_checkwrite(struct buf *bp)
 	}
 
 	/*
-	 * We can only clear the modified bit if the IO is not currently
-	 * undergoing modification.  Otherwise we may miss changes.
+	 * We have to be able to interlock the IO to safely modify any
+	 * of its fields without holding the fs_token.  If we can't lock
+	 * it then we are racing someone.
+	 *
+	 * Our ownership of the bp lock prevents the io from being ripped
+	 * out from under us.
+	 */
+	if (hammer_try_interlock_norefs(&io->lock) == 0) {
+		bp->b_flags |= B_LOCKED;
+		atomic_add_int(&hammer_count_io_locked, 1);
+		lwkt_reltoken(&hmp->io_token);
+		return(1);
+	}
+
+	/*
+	 * The modified bit must be cleared prior to the initiation of
+	 * any IO (returning 0 initiates the IO).  Because this is a
+	 * normal data buffer hammer_io_clear_modify() runs through a
+	 * simple degenerate case.
+	 *
+	 * Return 0 will cause the kernel to initiate the IO, and we
+	 * must normally clear the modified bit before we begin.  If
+	 * the io has modify_refs we do not clear the modified bit,
+	 * otherwise we may miss changes.
 	 *
 	 * Only data and undo buffers can reach here.  These buffers do
 	 * not have terminal crc functions but we temporarily reference
@@ -1243,6 +1333,7 @@ hammer_io_checkwrite(struct buf *bp)
 	atomic_add_int(&hammer_count_io_running_write, io->bytes);
 	TAILQ_INSERT_TAIL(&io->hmp->iorun_list, io, iorun_entry);
 
+	hammer_put_interlock(&io->lock, 1);
 	lwkt_reltoken(&hmp->io_token);
 
 	return(0);
@@ -1374,6 +1465,8 @@ done:
  *
  * MPSAFE - since we do not modify and hammer_records we do not need
  *	    io_token.
+ *
+ * NOTE: MPSAFE callback
  */
 static
 void
@@ -1401,7 +1494,7 @@ hammer_io_direct_read_complete(struct bio *nbio)
  * Write a buffer associated with a front-end vnode directly to the
  * disk media.  The bio may be issued asynchronously.
  *
- * The BIO is associated with the specified record and RECF_DIRECT_IO
+ * The BIO is associated with the specified record and RECG_DIRECT_IO
  * is set.  The recorded is added to its object.
  */
 int
@@ -1461,8 +1554,8 @@ hammer_io_direct_write(hammer_mount_t hmp, struct bio *bio,
 			nbio->bio_done = hammer_io_direct_write_complete;
 			nbio->bio_caller_info1.ptr = record;
 			record->zone2_offset = zone2_offset;
-			record->flags |= HAMMER_RECF_DIRECT_IO |
-					 HAMMER_RECF_DIRECT_INVAL;
+			record->gflags |= HAMMER_RECG_DIRECT_IO |
+					 HAMMER_RECG_DIRECT_INVAL;
 
 			/*
 			 * Third level bio - raw offset specific to the
@@ -1481,7 +1574,7 @@ hammer_io_direct_write(hammer_mount_t hmp, struct bio *bio,
 	} else {
 		/* 
 		 * Must fit in a standard HAMMER buffer.  In this case all
-		 * consumers use the HAMMER buffer system and RECF_DIRECT_IO
+		 * consumers use the HAMMER buffer system and RECG_DIRECT_IO
 		 * does not need to be set-up.
 		 */
 		KKASSERT(((buf_offset ^ (buf_offset + leaf->data_len - 1)) & ~HAMMER_BUFMASK64) == 0);
@@ -1525,6 +1618,9 @@ hammer_io_direct_write(hammer_mount_t hmp, struct bio *bio,
  * An I/O error forces the mount to read-only.  Data buffers
  * are not B_LOCKED like meta-data buffers are, so we have to
  * throw the buffer away to prevent the kernel from retrying.
+ *
+ * NOTE: MPSAFE callback, only modify fields we have explicit
+ *	 access to (the bp and the record->gflags).
  */
 static
 void
@@ -1544,21 +1640,23 @@ hammer_io_direct_write_complete(struct bio *nbio)
 	bp = nbio->bio_buf;
 	obio = pop_bio(nbio);
 	if (bp->b_flags & B_ERROR) {
+		lwkt_gettoken(&hmp->fs_token);
 		hammer_critical_error(hmp, record->ip,
 				      bp->b_error,
 				      "while writing bulk data");
+		lwkt_reltoken(&hmp->fs_token);
 		bp->b_flags |= B_INVAL;
 	}
 	biodone(obio);
 
-	KKASSERT(record->flags & HAMMER_RECF_DIRECT_IO);
-	if (record->flags & HAMMER_RECF_DIRECT_WAIT) {
-		record->flags &= ~(HAMMER_RECF_DIRECT_IO |
-				   HAMMER_RECF_DIRECT_WAIT);
+	KKASSERT(record->gflags & HAMMER_RECG_DIRECT_IO);
+	if (record->gflags & HAMMER_RECG_DIRECT_WAIT) {
+		record->gflags &= ~(HAMMER_RECG_DIRECT_IO |
+				    HAMMER_RECG_DIRECT_WAIT);
 		/* record can disappear once DIRECT_IO flag is cleared */
 		wakeup(&record->flags);
 	} else {
-		record->flags &= ~HAMMER_RECF_DIRECT_IO;
+		record->gflags &= ~HAMMER_RECG_DIRECT_IO;
 		/* record can disappear once DIRECT_IO flag is cleared */
 	}
 	lwkt_reltoken(&hmp->io_token);
@@ -1583,10 +1681,10 @@ hammer_io_direct_wait(hammer_record_t record)
 	/*
 	 * Wait for I/O to complete
 	 */
-	if (record->flags & HAMMER_RECF_DIRECT_IO) {
+	if (record->gflags & HAMMER_RECG_DIRECT_IO) {
 		lwkt_gettoken(&hmp->io_token);
-		while (record->flags & HAMMER_RECF_DIRECT_IO) {
-			record->flags |= HAMMER_RECF_DIRECT_WAIT;
+		while (record->gflags & HAMMER_RECG_DIRECT_IO) {
+			record->gflags |= HAMMER_RECG_DIRECT_WAIT;
 			tsleep(&record->flags, 0, "hmdiow", 0);
 		}
 		lwkt_reltoken(&hmp->io_token);
@@ -1602,12 +1700,12 @@ hammer_io_direct_wait(hammer_record_t record)
 	 * reservations ensure that all such buffers are removed before
 	 * an area can be reused.
 	 */
-	if (record->flags & HAMMER_RECF_DIRECT_INVAL) {
+	if (record->gflags & HAMMER_RECG_DIRECT_INVAL) {
 		KKASSERT(record->leaf.data_offset);
 		hammer_del_buffers(hmp, record->leaf.data_offset,
 				   record->zone2_offset, record->leaf.data_len,
 				   1);
-		record->flags &= ~HAMMER_RECF_DIRECT_INVAL;
+		record->gflags &= ~HAMMER_RECG_DIRECT_INVAL;
 	}
 }
 
@@ -1692,7 +1790,7 @@ hammer_io_direct_uncache_callback(hammer_inode_t ip, void *data)
 static void
 hammer_io_flush_mark(hammer_volume_t volume)
 {
-	volume->vol_flags |= HAMMER_VOLF_NEEDFLUSH;
+	atomic_set_int(&volume->vol_flags, HAMMER_VOLF_NEEDFLUSH);
 }
 
 /*
@@ -1707,7 +1805,8 @@ hammer_io_flush_sync(hammer_mount_t hmp)
 
 	RB_FOREACH(volume, hammer_vol_rb_tree, &hmp->rb_vols_root) {
 		if (volume->vol_flags & HAMMER_VOLF_NEEDFLUSH) {
-			volume->vol_flags &= ~HAMMER_VOLF_NEEDFLUSH;
+			atomic_clear_int(&volume->vol_flags,
+					 HAMMER_VOLF_NEEDFLUSH);
 			bp = getpbuf(NULL);
 			bp->b_bio1.bio_offset = 0;
 			bp->b_bufsize = 0;
diff --git a/sys/vfs/hammer/hammer_object.c b/sys/vfs/hammer/hammer_object.c
index 1d7f19fed7..00c0dcd699 100644
--- a/sys/vfs/hammer/hammer_object.c
+++ b/sys/vfs/hammer/hammer_object.c
@@ -425,8 +425,8 @@ hammer_rel_mem_record(struct hammer_record *record)
 			 * we can destroy the record because the bio may
 			 * have a reference to it.
 			 */
-			if (record->flags & 
-			   (HAMMER_RECF_DIRECT_IO | HAMMER_RECF_DIRECT_INVAL)) {
+			if (record->gflags &
+			   (HAMMER_RECG_DIRECT_IO | HAMMER_RECG_DIRECT_INVAL)) {
 				hammer_io_direct_wait(record);
 			}
 
@@ -1129,7 +1129,7 @@ hammer_ip_sync_record_cursor(hammer_cursor_t cursor, hammer_record_t record)
 	 * Any direct-write related to the record must complete before we
 	 * can sync the record to the on-disk media.
 	 */
-	if (record->flags & (HAMMER_RECF_DIRECT_IO | HAMMER_RECF_DIRECT_INVAL))
+	if (record->gflags & (HAMMER_RECG_DIRECT_IO | HAMMER_RECG_DIRECT_INVAL))
 		hammer_io_direct_wait(record);
 
 	/*
diff --git a/sys/vfs/hammer/hammer_ondisk.c b/sys/vfs/hammer/hammer_ondisk.c
index b9d1eece17..5cfeab43b9 100644
--- a/sys/vfs/hammer/hammer_ondisk.c
+++ b/sys/vfs/hammer/hammer_ondisk.c
@@ -289,10 +289,8 @@ hammer_unload_volume(hammer_volume_t volume, void *data __unused)
 	/*
 	 * Clean up the persistent ref ioerror might have on the volume
 	 */
-	if (volume->io.ioerror) {
-		volume->io.ioerror = 0;
-		hammer_rel(&volume->io.lock);
-	}
+	if (volume->io.ioerror)
+		hammer_io_clear_error_noassert(&volume->io);
 
 	/*
 	 * This should release the bp.  Releasing the volume with flush set
@@ -877,8 +875,7 @@ hammer_unload_buffer(hammer_buffer_t buffer, void *data)
 	 * and acquire a ref.  Expect a 0->1 transition.
 	 */
 	if (buffer->io.ioerror) {
-		buffer->io.ioerror = 0;
-		hammer_rel(&buffer->io.lock);
+		hammer_io_clear_error_noassert(&buffer->io);
 		--hammer_count_refedbufs;
 	}
 	hammer_ref_interlock_true(&buffer->io.lock);
diff --git a/sys/vfs/hammer/hammer_recover.c b/sys/vfs/hammer/hammer_recover.c
index 77cf9be207..041d821a2c 100644
--- a/sys/vfs/hammer/hammer_recover.c
+++ b/sys/vfs/hammer/hammer_recover.c
@@ -1441,6 +1441,9 @@ hammer_recover_flush_buffers(hammer_mount_t hmp, hammer_volume_t root_volume,
 
 	/*
 	 * Finalize the root volume header.
+	 *
+	 * No interlock is needed, volume buffers are not
+	 * messed with by bioops.
 	 */
 	if (root_volume && root_volume->io.recovered && final > 0) {
 		hammer_io_wait_all(hmp, "hmrflx", 1);
@@ -1470,6 +1473,10 @@ hammer_recover_flush_volume_callback(hammer_volume_t volume, void *data)
 	if (volume->io.recovered && volume != root_volume) {
 		volume->io.recovered = 0;
 		if (root_volume != NULL) {
+			/*
+			 * No interlock is needed, volume buffers are not
+			 * messed with by bioops.
+			 */
 			hammer_io_flush(&volume->io, 0);
 		} else {
 			hammer_io_clear_error(&volume->io);
@@ -1501,7 +1508,9 @@ hammer_recover_flush_buffer_callback(hammer_buffer_t buffer, void *data)
 			hammer_io_clear_error(&buffer->io);
 			hammer_io_clear_modify(&buffer->io, 1);
 		} else {
+			hammer_io_write_interlock(&buffer->io);
 			hammer_io_flush(&buffer->io, 0);
+			hammer_io_done_interlock(&buffer->io);
 		}
 		hammer_rel_buffer(buffer, 0);
 	} else {
diff --git a/sys/vfs/hammer/hammer_volume.c b/sys/vfs/hammer/hammer_volume.c
index a97547a2e3..212235f9f5 100644
--- a/sys/vfs/hammer/hammer_volume.c
+++ b/sys/vfs/hammer/hammer_volume.c
@@ -153,6 +153,9 @@ hammer_ioc_volume_add(hammer_transaction_t trans, hammer_inode_t ip,
 		 * Only changes to the header of the root volume
 		 * are automatically flushed to disk. For all
 		 * other volumes that we modify we do it here.
+		 *
+		 * No interlock is needed, volume buffers are not
+		 * messed with by bioops.
 		 */
 		if (volume != trans->rootvol && volume->io.modified) {
 			hammer_crc_set_volume(volume->ondisk);
@@ -373,6 +376,9 @@ hammer_ioc_volume_del(hammer_transaction_t trans, hammer_inode_t ip,
 		 * Only changes to the header of the root volume
 		 * are automatically flushed to disk. For all
 		 * other volumes that we modify we do it here.
+		 *
+		 * No interlock is needed, volume buffers are not
+		 * messed with by bioops.
 		 */
 		if (volume != trans->rootvol && volume->io.modified) {
 			hammer_crc_set_volume(volume->ondisk);
diff --git a/sys/vfs/nfs/nfs_bio.c b/sys/vfs/nfs/nfs_bio.c
index e356cbd509..0de9d3667a 100644
--- a/sys/vfs/nfs/nfs_bio.c
+++ b/sys/vfs/nfs/nfs_bio.c
@@ -59,6 +59,7 @@
 
 #include <sys/buf2.h>
 #include <sys/thread2.h>
+#include <sys/mplock2.h>
 #include <vm/vm_page2.h>
 
 #include "rpcv2.h"
@@ -1359,6 +1360,8 @@ nfs_readrpc_bio_done(nfsm_info_t info)
 
 	KKASSERT(info->state == NFSM_STATE_DONE);
 
+	get_mplock();
+
 	if (info->v3) {
 		ERROROUT(nfsm_postop_attr(info, info->vp, &attrflag,
 					 NFS_LATTR_NOSHRINK));
@@ -1397,6 +1400,7 @@ nfs_readrpc_bio_done(nfsm_info_t info)
 	bp->b_resid = 0;
 	/* bp->b_resid = bp->b_bcount - retlen; */
 nfsmout:
+	rel_mplock();
 	kfree(info, M_NFSREQ);
 	if (error) {
 		bp->b_error = error;
@@ -1509,6 +1513,8 @@ nfs_writerpc_bio_done(nfsm_info_t info)
 	int len = bp->b_resid;	/* b_resid was set to shortened length */
 	u_int32_t *tl;
 
+	get_mplock();
+
 	if (info->v3) {
 		/*
 		 * The write RPC returns a before and after mtime.  The
@@ -1624,6 +1630,7 @@ nfsmout:
 	}
 	if (info->info_writerpc.must_commit)
 		nfs_clearcommit(info->vp->v_mount);
+	rel_mplock();
 	kfree(info, M_NFSREQ);
 	if (error) {
 		bp->b_flags |= B_ERROR;
@@ -1689,6 +1696,8 @@ nfs_commitrpc_bio_done(nfsm_info_t info)
 	int wccflag = NFSV3_WCCRATTR;
 	int error = 0;
 
+	get_mplock();
+
 	ERROROUT(nfsm_wcc_data(info, info->vp, &wccflag));
 	if (error == 0) {
 		NULLOUT(tl = nfsm_dissect(info, NFSX_V3WRITEVERF));
@@ -1714,5 +1723,6 @@ nfsmout:
 	} else {
 		nfs_writerpc_bio(info->vp, bio);
 	}
+	rel_mplock();
 }
 
diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c
index dad5e4d2d2..4d7b557fc0 100644
--- a/sys/vm/swap_pager.c
+++ b/sys/vm/swap_pager.c
@@ -1125,10 +1125,10 @@ swap_chain_iodone(struct bio *biox)
         KKASSERT(bp != NULL);
 	if (bufx->b_flags & B_ERROR) {
 		atomic_set_int(&bufx->b_flags, B_ERROR);
-		bp->b_error = bufx->b_error;
+		bp->b_error = bufx->b_error;	/* race ok */
 	} else if (bufx->b_resid != 0) {
 		atomic_set_int(&bufx->b_flags, B_ERROR);
-		bp->b_error = EINVAL;
+		bp->b_error = EINVAL;		/* race ok */
 	} else {
 		atomic_subtract_int(&bp->b_resid, bufx->b_bcount);
 	}
diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c
index 048fde07ee..f0133b6c18 100644
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@@ -422,7 +422,6 @@ vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
 	/*
 	 * Insert it into the object.
 	 */
-	ASSERT_MP_LOCK_HELD(curthread);
 	vm_page_rb_tree_RB_INSERT(&object->rb_memq, m);
 	object->generation++;
 
@@ -478,7 +477,6 @@ vm_page_remove(vm_page_t m)
 	/*
 	 * Remove the page from the object and update the object.
 	 */
-	ASSERT_MP_LOCK_HELD(curthread);
 	vm_page_rb_tree_RB_REMOVE(&object->rb_memq, m);
 	object->resident_page_count--;
 	object->generation++;
@@ -492,7 +490,7 @@ vm_page_remove(vm_page_t m)
  * Locate and return the page at (object, pindex), or NULL if the
  * page could not be found.
  *
- * The caller must hold vm_token if non-blocking operation is desired.
+ * The caller must hold vm_token.
  */
 vm_page_t
 vm_page_lookup(vm_object_t object, vm_pindex_t pindex)
@@ -502,11 +500,9 @@ vm_page_lookup(vm_object_t object, vm_pindex_t pindex)
 	/*
 	 * Search the hash table for this object/offset pair
 	 */
-	ASSERT_MP_LOCK_HELD(curthread);
+	ASSERT_LWKT_TOKEN_HELD(&vm_token);
 	crit_enter();
-	lwkt_gettoken(&vm_token);
 	m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex);
-	lwkt_reltoken(&vm_token);
 	crit_exit();
 	KKASSERT(m == NULL || (m->object == object && m->pindex == pindex));
 	return(m);