From f9235b6d9cd4b6ef2a6f977a1e659de0ac635e32 Mon Sep 17 00:00:00 2001
From: Matthew Dillon <dillon@apollo.backplane.com>
Date: Mon, 23 Aug 2010 18:39:45 -0700
Subject: [PATCH] kernel - rewrite the LWKT scheduler's priority mechanism

The purpose of these changes is to begin to address the issue of cpu-bound
kernel threads.  For example, the crypto threads, or a HAMMER prune cycle
that operates entirely out of the buffer cache.  These threads tend to hicup
the system, creating temporary lockups because they never switch away due
to their nature as kernel threads.

* Change the LWKT scheduler from a strict hard priority model to
  a fair-share with hard priority queueing model.

  A kernel thread will be queued with a hard priority, giving it dibs on
  the cpu earlier if it has a higher priority.  However, if the thread
  runs past its fair-share quantum it will then become limited by that
  quantum and other lower-priority threads will be allowed to run.

* Rewrite lwkt_yield() and lwkt_user_yield(), remove uio_yield().
  Both yield functions are now very fast and can be called without
  further timing conditionals, simplifying numerous callers.

  lwkt_user_yield() now uses the fair-share quantum to determine when
  to yield the cpu for a cpu-bound kernel thread.

* Implement the new yield in the crypto kernel threads, HAMMER, and
  other places (many of which already used the old yield functions
  which didn't work very well).

* lwkt_switch() now only round-robins after the fair share
  quantum is exhausted.  It does not necessarily always round robin.

* Separate the critical section count from td_pri.  Add td_critcount.
---
 sys/ddb/db_ps.c                            |  47 +-
 sys/kern/kern_clock.c                      |   5 +
 sys/kern/kern_intr.c                       |   2 +-
 sys/kern/kern_kinfo.c                      |   2 +-
 sys/kern/kern_subr.c                       |   6 +-
 sys/kern/kern_synch.c                      |  37 +-
 sys/kern/kern_threads.c                    |   3 +-
 sys/kern/kern_time.c                       |   7 +-
 sys/kern/lwkt_ipiq.c                       |   8 +-
 sys/kern/lwkt_thread.c                     | 612 ++++++++++++---------
 sys/kern/lwkt_token.c                      |   4 +-
 sys/kern/usched_bsd4.c                     |   5 +-
 sys/kern/vfs_vnops.c                       |   4 +-
 sys/opencrypto/crypto.c                    |   2 +
 sys/platform/pc32/apic/apic_vector.s       |  24 +-
 sys/platform/pc32/i386/bcopy.s             |   8 +-
 sys/platform/pc32/i386/exception.s         |   2 +-
 sys/platform/pc32/i386/genassym.c          |   2 +-
 sys/platform/pc32/i386/machdep.c           |   6 +-
 sys/platform/pc32/i386/swtch.s             |   2 +-
 sys/platform/pc32/i386/trap.c              |  14 +-
 sys/platform/pc32/icu/icu_vector.s         |   8 +-
 sys/platform/pc32/isa/ipl.s                |  26 +-
 sys/platform/pc64/apic/apic_vector.s       |  24 +-
 sys/platform/pc64/icu/icu_vector.s         |   8 +-
 sys/platform/pc64/x86_64/exception.S       |   2 +-
 sys/platform/pc64/x86_64/genassym.c        |   2 +-
 sys/platform/pc64/x86_64/ipl.s             |  26 +-
 sys/platform/pc64/x86_64/machdep.c         |   6 +-
 sys/platform/pc64/x86_64/swtch.s           |   2 +-
 sys/platform/pc64/x86_64/trap.c            |  14 +-
 sys/platform/vkernel/i386/cpu_regs.c       |  10 +-
 sys/platform/vkernel/i386/exception.c      |  10 +-
 sys/platform/vkernel/i386/fork_tramp.s     |   2 +-
 sys/platform/vkernel/i386/genassym.c       |   2 +-
 sys/platform/vkernel/i386/swtch.s          |   2 +-
 sys/platform/vkernel/i386/trap.c           |  20 +-
 sys/platform/vkernel/platform/machintr.c   |   2 +-
 sys/platform/vkernel64/platform/machintr.c |   2 +-
 sys/platform/vkernel64/x86_64/cpu_regs.c   |  10 +-
 sys/platform/vkernel64/x86_64/exception.c  |  25 +-
 sys/platform/vkernel64/x86_64/fork_tramp.s |   2 +-
 sys/platform/vkernel64/x86_64/genassym.c   |   3 +-
 sys/platform/vkernel64/x86_64/swtch.s      |   2 +-
 sys/platform/vkernel64/x86_64/trap.c       |  20 +-
 sys/sys/globaldata.h                       |  10 +-
 sys/sys/thread.h                           |  29 +-
 sys/sys/thread2.h                          |  22 +-
 sys/sys/uio.h                              |   1 -
 sys/sys/upcall.h                           |   1 -
 sys/vfs/hammer/hammer_flusher.c            |   2 +-
 sys/vfs/ufs/ffs_rawread.c                  |  23 +-
 sys/vm/vm_zeroidle.c                       |   6 +-
 53 files changed, 586 insertions(+), 540 deletions(-)

diff --git a/sys/ddb/db_ps.c b/sys/ddb/db_ps.c
index ef128a187e..ddc849838b 100644
--- a/sys/ddb/db_ps.c
+++ b/sys/ddb/db_ps.c
@@ -113,9 +113,8 @@ db_ps(db_expr_t dummy1, boolean_t dummy2, db_expr_t dummy3, char *dummy4)
 
 	    if (db_more(&nl) < 0)
 		return;
-	    db_printf("cpu %d tdrunqmask %08x curthread %p reqflags %04x\n",
-		    gd->gd_cpuid, gd->gd_runqmask,
-		    gd->gd_curthread, gd->gd_reqflags);
+	    db_printf("cpu %d curthread %p reqflags %04x\n",
+		    gd->gd_cpuid, gd->gd_curthread, gd->gd_reqflags);
 	    if (gd->gd_curthread && gd->gd_curthread->td_preempted) {
 		    db_printf("       PREEMPTING THREAD %p\n",
 				gd->gd_curthread->td_preempted);
@@ -137,28 +136,26 @@ db_ps(db_expr_t dummy1, boolean_t dummy2, db_expr_t dummy3, char *dummy4)
 	    if (db_more(&nl) < 0)
 		return;
 	    db_printf("  tdq     thread pid    flags pri/cs/mp        sp    wmesg comm\n");
-	    for (np = 0; np < 32; ++np) {
-		TAILQ_FOREACH(td, &gd->gd_tdrunq[np], td_threadq) {
-		    if (db_more(&nl) < 0)
-			return;
-		    db_printf("  %3d %p %3d %08x %2d/%02d/%02d %p %8.8s %s\n",
-			np, td, 
-			(td->td_proc ? td->td_proc->p_pid : -1),
-			td->td_flags, 
-			td->td_pri & TDPRI_MASK,
-			td->td_pri / TDPRI_CRIT,
+	    TAILQ_FOREACH(td, &gd->gd_tdrunq, td_threadq) {
+		if (db_more(&nl) < 0)
+		    return;
+		db_printf("  %p %3d %08x %2d/%02d/%02d %p %8.8s %s\n",
+		    td,
+		    (td->td_proc ? td->td_proc->p_pid : -1),
+		    td->td_flags,
+		    td->td_pri,
+		    td->td_critcount,
 #ifdef SMP
-			td->td_mpcount,
+		    td->td_mpcount,
 #else
-			0,
+		    0,
 #endif
-			td->td_sp,
-			td->td_wmesg ? td->td_wmesg : "-",
-			td->td_proc ? td->td_proc->p_comm : td->td_comm);
-		    if (td->td_preempted)
-			db_printf("  PREEMPTING THREAD %p\n", td->td_preempted);
-		    db_dump_td_tokens(td);
-		}
+		    td->td_sp,
+		    td->td_wmesg ? td->td_wmesg : "-",
+		    td->td_proc ? td->td_proc->p_comm : td->td_comm);
+		if (td->td_preempted)
+		    db_printf("  PREEMPTING THREAD %p\n", td->td_preempted);
+		db_dump_td_tokens(td);
 	    }
 	    if (db_more(&nl) < 0)
 		return;
@@ -173,8 +170,8 @@ db_ps(db_expr_t dummy1, boolean_t dummy2, db_expr_t dummy3, char *dummy4)
 		    np, td, 
 		    (td->td_proc ? td->td_proc->p_pid : -1),
 		    td->td_flags,
-		    td->td_pri & TDPRI_MASK,
-		    td->td_pri / TDPRI_CRIT,
+		    td->td_pri,
+		    td->td_critcount,
 #ifdef SMP
 		    td->td_mpcount,
 #else
@@ -209,7 +206,7 @@ db_dump_td_tokens(thread_t td)
 
 		db_printf(" %p[tok=%p", ref, ref->tr_tok);
 #ifdef SMP
-		if (td == tok->t_ref->tr_owner)
+		if (tok->t_ref && td == tok->t_ref->tr_owner)
 		    db_printf(",held");
 #endif
 		db_printf("]");
diff --git a/sys/kern/kern_clock.c b/sys/kern/kern_clock.c
index 83818c2276..6a4660bde8 100644
--- a/sys/kern/kern_clock.c
+++ b/sys/kern/kern_clock.c
@@ -518,6 +518,11 @@ hardclock(systimer_t info, struct intrframe *frame)
 	    vm_fault_ratecheck();
 	}
 
+	/*
+	 * lwkt thread scheduler fair queueing
+	 */
+	lwkt_fairq_schedulerclock(curthread);
+
 	/*
 	 * softticks are handled for all cpus
 	 */
diff --git a/sys/kern/kern_intr.c b/sys/kern/kern_intr.c
index 5bc5b59f90..a63bcc7127 100644
--- a/sys/kern/kern_intr.c
+++ b/sys/kern/kern_intr.c
@@ -633,7 +633,7 @@ ithread_fast_handler(struct intrframe *frame)
     td = curthread;
 
     /* We must be in critical section. */
-    KKASSERT(td->td_pri >= TDPRI_CRIT);
+    KKASSERT(td->td_critcount);
 
     info = &intr_info_ary[intr];
 
diff --git a/sys/kern/kern_kinfo.c b/sys/kern/kern_kinfo.c
index 94b96dc930..63de1d232d 100644
--- a/sys/kern/kern_kinfo.c
+++ b/sys/kern/kern_kinfo.c
@@ -250,7 +250,7 @@ fill_kinfo_proc_kthread(struct thread *td, struct kinfo_proc *kp)
 
 	kp->kp_lwp.kl_tdprio = td->td_pri;
 	kp->kp_lwp.kl_rtprio.type = RTP_PRIO_THREAD;
-	kp->kp_lwp.kl_rtprio.prio = td->td_pri & TDPRI_MASK;
+	kp->kp_lwp.kl_rtprio.prio = td->td_pri;
 
 	kp->kp_lwp.kl_uticks = td->td_uticks;
 	kp->kp_lwp.kl_sticks = td->td_sticks;
diff --git a/sys/kern/kern_subr.c b/sys/kern/kern_subr.c
index 4abceec919..7991ff5031 100644
--- a/sys/kern/kern_subr.c
+++ b/sys/kern/kern_subr.c
@@ -82,7 +82,6 @@ uiomove(caddr_t cp, size_t n, struct uio *uio)
 	size_t cnt;
 	int error = 0;
 	int save = 0;
-	int baseticks = ticks;
 
 	KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE,
 	    ("uiomove: mode"));
@@ -108,10 +107,7 @@ uiomove(caddr_t cp, size_t n, struct uio *uio)
 		switch (uio->uio_segflg) {
 
 		case UIO_USERSPACE:
-			if (ticks - baseticks >= hogticks) {
-				uio_yield();
-				baseticks = ticks;
-			}
+			lwkt_user_yield();
 			if (uio->uio_rw == UIO_READ)
 				error = copyout(cp, iov->iov_base, cnt);
 			else
diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c
index bf645d5294..6230739044 100644
--- a/sys/kern/kern_synch.c
+++ b/sys/kern/kern_synch.c
@@ -486,7 +486,7 @@ tsleep(const volatile void *ident, int flags, const char *wmesg, int timo)
 		 * in case this is the idle process and already asleep.
 		 */
 		splz();
-		oldpri = td->td_pri & TDPRI_MASK;
+		oldpri = td->td_pri;
 		lwkt_setpri_self(safepri);
 		lwkt_switch();
 		lwkt_setpri_self(oldpri);
@@ -1102,41 +1102,6 @@ tstop(void)
 	crit_exit();
 }
 
-/*
- * Yield / synchronous reschedule.  This is a bit tricky because the trap
- * code might have set a lazy release on the switch function.   Setting
- * P_PASSIVE_ACQ will ensure that the lazy release executes when we call
- * switch, and that we are given a greater chance of affinity with our
- * current cpu.
- *
- * We call lwkt_setpri_self() to rotate our thread to the end of the lwkt
- * run queue.  lwkt_switch() will also execute any assigned passive release
- * (which usually calls release_curproc()), allowing a same/higher priority
- * process to be designated as the current process.  
- *
- * While it is possible for a lower priority process to be designated,
- * it's call to lwkt_maybe_switch() in acquire_curproc() will likely
- * round-robin back to us and we will be able to re-acquire the current
- * process designation.
- *
- * MPSAFE
- */
-void
-uio_yield(void)
-{
-	struct thread *td = curthread;
-	struct proc *p = td->td_proc;
-
-	lwkt_setpri_self(td->td_pri & TDPRI_MASK);
-	if (p) {
-		p->p_flag |= P_PASSIVE_ACQ;
-		lwkt_switch();
-		p->p_flag &= ~P_PASSIVE_ACQ;
-	} else {
-		lwkt_switch();
-	}
-}
-
 /*
  * Compute a tenex style load average of a quantity on
  * 1, 5 and 15 minute intervals.
diff --git a/sys/kern/kern_threads.c b/sys/kern/kern_threads.c
index 52df714162..581cace982 100644
--- a/sys/kern/kern_threads.c
+++ b/sys/kern/kern_threads.c
@@ -56,7 +56,6 @@
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/sysproto.h>
-#include <sys/uio.h>		/* uio_yield() fixme */
 
 #if 0
 
@@ -164,7 +163,7 @@ int
 sys_yield(struct yield_args *uap) 
 {
 	uap->sysmsg_result = 0;
-	uio_yield();
+	lwkt_user_yield();
 	return(0);
 }
 
diff --git a/sys/kern/kern_time.c b/sys/kern/kern_time.c
index 6621c81810..65ea66d3d3 100644
--- a/sys/kern/kern_time.c
+++ b/sys/kern/kern_time.c
@@ -296,7 +296,6 @@ nanosleep1(struct timespec *rqt, struct timespec *rmt)
 	struct timespec ts, ts2, ts3;
 	struct timeval tv;
 	int error;
-	int tried_yield;
 
 	if (rqt->tv_nsec < 0 || rqt->tv_nsec >= 1000000000)
 		return (EINVAL);
@@ -306,7 +305,6 @@ nanosleep1(struct timespec *rqt, struct timespec *rmt)
 	nanouptime(&ts);
 	timespecadd(&ts, rqt);		/* ts = target timestamp compare */
 	TIMESPEC_TO_TIMEVAL(&tv, rqt);	/* tv = sleep interval */
-	tried_yield = 0;
 
 	for (;;) {
 		int ticks;
@@ -316,9 +314,8 @@ nanosleep1(struct timespec *rqt, struct timespec *rmt)
 
 		if (tv.tv_sec == 0 && ticks == 0) {
 			thread_t td = curthread;
-			if (tried_yield || tv.tv_usec < sleep_hard_us) {
-				tried_yield = 0;
-				uio_yield();
+			if (tv.tv_usec < sleep_hard_us) {
+				lwkt_user_yield();
 			} else {
 				crit_enter_quick(td);
 				systimer_init_oneshot(&info, ns1_systimer,
diff --git a/sys/kern/lwkt_ipiq.c b/sys/kern/lwkt_ipiq.c
index c4533af694..9be618a85e 100644
--- a/sys/kern/lwkt_ipiq.c
+++ b/sys/kern/lwkt_ipiq.c
@@ -163,7 +163,7 @@ lwkt_send_ipiq3(globaldata_t target, ipifunc3_t func, void *arg1, int arg2)
     if (gd->gd_intr_nesting_level > 20)
 	panic("lwkt_send_ipiq: TOO HEAVILY NESTED!");
 #endif
-    KKASSERT(curthread->td_pri >= TDPRI_CRIT);
+    KKASSERT(curthread->td_critcount);
     ++ipiq_count;
     ip = &gd->gd_ipiq[target->gd_cpuid];
 
@@ -253,7 +253,7 @@ lwkt_send_ipiq3_passive(globaldata_t target, ipifunc3_t func,
     if (gd->gd_intr_nesting_level > 20)
 	panic("lwkt_send_ipiq: TOO HEAVILY NESTED!");
 #endif
-    KKASSERT(curthread->td_pri >= TDPRI_CRIT);
+    KKASSERT(curthread->td_critcount);
     ++ipiq_count;
     ++ipiq_passive;
     ip = &gd->gd_ipiq[target->gd_cpuid];
@@ -322,7 +322,7 @@ lwkt_send_ipiq3_nowait(globaldata_t target, ipifunc3_t func,
     struct globaldata *gd = mycpu;
 
     logipiq(send_nbio, func, arg1, arg2, gd, target);
-    KKASSERT(curthread->td_pri >= TDPRI_CRIT);
+    KKASSERT(curthread->td_critcount);
     if (target == gd) {
 	func(arg1, arg2, NULL);
 	logipiq(send_end, func, arg1, arg2, gd, target);
@@ -530,7 +530,7 @@ lwkt_process_ipiq_core(globaldata_t sgd, lwkt_ipiq_t ip,
      * Issue a load fence to prevent speculative reads of e.g. data written
      * by the other cpu prior to it updating the index.
      */
-    KKASSERT(curthread->td_pri >= TDPRI_CRIT);
+    KKASSERT(curthread->td_critcount);
     wi = ip->ip_windex;
     cpu_lfence();
 
diff --git a/sys/kern/lwkt_thread.c b/sys/kern/lwkt_thread.c
index e6b75897e1..48c498c73d 100644
--- a/sys/kern/lwkt_thread.c
+++ b/sys/kern/lwkt_thread.c
@@ -99,6 +99,7 @@ static struct objcache *thread_cache;
 #ifdef SMP
 static void lwkt_schedule_remote(void *arg, int arg2, struct intrframe *frame);
 #endif
+static void lwkt_fairq_accumulate(globaldata_t gd, thread_t td);
 
 extern void cpu_heavy_restore(void);
 extern void cpu_lwkt_restore(void);
@@ -147,6 +148,8 @@ SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_weird, CTLFLAG_RW, &preempt_weird, 0, "");
 SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count, CTLFLAG_RW,
 	&token_contention_count, 0, "spinning due to token contention");
 #endif
+static int fairq_enable = 1;
+SYSCTL_INT(_lwkt, OID_AUTO, fairq_enable, CTLFLAG_RW, &fairq_enable, 0, "");
 
 /*
  * These helper procedures handle the runq, they can only be called from
@@ -162,26 +165,45 @@ void
 _lwkt_dequeue(thread_t td)
 {
     if (td->td_flags & TDF_RUNQ) {
-	int nq = td->td_pri & TDPRI_MASK;
 	struct globaldata *gd = td->td_gd;
 
 	td->td_flags &= ~TDF_RUNQ;
-	TAILQ_REMOVE(&gd->gd_tdrunq[nq], td, td_threadq);
-	/* runqmask is passively cleaned up by the switcher */
+	TAILQ_REMOVE(&gd->gd_tdrunq, td, td_threadq);
+	gd->gd_fairq_total_pri -= td->td_pri;
+	if (TAILQ_FIRST(&gd->gd_tdrunq) == NULL)
+		atomic_clear_int_nonlocked(&gd->gd_reqflags, RQF_RUNNING);
     }
 }
 
+/*
+ * Priority enqueue.
+ *
+ * NOTE: There are a limited number of lwkt threads runnable since user
+ *	 processes only schedule one at a time per cpu.
+ */
 static __inline
 void
 _lwkt_enqueue(thread_t td)
 {
+    thread_t xtd;
+
     if ((td->td_flags & (TDF_RUNQ|TDF_MIGRATING|TDF_BLOCKQ)) == 0) {
-	int nq = td->td_pri & TDPRI_MASK;
 	struct globaldata *gd = td->td_gd;
 
 	td->td_flags |= TDF_RUNQ;
-	TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], td, td_threadq);
-	gd->gd_runqmask |= 1 << nq;
+	xtd = TAILQ_FIRST(&gd->gd_tdrunq);
+	if (xtd == NULL) {
+		TAILQ_INSERT_TAIL(&gd->gd_tdrunq, td, td_threadq);
+		atomic_set_int_nonlocked(&gd->gd_reqflags, RQF_RUNNING);
+	} else {
+		while (xtd && xtd->td_pri > td->td_pri)
+			xtd = TAILQ_NEXT(xtd, td_threadq);
+		if (xtd)
+			TAILQ_INSERT_BEFORE(xtd, td, td_threadq);
+		else
+			TAILQ_INSERT_TAIL(&gd->gd_tdrunq, td, td_threadq);
+	}
+	gd->gd_fairq_total_pri += td->td_pri;
     }
 }
 
@@ -232,7 +254,8 @@ void
 lwkt_schedule_self(thread_t td)
 {
     crit_enter_quick(td);
-    KASSERT(td != &td->td_gd->gd_idlethread, ("lwkt_schedule_self(): scheduling gd_idlethread is illegal!"));
+    KASSERT(td != &td->td_gd->gd_idlethread,
+	    ("lwkt_schedule_self(): scheduling gd_idlethread is illegal!"));
     KKASSERT(td->td_lwp == NULL || (td->td_lwp->lwp_flag & LWP_ONRUNQ) == 0);
     _lwkt_enqueue(td);
     crit_exit_quick(td);
@@ -259,11 +282,7 @@ lwkt_deschedule_self(thread_t td)
 void
 lwkt_gdinit(struct globaldata *gd)
 {
-    int i;
-
-    for (i = 0; i < sizeof(gd->gd_tdrunq)/sizeof(gd->gd_tdrunq[0]); ++i)
-	TAILQ_INIT(&gd->gd_tdrunq[i]);
-    gd->gd_runqmask = 0;
+    TAILQ_INIT(&gd->gd_tdrunq);
     TAILQ_INIT(&gd->gd_tdallq);
 }
 
@@ -357,7 +376,8 @@ lwkt_init_thread(thread_t td, void *stack, int stksize, int flags,
     td->td_kstack_size = stksize;
     td->td_flags = flags;
     td->td_gd = gd;
-    td->td_pri = TDPRI_KERN_DAEMON + TDPRI_CRIT;
+    td->td_pri = TDPRI_KERN_DAEMON;
+    td->td_critcount = 1;
     td->td_toks_stop = &td->td_toks_base;
 #ifdef SMP
     if ((flags & TDF_MPSAFE) == 0)
@@ -474,9 +494,13 @@ lwkt_switch(void)
     globaldata_t gd = mycpu;
     thread_t td = gd->gd_curthread;
     thread_t ntd;
+    thread_t xtd;
+    thread_t nlast;
 #ifdef SMP
+    int nquserok;
     int mpheld;
 #endif
+    int didaccumulate;
 
     /*
      * Switching from within a 'fast' (non thread switched) interrupt or IPI
@@ -557,17 +581,18 @@ lwkt_switch(void)
     }
 #endif
 #endif
+
+    /*
+     * If we had preempted another thread on this cpu, resume the preempted
+     * thread.  This occurs transparently, whether the preempted thread
+     * was scheduled or not (it may have been preempted after descheduling
+     * itself).
+     *
+     * We have to setup the MP lock for the original thread after backing
+     * out the adjustment that was made to curthread when the original
+     * was preempted.
+     */
     if ((ntd = td->td_preempted) != NULL) {
-	/*
-	 * We had preempted another thread on this cpu, resume the preempted
-	 * thread.  This occurs transparently, whether the preempted thread
-	 * was scheduled or not (it may have been preempted after descheduling
-	 * itself). 
-	 *
-	 * We have to setup the MP lock for the original thread after backing
-	 * out the adjustment that was made to curthread when the original
-	 * was preempted.
-	 */
 	KKASSERT(ntd->td_flags & TDF_PREEMPT_LOCK);
 #ifdef SMP
 	if (ntd->td_mpcount && mpheld == 0) {
@@ -586,181 +611,197 @@ lwkt_switch(void)
 	 * set the reschedule flag if the originally interrupted thread is
 	 * at a lower priority.
 	 */
-	if (gd->gd_runqmask > (2 << (ntd->td_pri & TDPRI_MASK)) - 1)
+	if (TAILQ_FIRST(&gd->gd_tdrunq) &&
+	    TAILQ_FIRST(&gd->gd_tdrunq)->td_pri > ntd->td_pri) {
 	    need_lwkt_resched();
+	}
 	/* YYY release mp lock on switchback if original doesn't need it */
-    } else {
+	goto havethread_preempted;
+    }
+
+    /*
+     * Implement round-robin fairq with priority insertion.  The priority
+     * insertion is handled by _lwkt_enqueue()
+     *
+     * We have to adjust the MP lock for the target thread.  If we
+     * need the MP lock and cannot obtain it we try to locate a
+     * thread that does not need the MP lock.  If we cannot, we spin
+     * instead of HLT.
+     *
+     * A similar issue exists for the tokens held by the target thread.
+     * If we cannot obtain ownership of the tokens we cannot immediately
+     * schedule the thread.
+     */
+    for (;;) {
+	clear_lwkt_resched();
+	didaccumulate = 0;
+	ntd = TAILQ_FIRST(&gd->gd_tdrunq);
+
 	/*
-	 * Priority queue / round-robin at each priority.  Note that user
-	 * processes run at a fixed, low priority and the user process
-	 * scheduler deals with interactions between user processes
-	 * by scheduling and descheduling them from the LWKT queue as
-	 * necessary.
-	 *
-	 * We have to adjust the MP lock for the target thread.  If we 
-	 * need the MP lock and cannot obtain it we try to locate a
-	 * thread that does not need the MP lock.  If we cannot, we spin
-	 * instead of HLT.
+	 * Hotpath if we can get all necessary resources.
 	 *
-	 * A similar issue exists for the tokens held by the target thread.
-	 * If we cannot obtain ownership of the tokens we cannot immediately
-	 * schedule the thread.
+	 * If nothing is runnable switch to the idle thread
 	 */
+	if (ntd == NULL) {
+	    ntd = &gd->gd_idlethread;
+	    if (gd->gd_reqflags & RQF_IDLECHECK_MASK)
+		    ntd->td_flags |= TDF_IDLE_NOHLT;
+	    if (ntd->td_mpcount) {
+		if (gd->gd_trap_nesting_level == 0 && panicstr == NULL)
+		    panic("Idle thread %p was holding the BGL!", ntd);
+		if (mpheld == 0) {
+		    cpu_pause();
+		    continue;
+		}
+	    }
+	    goto haveidle;
+	}
 
 	/*
-	 * If an LWKT reschedule was requested, well that is what we are
-	 * doing now so clear it.
+	 * Hotpath schedule
+	 */
+	if (ntd->td_fairq_accum >= 0 &&
+#ifdef SMP
+	    (ntd->td_mpcount == 0 || mpheld || cpu_try_mplock()) &&
+#endif
+	    (!TD_TOKS_HELD(ntd) || lwkt_getalltokens(ntd))
+	) {
+#ifdef SMP
+	    clr_mplock_contention_mask(gd);
+#endif
+	    goto havethread;
+	}
+
+#ifdef SMP
+	/* Reload mpheld (it become stale after mplock/token ops) */
+	mpheld = MP_LOCK_HELD();
+#endif
+
+	/*
+	 * Coldpath - unable to schedule ntd, continue looking for threads
+	 * to schedule.  This is only allowed of the (presumably) kernel
+	 * thread exhausted its fair share.  A kernel thread stuck on
+	 * resources does not currently allow a user thread to get in
+	 * front of it.
 	 */
-	clear_lwkt_resched();
-again:
-	if (gd->gd_runqmask) {
-	    int nq = bsrl(gd->gd_runqmask);
-	    if ((ntd = TAILQ_FIRST(&gd->gd_tdrunq[nq])) == NULL) {
-		gd->gd_runqmask &= ~(1 << nq);
-		goto again;
-	    }
 #ifdef SMP
+	nquserok = ((ntd->td_pri < TDPRI_KERN_LPSCHED) ||
+		    (ntd->td_fairq_accum < 0));
+#endif
+	nlast = NULL;
+
+	for (;;) {
 	    /*
-	     * THREAD SELECTION FOR AN SMP MACHINE BUILD
+	     * If the fair-share scheduler ran out ntd gets moved to the
+	     * end and its accumulator will be bumped, if it didn't we
+	     * maintain the same queue position.
 	     *
-	     * If the target needs the MP lock and we couldn't get it,
-	     * or if the target is holding tokens and we could not 
-	     * gain ownership of the tokens, continue looking for a
-	     * thread to schedule and spin instead of HLT if we can't.
-	     *
-	     * NOTE: the mpheld variable invalid after this conditional, it
-	     * can change due to both cpu_try_mplock() returning success
-	     * AND interactions in lwkt_getalltokens() due to the fact that
-	     * we are trying to check the mpcount of a thread other then
-	     * the current thread.  Because of this, if the current thread
-	     * is not holding td_mpcount, an IPI indirectly run via
-	     * lwkt_getalltokens() can obtain and release the MP lock and
-	     * cause the core MP lock to be released. 
+	     * nlast keeps track of the last element prior to any moves.
 	     */
-	    if ((ntd->td_mpcount && mpheld == 0 && !cpu_try_mplock()) ||
-		(TD_TOKS_HELD(ntd) && lwkt_getalltokens(ntd) == 0)
-	    ) {
-		u_int32_t rqmask = gd->gd_runqmask;
+	    if (ntd->td_fairq_accum < 0) {
+		xtd = TAILQ_NEXT(ntd, td_threadq);
+		lwkt_fairq_accumulate(gd, ntd);
+		didaccumulate = 1;
+		TAILQ_REMOVE(&gd->gd_tdrunq, ntd, td_threadq);
+		TAILQ_INSERT_TAIL(&gd->gd_tdrunq, ntd, td_threadq);
+		if (nlast == NULL) {
+		    nlast = ntd;
+		    if (xtd == NULL)
+			xtd = ntd;
+		}
+		ntd = xtd;
+	    } else {
+		ntd = TAILQ_NEXT(ntd, td_threadq);
+	    }
 
+	    /*
+	     * If we exhausted the run list switch to the idle thread.
+	     * Since one or more threads had resource acquisition issues
+	     * we do not allow the idle thread to halt.
+	     *
+	     * NOTE: nlast can be NULL.
+	     */
+	    if (ntd == nlast) {
 		cpu_pause();
-
-		mpheld = MP_LOCK_HELD();
-		ntd = NULL;
-		while (rqmask) {
-		    TAILQ_FOREACH(ntd, &gd->gd_tdrunq[nq], td_threadq) {
-			if (ntd->td_mpcount && !mpheld && !cpu_try_mplock()) {
-			    /* spinning due to MP lock being held */
-			    continue;
-			}
-
-			/*
-			 * mpheld state invalid after getalltokens call returns
-			 * failure, but the variable is only needed for
-			 * the loop.
-			 */
-			if (TD_TOKS_HELD(ntd) && !lwkt_getalltokens(ntd)) {
-			    /* spinning due to token contention */
-#ifdef	INVARIANTS
-			    ++token_contention_count;
-#endif
-			    mpheld = MP_LOCK_HELD();
-			    continue;
-			}
-			break;
-		    }
-		    if (ntd)
-			break;
-		    rqmask &= ~(1 << nq);
-		    nq = bsrl(rqmask);
-
-		    /*
-		     * We have two choices. We can either refuse to run a
-		     * user thread when a kernel thread needs the MP lock
-		     * but could not get it, or we can allow it to run but
-		     * then expect an IPI (hopefully) later on to force a
-		     * reschedule when the MP lock might become available.
-		     */
-		    if (nq < TDPRI_KERN_LPSCHED) {
-			break;	/* for now refuse to run */
-#if 0
-			if (chain_mplock == 0)
-				break;
-			/* continue loop, allow user threads to be scheduled */
-#endif
+		ntd = &gd->gd_idlethread;
+		ntd->td_flags |= TDF_IDLE_NOHLT;
+		set_mplock_contention_mask(gd);
+		cpu_mplock_contested();
+		if (ntd->td_mpcount) {
+		    mpheld = MP_LOCK_HELD();
+		    if (gd->gd_trap_nesting_level == 0 && panicstr == NULL)
+			panic("Idle thread %p was holding the BGL!", ntd);
+		    if (mpheld == 0) {
+			cpu_pause();
+			break;		/* try again from the top, almost */
 		    }
 		}
 
 		/*
-		 * Case where a (kernel) thread needed the MP lock and could
-		 * not get one, and we may or may not have found another
-		 * thread which does not need the MP lock to run while
-		 * we wait (ntd).
+		 * If fairq accumulations occured we do not schedule the
+		 * idle thread.  This will cause us to try again from
+		 * the (almost) top.
 		 */
-		if (ntd == NULL) {
-		    ntd = &gd->gd_idlethread;
-		    ntd->td_flags |= TDF_IDLE_NOHLT;
-		    set_mplock_contention_mask(gd);
-		    cpu_mplock_contested();
-		    goto using_idle_thread;
-		} else {
-		    clr_mplock_contention_mask(gd);
-		    ++gd->gd_cnt.v_swtch;
-		    TAILQ_REMOVE(&gd->gd_tdrunq[nq], ntd, td_threadq);
-		    TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], ntd, td_threadq);
-		}
-	    } else {
-		clr_mplock_contention_mask(gd);
-		++gd->gd_cnt.v_swtch;
-		TAILQ_REMOVE(&gd->gd_tdrunq[nq], ntd, td_threadq);
-		TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], ntd, td_threadq);
+		if (didaccumulate)
+			break;
+		goto haveidle;
 	    }
-#else
+
 	    /*
-	     * THREAD SELECTION FOR A UP MACHINE BUILD.  We don't have to
-	     * worry about tokens or the BGL.  However, we still have
-	     * to call lwkt_getalltokens() in order to properly detect
-	     * stale tokens.  This call cannot fail for a UP build!
+	     * Try to switch to this thread.
 	     */
-	    lwkt_getalltokens(ntd);
-	    ++gd->gd_cnt.v_swtch;
-	    TAILQ_REMOVE(&gd->gd_tdrunq[nq], ntd, td_threadq);
-	    TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], ntd, td_threadq);
+	    if ((ntd->td_pri >= TDPRI_KERN_LPSCHED || nquserok) &&
+		ntd->td_fairq_accum >= 0 &&
+#ifdef SMP
+		(ntd->td_mpcount == 0 || mpheld || cpu_try_mplock()) &&
 #endif
-	} else {
-	    /*
-	     * We have nothing to run but only let the idle loop halt
-	     * the cpu if there are no pending interrupts.
-	     */
-	    ntd = &gd->gd_idlethread;
-	    if (gd->gd_reqflags & RQF_IDLECHECK_MASK)
-		ntd->td_flags |= TDF_IDLE_NOHLT;
+		(!TD_TOKS_HELD(ntd) || lwkt_getalltokens(ntd))
+	    ) {
 #ifdef SMP
-using_idle_thread:
-	    /*
-	     * The idle thread should not be holding the MP lock unless we
-	     * are trapping in the kernel or in a panic.  Since we select the
-	     * idle thread unconditionally when no other thread is available,
-	     * if the MP lock is desired during a panic or kernel trap, we
-	     * have to loop in the scheduler until we get it.
-	     */
-	    if (ntd->td_mpcount) {
-		mpheld = MP_LOCK_HELD();
-		if (gd->gd_trap_nesting_level == 0 && panicstr == NULL)
-		    panic("Idle thread %p was holding the BGL!", ntd);
-		if (mpheld == 0)
-		    goto again;
+		    clr_mplock_contention_mask(gd);
+#endif
+		    goto havethread;
 	    }
+#ifdef SMP
+	    /* Reload mpheld (it become stale after mplock/token ops) */
+	    mpheld = MP_LOCK_HELD();
+	    if (ntd->td_pri >= TDPRI_KERN_LPSCHED && ntd->td_fairq_accum >= 0)
+		nquserok = 0;
 #endif
 	}
     }
-    KASSERT(ntd->td_pri >= TDPRI_CRIT,
-	("priority problem in lwkt_switch %d %d", td->td_pri, ntd->td_pri));
 
     /*
-     * Do the actual switch.  If the new target does not need the MP lock
-     * and we are holding it, release the MP lock.  If the new target requires
-     * the MP lock we have already acquired it for the target.
+     * Do the actual switch.  WARNING: mpheld is stale here.
+     *
+     * We must always decrement td_fairq_accum on non-idle threads just
+     * in case a thread never gets a tick due to being in a continuous
+     * critical section.  The page-zeroing code does that.
+     *
+     * If the thread we came up with is a higher or equal priority verses
+     * the thread at the head of the queue we move our thread to the
+     * front.  This way we can always check the front of the queue.
      */
+havethread:
+    ++gd->gd_cnt.v_swtch;
+    --ntd->td_fairq_accum;
+    xtd = TAILQ_FIRST(&gd->gd_tdrunq);
+    if (ntd != xtd && ntd->td_pri >= xtd->td_pri) {
+	TAILQ_REMOVE(&gd->gd_tdrunq, ntd, td_threadq);
+	TAILQ_INSERT_HEAD(&gd->gd_tdrunq, ntd, td_threadq);
+    }
+havethread_preempted:
+    ;
+    /*
+     * If the new target does not need the MP lock and we are holding it,
+     * release the MP lock.  If the new target requires the MP lock we have
+     * already acquired it for the target.
+     *
+     * WARNING: mpheld is stale here.
+     */
+haveidle:
+    KASSERT(ntd->td_critcount,
+	    ("priority problem in lwkt_switch %d %d", td->td_pri, ntd->td_pri));
 #ifdef SMP
     if (ntd->td_mpcount == 0 ) {
 	if (MP_LOCK_HELD())
@@ -772,10 +813,10 @@ using_idle_thread:
     if (td != ntd) {
 	++switch_count;
 #ifdef __x86_64__
-    {
-	int tos_ok __debugvar = jg_tos_ok(ntd);
-	KKASSERT(tos_ok);
-    }
+	{
+	    int tos_ok __debugvar = jg_tos_ok(ntd);
+	    KKASSERT(tos_ok);
+	}
 #endif
 	KTR_LOG(ctxsw_sw, gd->gd_cpuid, ntd);
 	td->td_switch(ntd);
@@ -798,7 +839,7 @@ using_idle_thread:
  *
  * THE CALLER OF LWKT_PREEMPT() MUST BE IN A CRITICAL SECTION.  Typically
  * this is called via lwkt_schedule() through the td_preemptable callback.
- * critpri is the managed critical priority that we should ignore in order
+ * critcount is the managed critical priority that we should ignore in order
  * to determine whether preemption is possible (aka usually just the crit
  * priority of lwkt_schedule() itself).
  *
@@ -819,7 +860,7 @@ using_idle_thread:
  * can leave it synchronized on return).
  */
 void
-lwkt_preempt(thread_t ntd, int critpri)
+lwkt_preempt(thread_t ntd, int critcount)
 {
     struct globaldata *gd = mycpu;
     thread_t td;
@@ -831,7 +872,7 @@ lwkt_preempt(thread_t ntd, int critpri)
     /*
      * The caller has put us in a critical section.  We can only preempt
      * if the caller of the caller was not in a critical section (basically
-     * a local interrupt), as determined by the 'critpri' parameter.  We
+     * a local interrupt), as determined by the 'critcount' parameter.  We
      * also can't preempt if the caller is holding any spinlocks (even if
      * he isn't in a critical section).  This also handles the tokens test.
      *
@@ -840,14 +881,14 @@ lwkt_preempt(thread_t ntd, int critpri)
      *
      * Set need_lwkt_resched() unconditionally for now YYY.
      */
-    KASSERT(ntd->td_pri >= TDPRI_CRIT, ("BADCRIT0 %d", ntd->td_pri));
+    KASSERT(ntd->td_critcount, ("BADCRIT0 %d", ntd->td_pri));
 
     td = gd->gd_curthread;
-    if ((ntd->td_pri & TDPRI_MASK) <= (td->td_pri & TDPRI_MASK)) {
+    if (ntd->td_pri <= td->td_pri) {
 	++preempt_miss;
 	return;
     }
-    if ((td->td_pri & ~TDPRI_MASK) > critpri) {
+    if (td->td_critcount > critcount) {
 	++preempt_miss;
 	need_lwkt_resched();
 	return;
@@ -952,43 +993,63 @@ splz_check(void)
     globaldata_t gd = mycpu;
     thread_t td = gd->gd_curthread;
 
-    if (gd->gd_reqflags && td->td_nest_count < 2)
+    if ((gd->gd_reqflags & RQF_IDLECHECK_MASK) && td->td_nest_count < 2)
 	splz();
 }
 
 /*
- * This implements a normal yield which will yield to equal priority
- * threads as well as higher priority threads.  Note that gd_reqflags
- * tests will be handled by the crit_exit() call in lwkt_switch().
- *
- * (self contained on a per cpu basis)
+ * This function is used to negotiate a passive release of the current
+ * process/lwp designation with the user scheduler, allowing the user
+ * scheduler to schedule another user thread.  The related kernel thread
+ * (curthread) continues running in the released state.
  */
 void
-lwkt_yield(void)
+lwkt_passive_release(struct thread *td)
 {
-    lwkt_schedule_self(curthread);
-    lwkt_switch();
+    struct lwp *lp = td->td_lwp;
+
+    td->td_release = NULL;
+    lwkt_setpri_self(TDPRI_KERN_USER);
+    lp->lwp_proc->p_usched->release_curproc(lp);
 }
 
+
 /*
- * This function is used along with the lwkt_passive_recover() inline
- * by the trap code to negotiate a passive release of the current
- * process/lwp designation with the user scheduler.
+ * This implements a normal yield.  This routine is virtually a nop if
+ * there is nothing to yield to but it will always run any pending interrupts
+ * if called from a critical section.
+ *
+ * This yield is designed for kernel threads without a user context.
+ *
+ * (self contained on a per cpu basis)
  */
 void
-lwkt_passive_release(struct thread *td)
+lwkt_yield(void)
 {
-    struct lwp *lp = td->td_lwp;
+    globaldata_t gd = mycpu;
+    thread_t td = gd->gd_curthread;
+    thread_t xtd;
 
-    td->td_release = NULL;
-    lwkt_setpri_self(TDPRI_KERN_USER);
-    lp->lwp_proc->p_usched->release_curproc(lp);
+    if ((gd->gd_reqflags & RQF_IDLECHECK_MASK) && td->td_nest_count < 2)
+	splz();
+    if (td->td_fairq_accum < 0) {
+	lwkt_schedule_self(curthread);
+	lwkt_switch();
+    } else {
+	xtd = TAILQ_FIRST(&gd->gd_tdrunq);
+	if (xtd && xtd->td_pri > td->td_pri) {
+	    lwkt_schedule_self(curthread);
+	    lwkt_switch();
+	}
+    }
 }
 
 /*
- * Make a kernel thread act as if it were in user mode with regards
- * to scheduling, to avoid becoming cpu-bound in the kernel.  Kernel
- * loops which may be potentially cpu-bound can call lwkt_user_yield().
+ * This yield is designed for kernel threads with a user context.
+ *
+ * The kernel acting on behalf of the user is potentially cpu-bound,
+ * this function will efficiently allow other threads to run and also
+ * switch to other processes by releasing.
  *
  * The lwkt_user_yield() function is designed to have very low overhead
  * if no yield is determined to be needed.
@@ -996,8 +1057,15 @@ lwkt_passive_release(struct thread *td)
 void
 lwkt_user_yield(void)
 {
-    thread_t td = curthread;
-    struct lwp *lp = td->td_lwp;
+    globaldata_t gd = mycpu;
+    thread_t td = gd->gd_curthread;
+
+    /*
+     * Always run any pending interrupts in case we are in a critical
+     * section.
+     */
+    if ((gd->gd_reqflags & RQF_IDLECHECK_MASK) && td->td_nest_count < 2)
+	splz();
 
 #ifdef SMP
     /*
@@ -1013,53 +1081,30 @@ lwkt_user_yield(void)
 #endif
 
     /*
-     * Another kernel thread wants the cpu
+     * Switch (which forces a release) if another kernel thread needs
+     * the cpu, if userland wants us to resched, or if our kernel
+     * quantum has run out.
      */
-    if (lwkt_resched_wanted())
+    if (lwkt_resched_wanted() ||
+	user_resched_wanted() ||
+	td->td_fairq_accum < 0)
+    {
 	lwkt_switch();
-
-    /*
-     * If the user scheduler has asynchronously determined that the current
-     * process (when running in user mode) needs to lose the cpu then make
-     * sure we are released.
-     */
-    if (user_resched_wanted()) {
-	if (td->td_release)
-	    td->td_release(td);
     }
 
+#if 0
     /*
-     * If we are released reduce our priority
+     * Reacquire the current process if we are released.
+     *
+     * XXX not implemented atm.  The kernel may be holding locks and such,
+     *     so we want the thread to continue to receive cpu.
      */
-    if (td->td_release == NULL) {
-	if (lwkt_check_resched(td) > 0)
-		lwkt_switch();
-	if (lp) {
-		lp->lwp_proc->p_usched->acquire_curproc(lp);
-		td->td_release = lwkt_passive_release;
-		lwkt_setpri_self(TDPRI_USER_NORM);
-	}
+    if (td->td_release == NULL && lp) {
+	lp->lwp_proc->p_usched->acquire_curproc(lp);
+	td->td_release = lwkt_passive_release;
+	lwkt_setpri_self(TDPRI_USER_NORM);
     }
-}
-
-/*
- * Return 0 if no runnable threads are pending at the same or higher
- * priority as the passed thread.
- *
- * Return 1 if runnable threads are pending at the same priority.
- *
- * Return 2 if runnable threads are pending at a higher priority.
- */
-int
-lwkt_check_resched(thread_t td)
-{
-	int pri = td->td_pri & TDPRI_MASK;
-
-	if (td->td_gd->gd_runqmask > (2 << pri) - 1)
-		return(2);
-	if (TAILQ_NEXT(td, td_threadq))
-		return(1);
-	return(0);
+#endif
 }
 
 /*
@@ -1083,18 +1128,31 @@ lwkt_check_resched(thread_t td)
  */
 static __inline
 void
-_lwkt_schedule_post(globaldata_t gd, thread_t ntd, int cpri, int reschedok)
+_lwkt_schedule_post(globaldata_t gd, thread_t ntd, int ccount, int reschedok)
 {
     thread_t otd;
 
     if (ntd->td_flags & TDF_RUNQ) {
 	if (ntd->td_preemptable && reschedok) {
-	    ntd->td_preemptable(ntd, cpri);	/* YYY +token */
+	    ntd->td_preemptable(ntd, ccount);	/* YYY +token */
 	} else if (reschedok) {
 	    otd = curthread;
-	    if ((ntd->td_pri & TDPRI_MASK) > (otd->td_pri & TDPRI_MASK))
+	    if (ntd->td_pri > otd->td_pri)
 		need_lwkt_resched();
 	}
+
+	/*
+	 * Give the thread a little fair share scheduler bump if it
+	 * has been asleep for a while.  This is primarily to avoid
+	 * a degenerate case for interrupt threads where accumulator
+	 * crosses into negative territory unnecessarily.
+	 */
+	if (ntd->td_fairq_lticks != ticks) {
+	    ntd->td_fairq_lticks = ticks;
+	    ntd->td_fairq_accum += gd->gd_fairq_total_pri;
+	    if (ntd->td_fairq_accum > TDFAIRQ_MAX(gd))
+		    ntd->td_fairq_accum = TDFAIRQ_MAX(gd);
+	}
     }
 }
 
@@ -1118,13 +1176,13 @@ _lwkt_schedule(thread_t td, int reschedok)
 #ifdef SMP
 	if (td->td_gd == mygd) {
 	    _lwkt_enqueue(td);
-	    _lwkt_schedule_post(mygd, td, TDPRI_CRIT, reschedok);
+	    _lwkt_schedule_post(mygd, td, 1, reschedok);
 	} else {
 	    lwkt_send_ipiq3(td->td_gd, lwkt_schedule_remote, td, 0);
 	}
 #else
 	_lwkt_enqueue(td);
-	_lwkt_schedule_post(mygd, td, TDPRI_CRIT, reschedok);
+	_lwkt_schedule_post(mygd, td, 1, reschedok);
 #endif
     }
     crit_exit_gd(mygd);
@@ -1257,29 +1315,23 @@ lwkt_deschedule(thread_t td)
  * Set the target thread's priority.  This routine does not automatically
  * switch to a higher priority thread, LWKT threads are not designed for
  * continuous priority changes.  Yield if you want to switch.
- *
- * We have to retain the critical section count which uses the high bits
- * of the td_pri field.  The specified priority may also indicate zero or
- * more critical sections by adding TDPRI_CRIT*N.
- *
- * Note that we requeue the thread whether it winds up on a different runq
- * or not.  uio_yield() depends on this and the routine is not normally
- * called with the same priority otherwise.
  */
 void
 lwkt_setpri(thread_t td, int pri)
 {
-    KKASSERT(pri >= 0);
     KKASSERT(td->td_gd == mycpu);
-    crit_enter();
-    if (td->td_flags & TDF_RUNQ) {
-	_lwkt_dequeue(td);
-	td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri;
-	_lwkt_enqueue(td);
-    } else {
-	td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri;
+    if (td->td_pri != pri) {
+	KKASSERT(pri >= 0);
+	crit_enter();
+	if (td->td_flags & TDF_RUNQ) {
+	    _lwkt_dequeue(td);
+	    td->td_pri = pri;
+	    _lwkt_enqueue(td);
+	} else {
+	    td->td_pri = pri;
+	}
+	crit_exit();
     }
-    crit_exit();
 }
 
 /*
@@ -1296,7 +1348,7 @@ lwkt_setpri_initial(thread_t td, int pri)
 {
     KKASSERT(pri >= 0);
     KKASSERT((td->td_flags & TDF_RUNQ) == 0);
-    td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri;
+    td->td_pri = pri;
 }
 
 void
@@ -1308,14 +1360,46 @@ lwkt_setpri_self(int pri)
     crit_enter();
     if (td->td_flags & TDF_RUNQ) {
 	_lwkt_dequeue(td);
-	td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri;
+	td->td_pri = pri;
 	_lwkt_enqueue(td);
     } else {
-	td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri;
+	td->td_pri = pri;
     }
     crit_exit();
 }
 
+/*
+ * 1/hz tick (typically 10ms) x TDFAIRQ_SCALE (typ 8) = 80ms full cycle.
+ *
+ * Example: two competing threads, same priority N.  decrement by (2*N)
+ * increment by N*8, each thread will get 4 ticks.
+ */
+void
+lwkt_fairq_schedulerclock(thread_t td)
+{
+    if (fairq_enable) {
+	while (td) {
+	    if (td != &td->td_gd->gd_idlethread) {
+		td->td_fairq_accum -= td->td_gd->gd_fairq_total_pri;
+		if (td->td_fairq_accum < -TDFAIRQ_MAX(td->td_gd))
+			td->td_fairq_accum = -TDFAIRQ_MAX(td->td_gd);
+		if (td->td_fairq_accum < 0)
+			need_lwkt_resched();
+		td->td_fairq_lticks = ticks;
+	    }
+	    td = td->td_preempted;
+	}
+    }
+}
+
+static void
+lwkt_fairq_accumulate(globaldata_t gd, thread_t td)
+{
+	td->td_fairq_accum += td->td_pri * TDFAIRQ_SCALE;
+	if (td->td_fairq_accum > TDFAIRQ_MAX(td->td_gd))
+		td->td_fairq_accum = TDFAIRQ_MAX(td->td_gd);
+}
+
 /*
  * Migrate the current thread to the specified cpu. 
  *
diff --git a/sys/kern/lwkt_token.c b/sys/kern/lwkt_token.c
index eaeb554637..ccb251f017 100644
--- a/sys/kern/lwkt_token.c
+++ b/sys/kern/lwkt_token.c
@@ -401,7 +401,7 @@ _lwkt_gettokref(lwkt_tokref_t ref, thread_t td)
 		 *
 		 * Since the tokref is already active the scheduler now
 		 * takes care of acquisition, so we need only call
-		 * lwkt_yield().
+		 * lwkt_switch().
 		 *
 		 * Since we failed this was not a recursive token so upon
 		 * return tr_tok->t_ref should be assigned to this specific
@@ -409,7 +409,7 @@ _lwkt_gettokref(lwkt_tokref_t ref, thread_t td)
 		 */
 		atomic_add_long(&ref->tr_tok->t_collisions, 1);
 		logtoken(fail, ref);
-		lwkt_yield();
+		lwkt_switch();
 		logtoken(succ, ref);
 		KKASSERT(ref->tr_tok->t_ref == ref);
 	}
diff --git a/sys/kern/usched_bsd4.c b/sys/kern/usched_bsd4.c
index 8737423ccd..048270c73c 100644
--- a/sys/kern/usched_bsd4.c
+++ b/sys/kern/usched_bsd4.c
@@ -291,10 +291,7 @@ bsd4_acquire_curproc(struct lwp *lp)
 		 * the run queue.  When we are reactivated we will have
 		 * another chance.
 		 */
-		if (lwkt_check_resched(lp->lwp_thread) > 1) {
-			lwkt_switch();
-			continue;
-		}
+		lwkt_yield();
 	} while (dd->uschedcp != lp);
 
 	crit_exit();
diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
index 7e5635699e..5acce8e6bb 100644
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@@ -573,7 +573,7 @@ vn_rdwr(enum uio_rw rw, struct vnode *vp, caddr_t base, int len,
  * Package up an I/O request on a vnode into a uio and do it.  The I/O
  * request is split up into smaller chunks and we try to avoid saturating
  * the buffer cache while potentially holding a vnode locked, so we 
- * check bwillwrite() before calling vn_rdwr().  We also call uio_yield()
+ * check bwillwrite() before calling vn_rdwr().  We also call lwkt_user_yield()
  * to give other processes a chance to lock the vnode (either other processes
  * core'ing the same binary, or unrelated processes scanning the directory).
  *
@@ -616,7 +616,7 @@ vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, caddr_t base, int len,
 			break;
 		offset += chunk;
 		base += chunk;
-		uio_yield();
+		lwkt_user_yield();
 	} while (len);
 	if (aresid)
 		*aresid += len;
diff --git a/sys/opencrypto/crypto.c b/sys/opencrypto/crypto.c
index 7e34357c80..cbc6bee690 100644
--- a/sys/opencrypto/crypto.c
+++ b/sys/opencrypto/crypto.c
@@ -830,6 +830,7 @@ crypto_dispatch(struct cryptop *crp)
 		KASSERT(cap != NULL, ("%s: Driver disappeared.", __func__));
 		if (!cap->cc_qblocked) {
 			result = crypto_invoke(cap, crp, 0);
+			lwkt_yield();
 			if (result != ERESTART)
 				return (result);
 			/*
@@ -1362,6 +1363,7 @@ crypto_proc(void *arg)
 
 			CRYPTO_Q_UNLOCK(tdinfo);
 			result = crypto_invoke(cap, submit, hint);
+			lwkt_yield();
 			CRYPTO_Q_LOCK(tdinfo);
 
 			if (result == ERESTART) {
diff --git a/sys/platform/pc32/apic/apic_vector.s b/sys/platform/pc32/apic/apic_vector.s
index a536fbcf57..99add4d95c 100644
--- a/sys/platform/pc32/apic/apic_vector.s
+++ b/sys/platform/pc32/apic/apic_vector.s
@@ -147,8 +147,8 @@ IDTVEC(vec_name) ;							\
 	pushl	%eax ;							\
 	testl	$-1,TD_NEST_COUNT(%ebx) ;				\
 	jne	1f ;							\
-	cmpl	$TDPRI_CRIT,TD_PRI(%ebx) ;				\
-	jl	2f ;							\
+	testl	$-1,TD_CRITCOUNT(%ebx) ;				\
+	je	2f ;							\
 1: ;									\
 	/* in critical section, make interrupt pending */		\
 	/* set the pending bit and return, leave interrupt masked */	\
@@ -160,9 +160,9 @@ IDTVEC(vec_name) ;							\
 	andl	$~IRQ_LBIT(irq_num),PCPU(fpending) ;			\
 	pushl	$irq_num ;						\
 	pushl	%esp ;			 /* pass frame by reference */	\
-	addl	$TDPRI_CRIT,TD_PRI(%ebx) ;				\
+	incl	TD_CRITCOUNT(%ebx) ;					\
 	call	ithread_fast_handler ;	 /* returns 0 to unmask */	\
-	subl	$TDPRI_CRIT,TD_PRI(%ebx) ;				\
+	decl	TD_CRITCOUNT(%ebx) ;					\
 	addl	$8, %esp ;						\
 	UNMASK_IRQ(irq_num) ;						\
 5: ;									\
@@ -299,14 +299,14 @@ Xipiq:
 
 	incl    PCPU(cnt) + V_IPI
 	movl	PCPU(curthread),%ebx
-	cmpl	$TDPRI_CRIT,TD_PRI(%ebx)
-	jge	1f
+	testl	$-1,TD_CRITCOUNT(%ebx)
+	jne	1f
 	subl	$8,%esp			/* make same as interrupt frame */
 	pushl	%esp			/* pass frame by reference */
 	incl	PCPU(intr_nesting_level)
-	addl	$TDPRI_CRIT,TD_PRI(%ebx)
+	incl	TD_CRITCOUNT(%ebx)
 	call	lwkt_process_ipiq_frame
-	subl	$TDPRI_CRIT,TD_PRI(%ebx)
+	decl	TD_CRITCOUNT(%ebx)
 	decl	PCPU(intr_nesting_level)
 	addl	$12,%esp
 	pushl	$0			/* CPL for frame (REMOVED) */
@@ -328,16 +328,16 @@ Xtimer:
 
 	incl    PCPU(cnt) + V_TIMER
 	movl	PCPU(curthread),%ebx
-	cmpl	$TDPRI_CRIT,TD_PRI(%ebx)
-	jge	1f
+	testl	$-1,TD_CRITCOUNT(%ebx)
+	jne	1f
 	testl	$-1,TD_NEST_COUNT(%ebx)
 	jne	1f
 	subl	$8,%esp			/* make same as interrupt frame */
 	pushl	%esp			/* pass frame by reference */
 	incl	PCPU(intr_nesting_level)
-	addl	$TDPRI_CRIT,TD_PRI(%ebx)
+	incl	TD_CRITCOUNT(%ebx)
 	call	lapic_timer_process_frame
-	subl	$TDPRI_CRIT,TD_PRI(%ebx)
+	decl	TD_CRITCOUNT(%ebx)
 	decl	PCPU(intr_nesting_level)
 	addl	$12,%esp
 	pushl	$0			/* CPL for frame (REMOVED) */
diff --git a/sys/platform/pc32/i386/bcopy.s b/sys/platform/pc32/i386/bcopy.s
index 131ab0da54..b68f55c5d6 100644
--- a/sys/platform/pc32/i386/bcopy.s
+++ b/sys/platform/pc32/i386/bcopy.s
@@ -302,7 +302,7 @@ ENTRY(asm_generic_bcopy)
 	pushl	%ecx ;							\
 	movl	GD_CURTHREAD(%eax),%edx ;	/* EDX = CURTHREAD */	\
 	movl	TD_SAVEFPU(%edx),%ebx ;		/* save app save area */\
-	addl	$TDPRI_CRIT,TD_PRI(%edx) ;				\
+	incl	TD_CRITCOUNT(%edx) ;					\
 	cmpl	$0,GD_NPXTHREAD(%eax) ;					\
 	je	100f ;							\
 	fxsave	0(%ebx) ;			/* race(1) */		\
@@ -315,11 +315,11 @@ ENTRY(asm_generic_bcopy)
 	orl	$TDF_KERNELFP,TD_FLAGS(%edx) ;				\
 	clts ;								\
 	movl	%edx,GD_NPXTHREAD(%eax) ;	/* race(3) */		\
-	subl	$TDPRI_CRIT,TD_PRI(%edx) ;	/* crit_exit() */	\
+	decl	TD_CRITCOUNT(%edx) ;		/* crit_exit() */	\
 	cmpl	$0,GD_REQFLAGS(%eax) ;					\
 	je	101f ;							\
-	cmpl	$TDPRI_CRIT,TD_PRI(%edx) ;				\
-	jge	101f ;							\
+	testl	$-1,TD_CRITCOUNT(%edx) ;				\
+	jne	101f ;							\
 	call	splz_check ;						\
 	/* note: eax,ecx,edx destroyed */				\
 101: ;									\
diff --git a/sys/platform/pc32/i386/exception.s b/sys/platform/pc32/i386/exception.s
index c65f37d182..27da25b3e6 100644
--- a/sys/platform/pc32/i386/exception.s
+++ b/sys/platform/pc32/i386/exception.s
@@ -895,7 +895,7 @@ IDTVEC(int0x80_syscall)
  */
 ENTRY(fork_trampoline)
 	movl	PCPU(curthread),%eax
-	subl	$TDPRI_CRIT,TD_PRI(%eax)
+	decl	TD_CRITCOUNT(%eax)
 
 	/*
 	 * cpu_set_fork_handler intercepts this function call to
diff --git a/sys/platform/pc32/i386/genassym.c b/sys/platform/pc32/i386/genassym.c
index 65e5c4daad..0176996cfd 100644
--- a/sys/platform/pc32/i386/genassym.c
+++ b/sys/platform/pc32/i386/genassym.c
@@ -92,6 +92,7 @@ ASSYM(TD_NEST_COUNT, offsetof(struct thread, td_nest_count));
 #ifdef SMP
 ASSYM(TD_MPCOUNT, offsetof(struct thread, td_mpcount));
 #endif
+ASSYM(TD_CRITCOUNT, offsetof(struct thread, td_critcount));
 ASSYM(TD_FLAGS, offsetof(struct thread, td_flags));
 ASSYM(TDF_RUNNING, TDF_RUNNING);
 ASSYM(TDF_USINGFP, TDF_USINGFP);
@@ -100,7 +101,6 @@ ASSYM(MACHINTR_INTREN, offsetof(struct machintr_abi, intren));
 
 ASSYM(TD_SAVEFPU, offsetof(struct thread, td_mach) + offsetof(struct md_thread, mtd_savefpu));
 
-ASSYM(TDPRI_CRIT, TDPRI_CRIT);
 ASSYM(TDPRI_INT_SUPPORT, TDPRI_INT_SUPPORT);
 #ifdef SMP
 ASSYM(CPUMASK_LOCK, CPUMASK_LOCK);
diff --git a/sys/platform/pc32/i386/machdep.c b/sys/platform/pc32/i386/machdep.c
index 171b029b7c..4d09c15ff0 100644
--- a/sys/platform/pc32/i386/machdep.c
+++ b/sys/platform/pc32/i386/machdep.c
@@ -753,7 +753,7 @@ sendupcall(struct vmupcall *vu, int morepending)
 	 */
 	vu->vu_pending = 0;
 	upcall.upc_pending = morepending;
-	crit_count += TDPRI_CRIT;
+	++crit_count;
 	copyout(&upcall.upc_pending, &lp->lwp_upcall->upc_pending, 
 		sizeof(upcall.upc_pending));
 	copyout(&crit_count, (char *)upcall.upc_uthread + upcall.upc_critoff,
@@ -811,7 +811,7 @@ fetchupcall(struct vmupcall *vu, int morepending, void *rsp)
 		crit_count = 0;
 		if (error == 0)
 			error = copyin((char *)upcall.upc_uthread + upcall.upc_critoff, &crit_count, sizeof(int));
-		crit_count += TDPRI_CRIT;
+		++crit_count;
 		if (error == 0)
 			error = copyout(&crit_count, (char *)upcall.upc_uthread + upcall.upc_critoff, sizeof(int));
 		regs->tf_eax = (register_t)vu->vu_func;
@@ -903,7 +903,7 @@ cpu_idle(void)
 	struct thread *td = curthread;
 
 	crit_exit();
-	KKASSERT(td->td_pri < TDPRI_CRIT);
+	KKASSERT(td->td_critcount == 0);
 	for (;;) {
 		/*
 		 * See if there are any LWKTs ready to go.
diff --git a/sys/platform/pc32/i386/swtch.s b/sys/platform/pc32/i386/swtch.s
index 91a66e0932..e19990991a 100644
--- a/sys/platform/pc32/i386/swtch.s
+++ b/sys/platform/pc32/i386/swtch.s
@@ -546,7 +546,7 @@ ENTRY(cpu_kthread_restore)
 	movl	%ecx,%cr3
 	andl	$~TDF_RUNNING,TD_FLAGS(%ebx)
 	orl	$TDF_RUNNING,TD_FLAGS(%eax)
-	subl	$TDPRI_CRIT,TD_PRI(%eax)
+	decl	TD_CRITCOUNT(%eax)
 	popl	%eax		/* kthread exit function */
 	pushl	PCB_EBX(%edx)	/* argument to ESI function */
 	pushl	%eax		/* set exit func as return address */
diff --git a/sys/platform/pc32/i386/trap.c b/sys/platform/pc32/i386/trap.c
index e9b125d669..fed82300c1 100644
--- a/sys/platform/pc32/i386/trap.c
+++ b/sys/platform/pc32/i386/trap.c
@@ -406,7 +406,7 @@ trap(struct trapframe *frame)
 	int have_mplock = 0;
 #endif
 #ifdef INVARIANTS
-	int crit_count = td->td_pri & ~TDPRI_MASK;
+	int crit_count = td->td_critcount;
 #endif
 	vm_offset_t eva;
 
@@ -929,9 +929,9 @@ out2:	;
 	if (p != NULL && lp != NULL)
 		KTR_LOG(kernentry_trap_ret, p->p_pid, lp->lwp_tid);
 #ifdef INVARIANTS
-	KASSERT(crit_count == (td->td_pri & ~TDPRI_MASK),
+	KASSERT(crit_count == td->td_critcount,
 		("syscall: critical section count mismatch! %d/%d",
-		crit_count / TDPRI_CRIT, td->td_pri / TDPRI_CRIT));
+		crit_count, td->td_pri));
 #endif
 }
 
@@ -1099,7 +1099,7 @@ trap_fatal(struct trapframe *frame, vm_offset_t eva)
 		kprintf("Idle\n");
 	}
 	kprintf("current thread          = pri %d ", curthread->td_pri);
-	if (curthread->td_pri >= TDPRI_CRIT)
+	if (curthread->td_critcount)
 		kprintf("(CRIT)");
 	kprintf("\n");
 #ifdef SMP
@@ -1179,7 +1179,7 @@ syscall2(struct trapframe *frame)
 	int error;
 	int narg;
 #ifdef INVARIANTS
-	int crit_count = td->td_pri & ~TDPRI_MASK;
+	int crit_count = td->td_critcount;
 #endif
 #ifdef SMP
 	int have_mplock = 0;
@@ -1399,9 +1399,9 @@ bad:
 #endif
 	KTR_LOG(kernentry_syscall_ret, p->p_pid, lp->lwp_tid, error);
 #ifdef INVARIANTS
-	KASSERT(crit_count == (td->td_pri & ~TDPRI_MASK), 
+	KASSERT(crit_count == td->td_critcount,
 		("syscall: critical section count mismatch! %d/%d",
-		crit_count / TDPRI_CRIT, td->td_pri / TDPRI_CRIT));
+		crit_count, td->td_pri));
 #endif
 }
 
diff --git a/sys/platform/pc32/icu/icu_vector.s b/sys/platform/pc32/icu/icu_vector.s
index 7c7c7c9d45..d196f9ec11 100644
--- a/sys/platform/pc32/icu/icu_vector.s
+++ b/sys/platform/pc32/icu/icu_vector.s
@@ -147,8 +147,8 @@ IDTVEC(vec_name) ; 							\
 	pushl	$0 ;			/* DUMMY CPL FOR DORETI */	\
 	testl	$-1,TD_NEST_COUNT(%ebx) ;				\
 	jne	1f ;							\
-	cmpl	$TDPRI_CRIT,TD_PRI(%ebx) ;				\
-	jl	2f ;							\
+	testl	$-1,TD_CRITCOUNT(%ebx) ;				\
+	je	2f ;							\
 1: ;									\
 	/* set pending bit and return, leave interrupt masked */	\
 	orl	$IRQ_LBIT(irq_num),PCPU(fpending) ;			\
@@ -159,9 +159,9 @@ IDTVEC(vec_name) ; 							\
 	andl	$~IRQ_LBIT(irq_num),PCPU(fpending) ;			\
 	pushl	$irq_num ;						\
 	pushl	%esp ;			/* pass frame by reference */	\
-	addl	$TDPRI_CRIT,TD_PRI(%ebx) ;				\
+	incl	TD_CRITCOUNT(%ebx) ;					\
 	call	ithread_fast_handler ;	/* returns 0 to unmask int */	\
-	subl	$TDPRI_CRIT,TD_PRI(%ebx) ;				\
+	decl	TD_CRITCOUNT(%ebx) ;					\
 	addl	$8,%esp ;						\
 	UNMASK_IRQ(icu, irq_num) ;					\
 5: ;									\
diff --git a/sys/platform/pc32/isa/ipl.s b/sys/platform/pc32/isa/ipl.s
index a6e5baebcd..f0c7d6e6db 100644
--- a/sys/platform/pc32/isa/ipl.s
+++ b/sys/platform/pc32/isa/ipl.s
@@ -111,12 +111,12 @@ doreti:
 	popl	%eax			/* cpl to restore XXX */
 	movl	$0,%eax			/* irq mask unavailable due to BGL */
 	movl	PCPU(curthread),%ebx
-	cli				/* interlock with TDPRI_CRIT */
+	cli				/* interlock with td_critcount */
 	cmpl	$0,PCPU(reqflags)	/* short cut if nothing to do */
 	je	5f
-	cmpl	$TDPRI_CRIT,TD_PRI(%ebx) /* can't unpend if in critical sec */
-	jge	5f
-	addl	$TDPRI_CRIT,TD_PRI(%ebx) /* force all ints to pending */
+	testl	$-1,TD_CRITCOUNT(%ebx)	/* can't unpend if in critical sec */
+	jne	5f
+	incl	TD_CRITCOUNT(%ebx)	/* force all ints to pending */
 doreti_next:
 	sti				/* allow new interrupts */
 	movl	%eax,%ecx		/* irq mask unavailable due to BGL */
@@ -152,7 +152,7 @@ doreti_next:
 	 * BGL requirements.  We can only clear RQF_INTPEND if *ALL* pending
 	 * interrupts have been processed.
 	 */
-	subl	$TDPRI_CRIT,TD_PRI(%ebx)	/* interlocked with cli */
+	decl	TD_CRITCOUNT(%ebx)	/* interlocked with cli */
 	testl	%eax,%eax
 	jnz	5f
 	andl	$~RQF_INTPEND,PCPU(reqflags)
@@ -240,9 +240,9 @@ doreti_soft:
 	pushl	%eax
 	pushl	%ecx
 	incl	TD_NEST_COUNT(%ebx)	/* prevent doreti/splz nesting */
-	subl	$TDPRI_CRIT,TD_PRI(%ebx) /* so we can preempt */
+	decl	TD_CRITCOUNT(%ebx)	/* so we can preempt */
 	call	sched_ithd		/* YYY must pull in imasks */
-	addl	$TDPRI_CRIT,TD_PRI(%ebx)
+	incl	TD_CRITCOUNT(%ebx)
 	decl	TD_NEST_COUNT(%ebx)
 	addl	$4,%esp
 	popl	%eax
@@ -264,9 +264,9 @@ doreti_ast:
 	movl	%eax,%esi		/* save cpl (can't use stack) */
 	movl	$T_ASTFLT,TF_TRAPNO(%esp)
 	pushl	%esp			/* pass frame by reference */
-	subl	$TDPRI_CRIT,TD_PRI(%ebx)
+	decl	TD_CRITCOUNT(%ebx)
 	call	trap
-	addl	$TDPRI_CRIT,TD_PRI(%ebx)
+	incl	TD_CRITCOUNT(%ebx)
 	addl	$4,%esp
 	movl	%esi,%eax		/* restore cpl for loop */
 	jmp	doreti_next
@@ -315,7 +315,7 @@ ENTRY(splz)
 	pushfl
 	pushl	%ebx
 	movl	PCPU(curthread),%ebx
-	addl	$TDPRI_CRIT,TD_PRI(%ebx)
+	incl	TD_CRITCOUNT(%ebx)
 	movl	$0,%eax
 
 splz_next:
@@ -335,7 +335,7 @@ splz_next:
 	cmpl	$0,%ecx
 	jnz	splz_soft
 
-	subl	$TDPRI_CRIT,TD_PRI(%ebx)
+	decl	TD_CRITCOUNT(%ebx)
 
 	/*
 	 * Nothing left to do, finish up.  Interrupts are still disabled.
@@ -380,10 +380,10 @@ splz_soft:
 	sti
 	pushl	%eax
 	pushl	%ecx
-	subl	$TDPRI_CRIT,TD_PRI(%ebx)
+	decl	TD_CRITCOUNT(%ebx)
 	incl	TD_NEST_COUNT(%ebx)	/* prevent doreti/splz nesting */
 	call	sched_ithd		/* YYY must pull in imasks */
-	addl	$TDPRI_CRIT,TD_PRI(%ebx)
+	incl	TD_CRITCOUNT(%ebx)
 	decl	TD_NEST_COUNT(%ebx)	/* prevent doreti/splz nesting */
 	addl	$4,%esp
 	popl	%eax
diff --git a/sys/platform/pc64/apic/apic_vector.s b/sys/platform/pc64/apic/apic_vector.s
index 32dd04846b..2bd8d07e20 100644
--- a/sys/platform/pc64/apic/apic_vector.s
+++ b/sys/platform/pc64/apic/apic_vector.s
@@ -130,8 +130,8 @@ IDTVEC(vec_name) ;							\
 	movq	PCPU(curthread),%rbx ;					\
 	testl	$-1,TD_NEST_COUNT(%rbx) ;				\
 	jne	1f ;							\
-	cmpl	$TDPRI_CRIT,TD_PRI(%rbx) ;				\
-	jl	2f ;							\
+	testl	$-1,TD_CRITCOUNT(%rbx) ;				\
+	je	2f ;							\
 1: ;									\
 	/* in critical section, make interrupt pending */		\
 	/* set the pending bit and return, leave interrupt masked */	\
@@ -143,9 +143,9 @@ IDTVEC(vec_name) ;							\
 	andl	$~IRQ_LBIT(irq_num),PCPU(fpending) ;			\
 	pushq	$irq_num ;		/* trapframe -> intrframe */	\
 	movq	%rsp, %rdi ;		/* pass frame by reference */	\
-	addl	$TDPRI_CRIT,TD_PRI(%rbx) ;				\
+	incl	TD_CRITCOUNT(%rbx) ;					\
 	call	ithread_fast_handler ;	/* returns 0 to unmask */	\
-	subl	$TDPRI_CRIT,TD_PRI(%rbx) ;				\
+	decl	TD_CRITCOUNT(%rbx) ;					\
 	addq	$8, %rsp ;		/* intrframe -> trapframe */	\
 	UNMASK_IRQ(irq_num) ;						\
 5: ;									\
@@ -305,14 +305,14 @@ Xipiq:
 
 	incl    PCPU(cnt) + V_IPI
 	movq	PCPU(curthread),%rbx
-	cmpl	$TDPRI_CRIT,TD_PRI(%rbx)
-	jge	1f
+	testl	$-1,TD_CRITCOUNT(%rbx)
+	jne	1f
 	subq	$8,%rsp			/* make same as interrupt frame */
 	movq	%rsp,%rdi		/* pass frame by reference */
 	incl	PCPU(intr_nesting_level)
-	addl	$TDPRI_CRIT,TD_PRI(%rbx)
+	incl	TD_CRITCOUNT(%rbx)
 	call	lwkt_process_ipiq_frame
-	subl	$TDPRI_CRIT,TD_PRI(%rbx)
+	decl	TD_CRITCOUNT(%rbx)
 	decl	PCPU(intr_nesting_level)
 	addq	$8,%rsp			/* turn into trapframe */
 	MEXITCOUNT
@@ -334,16 +334,16 @@ Xtimer:
 
 	incl    PCPU(cnt) + V_TIMER
 	movq	PCPU(curthread),%rbx
-	cmpl	$TDPRI_CRIT,TD_PRI(%rbx)
-	jge	1f
+	testl	$-1,TD_CRITCOUNT(%rbx)
+	jne	1f
 	testl	$-1,TD_NEST_COUNT(%rbx)
 	jne	1f
 	subq	$8,%rsp			/* make same as interrupt frame */
 	movq	%rsp,%rdi		/* pass frame by reference */
 	incl	PCPU(intr_nesting_level)
-	addl	$TDPRI_CRIT,TD_PRI(%rbx)
+	incl	TD_CRITCOUNT(%rbx)
 	call	lapic_timer_process_frame
-	subl	$TDPRI_CRIT,TD_PRI(%rbx)
+	decl	TD_CRITCOUNT(%rbx)
 	decl	PCPU(intr_nesting_level)
 	addq	$8,%rsp			/* turn into trapframe */
 	MEXITCOUNT
diff --git a/sys/platform/pc64/icu/icu_vector.s b/sys/platform/pc64/icu/icu_vector.s
index b10ef36d54..715b222518 100644
--- a/sys/platform/pc64/icu/icu_vector.s
+++ b/sys/platform/pc64/icu/icu_vector.s
@@ -142,8 +142,8 @@ IDTVEC(vec_name) ; 							\
 	movq	PCPU(curthread),%rbx ;					\
 	testl	$-1,TD_NEST_COUNT(%rbx) ;				\
 	jne	1f ;							\
-	cmpl	$TDPRI_CRIT,TD_PRI(%rbx) ;				\
-	jl	2f ;							\
+	testl	$-1,TD_CRITCOUNT(%rbx) ;				\
+	je	2f ;							\
 1: ;									\
 	/* set pending bit and return, leave interrupt masked */	\
 	orl	$IRQ_LBIT(irq_num),PCPU(fpending) ;			\
@@ -154,9 +154,9 @@ IDTVEC(vec_name) ; 							\
 	andl	$~IRQ_LBIT(irq_num),PCPU(fpending) ;			\
 	pushq	$irq_num ;						\
 	movq	%rsp,%rdi ;		/* rdi = call argument */	\
-	addl	$TDPRI_CRIT,TD_PRI(%rbx) ;				\
+	incl	TD_CRITCOUNT(%rbx) ;					\
 	call	ithread_fast_handler ;	/* returns 0 to unmask int */	\
-	subl	$TDPRI_CRIT,TD_PRI(%rbx) ;				\
+	decl	TD_CRITCOUNT(%rbx) ;					\
 	addq	$8,%rsp ;		/* intr frame -> trap frame */	\
 	UNMASK_IRQ(icu, irq_num) ;					\
 5: ;									\
diff --git a/sys/platform/pc64/x86_64/exception.S b/sys/platform/pc64/x86_64/exception.S
index 4efea946ae..f462849e4e 100644
--- a/sys/platform/pc64/x86_64/exception.S
+++ b/sys/platform/pc64/x86_64/exception.S
@@ -408,7 +408,7 @@ nmi_restoreregs:
  */
 ENTRY(fork_trampoline)
 	movq	PCPU(curthread),%rax
-	subl	$TDPRI_CRIT,TD_PRI(%rax)
+	decl	TD_CRITCOUNT(%rax)
 
 	/*
 	 * cpu_set_fork_handler intercepts this function call to
diff --git a/sys/platform/pc64/x86_64/genassym.c b/sys/platform/pc64/x86_64/genassym.c
index 116204dcdb..d7a169b413 100644
--- a/sys/platform/pc64/x86_64/genassym.c
+++ b/sys/platform/pc64/x86_64/genassym.c
@@ -178,6 +178,7 @@ ASSYM(TD_LWP, offsetof(struct thread, td_lwp));
 ASSYM(TD_PCB, offsetof(struct thread, td_pcb));
 ASSYM(TD_SP, offsetof(struct thread, td_sp));
 ASSYM(TD_PRI, offsetof(struct thread, td_pri));
+ASSYM(TD_CRITCOUNT, offsetof(struct thread, td_critcount));
 ASSYM(TD_MACH, offsetof(struct thread, td_mach));
 ASSYM(TD_WCHAN, offsetof(struct thread, td_wchan));
 ASSYM(TD_NEST_COUNT, offsetof(struct thread, td_nest_count));
@@ -236,7 +237,6 @@ ASSYM(MSR_FSBASE, MSR_FSBASE);
 
 ASSYM(MACHINTR_INTREN, offsetof(struct machintr_abi, intren));
 
-ASSYM(TDPRI_CRIT, TDPRI_CRIT);
 ASSYM(TDPRI_INT_SUPPORT, TDPRI_INT_SUPPORT);
 #ifdef SMP
 ASSYM(CPUMASK_LOCK, CPUMASK_LOCK);
diff --git a/sys/platform/pc64/x86_64/ipl.s b/sys/platform/pc64/x86_64/ipl.s
index 2037850334..44214a7aff 100644
--- a/sys/platform/pc64/x86_64/ipl.s
+++ b/sys/platform/pc64/x86_64/ipl.s
@@ -138,12 +138,12 @@ doreti:
 	FAKE_MCOUNT(bintr)		/* init "from" bintr -> doreti */
 	movq	$0,%rax			/* irq mask unavailable due to BGL */
 	movq	PCPU(curthread),%rbx
-	cli				/* interlock with TDPRI_CRIT */
+	cli				/* interlock with critical section */
 	cmpl	$0,PCPU(reqflags)	/* short cut if nothing to do */
 	je	5f
-	cmpl	$TDPRI_CRIT,TD_PRI(%rbx) /* can't unpend if in critical sec */
-	jge	5f
-	addl	$TDPRI_CRIT,TD_PRI(%rbx) /* force all ints to pending */
+	testl	$-1,TD_CRITCOUNT(%rbx)	/* can't unpend if in critical sec */
+	jne	5f
+	incl	TD_CRITCOUNT(%rbx)	/* force all ints to pending */
 doreti_next:
 	sti				/* allow new interrupts */
 	movl	%eax,%ecx		/* irq mask unavailable due to BGL */
@@ -175,7 +175,7 @@ doreti_next:
 	 * BGL requirements.  We can only clear RQF_INTPEND if *ALL* pending
 	 * interrupts have been processed.
 	 */
-	subl	$TDPRI_CRIT,TD_PRI(%rbx)	/* interlocked with cli */
+	decl	TD_CRITCOUNT(%rbx)	/* interlocked with cli */
 	testl	%eax,%eax
 	jnz	5f
 	andl	$~RQF_INTPEND,PCPU(reqflags)
@@ -275,9 +275,9 @@ doreti_soft:
 	pushq	%rax
 	movl	%ecx,%edi		/* argument to C call */
 	incl	TD_NEST_COUNT(%rbx)	/* prevent doreti/splz nesting */
-	subl	$TDPRI_CRIT,TD_PRI(%rbx) /* so we can preempt */
+	decl	TD_CRITCOUNT(%rbx)	/* so we can preempt */
 	call	sched_ithd		/* YYY must pull in imasks */
-	addl	$TDPRI_CRIT,TD_PRI(%rbx)
+	incl	TD_CRITCOUNT(%rbx)
 	decl	TD_NEST_COUNT(%rbx)
 	popq	%rax
 	jmp	doreti_next
@@ -298,9 +298,9 @@ doreti_ast:
 	movl	%eax,%r12d		/* save cpl (can't use stack) */
 	movl	$T_ASTFLT,TF_TRAPNO(%rsp)
 	movq	%rsp,%rdi		/* pass frame by ref (%edi = C arg) */
-	subl	$TDPRI_CRIT,TD_PRI(%rbx)
+	decl	TD_CRITCOUNT(%rbx)
 	call	trap
-	addl	$TDPRI_CRIT,TD_PRI(%rbx)
+	incl	TD_CRITCOUNT(%rbx)
 	movl	%r12d,%eax		/* restore cpl for loop */
 	jmp	doreti_next
 
@@ -348,7 +348,7 @@ ENTRY(splz)
 	pushfq
 	pushq	%rbx
 	movq	PCPU(curthread),%rbx
-	addl	$TDPRI_CRIT,TD_PRI(%rbx)
+	incl	TD_CRITCOUNT(%rbx)
 	movl	$0,%eax
 
 splz_next:
@@ -368,7 +368,7 @@ splz_next:
 	cmpl	$0,%ecx
 	jnz	splz_soft
 
-	subl	$TDPRI_CRIT,TD_PRI(%rbx)
+	decl	TD_CRITCOUNT(%rbx)
 
 	/*
 	 * Nothing left to do, finish up.  Interrupts are still disabled.
@@ -431,10 +431,10 @@ splz_soft:
 	sti
 	pushq	%rax
 	movl	%ecx,%edi		/* C argument */
-	subl	$TDPRI_CRIT,TD_PRI(%rbx)
+	decl	TD_CRITCOUNT(%rbx)
 	incl	TD_NEST_COUNT(%rbx)	/* prevent doreti/splz nesting */
 	call	sched_ithd		/* YYY must pull in imasks */
-	addl	$TDPRI_CRIT,TD_PRI(%rbx)
+	incl	TD_CRITCOUNT(%rbx)
 	decl	TD_NEST_COUNT(%rbx)	/* prevent doreti/splz nesting */
 	popq	%rax
 	jmp	splz_next
diff --git a/sys/platform/pc64/x86_64/machdep.c b/sys/platform/pc64/x86_64/machdep.c
index e342c39784..944b3ec50a 100644
--- a/sys/platform/pc64/x86_64/machdep.c
+++ b/sys/platform/pc64/x86_64/machdep.c
@@ -778,7 +778,7 @@ sendupcall(struct vmupcall *vu, int morepending)
 	 */
 	vu->vu_pending = 0;
 	upcall.upc_pending = morepending;
-	crit_count += TDPRI_CRIT;
+	++crit_count;
 	copyout(&upcall.upc_pending, &lp->lwp_upcall->upc_pending, 
 		sizeof(upcall.upc_pending));
 	copyout(&crit_count, (char *)upcall.upc_uthread + upcall.upc_critoff,
@@ -836,7 +836,7 @@ fetchupcall(struct vmupcall *vu, int morepending, void *rsp)
 		crit_count = 0;
 		if (error == 0)
 			error = copyin((char *)upcall.upc_uthread + upcall.upc_critoff, &crit_count, sizeof(int));
-		crit_count += TDPRI_CRIT;
+		++crit_count;
 		if (error == 0)
 			error = copyout(&crit_count, (char *)upcall.upc_uthread + upcall.upc_critoff, sizeof(int));
 		regs->tf_rax = (register_t)vu->vu_func;
@@ -928,7 +928,7 @@ cpu_idle(void)
 	struct thread *td = curthread;
 
 	crit_exit();
-	KKASSERT(td->td_pri < TDPRI_CRIT);
+	KKASSERT(td->td_critcount == 0);
 	for (;;) {
 		/*
 		 * See if there are any LWKTs ready to go.
diff --git a/sys/platform/pc64/x86_64/swtch.s b/sys/platform/pc64/x86_64/swtch.s
index 87cc24aa48..80efbc8b3e 100644
--- a/sys/platform/pc64/x86_64/swtch.s
+++ b/sys/platform/pc64/x86_64/swtch.s
@@ -609,7 +609,7 @@ ENTRY(cpu_kthread_restore)
 	/* rax and rbx come from the switchout code */
 	andl	$~TDF_RUNNING,TD_FLAGS(%rbx)
 	orl	$TDF_RUNNING,TD_FLAGS(%rax)
-	subl	$TDPRI_CRIT,TD_PRI(%rax)
+	decl	TD_CRITCOUNT(%rax)
 	movq	PCB_R12(%rdx),%rdi	/* argument to RBX function */
 	movq	PCB_RBX(%rdx),%rax	/* thread function */
 	/* note: top of stack return address inherited by function */
diff --git a/sys/platform/pc64/x86_64/trap.c b/sys/platform/pc64/x86_64/trap.c
index b394da5852..fb57f4ccf5 100644
--- a/sys/platform/pc64/x86_64/trap.c
+++ b/sys/platform/pc64/x86_64/trap.c
@@ -366,7 +366,7 @@ trap(struct trapframe *frame)
 	int have_mplock = 0;
 #endif
 #ifdef INVARIANTS
-	int crit_count = td->td_pri & ~TDPRI_MASK;
+	int crit_count = td->td_critcount;
 #endif
 	vm_offset_t eva;
 
@@ -794,9 +794,9 @@ out2:	;
 	if (p != NULL && lp != NULL)
 		KTR_LOG(kernentry_trap_ret, p->p_pid, lp->lwp_tid);
 #ifdef INVARIANTS
-	KASSERT(crit_count == (td->td_pri & ~TDPRI_MASK),
+	KASSERT(crit_count == td->td_critcount,
 		("syscall: critical section count mismatch! %d/%d",
-		crit_count / TDPRI_CRIT, td->td_pri / TDPRI_CRIT));
+		crit_count, td->td_pri));
 #endif
 }
 
@@ -971,7 +971,7 @@ trap_fatal(struct trapframe *frame, vm_offset_t eva)
 		kprintf("Idle\n");
 	}
 	kprintf("current thread          = pri %d ", curthread->td_pri);
-	if (curthread->td_pri >= TDPRI_CRIT)
+	if (curthread->td_critcount)
 		kprintf("(CRIT)");
 	kprintf("\n");
 
@@ -1032,7 +1032,7 @@ syscall2(struct trapframe *frame)
 	int error;
 	int narg;
 #ifdef INVARIANTS
-	int crit_count = td->td_pri & ~TDPRI_MASK;
+	int crit_count = td->td_critcount;
 #endif
 #ifdef SMP
 	int have_mplock = 0;
@@ -1259,9 +1259,9 @@ bad:
 #endif
 	KTR_LOG(kernentry_syscall_ret, p->p_pid, lp->lwp_tid, error);
 #ifdef INVARIANTS
-	KASSERT(crit_count == (td->td_pri & ~TDPRI_MASK), 
+	KASSERT(crit_count == td->td_critcount,
 		("syscall: critical section count mismatch! %d/%d",
-		crit_count / TDPRI_CRIT, td->td_pri / TDPRI_CRIT));
+		crit_count, td->td_pri));
 #endif
 }
 
diff --git a/sys/platform/vkernel/i386/cpu_regs.c b/sys/platform/vkernel/i386/cpu_regs.c
index 591632be0b..ff53178321 100644
--- a/sys/platform/vkernel/i386/cpu_regs.c
+++ b/sys/platform/vkernel/i386/cpu_regs.c
@@ -587,7 +587,7 @@ sendupcall(struct vmupcall *vu, int morepending)
 	 */
 	vu->vu_pending = 0;
 	upcall.upc_pending = morepending;
-	crit_count += TDPRI_CRIT;
+	++crit_count;
 	copyout(&upcall.upc_pending, &lp->lwp_upcall->upc_pending, 
 		sizeof(upcall.upc_pending));
 	copyout(&crit_count, (char *)upcall.upc_uthread + upcall.upc_critoff,
@@ -645,7 +645,7 @@ fetchupcall (struct vmupcall *vu, int morepending, void *rsp)
 		crit_count = 0;
 		if (error == 0)
 			error = copyin((char *)upcall.upc_uthread + upcall.upc_critoff, &crit_count, sizeof(int));
-		crit_count += TDPRI_CRIT;
+		++crit_count;
 		if (error == 0)
 			error = copyout(&crit_count, (char *)upcall.upc_uthread + upcall.upc_critoff, sizeof(int));
 		regs->tf_eax = (register_t)vu->vu_func;
@@ -704,7 +704,7 @@ cpu_idle(void)
 	struct mdglobaldata *gd = mdcpu;
 
 	crit_exit();
-	KKASSERT(td->td_pri < TDPRI_CRIT);
+	KKASSERT(td->td_critcount == 0);
 	cpu_enable_intr();
 	for (;;) {
 		/*
@@ -724,7 +724,7 @@ cpu_idle(void)
 				struct timeval tv1, tv2;
 				gettimeofday(&tv1, NULL);
 #endif
-				umtx_sleep(&gd->mi.gd_runqmask, 0, 1000000);
+				umtx_sleep(&gd->mi.gd_reqflags, 0, 1000000);
 #ifdef DEBUGIDLE
 				gettimeofday(&tv2, NULL);
 				if (tv2.tv_usec - tv1.tv_usec +
@@ -732,7 +732,7 @@ cpu_idle(void)
 				    > 500000) {
 					kprintf("cpu %d idlelock %08x %08x\n",
 						gd->mi.gd_cpuid,
-						gd->mi.gd_runqmask,
+						gd->mi.gd_reqflags,
 						gd->gd_fpending);
 				}
 #endif
diff --git a/sys/platform/vkernel/i386/exception.c b/sys/platform/vkernel/i386/exception.c
index 7513a5f962..8b770ef4fa 100644
--- a/sys/platform/vkernel/i386/exception.c
+++ b/sys/platform/vkernel/i386/exception.c
@@ -78,12 +78,12 @@ static
 void
 ipisig(int nada, siginfo_t *info, void *ctxp)
 {
-	if (curthread->td_pri < TDPRI_CRIT) {
-		curthread->td_pri += TDPRI_CRIT;
+	if (curthread->td_critcount == 0) {
+		++curthread->td_critcount;
 		++mycpu->gd_intr_nesting_level;
 		lwkt_process_ipiq();
 		--mycpu->gd_intr_nesting_level;
-		curthread->td_pri -= TDPRI_CRIT;
+		--curthread->td_critcount;
 	} else {
 		need_ipiq();
 	}
@@ -115,13 +115,13 @@ stopsig(int nada, siginfo_t *info, void *ctxp)
 	sigaddset(&ss, SIGTERM);
 	sigaddset(&ss, SIGWINCH);
 
-	curthread->td_pri += TDPRI_CRIT;
+	++curthread->td_critcount;
 	++mycpu->gd_intr_nesting_level;
 	while (stopped_cpus & mycpu->gd_cpumask) {
 		sigsuspend(&ss);
 	}
 	--mycpu->gd_intr_nesting_level;
-	curthread->td_pri -= TDPRI_CRIT;
+	--curthread->td_critcount;
 }
 
 #endif
diff --git a/sys/platform/vkernel/i386/fork_tramp.s b/sys/platform/vkernel/i386/fork_tramp.s
index 3a4202b086..6c2eede5ea 100644
--- a/sys/platform/vkernel/i386/fork_tramp.s
+++ b/sys/platform/vkernel/i386/fork_tramp.s
@@ -59,7 +59,7 @@
  */
 ENTRY(fork_trampoline)
 	movl	PCPU(curthread),%eax
-	subl	$TDPRI_CRIT,TD_PRI(%eax)
+	decl	TD_CRITCOUNT(%eax)
 
 	/*
 	 * cpu_set_fork_handler intercepts this function call to
diff --git a/sys/platform/vkernel/i386/genassym.c b/sys/platform/vkernel/i386/genassym.c
index cbabc033d6..5d332fd143 100644
--- a/sys/platform/vkernel/i386/genassym.c
+++ b/sys/platform/vkernel/i386/genassym.c
@@ -83,6 +83,7 @@ ASSYM(TD_LWP, offsetof(struct thread, td_lwp));
 ASSYM(TD_PCB, offsetof(struct thread, td_pcb));
 ASSYM(TD_SP, offsetof(struct thread, td_sp));
 ASSYM(TD_PRI, offsetof(struct thread, td_pri));
+ASSYM(TD_CRITCOUNT, offsetof(struct thread, td_critcount));
 ASSYM(TD_MACH, offsetof(struct thread, td_mach));
 ASSYM(TD_WCHAN, offsetof(struct thread, td_wchan));
 ASSYM(TD_NEST_COUNT, offsetof(struct thread, td_nest_count));
@@ -94,7 +95,6 @@ ASSYM(TDF_RUNNING, TDF_RUNNING);
 
 ASSYM(TD_SAVEFPU, offsetof(struct thread, td_mach) + offsetof(struct md_thread, mtd_savefpu));
 
-ASSYM(TDPRI_CRIT, TDPRI_CRIT);
 ASSYM(TDPRI_INT_SUPPORT, TDPRI_INT_SUPPORT);
 #ifdef SMP
 ASSYM(CPUMASK_LOCK, CPUMASK_LOCK);
diff --git a/sys/platform/vkernel/i386/swtch.s b/sys/platform/vkernel/i386/swtch.s
index 40705a3030..9e26f07152 100644
--- a/sys/platform/vkernel/i386/swtch.s
+++ b/sys/platform/vkernel/i386/swtch.s
@@ -532,7 +532,7 @@ ENTRY(cpu_kthread_restore)
 	movl	$0,%ebp
 	andl	$~TDF_RUNNING,TD_FLAGS(%ebx)
 	orl	$TDF_RUNNING,TD_FLAGS(%eax)
-	subl	$TDPRI_CRIT,TD_PRI(%eax)
+	decl	TD_CRITCOUNT(%eax)
 	popl	%eax		/* kthread exit function */
 	pushl	PCB_EBX(%edx)	/* argument to ESI function */
 	pushl	%eax		/* set exit func as return address */
diff --git a/sys/platform/vkernel/i386/trap.c b/sys/platform/vkernel/i386/trap.c
index 49fbd831f2..47d8cfd2dc 100644
--- a/sys/platform/vkernel/i386/trap.c
+++ b/sys/platform/vkernel/i386/trap.c
@@ -381,7 +381,7 @@ user_trap(struct trapframe *frame)
 	int have_mplock = 0;
 #endif
 #ifdef INVARIANTS
-	int crit_count = td->td_pri & ~TDPRI_MASK;
+	int crit_count = td->td_critcount;
 #endif
 	vm_offset_t eva;
 
@@ -659,9 +659,9 @@ out2:	;
 #endif
 	KTR_LOG(kernentry_trap_ret, lp->lwp_proc->p_pid, lp->lwp_tid);
 #ifdef INVARIANTS
-	KASSERT(crit_count == (td->td_pri & ~TDPRI_MASK),
+	KASSERT(crit_count == td->td_critcount,
 		("syscall: critical section count mismatch! %d/%d",
-		crit_count / TDPRI_CRIT, td->td_pri / TDPRI_CRIT));
+		crit_count, td->td_pri));
 #endif
 }
 
@@ -677,7 +677,7 @@ kern_trap(struct trapframe *frame)
 	int have_mplock = 0;
 #endif
 #ifdef INVARIANTS
-	int crit_count = td->td_pri & ~TDPRI_MASK;
+	int crit_count = td->td_critcount;
 #endif
 	vm_offset_t eva;
 
@@ -871,9 +871,9 @@ out2:
 		rel_mplock();
 #endif
 #ifdef INVARIANTS
-	KASSERT(crit_count == (td->td_pri & ~TDPRI_MASK),
+	KASSERT(crit_count == td->td_critcount,
 		("syscall: critical section count mismatch! %d/%d",
-		crit_count / TDPRI_CRIT, td->td_pri / TDPRI_CRIT));
+		crit_count, td->td_pri));
 #endif
 }
 
@@ -1016,7 +1016,7 @@ trap_fatal(struct trapframe *frame, int usermode, vm_offset_t eva)
 		kprintf("Idle\n");
 	}
 	kprintf("current thread          = pri %d ", curthread->td_pri);
-	if (curthread->td_pri >= TDPRI_CRIT)
+	if (curthread->td_critcount)
 		kprintf("(CRIT)");
 	kprintf("\n");
 #ifdef SMP
@@ -1096,7 +1096,7 @@ syscall2(struct trapframe *frame)
 	int error;
 	int narg;
 #ifdef INVARIANTS
-	int crit_count = td->td_pri & ~TDPRI_MASK;
+	int crit_count = td->td_critcount;
 #endif
 #ifdef SMP
 	int have_mplock = 0;
@@ -1304,9 +1304,9 @@ bad:
 #endif
 	KTR_LOG(kernentry_syscall_ret, lp->lwp_proc->p_pid, lp->lwp_tid, error);
 #ifdef INVARIANTS
-	KASSERT(crit_count == (td->td_pri & ~TDPRI_MASK), 
+	KASSERT(crit_count == td->td_critcount,
 		("syscall: critical section count mismatch! %d/%d",
-		crit_count / TDPRI_CRIT, td->td_pri / TDPRI_CRIT));
+		crit_count, td->td_pri));
 #endif
 }
 
diff --git a/sys/platform/vkernel/platform/machintr.c b/sys/platform/vkernel/platform/machintr.c
index 5b9163dd78..5b5c955303 100644
--- a/sys/platform/vkernel/platform/machintr.c
+++ b/sys/platform/vkernel/platform/machintr.c
@@ -160,7 +160,7 @@ signalintr(int intr)
 	struct mdglobaldata *gd = mdcpu;
 	thread_t td = gd->mi.gd_curthread;
 
-	if (td->td_pri >= TDPRI_CRIT || td->td_nest_count) {
+	if (td->td_critcount || td->td_nest_count) {
 		atomic_set_int_nonlocked(&gd->gd_fpending, 1 << intr);
 		atomic_set_int_nonlocked(&gd->mi.gd_reqflags, RQF_INTPEND);
 	} else {
diff --git a/sys/platform/vkernel64/platform/machintr.c b/sys/platform/vkernel64/platform/machintr.c
index 7aa99ab457..44cc1daf19 100644
--- a/sys/platform/vkernel64/platform/machintr.c
+++ b/sys/platform/vkernel64/platform/machintr.c
@@ -160,7 +160,7 @@ signalintr(int intr)
 	struct mdglobaldata *gd = mdcpu;
 	thread_t td = gd->mi.gd_curthread;
 
-	if (td->td_pri >= TDPRI_CRIT || td->td_nest_count) {
+	if (td->td_critcount || td->td_nest_count) {
 		atomic_set_int_nonlocked(&gd->gd_fpending, 1 << intr);
 		atomic_set_int_nonlocked(&gd->mi.gd_reqflags, RQF_INTPEND);
 	} else {
diff --git a/sys/platform/vkernel64/x86_64/cpu_regs.c b/sys/platform/vkernel64/x86_64/cpu_regs.c
index 926cc18c85..bd4f3efb03 100644
--- a/sys/platform/vkernel64/x86_64/cpu_regs.c
+++ b/sys/platform/vkernel64/x86_64/cpu_regs.c
@@ -591,7 +591,7 @@ sendupcall(struct vmupcall *vu, int morepending)
 	 */
 	vu->vu_pending = 0;
 	upcall.upc_pending = morepending;
-	crit_count += TDPRI_CRIT;
+	++crit_count;
 	copyout(&upcall.upc_pending, &lp->lwp_upcall->upc_pending,
 		sizeof(upcall.upc_pending));
 	copyout(&crit_count, (char *)upcall.upc_uthread + upcall.upc_critoff,
@@ -649,7 +649,7 @@ fetchupcall(struct vmupcall *vu, int morepending, void *rsp)
 		crit_count = 0;
 		if (error == 0)
 			error = copyin((char *)upcall.upc_uthread + upcall.upc_critoff, &crit_count, sizeof(int));
-		crit_count += TDPRI_CRIT;
+		++crit_count;
 		if (error == 0)
 			error = copyout(&crit_count, (char *)upcall.upc_uthread + upcall.upc_critoff, sizeof(int));
 		regs->tf_rax = (register_t)vu->vu_func;
@@ -708,7 +708,7 @@ cpu_idle(void)
 	struct mdglobaldata *gd = mdcpu;
 
 	crit_exit();
-	KKASSERT(td->td_pri < TDPRI_CRIT);
+	KKASSERT(td->td_critcount == 0);
 	cpu_enable_intr();
 	for (;;) {
 		/*
@@ -728,7 +728,7 @@ cpu_idle(void)
 				struct timeval tv1, tv2;
 				gettimeofday(&tv1, NULL);
 #endif
-				umtx_sleep(&gd->mi.gd_runqmask, 0, 1000000);
+				umtx_sleep(&gd->mi.gd_reqflags, 0, 1000000);
 #ifdef DEBUGIDLE
 				gettimeofday(&tv2, NULL);
 				if (tv2.tv_usec - tv1.tv_usec +
@@ -736,7 +736,7 @@ cpu_idle(void)
 				    > 500000) {
 					kprintf("cpu %d idlelock %08x %08x\n",
 						gd->mi.gd_cpuid,
-						gd->mi.gd_runqmask,
+						gd->mi.gd_reqflags,
 						gd->gd_fpending);
 				}
 #endif
diff --git a/sys/platform/vkernel64/x86_64/exception.c b/sys/platform/vkernel64/x86_64/exception.c
index 6236a2effa..34ef724984 100644
--- a/sys/platform/vkernel64/x86_64/exception.c
+++ b/sys/platform/vkernel64/x86_64/exception.c
@@ -78,12 +78,15 @@ static
 void
 ipisig(int nada, siginfo_t *info, void *ctxp)
 {
-	if (curthread->td_pri < TDPRI_CRIT) {
-		curthread->td_pri += TDPRI_CRIT;
-		++mycpu->gd_intr_nesting_level;
+	globaldata_t gd = mycpu;
+	thread_t td = gd->gd_curthread;
+
+	if (td->td_critcount == 0) {
+		++td->td_critcount;
+		++gd->gd_intr_nesting_level;
 		lwkt_process_ipiq();
-		--mycpu->gd_intr_nesting_level;
-		curthread->td_pri -= TDPRI_CRIT;
+		--gd->gd_intr_nesting_level;
+		--td->td_critcount;
 	} else {
 		need_ipiq();
 	}
@@ -104,6 +107,8 @@ static
 void
 stopsig(int nada, siginfo_t *info, void *ctxp)
 {
+	globaldata_t gd = mycpu;
+	thread_t td = gd->gd_curthread;
 	sigset_t ss;
 
 	sigemptyset(&ss);
@@ -115,13 +120,13 @@ stopsig(int nada, siginfo_t *info, void *ctxp)
 	sigaddset(&ss, SIGTERM);
 	sigaddset(&ss, SIGWINCH);
 
-	curthread->td_pri += TDPRI_CRIT;
-	++mycpu->gd_intr_nesting_level;
-	while (stopped_cpus & mycpu->gd_cpumask) {
+	++td->td_critcount;
+	++gd->gd_intr_nesting_level;
+	while (stopped_cpus & gd->gd_cpumask) {
 		sigsuspend(&ss);
 	}
-	--mycpu->gd_intr_nesting_level;
-	curthread->td_pri -= TDPRI_CRIT;
+	--gd->gd_intr_nesting_level;
+	--td->td_critcount;
 }
 
 #endif
diff --git a/sys/platform/vkernel64/x86_64/fork_tramp.s b/sys/platform/vkernel64/x86_64/fork_tramp.s
index 1e3fb418ba..e804ef5096 100644
--- a/sys/platform/vkernel64/x86_64/fork_tramp.s
+++ b/sys/platform/vkernel64/x86_64/fork_tramp.s
@@ -56,7 +56,7 @@
  */
 ENTRY(fork_trampoline)
 	movq	PCPU(curthread),%rax
-	subl	$TDPRI_CRIT,TD_PRI(%rax)
+	decl	TD_CRITCOUNT(%rax)
 
 	/*
 	 * cpu_set_fork_handler intercepts this function call to
diff --git a/sys/platform/vkernel64/x86_64/genassym.c b/sys/platform/vkernel64/x86_64/genassym.c
index af70e870f6..ee7725a0cd 100644
--- a/sys/platform/vkernel64/x86_64/genassym.c
+++ b/sys/platform/vkernel64/x86_64/genassym.c
@@ -109,6 +109,7 @@ ASSYM(TD_LWP, offsetof(struct thread, td_lwp));
 ASSYM(TD_PCB, offsetof(struct thread, td_pcb));
 ASSYM(TD_SP, offsetof(struct thread, td_sp));
 ASSYM(TD_PRI, offsetof(struct thread, td_pri));
+ASSYM(TD_CRITCOUNT, offsetof(struct thread, td_critcount));
 #ifdef SMP
 ASSYM(TD_MPCOUNT, offsetof(struct thread, td_mpcount));
 #endif
@@ -116,5 +117,3 @@ ASSYM(TD_FLAGS, offsetof(struct thread, td_flags));
 ASSYM(TD_SAVEFPU, offsetof(struct thread, td_savefpu));
 ASSYM(TDF_RUNNING, TDF_RUNNING);
 ASSYM(GD_NPXTHREAD, offsetof(struct mdglobaldata, gd_npxthread));
-
-ASSYM(TDPRI_CRIT, TDPRI_CRIT);
diff --git a/sys/platform/vkernel64/x86_64/swtch.s b/sys/platform/vkernel64/x86_64/swtch.s
index 0d755ab6d7..65929f39ab 100644
--- a/sys/platform/vkernel64/x86_64/swtch.s
+++ b/sys/platform/vkernel64/x86_64/swtch.s
@@ -593,7 +593,7 @@ ENTRY(cpu_kthread_restore)
 	/* rax and rbx come from the switchout code */
 	andl	$~TDF_RUNNING,TD_FLAGS(%rbx)
 	orl	$TDF_RUNNING,TD_FLAGS(%rax)
-	subl	$TDPRI_CRIT,TD_PRI(%rax)
+	decl	TD_CRITCOUNT(%rax)
 	movq	PCB_R12(%rdx),%rdi	/* argument to RBX function */
 	movq	PCB_RBX(%rdx),%rax	/* thread function */
 	/* note: top of stack return address inherited by function */
diff --git a/sys/platform/vkernel64/x86_64/trap.c b/sys/platform/vkernel64/x86_64/trap.c
index 1a0b07dfdb..b89b756298 100644
--- a/sys/platform/vkernel64/x86_64/trap.c
+++ b/sys/platform/vkernel64/x86_64/trap.c
@@ -381,7 +381,7 @@ user_trap(struct trapframe *frame)
 	int have_mplock = 0;
 #endif
 #ifdef INVARIANTS
-	int crit_count = td->td_pri & ~TDPRI_MASK;
+	int crit_count = td->td_critcount;
 #endif
 	vm_offset_t eva;
 
@@ -635,9 +635,9 @@ out2:	;
 #endif
 	KTR_LOG(kernentry_trap_ret, lp->lwp_proc->p_pid, lp->lwp_tid);
 #ifdef INVARIANTS
-	KASSERT(crit_count == (td->td_pri & ~TDPRI_MASK),
+	KASSERT(crit_count == td->td_critcount,
 		("syscall: critical section count mismatch! %d/%d",
-		crit_count / TDPRI_CRIT, td->td_pri / TDPRI_CRIT));
+		crit_count, td->td_pri));
 #endif
 }
 
@@ -653,7 +653,7 @@ kern_trap(struct trapframe *frame)
 	int have_mplock = 0;
 #endif
 #ifdef INVARIANTS
-	int crit_count = td->td_pri & ~TDPRI_MASK;
+	int crit_count = td->td_critcount;
 #endif
 	vm_offset_t eva;
 
@@ -846,9 +846,9 @@ out2:
 		rel_mplock();
 #endif
 #ifdef INVARIANTS
-	KASSERT(crit_count == (td->td_pri & ~TDPRI_MASK),
+	KASSERT(crit_count == td->td_critcount,
 		("syscall: critical section count mismatch! %d/%d",
-		crit_count / TDPRI_CRIT, td->td_pri / TDPRI_CRIT));
+		crit_count, td->td_pri));
 #endif
 }
 
@@ -1009,7 +1009,7 @@ trap_fatal(struct trapframe *frame, int usermode, vm_offset_t eva)
 		kprintf("Idle\n");
 	}
 	kprintf("current thread          = pri %d ", curthread->td_pri);
-	if (curthread->td_pri >= TDPRI_CRIT)
+	if (curthread->td_critcount)
 		kprintf("(CRIT)");
 	kprintf("\n");
 #ifdef SMP
@@ -1141,7 +1141,7 @@ syscall2(struct trapframe *frame)
 	int error;
 	int narg;
 #ifdef INVARIANTS
-	int crit_count = td->td_pri & ~TDPRI_MASK;
+	int crit_count = td->td_critcount;
 #endif
 #ifdef SMP
 	int have_mplock = 0;
@@ -1363,9 +1363,9 @@ bad:
 #endif
 	KTR_LOG(kernentry_syscall_ret, lp->lwp_proc->p_pid, lp->lwp_tid, error);
 #ifdef INVARIANTS
-	KASSERT(crit_count == (td->td_pri & ~TDPRI_MASK),
+	KASSERT(crit_count == td->td_critcount,
 		("syscall: critical section count mismatch! %d/%d",
-		crit_count / TDPRI_CRIT, td->td_pri / TDPRI_CRIT));
+		crit_count, td->td_pri));
 #endif
 }
 
diff --git a/sys/sys/globaldata.h b/sys/sys/globaldata.h
index 7c546453e1..ba58de030b 100644
--- a/sys/sys/globaldata.h
+++ b/sys/sys/globaldata.h
@@ -129,9 +129,7 @@ struct globaldata {
 	__uint32_t	gd_reqflags;		/* (see note above) */
 	void		*gd_unused00B;
 	lwkt_queue	gd_tdallq;		/* all threads */
-	lwkt_queue	gd_unused00C;
-	lwkt_queue	gd_tdrunq[32];		/* runnable threads */
-	__uint32_t	gd_runqmask;		/* which queues? */
+	lwkt_queue	gd_tdrunq;		/* runnable threads */
 	__uint32_t	gd_cpuid;
 	cpumask_t	gd_cpumask;		/* mask = 1<<cpuid */
 	cpumask_t	gd_other_cpus;		/* mask of 'other' cpus */
@@ -140,8 +138,7 @@ struct globaldata {
 	struct vmmeter	gd_cnt;
 	struct lwkt_ipiq *gd_ipiq;		/* array[ncpu] of ipiq's */
 	struct lwkt_ipiq gd_cpusyncq;		/* ipiq for cpu synchro */
-	short		gd_unused01;
-	short		gd_unused02;
+	int		gd_fairq_total_pri;
 	struct thread	gd_unused02B;
 	struct thread	gd_idlethread;
 	SLGlobalData	gd_slab;		/* slab allocator */
@@ -150,7 +147,6 @@ struct globaldata {
 	struct vm_map_entry *gd_vme_base;	/* vm_map_entry reservation */
 	struct systimerq gd_systimerq;		/* per-cpu system timers */
 	int		gd_syst_nest;
-	sysclock_t	gd_unused03;
 	struct systimer gd_hardclock;		/* scheduler periodic */
 	struct systimer gd_statclock;		/* statistics periodic */
 	struct systimer gd_schedclock;		/* scheduler periodic */
@@ -181,6 +177,7 @@ typedef struct globaldata *globaldata_t;
 #define RQB_AST_LWKT_RESCHED	5
 #define RQB_AST_UPCALL		6
 #define RQB_TIMER		7
+#define RQB_RUNNING		8
 
 #define RQF_IPIQ		(1 << RQB_IPIQ)
 #define RQF_INTPEND		(1 << RQB_INTPEND)
@@ -190,6 +187,7 @@ typedef struct globaldata *globaldata_t;
 #define RQF_AST_USER_RESCHED	(1 << RQB_AST_USER_RESCHED)
 #define RQF_AST_LWKT_RESCHED	(1 << RQB_AST_LWKT_RESCHED)
 #define RQF_AST_UPCALL		(1 << RQB_AST_UPCALL)
+#define RQF_RUNNING		(1 << RQB_RUNNING)
 #define RQF_AST_MASK		(RQF_AST_OWEUPC|RQF_AST_SIGNAL|\
 				RQF_AST_USER_RESCHED|RQF_AST_LWKT_RESCHED|\
 				RQF_AST_UPCALL)
diff --git a/sys/sys/thread.h b/sys/sys/thread.h
index ac64135af6..7846409f46 100644
--- a/sys/sys/thread.h
+++ b/sys/sys/thread.h
@@ -196,9 +196,6 @@ typedef struct lwkt_cpu_msg {
  * must be done through cpu_*msg() functions.  e.g. you could request
  * ownership of a thread that way, or hand a thread off to another cpu.
  *
- * NOTE: td_pri is bumped by TDPRI_CRIT when entering a critical section,
- * but this does not effect how the thread is scheduled by LWKT.
- *
  * NOTE: td_ucred is synchronized from the p_ucred on user->kernel syscall,
  *	 trap, and AST/signal transitions to provide a stable ucred for
  *	 (primarily) system calls.  This field will be NULL for pure kernel
@@ -219,9 +216,10 @@ struct thread {
     const char	*td_wmesg;	/* string name for blockage */
     const volatile void	*td_wchan;	/* waiting on channel */
     int		td_pri;		/* 0-31, 31=highest priority (note 1) */
+    int		td_critcount;	/* critical section priority */
     int		td_flags;	/* TDF flags */
     int		td_wdomain;	/* domain for wchan address (typ 0) */
-    void	(*td_preemptable)(struct thread *td, int critpri);
+    void	(*td_preemptable)(struct thread *td, int critcount);
     void	(*td_release)(struct thread *td);
     char	*td_kstack;	/* kernel stack */
     int		td_kstack_size;	/* size of kernel stack */
@@ -231,7 +229,7 @@ struct thread {
     __uint64_t	td_sticks;      /* Statclock hits in system mode (uS) */
     __uint64_t	td_iticks;	/* Statclock hits processing intr (uS) */
     int		td_locks;	/* lockmgr lock debugging */
-    int		td_unused01;
+    int		td_fairq_lticks;	/* fairq wakeup accumulator reset */
     void	*td_dsched_priv1;	/* priv data for I/O schedulers */
     int		td_refs;	/* hold position in gd_tdallq / hold free */
     int		td_nest_count;	/* prevent splz nesting */
@@ -257,6 +255,7 @@ struct thread {
     int		td_crit_debug_index;
     int		td_in_crit_report;	
 #endif
+    int		td_fairq_accum;		/* fairq priority accumulator */
     struct md_thread td_mach;
 };
 
@@ -311,6 +310,7 @@ struct thread {
 #define TDF_KERNELFP		0x01000000	/* kernel using fp coproc */
 #define TDF_NETWORK		0x02000000	/* network proto thread */
 #define TDF_CRYPTO		0x04000000	/* crypto thread */
+#define TDF_MARKER		0x80000000	/* fairq marker thread */
 
 /*
  * Thread priorities.  Typically only one thread from any given
@@ -339,14 +339,21 @@ struct thread {
 #define TDPRI_INT_HIGH		29	/* high priority interrupt */
 #define TDPRI_MAX		31
 
-#define TDPRI_MASK		31
-#define TDPRI_CRIT		32	/* high bits of td_pri used for crit */
+/*
+ * Scale is the approximate number of ticks for which we desire the
+ * entire gd_tdrunq to get service.  With hz = 100 a scale of 8 is 80ms.
+ *
+ * Setting this value too small will result in inefficient switching
+ * rates.
+ */
+#define TDFAIRQ_SCALE		8
+#define TDFAIRQ_MAX(gd)		((gd)->gd_fairq_total_pri * TDFAIRQ_SCALE)
 
 #define LWKT_THREAD_STACK	(UPAGES * PAGE_SIZE)
 
 #define CACHE_NTHREADS		6
 
-#define IN_CRITICAL_SECT(td)	((td)->td_pri >= TDPRI_CRIT)
+#define IN_CRITICAL_SECT(td)	((td)->td_critcount)
 
 #ifdef _KERNEL
 
@@ -403,7 +410,11 @@ extern lwkt_token_t lwkt_getpooltoken(void *);
 extern void lwkt_setpri(thread_t, int);
 extern void lwkt_setpri_initial(thread_t, int);
 extern void lwkt_setpri_self(int);
-extern int lwkt_check_resched(thread_t);
+extern void lwkt_fairq_schedulerclock(thread_t td);
+extern void lwkt_fairq_setpri_self(int pri);
+extern int lwkt_fairq_push(int pri);
+extern void lwkt_fairq_pop(int pri);
+extern void lwkt_fairq_yield(void);
 extern void lwkt_setcpu_self(struct globaldata *);
 extern void lwkt_migratecpu(int);
 
diff --git a/sys/sys/thread2.h b/sys/sys/thread2.h
index 0087690c8e..442c50058f 100644
--- a/sys/sys/thread2.h
+++ b/sys/sys/thread2.h
@@ -121,10 +121,10 @@ _crit_enter(__DEBUG_CRIT_ARG__)
     struct thread *td = curthread;
 
 #ifdef INVARIANTS
-    if (td->td_pri < 0)
+    if (td->td_critcount < 0)
 	crit_panic();
 #endif
-    td->td_pri += TDPRI_CRIT;
+    ++td->td_critcount;
     __DEBUG_CRIT_ENTER(td);
     cpu_ccfence();
 }
@@ -132,7 +132,7 @@ _crit_enter(__DEBUG_CRIT_ARG__)
 static __inline void
 _crit_enter_quick(struct thread *curtd __DEBUG_CRIT_ADD_ARG__)
 {
-    curtd->td_pri += TDPRI_CRIT;
+    ++curtd->td_critcount;
     __DEBUG_CRIT_ENTER(curtd);
     cpu_ccfence();
 }
@@ -147,7 +147,7 @@ static __inline void
 _crit_exit_noyield(struct thread *curtd __DEBUG_CRIT_ADD_ARG__)
 {
     __DEBUG_CRIT_EXIT(curtd);
-    curtd->td_pri -= TDPRI_CRIT;
+    --curtd->td_critcount;
 #ifdef INVARIANTS
     if (curtd->td_pri < 0)
 	crit_panic();
@@ -161,13 +161,13 @@ _crit_exit(__DEBUG_CRIT_ARG__)
     thread_t td = curthread;
 
     __DEBUG_CRIT_EXIT(td);
-    td->td_pri -= TDPRI_CRIT;
+    --td->td_critcount;
 #ifdef INVARIANTS
     if (td->td_pri < 0)
 	crit_panic();
 #endif
     cpu_ccfence();	/* prevent compiler reordering */
-    if (td->td_gd->gd_reqflags && td->td_pri < TDPRI_CRIT)
+    if (td->td_gd->gd_reqflags && td->td_critcount == 0)
 	splz_check();
 }
 
@@ -177,9 +177,9 @@ _crit_exit_quick(struct thread *curtd __DEBUG_CRIT_ADD_ARG__)
     globaldata_t gd = curtd->td_gd;
 
     __DEBUG_CRIT_EXIT(curtd);
-    curtd->td_pri -= TDPRI_CRIT;
+    --curtd->td_critcount;
     cpu_ccfence();	/* prevent compiler reordering */
-    if (gd->gd_reqflags && curtd->td_pri < TDPRI_CRIT)
+    if (gd->gd_reqflags && curtd->td_critcount == 0)
 	splz_check();
 }
 
@@ -192,7 +192,7 @@ _crit_exit_gd(globaldata_t mygd __DEBUG_CRIT_ADD_ARG__)
 static __inline int
 crit_test(thread_t td)
 {
-    return(td->td_pri >= TDPRI_CRIT);
+    return(td->td_critcount);
 }
 
 /*
@@ -202,13 +202,13 @@ crit_test(thread_t td)
 static __inline int
 lwkt_runnable(void)
 {
-    return (mycpu->gd_runqmask != 0);
+    return (TAILQ_FIRST(&mycpu->gd_tdrunq) != NULL);
 }
 
 static __inline int
 lwkt_getpri(thread_t td)
 {
-    return(td->td_pri & TDPRI_MASK);
+    return(td->td_pri);
 }
 
 static __inline int
diff --git a/sys/sys/uio.h b/sys/sys/uio.h
index f0de1d038a..c14dfebbe8 100644
--- a/sys/sys/uio.h
+++ b/sys/sys/uio.h
@@ -93,7 +93,6 @@ struct uio {
 struct vm_object;
 struct vm_page;
 
-void	uio_yield (void);
 int	uiomove (caddr_t, size_t, struct uio *);
 int	uiomovez (size_t, struct uio *);
 int 	uiomove_frombuf (void *buf, size_t buflen, struct uio *uio);
diff --git a/sys/sys/upcall.h b/sys/sys/upcall.h
index 1eb26d44c5..324871fd58 100644
--- a/sys/sys/upcall.h
+++ b/sys/sys/upcall.h
@@ -59,7 +59,6 @@ struct upcall {
 #define UPC_CONTROL_POLLANDCLEAR	5
 #define UPC_CONTROL_WAIT		6
 
-#define UPC_CRITADD			32	/* NOTE! same as TDPRI_CRIT */
 #define UPC_RESERVED			32	/* # of reserved id's */
 
 #if defined(_KERNEL)
diff --git a/sys/vfs/hammer/hammer_flusher.c b/sys/vfs/hammer/hammer_flusher.c
index 082cb85c3f..1c8a433f6d 100644
--- a/sys/vfs/hammer/hammer_flusher.c
+++ b/sys/vfs/hammer/hammer_flusher.c
@@ -333,7 +333,7 @@ hammer_flusher_flush(hammer_mount_t hmp)
 
 			if (++hmp->check_yield > hammer_yield_check) {
 				hmp->check_yield = 0;
-				lwkt_user_yield();
+				lwkt_yield();
 			}
 
 			/*
diff --git a/sys/vfs/ufs/ffs_rawread.c b/sys/vfs/ufs/ffs_rawread.c
index 72dfbbb8c9..13550dc1bc 100644
--- a/sys/vfs/ufs/ffs_rawread.c
+++ b/sys/vfs/ufs/ffs_rawread.c
@@ -52,7 +52,7 @@
 #include <sys/sysctl.h>
 
 static int ffs_rawread_readahead(struct vnode *vp, caddr_t udata, off_t offset,
-				 size_t len, struct buf *bp, int *baseticks);
+				 size_t len, struct buf *bp);
 static int ffs_rawread_main(struct vnode *vp,
 			    struct uio *uio);
 
@@ -142,7 +142,7 @@ done:
 
 static int
 ffs_rawread_readahead(struct vnode *vp, caddr_t udata, off_t loffset,
-		      size_t len, struct buf *bp, int *baseticks)
+		      size_t len, struct buf *bp)
 {
 	int error;
 	int iolen;
@@ -190,10 +190,7 @@ ffs_rawread_readahead(struct vnode *vp, caddr_t udata, off_t loffset,
 		if (vmapbuf(bp, udata, len) < 0)
 			return EFAULT;
 		
-		if (ticks - *baseticks >= hogticks) {
-			*baseticks = ticks;
-			uio_yield();
-		}
+		lwkt_user_yield();
 		bzero(bp->b_data, bp->b_bcount);
 
 		/* Mark operation completed (similar to bufdone()) */
@@ -230,7 +227,6 @@ ffs_rawread_main(struct vnode *vp, struct uio *uio)
 	int error, nerror;
 	struct buf *bp, *nbp, *tbp;
 	int iolen;
-	int baseticks = ticks;
 	caddr_t udata;
 	int resid;
 	off_t offset;
@@ -250,8 +246,8 @@ ffs_rawread_main(struct vnode *vp, struct uio *uio)
 		if (bp == NULL) { /* Setup first read */
 			/* XXX: Leave some bufs for swap */
 			bp = getpbuf_kva(&ffsrawbufcnt);
-			error = ffs_rawread_readahead(vp, udata, offset, resid,
-						      bp, &baseticks);
+			error = ffs_rawread_readahead(vp, udata, offset,
+						      resid, bp);
 			if (error != 0)
 				break;
 			
@@ -267,7 +263,7 @@ ffs_rawread_main(struct vnode *vp, struct uio *uio)
 							udata + bp->b_bufsize,
 							offset + bp->b_bufsize,
 							resid - bp->b_bufsize,
-							nbp, &baseticks);
+							nbp);
 					if (nerror) {
 						relpbuf(nbp, &ffsrawbufcnt);
 						nbp = NULL;
@@ -298,7 +294,7 @@ ffs_rawread_main(struct vnode *vp, struct uio *uio)
 			/* Incomplete read.  Try to read remaining part */
 			error = ffs_rawread_readahead(
 				    vp, udata, offset,
-				    bp->b_bufsize - iolen, bp, &baseticks);
+				    bp->b_bufsize - iolen, bp);
 			if (error != 0)
 				break;
 		} else if (nbp != NULL) { /* Complete read with readahead */
@@ -317,7 +313,7 @@ ffs_rawread_main(struct vnode *vp, struct uio *uio)
 						vp, udata + bp->b_bufsize,
 				   		offset + bp->b_bufsize,
 						resid - bp->b_bufsize,
-						nbp, &baseticks);
+						nbp);
 				if (nerror != 0) {
 					relpbuf(nbp, &ffsrawbufcnt);
 					nbp = NULL;
@@ -327,8 +323,7 @@ ffs_rawread_main(struct vnode *vp, struct uio *uio)
 			break;		
 		}  else if (resid > 0) { /* More to read, no readahead */
 			error = ffs_rawread_readahead(vp, udata, offset,
-						      resid, bp,
-						      &baseticks);
+						      resid, bp);
 			if (error != 0)
 				break;
 		}
diff --git a/sys/vm/vm_zeroidle.c b/sys/vm/vm_zeroidle.c
index 15559323dc..700b773d0c 100644
--- a/sys/vm/vm_zeroidle.c
+++ b/sys/vm/vm_zeroidle.c
@@ -208,8 +208,7 @@ vm_pagezero(void __unused *arg)
 			 * resched has been requested.
 			 */
 			while (i < PAGE_SIZE) {
-				if (lwkt_check_resched(curthread))
-					break;
+				lwkt_yield();
 				if (idlezero_nocache == 1)
 					bzeront(&pg[i], IDLEZERO_RUN);
 				else
@@ -227,8 +226,7 @@ vm_pagezero(void __unused *arg)
 			++idlezero_count;
 			break;
 		}
-		if (lwkt_check_resched(curthread))
-			lwkt_switch();
+		lwkt_yield();
 	}
 }
 
-- 
2.41.0