From 52eedfb5ec3690b943858e66c64256b1683c1072 Mon Sep 17 00:00:00 2001
From: Matthew Dillon <dillon@dragonflybsd.org>
Date: Mon, 29 May 2006 03:57:21 +0000
Subject: [PATCH] Further isolate the user process scheduler data by moving
 more variables from the globaldata structure to the scheduler module(s).

Make the user process scheduler MP safe.  Make the LWKT 'pull thread'
(to a different cpu) feature MP safe.  Streamline the user process
scheduler API.

Do a near complete rewrite of the BSD4 scheduler.  Remote reschedules
(reschedules to other cpus), cpu pickup of queued processes, and locality
of reference handling should make the new BSD4 scheduler a lot more
responsive.

Add a demonstration user process scheduler called 'dummy'
(kern/usched_dummy.c).  Add a kenv variable 'kern.user_scheduler' that
can be set to the desired scheduler on boot (i.e. 'bsd4' or 'dummy').

NOTE: Until more of the system is taken out from under the MP lock,
these changes actually slow things down slightly.  Buildworlds are
about ~2.7% slower.
---
 sys/conf/files                |    3 +-
 sys/ddb/db_ps.c               |   15 +-
 sys/i386/i386/trap.c          |   38 +-
 sys/kern/init_main.c          |    5 +-
 sys/kern/kern_synch.c         |   10 +-
 sys/kern/kern_usched.c        |   16 +-
 sys/kern/lwkt_thread.c        |   68 +-
 sys/kern/usched_bsd4.c        | 1288 ++++++++++++++++++---------------
 sys/kern/usched_dummy.c       |  545 ++++++++++++++
 sys/platform/pc32/i386/trap.c |   38 +-
 sys/sys/globaldata.h          |   13 +-
 sys/sys/thread.h              |    5 +-
 sys/sys/usched.h              |    8 +-
 13 files changed, 1339 insertions(+), 713 deletions(-)
 create mode 100644 sys/kern/usched_dummy.c

diff --git a/sys/conf/files b/sys/conf/files
index a13d2e33ef..70bdeb5d84 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -1,5 +1,5 @@
 # $FreeBSD: src/sys/conf/files,v 1.340.2.137 2003/06/04 17:10:30 sam Exp $
-# $DragonFly: src/sys/conf/files,v 1.125 2006/05/23 20:35:07 dillon Exp $
+# $DragonFly: src/sys/conf/files,v 1.126 2006/05/29 03:57:16 dillon Exp $
 #
 # The long compile-with and dependency lines are required because of
 # limitations in config: backslash-newline doesn't work in strings, and
@@ -531,6 +531,7 @@ kern/kern_msfbuf.c	standard
 kern/kern_subr.c	standard
 kern/kern_usched.c	standard
 kern/usched_bsd4.c	standard
+kern/usched_dummy.c	standard
 kern/kern_umtx.c	standard
 kern/lwkt_thread.c	standard
 kern/lwkt_ipiq.c	standard
diff --git a/sys/ddb/db_ps.c b/sys/ddb/db_ps.c
index d8efba594b..65343de3fe 100644
--- a/sys/ddb/db_ps.c
+++ b/sys/ddb/db_ps.c
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  * $FreeBSD: src/sys/ddb/db_ps.c,v 1.20 1999/08/28 00:41:09 peter Exp $
- * $DragonFly: src/sys/ddb/db_ps.c,v 1.17 2006/05/19 18:26:27 dillon Exp $
+ * $DragonFly: src/sys/ddb/db_ps.c,v 1.18 2006/05/29 03:57:18 dillon Exp $
  */
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -104,11 +104,6 @@ db_ps(db_expr_t dummy1, boolean_t dummy2, db_expr_t dummy3, char *dummy4)
 	    db_printf("cpu %d tdrunqmask %08x curthread %p reqflags %04x\n",
 		    gd->gd_cpuid, gd->gd_runqmask,
 		    gd->gd_curthread, gd->gd_reqflags);
-	    db_printf("       uschedcp %p (%d/%d) upri %d\n",
-		    gd->gd_uschedcp,
-		    (gd->gd_uschedcp ? gd->gd_uschedcp->lwp_proc->p_pid : -1),
-		    (gd->gd_uschedcp ? gd->gd_uschedcp->lwp_tid : -1),
-		    gd->gd_upri);
 	    if (gd->gd_curthread && gd->gd_curthread->td_preempted) {
 		    db_printf("       PREEMPTING THREAD %p\n",
 				gd->gd_curthread->td_preempted);
@@ -181,14 +176,10 @@ db_ps(db_expr_t dummy1, boolean_t dummy2, db_expr_t dummy3, char *dummy4)
 	}
 	if (db_more(&nl) < 0)
 	    return;
-	db_printf("CURCPU %d CURTHREAD %p (%d) USCHEDCP %p (%d/%d) UPRI %d\n",
+	db_printf("CURCPU %d CURTHREAD %p (%d)\n",
 	    mycpu->gd_cpuid,
 	    curthread,
-	    (curthread->td_proc ? curthread->td_proc->p_pid : -1),
-	    mycpu->gd_uschedcp,
-	    (mycpu->gd_uschedcp ? mycpu->gd_uschedcp->lwp_proc->p_pid : -1),
-	    (mycpu->gd_uschedcp ? mycpu->gd_uschedcp->lwp_tid : -1),
-	    mycpu->gd_upri);
+	    (curthread->td_proc ? curthread->td_proc->p_pid : -1));
 	db_dump_td_tokens(curthread);
 }
 
diff --git a/sys/i386/i386/trap.c b/sys/i386/i386/trap.c
index ba66ddd694..c77aa3d394 100644
--- a/sys/i386/i386/trap.c
+++ b/sys/i386/i386/trap.c
@@ -36,7 +36,7 @@
  *
  *	from: @(#)trap.c	7.4 (Berkeley) 5/13/91
  * $FreeBSD: src/sys/i386/i386/trap.c,v 1.147.2.11 2003/02/27 19:09:59 luoqi Exp $
- * $DragonFly: src/sys/i386/i386/Attic/trap.c,v 1.75 2006/05/22 06:26:30 swildner Exp $
+ * $DragonFly: src/sys/i386/i386/Attic/trap.c,v 1.76 2006/05/29 03:57:19 dillon Exp $
  */
 
 /*
@@ -317,7 +317,6 @@ userexit(struct lwp *lp)
 		lp->lwp_proc->p_usched->release_curproc(lp);
 #endif
 
-again:
 	/*
 	 * Handle a LWKT reschedule request first.  Since our passive release
 	 * is still in place we do not have to do anything special.
@@ -326,24 +325,12 @@ again:
 		lwkt_switch();
 
 	/*
-	 * Acquire the current process designation if we do not own it.
-	 * Note that acquire_curproc() does not reset the user reschedule
-	 * bit on purpose, because we may need to accumulate over several
-	 * threads waking up at the same time.
-	 *
-	 * NOTE: userland scheduler cruft: because processes are removed
-	 * from the userland scheduler's queue we run through loops to try
-	 * to figure out which is the best of [ existing, waking-up ]
-	 * threads.
+	 * Acquire the current process designation for this user scheduler
+	 * on this cpu.  This will also handle any user-reschedule requests.
 	 */
-	if (lp != gd->gd_uschedcp) {
-		++slow_release;
-		lp->lwp_proc->p_usched->acquire_curproc(lp);
-		/* We may have switched cpus on acquisition */
-		gd = td->td_gd;
-	} else {
-		++fast_release;
-	}
+	lp->lwp_proc->p_usched->acquire_curproc(lp);
+	/* We may have switched cpus on acquisition */
+	gd = td->td_gd;
 
 	/*
 	 * Reduce our priority in preparation for a return to userland.  If
@@ -363,19 +350,6 @@ again:
 	 */
 	if (lwkt_checkpri_self())
 		lwkt_switch();
-
-	/*
-	 * If a userland reschedule is [still] pending we may not be the best
-	 * selected process.  Select a better one.  If another LWKT resched
-	 * is pending the trap will be re-entered.
-	 */
-	if (user_resched_wanted()) {
-		lp->lwp_proc->p_usched->select_curproc(gd);
-		if (lp != gd->gd_uschedcp) {
-			lwkt_setpri_self(TDPRI_KERN_USER);
-			goto again;
-		}
-	}
 }
 
 /*
diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c
index 80a2c3c592..46b665aff5 100644
--- a/sys/kern/init_main.c
+++ b/sys/kern/init_main.c
@@ -40,7 +40,7 @@
  *
  *	@(#)init_main.c	8.9 (Berkeley) 1/21/94
  * $FreeBSD: src/sys/kern/init_main.c,v 1.134.2.8 2003/06/06 20:21:32 tegge Exp $
- * $DragonFly: src/sys/kern/init_main.c,v 1.55 2006/05/25 07:36:34 dillon Exp $
+ * $DragonFly: src/sys/kern/init_main.c,v 1.56 2006/05/29 03:57:20 dillon Exp $
  */
 
 #include "opt_init_path.h"
@@ -556,9 +556,8 @@ start_init(void *dummy)
 		 * release it.
 		 */
 		if ((error = execve(&args)) == 0) {
-			if (lp->lwp_thread->td_gd->gd_uschedcp != lp)
-				lp->lwp_proc->p_usched->acquire_curproc(lp);
 			rel_mplock();
+			lp->lwp_proc->p_usched->acquire_curproc(lp);
 			return;
 		}
 		if (error != ENOENT)
diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c
index 8d9bf1e2b0..afccaf9c24 100644
--- a/sys/kern/kern_synch.c
+++ b/sys/kern/kern_synch.c
@@ -37,7 +37,7 @@
  *
  *	@(#)kern_synch.c	8.9 (Berkeley) 5/19/95
  * $FreeBSD: src/sys/kern/kern_synch.c,v 1.87.2.6 2002/10/13 07:29:53 kbyanc Exp $
- * $DragonFly: src/sys/kern/kern_synch.c,v 1.62 2006/05/27 01:51:26 dillon Exp $
+ * $DragonFly: src/sys/kern/kern_synch.c,v 1.63 2006/05/29 03:57:20 dillon Exp $
  */
 
 #include "opt_ktrace.h"
@@ -251,6 +251,8 @@ schedcpu_resource(struct proc *p, void *data __unused)
 /*
  * This is only used by ps.  Generate a cpu percentage use over
  * a period of one second.
+ *
+ * MPSAFE
  */
 void
 updatepcpu(struct lwp *lp, int cpticks, int ttlticks)
@@ -433,11 +435,9 @@ tsleep(void *ident, int flags, const char *wmesg, int timo)
 	 */
 	if (p) {
 		/*
-		 * Ok, we are sleeping.  Remove us from the userland runq
-		 * and place us in the SSLEEP state.
+		 * Ok, we are sleeping.  Place us in the SSLEEP state.
 		 */
-		if (p->p_flag & P_ONRUNQ)
-			p->p_usched->remrunqueue(&p->p_lwp);
+		KKASSERT((p->p_flag & P_ONRUNQ) == 0);
 		p->p_stat = SSLEEP;
 		p->p_stats->p_ru.ru_nvcsw++;
 		lwkt_switch();
diff --git a/sys/kern/kern_usched.c b/sys/kern/kern_usched.c
index b6e101b196..cf8b6ce3ad 100644
--- a/sys/kern/kern_usched.c
+++ b/sys/kern/kern_usched.c
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/kern/kern_usched.c,v 1.2 2006/03/23 14:04:24 drhodus Exp $
+ * $DragonFly: src/sys/kern/kern_usched.c,v 1.3 2006/05/29 03:57:20 dillon Exp $
  */
 
 #include <sys/errno.h>
@@ -51,11 +51,21 @@ static TAILQ_HEAD(, usched) usched_list = TAILQ_HEAD_INITIALIZER(usched_list);
 struct usched *
 usched_init(void)
 {
+	const char *defsched;
+
+	defsched = getenv("kern.user_scheduler");
+
 	/*
-	 * Add the bsd4 userland scheduler to the system.
+	 * Add various userland schedulers to the system.
 	 */
 	usched_ctl(&usched_bsd4, USCH_ADD);
-	return(&usched_bsd4);
+	usched_ctl(&usched_dummy, USCH_ADD);
+	if (defsched == NULL )
+		return(&usched_bsd4);
+	if (strcmp(defsched, "bsd4") == 0)
+		return(&usched_bsd4);
+	printf("WARNING: Running dummy userland scheduler\n");
+	return(&usched_dummy);
 }
 
 /*
diff --git a/sys/kern/lwkt_thread.c b/sys/kern/lwkt_thread.c
index e9db34b45c..b024aa0910 100644
--- a/sys/kern/lwkt_thread.c
+++ b/sys/kern/lwkt_thread.c
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/kern/lwkt_thread.c,v 1.96 2006/05/21 20:23:25 dillon Exp $
+ * $DragonFly: src/sys/kern/lwkt_thread.c,v 1.97 2006/05/29 03:57:20 dillon Exp $
  */
 
 /*
@@ -309,6 +309,9 @@ lwkt_init_thread_remote(void *arg)
 {
     thread_t td = arg;
 
+    /*
+     * Protected by critical section held by IPI dispatch
+     */
     TAILQ_INSERT_TAIL(&td->td_gd->gd_tdallq, td, td_allq);
 }
 
@@ -1059,35 +1062,59 @@ lwkt_schedule(thread_t td)
     crit_exit_gd(mygd);
 }
 
+#ifdef SMP
+
 /*
- * Managed acquisition.  This code assumes that the MP lock is held for
- * the tdallq operation and that the thread has been descheduled from its
- * original cpu.  We also have to wait for the thread to be entirely switched
- * out on its original cpu (this is usually fast enough that we never loop)
- * since the LWKT system does not have to hold the MP lock while switching
- * and the target may have released it before switching.
+ * Thread migration using a 'Pull' method.  The thread may or may not be
+ * the current thread.  It MUST be descheduled and in a stable state.
+ * lwkt_giveaway() must be called on the cpu owning the thread.
+ *
+ * At any point after lwkt_giveaway() is called, the target cpu may
+ * 'pull' the thread by calling lwkt_acquire().
+ *
+ * MPSAFE - must be called under very specific conditions.
  */
+void
+lwkt_giveaway(thread_t td)
+{
+	globaldata_t gd = mycpu;
+
+	crit_enter_gd(gd);
+	KKASSERT(td->td_gd == gd);
+	TAILQ_REMOVE(&gd->gd_tdallq, td, td_allq);
+	td->td_flags |= TDF_MIGRATING;
+	crit_exit_gd(gd);
+}
+
 void
 lwkt_acquire(thread_t td)
 {
     globaldata_t gd;
     globaldata_t mygd;
 
+    KKASSERT(td->td_flags & TDF_MIGRATING);
     gd = td->td_gd;
     mygd = mycpu;
-    cpu_lfence();
-    KKASSERT((td->td_flags & TDF_RUNQ) == 0);
-    while (td->td_flags & (TDF_RUNNING|TDF_PREEMPT_LOCK))	/* XXX spin */
+    if (gd != mycpu) {
 	cpu_lfence();
-    if (gd != mygd) {
+	KKASSERT((td->td_flags & TDF_RUNQ) == 0);
 	crit_enter_gd(mygd);
-	TAILQ_REMOVE(&gd->gd_tdallq, td, td_allq);	/* protected by BGL */
+	while (td->td_flags & (TDF_RUNNING|TDF_PREEMPT_LOCK))
+	    cpu_lfence();
 	td->td_gd = mygd;
-	TAILQ_INSERT_TAIL(&mygd->gd_tdallq, td, td_allq); /* protected by BGL */
+	TAILQ_INSERT_TAIL(&mygd->gd_tdallq, td, td_allq);
+	td->td_flags &= ~TDF_MIGRATING;
+	crit_exit_gd(mygd);
+    } else {
+	crit_enter_gd(mygd);
+	TAILQ_INSERT_TAIL(&mygd->gd_tdallq, td, td_allq);
+	td->td_flags &= ~TDF_MIGRATING;
 	crit_exit_gd(mygd);
     }
 }
 
+#endif
+
 /*
  * Generic deschedule.  Descheduling threads other then your own should be
  * done only in carefully controlled circumstances.  Descheduling is 
@@ -1188,11 +1215,12 @@ lwkt_checkpri_self(void)
 }
 
 /*
- * Migrate the current thread to the specified cpu.  The BGL must be held
- * (for the gd_tdallq manipulation XXX).  This is accomplished by 
- * descheduling ourselves from the current cpu, moving our thread to the
- * tdallq of the target cpu, IPI messaging the target cpu, and switching out.
- * TDF_MIGRATING prevents scheduling races while the thread is being migrated.
+ * Migrate the current thread to the specified cpu. 
+ *
+ * This is accomplished by descheduling ourselves from the current cpu,
+ * moving our thread to the tdallq of the target cpu, IPI messaging the
+ * target cpu, and switching out.  TDF_MIGRATING prevents scheduling
+ * races while the thread is being migrated.
  */
 #ifdef SMP
 static void lwkt_setcpu_remote(void *arg);
@@ -1208,11 +1236,11 @@ lwkt_setcpu_self(globaldata_t rgd)
 	crit_enter_quick(td);
 	td->td_flags |= TDF_MIGRATING;
 	lwkt_deschedule_self(td);
-	TAILQ_REMOVE(&td->td_gd->gd_tdallq, td, td_allq); /* protected by BGL */
-	TAILQ_INSERT_TAIL(&rgd->gd_tdallq, td, td_allq); /* protected by BGL */
+	TAILQ_REMOVE(&td->td_gd->gd_tdallq, td, td_allq);
 	lwkt_send_ipiq(rgd, (ipifunc1_t)lwkt_setcpu_remote, td);
 	lwkt_switch();
 	/* we are now on the target cpu */
+	TAILQ_INSERT_TAIL(&rgd->gd_tdallq, td, td_allq);
 	crit_exit_quick(td);
     }
 #endif
diff --git a/sys/kern/usched_bsd4.c b/sys/kern/usched_bsd4.c
index 415cdad6eb..47958ed7ef 100644
--- a/sys/kern/usched_bsd4.c
+++ b/sys/kern/usched_bsd4.c
@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $DragonFly: src/sys/kern/usched_bsd4.c,v 1.8 2006/04/23 17:48:59 dillon Exp $
+ * $DragonFly: src/sys/kern/usched_bsd4.c,v 1.9 2006/05/29 03:57:20 dillon Exp $
  */
 
 #include <sys/param.h>
@@ -33,14 +33,17 @@
 #include <sys/queue.h>
 #include <sys/proc.h>
 #include <sys/rtprio.h>
-#include <sys/thread2.h>
 #include <sys/uio.h>
 #include <sys/sysctl.h>
 #include <sys/resourcevar.h>
+#include <sys/spinlock.h>
 #include <machine/ipl.h>
 #include <machine/cpu.h>
 #include <machine/smp.h>
 
+#include <sys/thread2.h>
+#include <sys/spinlock2.h>
+
 /*
  * Priorities.  Note that with 32 run queues per scheduler each queue
  * represents four priority levels.
@@ -56,6 +59,7 @@
 
 #define NQS	32			/* 32 run queues. */
 #define PPQ	(MAXPRI / NQS)		/* priorities per queue */
+#define PPQMASK	(PPQ - 1)
 
 /*
  * NICEPPQ	- number of nice units per priority queue
@@ -81,19 +85,25 @@ TAILQ_HEAD(rq, lwp);
 #define lwp_rqindex	lwp_usdata.bsd4.rqindex
 #define lwp_origcpu	lwp_usdata.bsd4.origcpu
 #define lwp_estcpu	lwp_usdata.bsd4.estcpu
+#define lwp_rqtype	lwp_usdata.bsd4.rqtype
 
 static void bsd4_acquire_curproc(struct lwp *lp);
 static void bsd4_release_curproc(struct lwp *lp);
 static void bsd4_select_curproc(globaldata_t gd);
 static void bsd4_setrunqueue(struct lwp *lp);
-static void bsd4_remrunqueue(struct lwp *lp);
 static void bsd4_schedulerclock(struct lwp *lp, sysclock_t period,
 				sysclock_t cpstamp);
+static void bsd4_recalculate_estcpu(struct lwp *lp);
 static void bsd4_resetpriority(struct lwp *lp);
 static void bsd4_forking(struct lwp *plp, struct lwp *lp);
 static void bsd4_exiting(struct lwp *plp, struct lwp *lp);
 
-static void bsd4_recalculate_estcpu(struct lwp *lp);
+#ifdef SMP
+static void need_user_resched_remote(void *dummy);
+#endif
+static struct lwp *chooseproc_locked(struct lwp *chklp);
+static void bsd4_remrunqueue_locked(struct lwp *lp);
+static void bsd4_setrunqueue_locked(struct lwp *lp);
 
 struct usched usched_bsd4 = {
 	{ NULL },
@@ -104,7 +114,6 @@ struct usched usched_bsd4 = {
 	bsd4_release_curproc,
 	bsd4_select_curproc,
 	bsd4_setrunqueue,
-	bsd4_remrunqueue,
 	bsd4_schedulerclock,
 	bsd4_recalculate_estcpu,
 	bsd4_resetpriority,
@@ -113,6 +122,15 @@ struct usched usched_bsd4 = {
 	NULL			/* setcpumask not supported */
 };
 
+struct usched_bsd4_pcpu {
+	struct thread helper_thread;
+	short	rrcount;
+	short	upri;
+	struct lwp *uschedcp;
+};
+
+typedef struct usched_bsd4_pcpu	*bsd4_pcpu_t;
+
 /*
  * We have NQS (32) run queues per scheduling class.  For the normal
  * class, there are 128 priorities scaled onto these 32 queues.  New
@@ -125,20 +143,22 @@ struct usched usched_bsd4 = {
  * the state of all 32 queues and then a ffs() to find the first busy
  * queue.
  */
-static struct rq queues[NQS];
-static struct rq rtqueues[NQS];
-static struct rq idqueues[NQS];
-static u_int32_t queuebits;
-static u_int32_t rtqueuebits;
-static u_int32_t idqueuebits;
-static cpumask_t curprocmask = -1;	/* currently running a user process */
-static cpumask_t rdyprocmask;		/* ready to accept a user process */
-static int	 runqcount;
+static struct rq bsd4_queues[NQS];
+static struct rq bsd4_rtqueues[NQS];
+static struct rq bsd4_idqueues[NQS];
+static u_int32_t bsd4_queuebits;
+static u_int32_t bsd4_rtqueuebits;
+static u_int32_t bsd4_idqueuebits;
+static cpumask_t bsd4_curprocmask = -1;	/* currently running a user process */
+static cpumask_t bsd4_rdyprocmask;	/* ready to accept a user process */
+static int	 bsd4_runqcount;
 #ifdef SMP
-static int	 scancpu;
+static volatile int bsd4_scancpu;
 #endif
+static struct spinlock bsd4_spin;
+static struct usched_bsd4_pcpu bsd4_pcpu[MAXCPU];
 
-SYSCTL_INT(_debug, OID_AUTO, runqcount, CTLFLAG_RD, &runqcount, 0, "");
+SYSCTL_INT(_debug, OID_AUTO, bsd4_runqcount, CTLFLAG_RD, &bsd4_runqcount, 0, "");
 #ifdef INVARIANTS
 static int usched_nonoptimal;
 SYSCTL_INT(_debug, OID_AUTO, usched_nonoptimal, CTLFLAG_RW,
@@ -150,12 +170,9 @@ SYSCTL_INT(_debug, OID_AUTO, usched_optimal, CTLFLAG_RW,
 static int usched_debug = -1;
 SYSCTL_INT(_debug, OID_AUTO, scdebug, CTLFLAG_RW, &usched_debug, 0, "");
 #ifdef SMP
-static int remote_resched = 1;
 static int remote_resched_nonaffinity;
 static int remote_resched_affinity;
 static int choose_affinity;
-SYSCTL_INT(_debug, OID_AUTO, remote_resched, CTLFLAG_RW,
-        &remote_resched, 0, "Resched to another cpu");
 SYSCTL_INT(_debug, OID_AUTO, remote_resched_nonaffinity, CTLFLAG_RD,
         &remote_resched_nonaffinity, 0, "Number of remote rescheds");
 SYSCTL_INT(_debug, OID_AUTO, remote_resched_affinity, CTLFLAG_RD,
@@ -179,106 +196,195 @@ rqinit(void *dummy)
 {
 	int i;
 
+	spin_init(&bsd4_spin);
 	for (i = 0; i < NQS; i++) {
-		TAILQ_INIT(&queues[i]);
-		TAILQ_INIT(&rtqueues[i]);
-		TAILQ_INIT(&idqueues[i]);
+		TAILQ_INIT(&bsd4_queues[i]);
+		TAILQ_INIT(&bsd4_rtqueues[i]);
+		TAILQ_INIT(&bsd4_idqueues[i]);
 	}
-	atomic_clear_int(&curprocmask, 1);
+	atomic_clear_int(&bsd4_curprocmask, 1);
 }
 SYSINIT(runqueue, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, rqinit, NULL)
 
 /*
- * chooseproc() is called when a cpu needs a user process to LWKT schedule,
- * it selects a user process and returns it.  If chklp is non-NULL and chklp
- * has a better or equal priority then the process that would otherwise be
- * chosen, NULL is returned.
+ * BSD4_ACQUIRE_CURPROC
  *
- * Until we fix the RUNQ code the chklp test has to be strict or we may
- * bounce between processes trying to acquire the current process designation.
+ * This function is called when the kernel intends to return to userland.
+ * It is responsible for making the thread the current designated userland
+ * thread for this cpu, blocking if necessary.
+ *
+ * We are expected to handle userland reschedule requests here too.
+ *
+ * WARNING! THIS FUNCTION IS ALLOWED TO CAUSE THE CURRENT THREAD TO MIGRATE
+ * TO ANOTHER CPU!  Because most of the kernel assumes that no migration will
+ * occur, this function is called only under very controlled circumstances.
+ *
+ * Basically we recalculate our estcpu to hopefully give us a more
+ * favorable disposition, setrunqueue, then wait for the curlwp
+ * designation to be handed to us (if the setrunqueue didn't do it).
+ *
+ * MPSAFE
  */
-static
-struct lwp *
-chooseproc(struct lwp *chklp)
+static void
+bsd4_acquire_curproc(struct lwp *lp)
 {
-	struct lwp *lp;
-	struct rq *q;
-	u_int32_t *which;
-	u_int32_t pri;
+	globaldata_t gd = mycpu;
+	bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid];
 
-	if (rtqueuebits) {
-		pri = bsfl(rtqueuebits);
-		q = &rtqueues[pri];
-		which = &rtqueuebits;
-	} else if (queuebits) {
-		pri = bsfl(queuebits);
-		q = &queues[pri];
-		which = &queuebits;
-	} else if (idqueuebits) {
-		pri = bsfl(idqueuebits);
-		q = &idqueues[pri];
-		which = &idqueuebits;
-	} else {
-		return NULL;
-	}
-	lp = TAILQ_FIRST(q);
-	KASSERT(lp, ("chooseproc: no lwp on busy queue"));
+	/*
+	 * Possibly select another thread, or keep the  current thread.
+	 */
+	if (user_resched_wanted())
+		bsd4_select_curproc(gd);
 
 	/*
-	 * If the passed lwp <chklp> is reasonably close to the selected
-	 * lwp <lp>, return NULL (indicating that <chklp> should be kept).
-	 * 
-	 * Note that we must error on the side of <chklp> to avoid bouncing
-	 * between threads in the acquire code.
+	 * If uschedcp is still pointing to us, we're done
 	 */
-	if (chklp) {
-		if (chklp->lwp_priority < lp->lwp_priority + PPQ)
-			return(NULL);
-	}
+	if (dd->uschedcp == lp)
+		return;
 
-#ifdef SMP
 	/*
-	 * If the chosen lwp does not reside on this cpu spend a few
-	 * cycles looking for a better candidate at the same priority level.
-	 * This is a fallback check, setrunqueue() tries to wakeup the
-	 * correct cpu and is our front-line affinity.
+	 * If this cpu has no current thread, and the run queue is
+	 * empty, we can safely select ourself.
 	 */
-	if (lp->lwp_thread->td_gd != mycpu &&
-	    (chklp = TAILQ_NEXT(lp, lwp_procq)) != NULL
-	) {
-		if (chklp->lwp_thread->td_gd == mycpu) {
-			++choose_affinity;
-			lp = chklp;
-		}
+	if (dd->uschedcp == NULL && bsd4_runqcount == 0) {
+		atomic_set_int(&bsd4_curprocmask, gd->gd_cpumask);
+		dd->uschedcp = lp;
+		dd->upri = lp->lwp_priority;
+		return;
 	}
-#endif
 
-	TAILQ_REMOVE(q, lp, lwp_procq);
-	--runqcount;
-	if (TAILQ_EMPTY(q))
-		*which &= ~(1 << pri);
-	KASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) != 0, ("not on runq6!"));
-	lp->lwp_proc->p_flag &= ~P_ONRUNQ;
-	return lp;
+	/*
+	 * Adjust estcpu and recalculate our priority, then put us back on
+	 * the user process scheduler's runq.  Only increment the involuntary
+	 * context switch count if the setrunqueue call did not immediately
+	 * schedule us.
+	 *
+	 * Loop until we become the currently scheduled process.  Note that
+	 * calling setrunqueue can cause us to be migrated to another cpu
+	 * after we switch away.
+	 */
+	do {
+		crit_enter();
+		bsd4_recalculate_estcpu(lp);
+		lwkt_deschedule_self(gd->gd_curthread);
+		bsd4_setrunqueue(lp);
+		if ((gd->gd_curthread->td_flags & TDF_RUNQ) == 0)
+			++lp->lwp_stats->p_ru.ru_nivcsw;
+		lwkt_switch();
+		crit_exit();
+		gd = mycpu;
+		dd = &bsd4_pcpu[gd->gd_cpuid];
+	} while (dd->uschedcp != lp);
+	KKASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) == 0);
+}
+
+/*
+ * BSD4_RELEASE_CURPROC
+ *
+ * This routine detaches the current thread from the userland scheduler,
+ * usually because the thread needs to run in the kernel (at kernel priority)
+ * for a while.
+ *
+ * This routine is also responsible for selecting a new thread to
+ * make the current thread.
+ *
+ * NOTE: This implementation differs from the dummy example in that
+ * bsd4_select_curproc() is able to select the current process, whereas
+ * dummy_select_curproc() is not able to select the current process.
+ * This means we have to NULL out uschedcp.
+ *
+ * Additionally, note that we may already be on a run queue if releasing
+ * via the lwkt_switch() in bsd4_setrunqueue().
+ *
+ * WARNING!  The MP lock may be in an unsynchronized state due to the
+ * way get_mplock() works and the fact that this function may be called
+ * from a passive release during a lwkt_switch().   try_mplock() will deal 
+ * with this for us but you should be aware that td_mpcount may not be
+ * useable.
+ *
+ * MPSAFE
+ */
+static void
+bsd4_release_curproc(struct lwp *lp)
+{
+	globaldata_t gd = mycpu;
+	bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid];
+
+	if (dd->uschedcp == lp) {
+		/*
+		 * Note: we leave ou curprocmask bit set to prevent
+		 * unnecessary scheduler helper wakeups.  
+		 * bsd4_select_curproc() will clean it up.
+		 */
+		KKASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) == 0);
+		dd->uschedcp = NULL;	/* don't let lp be selected */
+		bsd4_select_curproc(gd);
+	}
 }
 
-#ifdef SMP
 /*
- * called via an ipi message to reschedule on another cpu.
+ * BSD4_SELECT_CURPROC
+ *
+ * Select a new current process for this cpu.  This satisfies a user
+ * scheduler reschedule request so clear that too.
+ *
+ * This routine is also responsible for equal-priority round-robining,
+ * typically triggered from bsd4_schedulerclock().  In our dummy example
+ * all the 'user' threads are LWKT scheduled all at once and we just
+ * call lwkt_switch().
+ *
+ * MPSAFE
  */
 static
 void
-need_user_resched_remote(void *dummy)
+bsd4_select_curproc(globaldata_t gd)
 {
-	need_user_resched();
-}
+	bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid];
+	struct lwp *nlp;
+	int cpuid = gd->gd_cpuid;
 
+	crit_enter_gd(gd);
+	clear_user_resched();	/* This satisfied the reschedule request */
+	dd->rrcount = 0;	/* Reset the round-robin counter */
+
+	spin_lock_wr(&bsd4_spin);
+	if ((nlp = chooseproc_locked(dd->uschedcp)) != NULL) {
+		atomic_set_int(&bsd4_curprocmask, 1 << cpuid);
+		dd->upri = nlp->lwp_priority;
+		dd->uschedcp = nlp;
+		spin_unlock_wr(&bsd4_spin);
+#ifdef SMP
+		lwkt_acquire(nlp->lwp_thread);
 #endif
+		lwkt_schedule(nlp->lwp_thread);
+	} else if (dd->uschedcp) {
+		dd->upri = dd->uschedcp->lwp_priority;
+		spin_unlock_wr(&bsd4_spin);
+		KKASSERT(bsd4_curprocmask & (1 << cpuid));
+	} else if (bsd4_runqcount && (bsd4_rdyprocmask & (1 << cpuid))) {
+		atomic_clear_int(&bsd4_curprocmask, 1 << cpuid);
+		atomic_clear_int(&bsd4_rdyprocmask, 1 << cpuid);
+		dd->uschedcp = NULL;
+		dd->upri = PRIBASE_NULL;
+		spin_unlock_wr(&bsd4_spin);
+		lwkt_schedule(&dd->helper_thread);
+	} else {
+		dd->uschedcp = NULL;
+		dd->upri = PRIBASE_NULL;
+		atomic_clear_int(&bsd4_curprocmask, 1 << cpuid);
+		spin_unlock_wr(&bsd4_spin);
+	}
+	crit_exit_gd(gd);
+}
 
 /*
- * setrunqueue() 'wakes up' a 'user' process.  GIANT must be held.  The
- * user process may represent any user process, including the current
- * process.
+ * BSD4_SETRUNQUEUE
+ *
+ * This routine is called to schedule a new user process after a fork.
+ *
+ * The caller may set P_PASSIVE_ACQ in p_flag to indicate that we should
+ * attempt to leave the thread on the current cpu.
  *
  * If P_PASSIVE_ACQ is set setrunqueue() will not wakeup potential target
  * cpus in an attempt to keep the process on the current cpu at least for
@@ -291,36 +397,38 @@ need_user_resched_remote(void *dummy)
  * priority then the processes running on other cpus, we will allow the
  * process to be stolen by another cpu.
  *
- * WARNING! a thread can be acquired by another cpu the moment it is put
- * on the user scheduler's run queue AND we release the MP lock.  Since we
- * release the MP lock before switching out another cpu may begin stealing
- * our current thread before we are completely switched out!  The 
- * lwkt_acquire() function will stall until TDF_RUNNING is cleared on the
- * thread before stealing it.
+ * WARNING!  This routine cannot block.  bsd4_acquire_curproc() does 
+ * a deschedule/switch interlock and we can be moved to another cpu
+ * the moment we are switched out.  Our LWKT run state is the only
+ * thing preventing the transfer.
  *
- * NOTE on need_user_resched() calls: we have to call need_user_resched()
- * if the new process is more important then the current process, or if
- * the new process is the current process and is now less important then
- * other processes.
+ * The associated thread must NOT currently be scheduled (but can be the
+ * current process after it has been LWKT descheduled).  It must NOT be on
+ * a bsd4 scheduler queue either.  The purpose of this routine is to put
+ * it on a scheduler queue or make it the current user process and LWKT
+ * schedule it.  It is possible that the thread is in the middle of a LWKT
+ * switchout on another cpu, lwkt_acquire() deals with that case.
  *
- * The associated thread must NOT be scheduled.  
  * The process must be runnable.
- * This must be called at splhigh().
+ *
+ * MPSAFE
  */
 static void
 bsd4_setrunqueue(struct lwp *lp)
 {
-	struct rq *q;
-	struct globaldata *gd;
-	int pri;
+	globaldata_t gd;
+	bsd4_pcpu_t dd;
 	int cpuid;
-	u_int32_t needresched;
 #ifdef SMP
-	int count;
 	cpumask_t mask;
+	cpumask_t tmpmask;
 #endif
 
-	ASSERT_MP_LOCK_HELD(lp->lwp_thread);
+	/*
+	 * First validate the process state relative to the current cpu.
+	 * We don't need the spinlock for this, just a critical section.
+	 * We are in control of the process.
+	 */
 	crit_enter();
 	KASSERT(lp->lwp_proc->p_stat == SRUN, ("setrunqueue: proc not SRUN"));
 	KASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) == 0,
@@ -329,48 +437,54 @@ bsd4_setrunqueue(struct lwp *lp)
 	KKASSERT((lp->lwp_thread->td_flags & TDF_RUNQ) == 0);
 
 	/*
-	 * Note: gd is the gd of the TARGET thread's cpu, not our cpu.
+	 * Note: gd and dd are relative to the target thread's last cpu,
+	 * NOT our current cpu.
 	 */
 	gd = lp->lwp_thread->td_gd;
+	dd = &bsd4_pcpu[gd->gd_cpuid];
 
 	/*
+	 * If setrunqueue is being called due to being woken up, verses
+	 * being called when aquiring the current process, recalculate
+	 * estcpu.
+	 *
 	 * Because recalculate is only called once or twice for long sleeps,
 	 * not every second forever while the process is sleeping, we have 
 	 * to manually call it to resynchronize p_cpbase on wakeup or it
 	 * will wrap if the process was sleeping long enough (e.g. ~10 min
 	 * with the ACPI timer) and really mess up the nticks calculation.
+	 *
+	 * NOTE: because P_ONRUNQ is not set, bsd4_recalculate_estcpu()'s
+	 * calls to resetpriority will just play with the processes priority
+	 * fields and not mess with any queues, so it is MPSAFE in this
+	 * context.
 	 */
-	if (lp->lwp_slptime) {
+	if (lp->lwp_slptime && (lp->lwp_thread->td_flags & TDF_RUNNING) == 0) {
 	    bsd4_recalculate_estcpu(lp);
 	    lp->lwp_slptime = 0;
 	}
+
 	/*
-	 * We have not been released, make sure that we are not the currently
-	 * designated process.
+	 * This process is not supposed to be scheduled anywhere or assigned
+	 * as the current process anywhere.  Assert the condition.
 	 */
-	KKASSERT(gd->gd_uschedcp != lp);
+	KKASSERT(dd->uschedcp != lp);
 
 	/*
-	 * Check cpu affinity.  The associated thread is stable at the
-	 * moment.  Note that we may be checking another cpu here so we
-	 * have to be careful.  We are currently protected by the BGL.
+	 * Check local cpu affinity.  The associated thread is stable at 
+	 * the moment.  Note that we may be checking another cpu here so we
+	 * have to be careful.  We can only assign uschedcp on OUR cpu.
 	 *
 	 * This allows us to avoid actually queueing the process.  
 	 * acquire_curproc() will handle any threads we mistakenly schedule.
 	 */
 	cpuid = gd->gd_cpuid;
-
-	if ((curprocmask & (1 << cpuid)) == 0) {
-		atomic_set_int(&curprocmask, 1 << cpuid);
-		gd->gd_uschedcp = lp;
-		gd->gd_upri = lp->lwp_priority;
+	if (gd == mycpu && (bsd4_curprocmask & (1 << cpuid)) == 0) {
+		atomic_set_int(&bsd4_curprocmask, 1 << cpuid);
+		dd->uschedcp = lp;
+		dd->upri = lp->lwp_priority;
 		lwkt_schedule(lp->lwp_thread);
-		/* CANNOT TOUCH PROC OR TD AFTER SCHEDULE CALL TO REMOTE CPU */
 		crit_exit();
-#ifdef SMP
-		if (gd != mycpu)
-			++remote_resched_affinity;
-#endif
 		return;
 	}
 
@@ -379,189 +493,80 @@ bsd4_setrunqueue(struct lwp *lp)
 	 * to place this process on the userland scheduler's run queue for
 	 * action by the target cpu.
 	 */
-	++runqcount;
-	lp->lwp_proc->p_flag |= P_ONRUNQ;
-	if (lp->lwp_rtprio.type == RTP_PRIO_NORMAL) {
-		pri = (lp->lwp_priority & PRIMASK) / PPQ;
-		q = &queues[pri];
-		queuebits |= 1 << pri;
-		needresched = (queuebits & ((1 << pri) - 1));
-	} else if (lp->lwp_rtprio.type == RTP_PRIO_REALTIME ||
-		   lp->lwp_rtprio.type == RTP_PRIO_FIFO) {
-		pri = (u_int8_t)lp->lwp_rtprio.prio;
-		q = &rtqueues[pri];
-		rtqueuebits |= 1 << pri;
-		needresched = (rtqueuebits & ((1 << pri) - 1));
-	} else if (lp->lwp_rtprio.type == RTP_PRIO_IDLE) {
-		pri = (u_int8_t)lp->lwp_rtprio.prio;
-		q = &idqueues[pri];
-		idqueuebits |= 1 << pri;
-		needresched = (idqueuebits & ((1 << pri) - 1));
-	} else {
-		needresched = 0;
-		panic("setrunqueue: invalid rtprio type");
-	}
-	KKASSERT(pri < 32);
-	lp->lwp_rqindex = pri;		/* remember the queue index */
-	TAILQ_INSERT_TAIL(q, lp, lwp_procq);
-
 #ifdef SMP
 	/*
-	 * Either wakeup other cpus user thread scheduler or request 
-	 * preemption on other cpus (which will also wakeup a HLT).
-	 *
-	 * NOTE!  gd and cpuid may still be our 'hint', not our current
-	 * cpu info.
+	 * XXX fixme.  Could be part of a remrunqueue/setrunqueue
+	 * operation when the priority is recalculated, so TDF_MIGRATING
+	 * may already be set.
 	 */
-
-	count = runqcount;
+	if ((lp->lwp_thread->td_flags & TDF_MIGRATING) == 0)
+		lwkt_giveaway(lp->lwp_thread);
+#endif
+	spin_lock_wr(&bsd4_spin);
+	bsd4_setrunqueue_locked(lp);
+	spin_unlock_wr(&bsd4_spin);
 
 	/*
-	 * Check cpu affinity for user preemption (when the curprocmask bit
-	 * is set).  Note that gd_upri is a speculative field (we modify
-	 * another cpu's gd_upri to avoid sending ipiq storms).
+	 * gd and cpuid may still be our 'hint', not our current cpu info.
+	 *
+	 * Cpu locality of reference.  If the LWP has higher priority
+	 * (lower lwp_priority value) on its target cpu, reschedule on
+	 * that cpu.
 	 */
-	if (gd == mycpu) {
-		if ((lp->lwp_thread->td_flags & TDF_NORESCHED) == 0) {
-			if (lp->lwp_priority < gd->gd_upri - PPQ) {
-				gd->gd_upri = lp->lwp_priority;
-				gd->gd_rrcount = 0;
-				need_user_resched();
-				--count;
-			} else if (gd->gd_uschedcp == lp && needresched) {
-				gd->gd_rrcount = 0;
+	if ((lp->lwp_thread->td_flags & TDF_NORESCHED) == 0) {
+		if (dd->upri > lp->lwp_priority) {	/* heuristic */
+			dd->upri = lp->lwp_priority;	/* heuristic */
+#ifdef SMP
+			if (gd == mycpu) {
 				need_user_resched();
-				--count;
+			} else {
+				lwkt_send_ipiq(gd, need_user_resched_remote,
+					       NULL);
 			}
-		}
-	} else if (remote_resched) {
-		if (lp->lwp_priority < gd->gd_upri - PPQ) {
-			gd->gd_upri = lp->lwp_priority;
-			lwkt_send_ipiq(gd, need_user_resched_remote, NULL);
-			--count;
-			++remote_resched_affinity;
-		}
-	}
-
-	/*
-	 * No affinity, first schedule to any cpus that do not have a current
-	 * process.  If there is a free cpu we always schedule to it.
-	 */
-	if (count &&
-	    (mask = ~curprocmask & rdyprocmask & mycpu->gd_other_cpus) != 0 &&
-	    (lp->lwp_proc->p_flag & P_PASSIVE_ACQ) == 0) {
-		if (!mask)
-			printf("lwp %d/%d nocpu to schedule it on\n",
-			       lp->lwp_proc->p_pid, lp->lwp_tid);
-		while (mask && count) {
-			cpuid = bsfl(mask);
-			KKASSERT((curprocmask & (1 << cpuid)) == 0);
-			atomic_clear_int(&rdyprocmask, 1 << cpuid);
-			lwkt_schedule(&globaldata_find(cpuid)->gd_schedthread);
-			--count;
-			mask &= ~(1 << cpuid);
+#else
+			need_user_resched();
+#endif
+			crit_exit();
+			return;
 		}
 	}
 
+#ifdef SMP
 	/*
-	 * If there are still runnable processes try to wakeup a random
-	 * cpu that is running a much lower priority process in order to
-	 * preempt on it.  Note that gd_upri is only a hint, so we can
-	 * overwrite it from the wrong cpu.   If we can't find one, we
-	 * are SOL.
+	 * Otherwise the LWP has a lower priority or we were asked not
+	 * to reschedule.  Look for an idle cpu whos scheduler helper
+	 * is ready to accept more work.
 	 *
-	 * We depress the priority check so multiple cpu bound programs
-	 * do not bounce between cpus.  Remember that the clock interrupt
-	 * will also cause all cpus to reschedule.
+	 * Look for an idle cpu starting at our rotator (bsd4_scancpu).
 	 *
-	 * We must mask against rdyprocmask or we will race in the boot
-	 * code (before all cpus have working scheduler helpers), plus
-	 * some cpus might not be operational and/or not configured to
-	 * handle user processes.
+	 * If no cpus are ready to accept work, just return.
+	 *
+	 * XXX P_PASSIVE_ACQ
 	 */
-	if (count && remote_resched && ncpus > 1) {
-		cpuid = scancpu;
-		do {
-			if (++cpuid == ncpus)
-				cpuid = 0;
-		} while (cpuid == mycpu->gd_cpuid);
-		scancpu = cpuid;
-
-		if (rdyprocmask & (1 << cpuid)) {
-			gd = globaldata_find(cpuid);
-
-			if (lp->lwp_priority < gd->gd_upri - PPQ) {
-				gd->gd_upri = lp->lwp_priority;
-				lwkt_send_ipiq(gd, need_user_resched_remote, NULL);
-				++remote_resched_nonaffinity;
-			}
-		}
-	}
-#else
-	if ((lp->lwp_thread->td_flags & TDF_NORESCHED) == 0) {
-		if (lp->lwp_priority < gd->gd_upri - PPQ) {
-			gd->gd_upri = lp->lwp_priority;
-			gd->gd_rrcount = 0;
-			need_user_resched();
-		} else if (gd->gd_uschedcp == lp && needresched) {
-			gd->gd_rrcount = 0;
-			need_user_resched();
-		}
+	mask = ~bsd4_curprocmask & bsd4_rdyprocmask & mycpu->gd_other_cpus;
+	if (mask) {
+		cpuid = bsd4_scancpu;
+		if (++cpuid == ncpus)
+			cpuid = 0;
+		tmpmask = ~((1 << cpuid) - 1);
+		if (mask & tmpmask)
+			cpuid = bsfl(mask & tmpmask);
+		else
+			cpuid = bsfl(mask);
+		atomic_clear_int(&bsd4_rdyprocmask, 1 << cpuid);
+		bsd4_scancpu = cpuid;
+		lwkt_schedule(&bsd4_pcpu[cpuid].helper_thread);
 	}
 #endif
 	crit_exit();
 }
 
-/*
- * remrunqueue() removes a given process from the run queue that it is on,
- * clearing the queue busy bit if it becomes empty.  This function is called
- * when a userland process is selected for LWKT scheduling.  Note that 
- * LWKT scheduling is an abstraction of 'curproc'.. there could very well be
- * several userland processes whos threads are scheduled or otherwise in
- * a special state, and such processes are NOT on the userland scheduler's
- * run queue.
- *
- * This must be called at splhigh().
- */
-static void
-bsd4_remrunqueue(struct lwp *lp)
-{
-	struct rq *q;
-	u_int32_t *which;
-	u_int8_t pri;
-
-	ASSERT_MP_LOCK_HELD(lp->lwp_thread);
-	crit_enter();
-	KASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) != 0, ("not on runq4!"));
-	lp->lwp_proc->p_flag &= ~P_ONRUNQ;
-	--runqcount;
-	KKASSERT(runqcount >= 0);
-	pri = lp->lwp_rqindex;
-	if (lp->lwp_rtprio.type == RTP_PRIO_NORMAL) {
-		q = &queues[pri];
-		which = &queuebits;
-	} else if (lp->lwp_rtprio.type == RTP_PRIO_REALTIME ||
-		   lp->lwp_rtprio.type == RTP_PRIO_FIFO) {
-		q = &rtqueues[pri];
-		which = &rtqueuebits;
-	} else if (lp->lwp_rtprio.type == RTP_PRIO_IDLE) {
-		q = &idqueues[pri];
-		which = &idqueuebits;
-	} else {
-		panic("remrunqueue: invalid rtprio type");
-	}
-	TAILQ_REMOVE(q, lp, lwp_procq);
-	if (TAILQ_EMPTY(q)) {
-		KASSERT((*which & (1 << pri)) != 0,
-			("remrunqueue: remove from empty queue"));
-		*which &= ~(1 << pri);
-	}
-	crit_exit();
-}
-
 /*
  * This routine is called from a systimer IPI.  It MUST be MP-safe and
- * the BGL IS NOT HELD ON ENTRY.  This routine is called at ESTCPUFREQ.
+ * the BGL IS NOT HELD ON ENTRY.  This routine is called at ESTCPUFREQ on
+ * each cpu.
+ *
+ * Because this is effectively a 'fast' interrupt, we cannot safely
  *
  * MPSAFE
  */
@@ -570,13 +575,14 @@ void
 bsd4_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp)
 {
 	globaldata_t gd = mycpu;
+	bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid];
 
 	/*
 	 * Do we need to round-robin?  We round-robin 10 times a second.
 	 * This should only occur for cpu-bound batch processes.
 	 */
-	if (++gd->gd_rrcount >= usched_bsd4_rrinterval) {
-		gd->gd_rrcount = 0;
+	if (++dd->rrcount >= usched_bsd4_rrinterval) {
+		dd->rrcount = 0;
 		need_user_resched();
 	}
 
@@ -594,254 +600,208 @@ bsd4_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp)
 	 */
 	if (lp->lwp_origcpu)
 		--lp->lwp_origcpu;
-
-	/* XXX optimize, avoid lock if no reset is required */
-	if (try_mplock()) {
-		bsd4_resetpriority(lp);
-		rel_mplock();
-	}
+	bsd4_resetpriority(lp);
 }
 
 /*
- * Release the current process designation on p.  P MUST BE CURPROC.
- * Attempt to assign a new current process from the run queue.
- *
- * This function is called from exit1(), tsleep(), and the passive
- * release code setup in <arch>/<arch>/trap.c
+ * Called from acquire and from kern_synch's one-second timer (one of the
+ * callout helper threads) with a critical section held. 
  *
- * If we do not have or cannot get the MP lock we just wakeup the userland
- * helper scheduler thread for this cpu to do the work for us.
+ * Decay p_estcpu based on the number of ticks we haven't been running
+ * and our p_nice.  As the load increases each process observes a larger
+ * number of idle ticks (because other processes are running in them).
+ * This observation leads to a larger correction which tends to make the
+ * system more 'batchy'.
  *
- * WARNING!  The MP lock may be in an unsynchronized state due to the
- * way get_mplock() works and the fact that this function may be called
- * from a passive release during a lwkt_switch().   try_mplock() will deal 
- * with this for us but you should be aware that td_mpcount may not be
- * useable.
- */
-static void
-bsd4_release_curproc(struct lwp *lp)
-{
-	int cpuid;
-	globaldata_t gd = mycpu;
-
-	KKASSERT(lp->lwp_thread->td_gd == gd);
-	crit_enter();
-	cpuid = gd->gd_cpuid;
-
-	if (gd->gd_uschedcp == lp) {
-		if (try_mplock()) {
-			/*
-			 * If we can obtain the MP lock we can directly 
-			 * select the next current process.
-			 *
-			 * bsd4_select_curproc() will adjust curprocmask
-			 * for us.
-			 */
-			gd->gd_uschedcp = NULL;
-			gd->gd_upri = PRIBASE_NULL;
-			bsd4_select_curproc(gd);
-			rel_mplock();
-		} else {
-			/*
-			 * If we cannot obtain the MP lock schedule our
-			 * helper thread to select the next current
-			 * process.
-			 *
-			 * This is the only place where we adjust curprocmask
-			 * and rdyprocmask without holding the MP lock.
-			 */
-			gd->gd_uschedcp = NULL;
-			gd->gd_upri = PRIBASE_NULL;
-			atomic_clear_int(&curprocmask, 1 << cpuid);
-			if (runqcount && (rdyprocmask & (1 << cpuid))) {
-				atomic_clear_int(&rdyprocmask, 1 << cpuid);
-				lwkt_schedule(&mycpu->gd_schedthread);
-			}
-		}
-	}
-	crit_exit();
-}
-
-/*
- * Select a new current process, potentially retaining gd_uschedcp.  However,
- * be sure to round-robin.  This routine is generally only called if a
- * reschedule is requested and that typically only occurs if a new process
- * has a better priority or when we are round-robining.
+ * Note that no recalculation occurs for a process which sleeps and wakes
+ * up in the same tick.  That is, a system doing thousands of context
+ * switches per second will still only do serious estcpu calculations
+ * ESTCPUFREQ times per second.
  *
- * NOTE: Must be called with giant held and the current cpu's gd. 
- * NOTE: The caller must handle the situation where it loses a
- *	uschedcp designation that it previously held, typically by
- *	calling acquire_curproc() again. 
- * NOTE: May not block
+ * MPSAFE
  */
 static
-void
-bsd4_select_curproc(globaldata_t gd)
+void 
+bsd4_recalculate_estcpu(struct lwp *lp)
 {
-	struct lwp *nlp;
-	int cpuid = gd->gd_cpuid;
-	void *old;
-
-	clear_user_resched();
-	get_mplock();
+	globaldata_t gd = mycpu;
+	sysclock_t cpbase;
+	int loadfac;
+	int ndecay;
+	int nticks;
+	int nleft;
 
 	/*
-	 * Choose the next designated current user process.
-	 * Note that we cannot schedule gd_schedthread
-	 * if runqcount is 0 without creating a scheduling
-	 * loop. 
-	 *
-	 * We do not clear the user resched request here,
-	 * we need to test it later when we re-acquire.
-	 *
-	 * NOTE: chooseproc returns NULL if the chosen lwp
-	 * is gd_uschedcp. XXX needs cleanup.
+	 * We have to subtract periodic to get the last schedclock
+	 * timeout time, otherwise we would get the upcoming timeout.
+	 * Keep in mind that a process can migrate between cpus and
+	 * while the scheduler clock should be very close, boundary
+	 * conditions could lead to a small negative delta.
 	 */
-	old = gd->gd_uschedcp;
-	if ((nlp = chooseproc(gd->gd_uschedcp)) != NULL) {
-		atomic_set_int(&curprocmask, 1 << cpuid);
-		gd->gd_upri = nlp->lwp_priority;
-		gd->gd_uschedcp = nlp;
-		lwkt_acquire(nlp->lwp_thread);
-		lwkt_schedule(nlp->lwp_thread);
-	} else if (gd->gd_uschedcp) {
-		gd->gd_upri = gd->gd_uschedcp->lwp_priority;
-		KKASSERT(curprocmask & (1 << cpuid));
-	} else if (runqcount && (rdyprocmask & (1 << cpuid))) {
-		/*gd->gd_uschedcp = NULL;*/
-		atomic_clear_int(&curprocmask, 1 << cpuid);
-		atomic_clear_int(&rdyprocmask, 1 << cpuid);
-		lwkt_schedule(&gd->gd_schedthread);
-	} else {
-		/*gd->gd_uschedcp = NULL;*/
-		atomic_clear_int(&curprocmask, 1 << cpuid);
-	}
-	rel_mplock();
-}
-
-/*
- * Acquire the current process designation on the CURRENT process only.
- * This function is called at kernel-user priority (not userland priority)
- * when curlwp does not match gd_uschedcp.
- *
- * This function is only called just prior to returning to user mode.
- *
- * Basically we recalculate our estcpu to hopefully give us a more
- * favorable disposition, setrunqueue, then wait for the curlwp
- * designation to be handed to us (if the setrunqueue didn't do it).
- *
- * WARNING! THIS FUNCTION MAY CAUSE THE CURRENT THREAD TO MIGRATE TO
- * ANOTHER CPU!  Because most of the kernel assumes that no migration will
- * occur, this function is called only under very controlled circumstances.
- */
-static void
-bsd4_acquire_curproc(struct lwp *lp)
-{
-	globaldata_t gd = mycpu;
+	cpbase = gd->gd_schedclock.time - gd->gd_schedclock.periodic;
 
-	get_mplock();
-	crit_enter();
+	if (lp->lwp_slptime > 1) {
+		/*
+		 * Too much time has passed, do a coarse correction.
+		 */
+		lp->lwp_estcpu = lp->lwp_estcpu >> 1;
+		bsd4_resetpriority(lp);
+		lp->lwp_cpbase = cpbase;
+		lp->lwp_cpticks = 0;
+	} else if (lp->lwp_cpbase != cpbase) {
+		/*
+		 * Adjust estcpu if we are in a different tick.  Don't waste
+		 * time if we are in the same tick. 
+		 * 
+		 * First calculate the number of ticks in the measurement
+		 * interval.  The nticks calculation can wind up 0 due to
+		 * a bug in the handling of lwp_slptime  (as yet not found),
+		 * so make sure we do not get a divide by 0 panic.
+		 */
+		nticks = (cpbase - lp->lwp_cpbase) / gd->gd_schedclock.periodic;
+		if (nticks <= 0)
+			nticks = 1;
+		updatepcpu(lp, lp->lwp_cpticks, nticks);
 
-	/*
-	 * Recalculate our priority and put us back on the userland
-	 * scheduler's runq.
-	 *
-	 * Only increment the involuntary context switch count if the
-	 * setrunqueue call did not immediately schedule us.
-	 */
-	KKASSERT(lp == gd->gd_curthread->td_lwp);
-	bsd4_recalculate_estcpu(lp);
-	lwkt_deschedule_self(gd->gd_curthread);
-	bsd4_setrunqueue(lp);
-	if ((gd->gd_curthread->td_flags & TDF_RUNQ) == 0)
-		++lp->lwp_stats->p_ru.ru_nivcsw;
-	lwkt_switch();
+		if ((nleft = nticks - lp->lwp_cpticks) < 0)
+			nleft = 0;
+		if (usched_debug == lp->lwp_proc->p_pid) {
+			printf("pid %d tid %d estcpu %d cpticks %d nticks %d nleft %d",
+				lp->lwp_proc->p_pid, lp->lwp_tid, lp->lwp_estcpu,
+				lp->lwp_cpticks, nticks, nleft);
+		}
 
-	/*
-	 * Because we put ourselves back on the userland scheduler's run
-	 * queue, WE MAY HAVE BEEN MIGRATED TO ANOTHER CPU
-	 */
-	gd = mycpu;
+		/*
+		 * Calculate a decay value based on ticks remaining scaled
+		 * down by the instantanious load and p_nice.
+		 */
+		if ((loadfac = bsd4_runqcount) < 2)
+			loadfac = 2;
+		ndecay = nleft * usched_bsd4_decay * 2 * 
+			(PRIO_MAX * 2 - lp->lwp_proc->p_nice) / (loadfac * PRIO_MAX * 2);
 
-	/*
-	 * We better be the current process when we wake up, and we had
-	 * better not be on the run queue.
-	 */
-	KKASSERT(gd->gd_uschedcp == lp);
-	KKASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) == 0);
+		/*
+		 * Adjust p_estcpu.  Handle a border case where batch jobs
+		 * can get stalled long enough to decay to zero when they
+		 * shouldn't.
+		 */
+		if (lp->lwp_estcpu > ndecay * 2)
+			lp->lwp_estcpu -= ndecay;
+		else
+			lp->lwp_estcpu >>= 1;
 
-	crit_exit();
-	rel_mplock();
+		if (usched_debug == lp->lwp_proc->p_pid)
+			printf(" ndecay %d estcpu %d\n", ndecay, lp->lwp_estcpu);
+		bsd4_resetpriority(lp);
+		lp->lwp_cpbase = cpbase;
+		lp->lwp_cpticks = 0;
+	}
 }
 
 /*
  * Compute the priority of a process when running in user mode.
  * Arrange to reschedule if the resulting priority is better
  * than that of the current process.
+ *
+ * This routine may be called with any process.
+ *
+ * This routine is called by fork1() for initial setup with the process
+ * of the run queue, and also may be called normally with the process on or
+ * off the run queue.
+ *
+ * MPSAFE
  */
 static void
 bsd4_resetpriority(struct lwp *lp)
 {
+	bsd4_pcpu_t dd;
 	int newpriority;
-	int opq;
-	int npq;
-
-	ASSERT_MP_LOCK_HELD(curthread);
+	u_short newrqtype;
+	int reschedcpu;
 
 	/*
-	 * Set p_priority for general process comparisons
+	 * Calculate the new priority and queue type
 	 */
-	switch(lp->lwp_rtprio.type) {
+	crit_enter();
+	spin_lock_wr(&bsd4_spin);
+
+	newrqtype = lp->lwp_rtprio.type;
+
+	switch(newrqtype) {
 	case RTP_PRIO_REALTIME:
-		lp->lwp_priority = PRIBASE_REALTIME + lp->lwp_rtprio.prio;
-		return;
+		newpriority = PRIBASE_REALTIME +
+			     (lp->lwp_rtprio.prio & PRIMASK);
+		break;
 	case RTP_PRIO_NORMAL:
+		newpriority = (lp->lwp_proc->p_nice - PRIO_MIN) * PPQ / NICEPPQ;
+		newpriority += lp->lwp_estcpu * PPQ / ESTCPUPPQ;
+		newpriority = newpriority * MAXPRI / (PRIO_RANGE * PPQ /
+			      NICEPPQ + ESTCPUMAX * PPQ / ESTCPUPPQ);
+		newpriority = PRIBASE_NORMAL + (newpriority & PRIMASK);
 		break;
 	case RTP_PRIO_IDLE:
-		lp->lwp_priority = PRIBASE_IDLE + lp->lwp_rtprio.prio;
-		return;
+		newpriority = PRIBASE_IDLE + (lp->lwp_rtprio.prio & PRIMASK);
+		break;
 	case RTP_PRIO_THREAD:
-		lp->lwp_priority = PRIBASE_THREAD + lp->lwp_rtprio.prio;
-		return;
+		newpriority = PRIBASE_THREAD + (lp->lwp_rtprio.prio & PRIMASK);
+		break;
+	default:
+		panic("Bad RTP_PRIO %d", newrqtype);
+		/* NOT REACHED */
 	}
 
 	/*
-	 * NORMAL priorities fall through.  These are based on niceness
-	 * and cpu use.  Lower numbers == higher priorities.
-	 *
-	 * Calculate our priority based on our niceness and estimated cpu.
-	 * Note that the nice value adjusts the baseline, which effects
-	 * cpu bursts but does not effect overall cpu use between cpu-bound
-	 * processes.  The use of the nice field in the decay calculation
-	 * controls the overall cpu use.
-	 *
-	 * This isn't an exact calculation.  We fit the full nice and
-	 * estcpu range into the priority range so the actual PPQ value
-	 * is incorrect, but it's still a reasonable way to think about it.
+	 * The newpriority incorporates the queue type so do a simple masked
+	 * check to determine if the process has moved to another queue.  If
+	 * it has, and it is currently on a run queue, then move it.
 	 */
-	newpriority = (lp->lwp_proc->p_nice - PRIO_MIN) * PPQ / NICEPPQ;
-	newpriority += lp->lwp_estcpu * PPQ / ESTCPUPPQ;
-	newpriority = newpriority * MAXPRI /
-		    (PRIO_RANGE * PPQ / NICEPPQ + ESTCPUMAX * PPQ / ESTCPUPPQ);
-	newpriority = MIN(newpriority, MAXPRI - 1);	/* sanity */
-	newpriority = MAX(newpriority, 0);		/* sanity */
-	npq = newpriority / PPQ;
-	crit_enter();
-	opq = (lp->lwp_priority & PRIMASK) / PPQ;
-	if (lp->lwp_proc->p_stat == SRUN && (lp->lwp_proc->p_flag & P_ONRUNQ) && opq != npq) {
-		/*
-		 * We have to move the process to another queue
-		 */
-		bsd4_remrunqueue(lp);
-		lp->lwp_priority = PRIBASE_NORMAL + newpriority;
-		bsd4_setrunqueue(lp);
+	if ((lp->lwp_priority ^ newpriority) & ~PPQMASK) {
+		lp->lwp_priority = newpriority;
+		if (lp->lwp_proc->p_flag & P_ONRUNQ) {
+			bsd4_remrunqueue_locked(lp);
+			lp->lwp_rqtype = newrqtype;
+			lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ;
+			bsd4_setrunqueue_locked(lp);
+			reschedcpu = lp->lwp_thread->td_gd->gd_cpuid;
+		} else {
+			lp->lwp_rqtype = newrqtype;
+			lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ;
+			reschedcpu = -1;
+		}
 	} else {
-		/*
-		 * We can just adjust the priority and it will be picked
-		 * up later.
-		 */
-		KKASSERT(opq == npq || (lp->lwp_proc->p_flag & P_ONRUNQ) == 0);
-		lp->lwp_priority = PRIBASE_NORMAL + newpriority;
+		lp->lwp_rqtype = newrqtype;
+		lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ;
+		lp->lwp_priority = newpriority;
+		reschedcpu = -1;
+	}
+	spin_unlock_wr(&bsd4_spin);
+
+	/*
+	 * Determine if we need to reschedule the target cpu.  Since at
+	 * most we are moving an already-scheduled lwp around, we don't
+	 * have to be fancy here.
+	 */
+	if (reschedcpu >= 0) {
+		dd = &bsd4_pcpu[reschedcpu];
+		if (dd->uschedcp == lp) {
+			/*
+			 * We don't need to reschedule ourselves.  In fact,
+			 * this could lead to a livelock.
+			 */
+			dd->upri = lp->lwp_priority;
+		} else if (dd->upri > lp->lwp_priority) {	/* heuristic */
+			dd->upri = lp->lwp_priority;		/* heuristic */
+#ifdef SMP
+			if (reschedcpu == mycpu->gd_cpuid) {
+				need_user_resched();
+			} else {
+				lwkt_send_ipiq(lp->lwp_thread->td_gd,
+					       need_user_resched_remote, NULL);
+			}
+#else
+			need_user_resched();
+#endif
+		}
 	}
 	crit_exit();
 }
@@ -886,102 +846,204 @@ bsd4_exiting(struct lwp *plp, struct lwp *lp)
 	}
 }
 
+
 /*
- * Called from acquire and from kern_synch's one-second timer with a 
- * critical section held.
+ * chooseproc() is called when a cpu needs a user process to LWKT schedule,
+ * it selects a user process and returns it.  If chklp is non-NULL and chklp
+ * has a better or equal priority then the process that would otherwise be
+ * chosen, NULL is returned.
  *
- * Decay p_estcpu based on the number of ticks we haven't been running
- * and our p_nice.  As the load increases each process observes a larger
- * number of idle ticks (because other processes are running in them).
- * This observation leads to a larger correction which tends to make the
- * system more 'batchy'.
+ * Until we fix the RUNQ code the chklp test has to be strict or we may
+ * bounce between processes trying to acquire the current process designation.
  *
- * Note that no recalculation occurs for a process which sleeps and wakes
- * up in the same tick.  That is, a system doing thousands of context
- * switches per second will still only do serious estcpu calculations
- * ESTCPUFREQ times per second.
+ * MPSAFE - must be called with bsd4_spin exclusive held.  The spinlock is
+ *	    left intact through the entire routine.
  */
 static
-void 
-bsd4_recalculate_estcpu(struct lwp *lp)
+struct lwp *
+chooseproc_locked(struct lwp *chklp)
 {
-	globaldata_t gd = mycpu;
-	sysclock_t cpbase;
-	int loadfac;
-	int ndecay;
-	int nticks;
-	int nleft;
+	struct lwp *lp;
+	struct rq *q;
+	u_int32_t *which;
+	u_int32_t pri;
 
-	ASSERT_MP_LOCK_HELD(curthread);
+	if (bsd4_rtqueuebits) {
+		pri = bsfl(bsd4_rtqueuebits);
+		q = &bsd4_rtqueues[pri];
+		which = &bsd4_rtqueuebits;
+	} else if (bsd4_queuebits) {
+		pri = bsfl(bsd4_queuebits);
+		q = &bsd4_queues[pri];
+		which = &bsd4_queuebits;
+	} else if (bsd4_idqueuebits) {
+		pri = bsfl(bsd4_idqueuebits);
+		q = &bsd4_idqueues[pri];
+		which = &bsd4_idqueuebits;
+	} else {
+		return NULL;
+	}
+	lp = TAILQ_FIRST(q);
+	KASSERT(lp, ("chooseproc: no lwp on busy queue"));
 
 	/*
-	 * We have to subtract periodic to get the last schedclock
-	 * timeout time, otherwise we would get the upcoming timeout.
-	 * Keep in mind that a process can migrate between cpus and
-	 * while the scheduler clock should be very close, boundary
-	 * conditions could lead to a small negative delta.
+	 * If the passed lwp <chklp> is reasonably close to the selected
+	 * lwp <lp>, return NULL (indicating that <chklp> should be kept).
+	 * 
+	 * Note that we must error on the side of <chklp> to avoid bouncing
+	 * between threads in the acquire code.
 	 */
-	cpbase = gd->gd_schedclock.time - gd->gd_schedclock.periodic;
-
-	if (lp->lwp_slptime > 1) {
-		/*
-		 * Too much time has passed, do a coarse correction.
-		 */
-		lp->lwp_estcpu = lp->lwp_estcpu >> 1;
-		bsd4_resetpriority(lp);
-		lp->lwp_cpbase = cpbase;
-		lp->lwp_cpticks = 0;
-	} else if (lp->lwp_cpbase != cpbase) {
-		/*
-		 * Adjust estcpu if we are in a different tick.  Don't waste
-		 * time if we are in the same tick. 
-		 * 
-		 * First calculate the number of ticks in the measurement
-		 * interval.  The nticks calculation can wind up 0 due to
-		 * a bug in the handling of lwp_slptime  (as yet not found),
-		 * so make sure we do not get a divide by 0 panic.
-		 */
-		nticks = (cpbase - lp->lwp_cpbase) / gd->gd_schedclock.periodic;
-		if (nticks <= 0)
-			nticks = 1;
-		updatepcpu(lp, lp->lwp_cpticks, nticks);
+	if (chklp) {
+		if (chklp->lwp_priority < lp->lwp_priority + PPQ)
+			return(NULL);
+	}
 
-		if ((nleft = nticks - lp->lwp_cpticks) < 0)
-			nleft = 0;
-		if (usched_debug == lp->lwp_proc->p_pid) {
-			printf("pid %d tid %d estcpu %d cpticks %d nticks %d nleft %d",
-				lp->lwp_proc->p_pid, lp->lwp_tid, lp->lwp_estcpu,
-				lp->lwp_cpticks, nticks, nleft);
+#ifdef SMP
+	/*
+	 * If the chosen lwp does not reside on this cpu spend a few
+	 * cycles looking for a better candidate at the same priority level.
+	 * This is a fallback check, setrunqueue() tries to wakeup the
+	 * correct cpu and is our front-line affinity.
+	 */
+	if (lp->lwp_thread->td_gd != mycpu &&
+	    (chklp = TAILQ_NEXT(lp, lwp_procq)) != NULL
+	) {
+		if (chklp->lwp_thread->td_gd == mycpu) {
+			++choose_affinity;
+			lp = chklp;
 		}
+	}
+#endif
 
-		/*
-		 * Calculate a decay value based on ticks remaining scaled
-		 * down by the instantanious load and p_nice.
-		 */
-		if ((loadfac = runqcount) < 2)
-			loadfac = 2;
-		ndecay = nleft * usched_bsd4_decay * 2 * 
-			(PRIO_MAX * 2 - lp->lwp_proc->p_nice) / (loadfac * PRIO_MAX * 2);
+	TAILQ_REMOVE(q, lp, lwp_procq);
+	--bsd4_runqcount;
+	if (TAILQ_EMPTY(q))
+		*which &= ~(1 << pri);
+	KASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) != 0, ("not on runq6!"));
+	lp->lwp_proc->p_flag &= ~P_ONRUNQ;
+	return lp;
+}
 
-		/*
-		 * Adjust p_estcpu.  Handle a border case where batch jobs
-		 * can get stalled long enough to decay to zero when they
-		 * shouldn't.
-		 */
-		if (lp->lwp_estcpu > ndecay * 2)
-			lp->lwp_estcpu -= ndecay;
-		else
-			lp->lwp_estcpu >>= 1;
+#ifdef SMP
+/*
+ * Called via an ipi message to reschedule on another cpu.
+ *
+ * MPSAFE
+ */
+static
+void
+need_user_resched_remote(void *dummy)
+{
+	need_user_resched();
+}
 
-		if (usched_debug == lp->lwp_proc->p_pid)
-			printf(" ndecay %d estcpu %d\n", ndecay, lp->lwp_estcpu);
+#endif
 
-		bsd4_resetpriority(lp);
-		lp->lwp_cpbase = cpbase;
-		lp->lwp_cpticks = 0;
+
+/*
+ * bsd4_remrunqueue_locked() removes a given process from the run queue
+ * that it is on, clearing the queue busy bit if it becomes empty.
+ *
+ * Note that user process scheduler is different from the LWKT schedule.
+ * The user process scheduler only manages user processes but it uses LWKT
+ * underneath, and a user process operating in the kernel will often be
+ * 'released' from our management.
+ *
+ * MPSAFE - bsd4_spin must be held exclusively on call
+ */
+static void
+bsd4_remrunqueue_locked(struct lwp *lp)
+{
+	struct rq *q;
+	u_int32_t *which;
+	u_int8_t pri;
+
+	KKASSERT(lp->lwp_proc->p_flag & P_ONRUNQ);
+	lp->lwp_proc->p_flag &= ~P_ONRUNQ;
+	--bsd4_runqcount;
+	KKASSERT(bsd4_runqcount >= 0);
+
+	pri = lp->lwp_rqindex;
+	switch(lp->lwp_rqtype) {
+	case RTP_PRIO_NORMAL:
+		q = &bsd4_queues[pri];
+		which = &bsd4_queuebits;
+		break;
+	case RTP_PRIO_REALTIME:
+	case RTP_PRIO_FIFO:
+		q = &bsd4_rtqueues[pri];
+		which = &bsd4_rtqueuebits;
+		break;
+	case RTP_PRIO_IDLE:
+		q = &bsd4_idqueues[pri];
+		which = &bsd4_idqueuebits;
+		break;
+	default:
+		panic("remrunqueue: invalid rtprio type");
+		/* NOT REACHED */
+	}
+	TAILQ_REMOVE(q, lp, lwp_procq);
+	if (TAILQ_EMPTY(q)) {
+		KASSERT((*which & (1 << pri)) != 0,
+			("remrunqueue: remove from empty queue"));
+		*which &= ~(1 << pri);
 	}
 }
 
+/*
+ * bsd4_setrunqueue_locked()
+ *
+ * Add a process whos rqtype and rqindex had previously been calculated
+ * onto the appropriate run queue.   Determine if the addition requires
+ * a reschedule on a cpu and return the cpuid or -1.
+ *
+ * NOTE: Lower priorities are better priorities.
+ *
+ * MPSAFE - bsd4_spin must be held exclusively on call
+ */
+static void
+bsd4_setrunqueue_locked(struct lwp *lp)
+{
+	struct rq *q;
+	u_int32_t *which;
+	int pri;
+
+	KKASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) == 0);
+	lp->lwp_proc->p_flag |= P_ONRUNQ;
+	++bsd4_runqcount;
+
+	pri = lp->lwp_rqindex;
+
+	switch(lp->lwp_rqtype) {
+	case RTP_PRIO_NORMAL:
+		q = &bsd4_queues[pri];
+		which = &bsd4_queuebits;
+		break;
+	case RTP_PRIO_REALTIME:
+	case RTP_PRIO_FIFO:
+		q = &bsd4_rtqueues[pri];
+		which = &bsd4_rtqueuebits;
+		break;
+	case RTP_PRIO_IDLE:
+		q = &bsd4_idqueues[pri];
+		which = &bsd4_idqueuebits;
+		break;
+	default:
+		panic("remrunqueue: invalid rtprio type");
+		/* NOT REACHED */
+	}
+
+	/*
+	 * Add to the correct queue and set the appropriate bit.  If no
+	 * lower priority (i.e. better) processes are in the queue then
+	 * we want a reschedule, calculate the best cpu for the job.
+	 *
+	 * Always run reschedules on the LWPs original cpu.
+	 */
+	TAILQ_INSERT_TAIL(q, lp, lwp_procq);
+	*which |= 1 << pri;
+}
+
 #ifdef SMP
 
 /*
@@ -991,29 +1053,71 @@ bsd4_recalculate_estcpu(struct lwp *lp)
  * need the helper since there is only one cpu.  We can't use the idle
  * thread for this because we need to hold the MP lock.  Additionally,
  * doing things this way allows us to HLT idle cpus on MP systems.
+ *
+ * MPSAFE
  */
 static void
 sched_thread(void *dummy)
 {
-    globaldata_t gd = mycpu;
-    int cpuid = gd->gd_cpuid;		/* doesn't change */
-    u_int32_t cpumask = 1 << cpuid;	/* doesn't change */
+    globaldata_t gd;
+    bsd4_pcpu_t  dd;
+    struct lwp *nlp;
+    cpumask_t cpumask;
+    cpumask_t tmpmask;
+    int cpuid;
+    int tmpid;
+
+    gd = mycpu;
+    cpuid = gd->gd_cpuid;	/* doesn't change */
+    cpumask = 1 << cpuid;	/* doesn't change */
+    dd = &bsd4_pcpu[cpuid];
+
+    /*
+     * Scheduler thread does not need to hold the MP lock
+     */
+    rel_mplock();
 
-    ASSERT_MP_LOCK_HELD(curthread);
     for (;;) {
-	struct lwp *nlp;
-
+	crit_enter_gd(gd);
 	lwkt_deschedule_self(gd->gd_curthread);	/* interlock */
-	atomic_set_int(&rdyprocmask, cpumask);
-	crit_enter_quick(gd->gd_curthread);
-	if ((curprocmask & cpumask) == 0 && (nlp = chooseproc(NULL)) != NULL) {
-	    atomic_set_int(&curprocmask, cpumask);
-	    gd->gd_upri = nlp->lwp_priority;
-	    gd->gd_uschedcp = nlp;
-	    lwkt_acquire(nlp->lwp_thread);
-	    lwkt_schedule(nlp->lwp_thread);
+	spin_lock_wr(&bsd4_spin);
+	atomic_set_int(&bsd4_rdyprocmask, cpumask);
+	if ((bsd4_curprocmask & cpumask) == 0) {
+		if ((nlp = chooseproc_locked(NULL)) != NULL) {
+			atomic_set_int(&bsd4_curprocmask, cpumask);
+			dd->upri = nlp->lwp_priority;
+			dd->uschedcp = nlp;
+			spin_unlock_wr(&bsd4_spin);
+			lwkt_acquire(nlp->lwp_thread);
+			lwkt_schedule(nlp->lwp_thread);
+		} else {
+			spin_unlock_wr(&bsd4_spin);
+		}
+	} else {
+		/*
+		 * Someone scheduled us but raced.  In order to not lose
+		 * track of the fact that there may be a LWP ready to go,
+		 * forward the request to another cpu if available.
+		 *
+		 * Rotate through cpus starting with cpuid + 1.  Since cpuid
+		 * is already masked out by gd_other_cpus, just use ~cpumask.
+		 */
+		tmpmask = ~bsd4_curprocmask & bsd4_rdyprocmask &
+			  mycpu->gd_other_cpus;
+		if (tmpmask) {
+			if (tmpmask & ~(cpumask - 1))
+				tmpid = bsfl(tmpmask & ~(cpumask - 1));
+			else
+				tmpid = bsfl(tmpmask);
+			bsd4_scancpu = tmpid;
+			atomic_clear_int(&bsd4_rdyprocmask, 1 << tmpid);
+			spin_unlock_wr(&bsd4_spin);
+			lwkt_schedule(&bsd4_pcpu[tmpid].helper_thread);
+		} else {
+			spin_unlock_wr(&bsd4_spin);
+		}
 	}
-	crit_exit_quick(gd->gd_curthread);
+	crit_exit_gd(gd);
 	lwkt_switch();
     }
 }
@@ -1031,7 +1135,7 @@ sched_thread_cpu_init(void)
 	printf("start scheduler helpers on cpus:");
 
     for (i = 0; i < ncpus; ++i) {
-	globaldata_t dgd = globaldata_find(i);
+	bsd4_pcpu_t dd = &bsd4_pcpu[i];
 	cpumask_t mask = 1 << i;
 
 	if ((mask & smp_active_mask) == 0)
@@ -1040,7 +1144,7 @@ sched_thread_cpu_init(void)
 	if (bootverbose)
 	    printf(" %d", i);
 
-	lwkt_create(sched_thread, NULL, NULL, &dgd->gd_schedthread, 
+	lwkt_create(sched_thread, NULL, NULL, &dd->helper_thread, 
 		    TDF_STOPREQ, i, "usched %d", i);
 
 	/*
@@ -1048,8 +1152,8 @@ sched_thread_cpu_init(void)
 	 * been enabled in rqinit().
 	 */
 	if (i)
-	    atomic_clear_int(&curprocmask, mask);
-	atomic_set_int(&rdyprocmask, mask);
+	    atomic_clear_int(&bsd4_curprocmask, mask);
+	atomic_set_int(&bsd4_rdyprocmask, mask);
     }
     if (bootverbose)
 	printf("\n");
diff --git a/sys/kern/usched_dummy.c b/sys/kern/usched_dummy.c
new file mode 100644
index 0000000000..1bf4b931b5
--- /dev/null
+++ b/sys/kern/usched_dummy.c
@@ -0,0 +1,545 @@
+/*
+ * Copyright (c) 2006 The DragonFly Project.  All rights reserved.
+ * 
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@backplane.com>
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * 
+ * $DragonFly: src/sys/kern/usched_dummy.c,v 1.1 2006/05/29 03:57:20 dillon Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/queue.h>
+#include <sys/proc.h>
+#include <sys/rtprio.h>
+#include <sys/uio.h>
+#include <sys/sysctl.h>
+#include <sys/resourcevar.h>
+#include <sys/spinlock.h>
+#include <machine/ipl.h>
+#include <machine/cpu.h>
+#include <machine/smp.h>
+
+#include <sys/thread2.h>
+#include <sys/spinlock2.h>
+
+#define MAXPRI			128
+#define PRIBASE_REALTIME	0
+#define PRIBASE_NORMAL		MAXPRI
+#define PRIBASE_IDLE		(MAXPRI * 2)
+#define PRIBASE_THREAD		(MAXPRI * 3)
+#define PRIBASE_NULL		(MAXPRI * 4)
+
+#define lwp_priority	lwp_usdata.bsd4.priority
+#define lwp_estcpu	lwp_usdata.bsd4.estcpu
+
+static void dummy_acquire_curproc(struct lwp *lp);
+static void dummy_release_curproc(struct lwp *lp);
+static void dummy_select_curproc(globaldata_t gd);
+static void dummy_setrunqueue(struct lwp *lp);
+static void dummy_schedulerclock(struct lwp *lp, sysclock_t period,
+				sysclock_t cpstamp);
+static void dummy_recalculate_estcpu(struct lwp *lp);
+static void dummy_resetpriority(struct lwp *lp);
+static void dummy_forking(struct lwp *plp, struct lwp *lp);
+static void dummy_exiting(struct lwp *plp, struct lwp *lp);
+
+struct usched usched_dummy = {
+	{ NULL },
+	"dummy", "Dummy DragonFly Scheduler",
+	NULL,			/* default registration */
+	NULL,			/* default deregistration */
+	dummy_acquire_curproc,
+	dummy_release_curproc,
+	dummy_select_curproc,
+	dummy_setrunqueue,
+	dummy_schedulerclock,
+	dummy_recalculate_estcpu,
+	dummy_resetpriority,
+	dummy_forking,
+	dummy_exiting,
+	NULL			/* setcpumask not supported */
+};
+
+struct usched_dummy_pcpu {
+	int	rrcount;
+	struct thread helper_thread;
+	struct lwp *uschedcp;
+};
+
+typedef struct usched_dummy_pcpu *dummy_pcpu_t;
+
+static struct usched_dummy_pcpu dummy_pcpu[MAXCPU];
+static cpumask_t dummy_curprocmask = -1;
+static cpumask_t dummy_rdyprocmask;
+static struct spinlock dummy_spin;
+static TAILQ_HEAD(rq, lwp) dummy_runq;
+static int dummy_runqcount;
+
+static int usched_dummy_rrinterval = (ESTCPUFREQ + 9) / 10;
+SYSCTL_INT(_kern, OID_AUTO, usched_dummy_rrinterval, CTLFLAG_RW,
+        &usched_dummy_rrinterval, 0, "");
+
+/*
+ * Initialize the run queues at boot time, clear cpu 0 in curprocmask
+ * to allow dummy scheduling on cpu 0.
+ */
+static void
+dummyinit(void *dummy)
+{
+	TAILQ_INIT(&dummy_runq);
+	spin_init(&dummy_spin);
+	atomic_clear_int(&dummy_curprocmask, 1);
+}
+SYSINIT(runqueue, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, dummyinit, NULL)
+
+/*
+ * DUMMY_ACQUIRE_CURPROC
+ *
+ * This function is called when the kernel intends to return to userland.
+ * It is responsible for making the thread the current designated userland
+ * thread for this cpu, blocking if necessary.
+ *
+ * We are expected to handle userland reschedule requests here too.
+ *
+ * WARNING! THIS FUNCTION IS ALLOWED TO CAUSE THE CURRENT THREAD TO MIGRATE
+ * TO ANOTHER CPU!  Because most of the kernel assumes that no migration will
+ * occur, this function is called only under very controlled circumstances.
+ *
+ * MPSAFE
+ */
+static void
+dummy_acquire_curproc(struct lwp *lp)
+{
+	globaldata_t gd = mycpu;
+	dummy_pcpu_t dd = &dummy_pcpu[gd->gd_cpuid];
+	thread_t td = lp->lwp_thread;
+
+	/*
+	 * Possibly select another thread
+	 */
+	if (user_resched_wanted())
+		dummy_select_curproc(gd);
+
+	/*
+	 * If this cpu has no current thread, select ourself
+	 */
+	if (dd->uschedcp == NULL && TAILQ_EMPTY(&dummy_runq)) {
+		atomic_set_int(&dummy_curprocmask, gd->gd_cpumask);
+		dd->uschedcp = lp;
+		return;
+	}
+
+	/*
+	 * If this cpu's current user process thread is not our thread,
+	 * deschedule ourselves and place us on the run queue, then
+	 * switch away.
+	 *
+	 * We loop until we become the current process.  Its a good idea
+	 * to run any passive release(s) before we mess with the scheduler
+	 * so our thread is in the expected state.
+	 */
+	KKASSERT(dd->uschedcp != lp);
+	if (td->td_release)
+		td->td_release(lp->lwp_thread);
+	do {
+		crit_enter();
+		lwkt_deschedule_self(td);
+		dummy_setrunqueue(lp);
+		if ((td->td_flags & TDF_RUNQ) == 0)
+			++lp->lwp_stats->p_ru.ru_nivcsw;
+		lwkt_switch();		/* WE MAY MIGRATE TO ANOTHER CPU */
+		crit_exit();
+		gd = mycpu;
+		dd = &dummy_pcpu[gd->gd_cpuid];
+		KKASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) == 0);
+	} while (dd->uschedcp != lp);
+}
+
+/*
+ * DUMMY_RELEASE_CURPROC
+ *
+ * This routine detaches the current thread from the userland scheduler,
+ * usually because the thread needs to run in the kernel (at kernel priority)
+ * for a while.
+ *
+ * This routine is also responsible for selecting a new thread to
+ * make the current thread.
+ *
+ * WARNING!  The MP lock may be in an unsynchronized state due to the
+ * way get_mplock() works and the fact that this function may be called
+ * from a passive release during a lwkt_switch().   try_mplock() will deal 
+ * with this for us but you should be aware that td_mpcount may not be
+ * useable.
+ *
+ * MPSAFE
+ */
+static void
+dummy_release_curproc(struct lwp *lp)
+{
+	globaldata_t gd = mycpu;
+	dummy_pcpu_t dd = &dummy_pcpu[gd->gd_cpuid];
+
+	KKASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) == 0);
+	if (dd->uschedcp == lp) {
+		dummy_select_curproc(gd);
+	}
+}
+
+/*
+ * DUMMY_SELECT_CURPROC
+ *
+ * Select a new current process for this cpu.  This satisfies a user
+ * scheduler reschedule request so clear that too.
+ *
+ * This routine is also responsible for equal-priority round-robining,
+ * typically triggered from dummy_schedulerclock().  In our dummy example
+ * all the 'user' threads are LWKT scheduled all at once and we just
+ * call lwkt_switch().
+ *
+ * MPSAFE
+ */
+static
+void
+dummy_select_curproc(globaldata_t gd)
+{
+	dummy_pcpu_t dd = &dummy_pcpu[gd->gd_cpuid];
+	struct lwp *lp;
+
+	clear_user_resched();
+	spin_lock_wr(&dummy_spin);
+	if ((lp = TAILQ_FIRST(&dummy_runq)) == NULL) {
+		dd->uschedcp = NULL;
+		atomic_clear_int(&dummy_curprocmask, gd->gd_cpumask);
+		spin_unlock_wr(&dummy_spin);
+	} else {
+		--dummy_runqcount;
+		TAILQ_REMOVE(&dummy_runq, lp, lwp_procq);
+		lp->lwp_proc->p_flag &= ~P_ONRUNQ;
+		dd->uschedcp = lp;
+		atomic_set_int(&dummy_curprocmask, gd->gd_cpumask);
+		spin_unlock_wr(&dummy_spin);
+#ifdef SMP
+		lwkt_acquire(lp->lwp_thread);
+#endif
+		lwkt_schedule(lp->lwp_thread);
+	}
+}
+
+/*
+ * DUMMY_SETRUNQUEUE
+ *
+ * This routine is called to schedule a new user process after a fork.
+ * The scheduler module itself might also call this routine to place
+ * the current process on the userland scheduler's run queue prior
+ * to calling dummy_select_curproc().
+ *
+ * The caller may set P_PASSIVE_ACQ in p_flag to indicate that we should
+ * attempt to leave the thread on the current cpu.
+ *
+ * MPSAFE
+ */
+static void
+dummy_setrunqueue(struct lwp *lp)
+{
+	globaldata_t gd = mycpu;
+	dummy_pcpu_t dd = &dummy_pcpu[gd->gd_cpuid];
+	cpumask_t mask;
+	int cpuid;
+
+	if (dd->uschedcp == NULL) {
+		dd->uschedcp = lp;
+		atomic_set_int(&dummy_curprocmask, gd->gd_cpumask);
+		lwkt_schedule(lp->lwp_thread);
+	} else {
+		/*
+		 * Add to our global runq
+		 */
+		KKASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) == 0);
+		spin_lock_wr(&dummy_spin);
+		++dummy_runqcount;
+		TAILQ_INSERT_TAIL(&dummy_runq, lp, lwp_procq);
+		lp->lwp_proc->p_flag |= P_ONRUNQ;
+#ifdef SMP
+		lwkt_giveaway(lp->lwp_thread);
+#endif
+
+		/* lp = TAILQ_FIRST(&dummy_runq); */
+
+		/*
+		 * Notify the next available cpu.  P.S. some
+		 * cpu affinity could be done here.
+		 *
+		 * The rdyprocmask bit placeholds the knowledge that there
+		 * is a process on the runq that needs service.  If the
+		 * helper thread cannot find a home for it it will forward
+		 * the request to another available cpu.
+		 */
+		mask = ~dummy_curprocmask & dummy_rdyprocmask & 
+		       gd->gd_other_cpus;
+		if (mask) {
+			cpuid = bsfl(mask);
+			atomic_clear_int(&dummy_rdyprocmask, 1 << cpuid);
+			spin_unlock_wr(&dummy_spin);
+			lwkt_schedule(&dummy_pcpu[cpuid].helper_thread);
+		} else {
+			spin_unlock_wr(&dummy_spin);
+		}
+	}
+}
+
+/*
+ * This routine is called from a systimer IPI.  Thus it is called with 
+ * a critical section held.  Any spinlocks we get here that are also
+ * obtained in other procedures must be proected by a critical section
+ * in those other procedures to avoid a deadlock.
+ *
+ * The MP lock may or may not be held on entry and cannot be obtained
+ * by this routine (because it is called from a systimer IPI).
+ *
+ * This routine is called at ESTCPUFREQ on each cpu independantly.
+ *
+ * This routine typically queues a reschedule request, which will cause
+ * the scheduler's BLAH_select_curproc() to be called as soon as possible.
+ *
+ * MPSAFE
+ */
+static
+void
+dummy_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp)
+{
+	globaldata_t gd = mycpu;
+	dummy_pcpu_t dd = &dummy_pcpu[gd->gd_cpuid];
+
+	if (++dd->rrcount >= usched_dummy_rrinterval) {
+		dd->rrcount = 0;
+		need_user_resched();
+	}
+}
+
+/*
+ * DUMMY_RECALCULATE_ESTCPU
+ *
+ * Called once a second for any process that is running or has slept
+ * for less then 2 seconds.
+ *
+ * MPSAFE
+ */
+static
+void 
+dummy_recalculate_estcpu(struct lwp *lp)
+{
+}
+
+/*
+ * DUMMY_RESETPRIORITY
+ *
+ * This routine is called after the kernel has potentially modified
+ * the lwp_rtprio structure.  The target process may be running or sleeping
+ * or scheduled but not yet running or owned by another cpu.  Basically,
+ * it can be in virtually any state.
+ *
+ * This routine is called by fork1() for initial setup with the process 
+ * of the run queue, and also may be called normally with the process on or
+ * off the run queue.
+ *
+ * MPSAFE
+ */
+static void
+dummy_resetpriority(struct lwp *lp)
+{
+	/* XXX spinlock usually needed */
+	/*
+	 * Set p_priority for general process comparisons
+	 */
+	switch(lp->lwp_rtprio.type) {
+	case RTP_PRIO_REALTIME:
+		lp->lwp_priority = PRIBASE_REALTIME + lp->lwp_rtprio.prio;
+		return;
+	case RTP_PRIO_NORMAL:
+		lp->lwp_priority = PRIBASE_NORMAL + lp->lwp_rtprio.prio;
+		break;
+	case RTP_PRIO_IDLE:
+		lp->lwp_priority = PRIBASE_IDLE + lp->lwp_rtprio.prio;
+		return;
+	case RTP_PRIO_THREAD:
+		lp->lwp_priority = PRIBASE_THREAD + lp->lwp_rtprio.prio;
+		return;
+	}
+	/* XXX spinlock usually needed */
+}
+
+
+/*
+ * DUMMY_FORKING
+ *
+ * Called from fork1() when a new child process is being created.  Allows
+ * the scheduler to predispose the child process before it gets scheduled.
+ *
+ * MPSAFE
+ */
+static void
+dummy_forking(struct lwp *plp, struct lwp *lp)
+{
+	lp->lwp_estcpu = plp->lwp_estcpu;
+#if 0
+	++plp->lwp_estcpu;
+#endif
+}
+
+/*
+ * DUMMY_EXITING
+ *
+ * Called when the parent reaps a child.   Typically used to propogate cpu
+ * use by the child back to the parent as part of a batch detection
+ * heuristic.  
+ *
+ * NOTE: cpu use is not normally back-propogated to PID 1.
+ *
+ * MPSAFE
+ */
+static void
+dummy_exiting(struct lwp *plp, struct lwp *lp)
+{
+}
+
+/*
+ * SMP systems may need a scheduler helper thread.  This is how one can be
+ * setup.
+ *
+ * We use a neat LWKT scheduling trick to interlock the helper thread.  It
+ * is possible to deschedule an LWKT thread and then do some work before
+ * switching away.  The thread can be rescheduled at any time, even before
+ * we switch away.
+ */
+#ifdef SMP
+
+static void
+dummy_sched_thread(void *dummy)
+{
+    globaldata_t gd;
+    dummy_pcpu_t dd;
+    struct lwp *lp;
+    cpumask_t cpumask;
+    cpumask_t tmpmask;
+    int cpuid;
+    int tmpid;
+
+    gd = mycpu;
+    cpuid = gd->gd_cpuid;
+    dd = &dummy_pcpu[cpuid];
+    cpumask = 1 << cpuid;
+
+    /*
+     * Our Scheduler helper thread does not need to hold the MP lock
+     */
+    rel_mplock();
+
+    for (;;) {
+	lwkt_deschedule_self(gd->gd_curthread);		/* interlock */
+	atomic_set_int(&dummy_rdyprocmask, cpumask);
+	spin_lock_wr(&dummy_spin);
+	if (dd->uschedcp) {
+		/*
+		 * We raced another cpu trying to schedule a thread onto us.
+		 * If the runq isn't empty hit another free cpu.
+		 */
+		tmpmask = ~dummy_curprocmask & dummy_rdyprocmask & 
+		          gd->gd_other_cpus;
+		if (tmpmask && dummy_runqcount) {
+			tmpid = bsfl(tmpmask);
+			KKASSERT(tmpid != cpuid);
+			atomic_clear_int(&dummy_rdyprocmask, 1 << tmpid);
+			spin_unlock_wr(&dummy_spin);
+			lwkt_schedule(&dummy_pcpu[tmpid].helper_thread);
+		} else {
+			spin_unlock_wr(&dummy_spin);
+		}
+	} else if ((lp = TAILQ_FIRST(&dummy_runq)) != NULL) {
+		--dummy_runqcount;
+		TAILQ_REMOVE(&dummy_runq, lp, lwp_procq);
+		lp->lwp_proc->p_flag &= ~P_ONRUNQ;
+		dd->uschedcp = lp;
+		atomic_set_int(&dummy_curprocmask, cpumask);
+		spin_unlock_wr(&dummy_spin);
+#ifdef SMP
+		lwkt_acquire(lp->lwp_thread);
+#endif
+		lwkt_schedule(lp->lwp_thread);
+	} else {
+		spin_unlock_wr(&dummy_spin);
+	}
+	lwkt_switch();
+    }
+}
+
+/*
+ * Setup our scheduler helpers.  Note that curprocmask bit 0 has already
+ * been cleared by rqinit() and we should not mess with it further.
+ */
+static void
+dummy_sched_thread_cpu_init(void)
+{
+    int i;
+
+    if (bootverbose)
+	printf("start dummy scheduler helpers on cpus:");
+
+    for (i = 0; i < ncpus; ++i) {
+	dummy_pcpu_t dd = &dummy_pcpu[i];
+	cpumask_t mask = 1 << i;
+
+	if ((mask & smp_active_mask) == 0)
+	    continue;
+
+	if (bootverbose)
+	    printf(" %d", i);
+
+	lwkt_create(dummy_sched_thread, NULL, NULL, &dd->helper_thread, 
+		    TDF_STOPREQ, i, "dsched %d", i);
+
+	/*
+	 * Allow user scheduling on the target cpu.  cpu #0 has already
+	 * been enabled in rqinit().
+	 */
+	if (i)
+	    atomic_clear_int(&dummy_curprocmask, mask);
+	atomic_set_int(&dummy_rdyprocmask, mask);
+    }
+    if (bootverbose)
+	printf("\n");
+}
+SYSINIT(uschedtd, SI_SUB_FINISH_SMP, SI_ORDER_ANY,
+	dummy_sched_thread_cpu_init, NULL)
+
+#endif
+
diff --git a/sys/platform/pc32/i386/trap.c b/sys/platform/pc32/i386/trap.c
index 48d00b978e..a34c0dfee2 100644
--- a/sys/platform/pc32/i386/trap.c
+++ b/sys/platform/pc32/i386/trap.c
@@ -36,7 +36,7 @@
  *
  *	from: @(#)trap.c	7.4 (Berkeley) 5/13/91
  * $FreeBSD: src/sys/i386/i386/trap.c,v 1.147.2.11 2003/02/27 19:09:59 luoqi Exp $
- * $DragonFly: src/sys/platform/pc32/i386/trap.c,v 1.75 2006/05/22 06:26:30 swildner Exp $
+ * $DragonFly: src/sys/platform/pc32/i386/trap.c,v 1.76 2006/05/29 03:57:19 dillon Exp $
  */
 
 /*
@@ -317,7 +317,6 @@ userexit(struct lwp *lp)
 		lp->lwp_proc->p_usched->release_curproc(lp);
 #endif
 
-again:
 	/*
 	 * Handle a LWKT reschedule request first.  Since our passive release
 	 * is still in place we do not have to do anything special.
@@ -326,24 +325,12 @@ again:
 		lwkt_switch();
 
 	/*
-	 * Acquire the current process designation if we do not own it.
-	 * Note that acquire_curproc() does not reset the user reschedule
-	 * bit on purpose, because we may need to accumulate over several
-	 * threads waking up at the same time.
-	 *
-	 * NOTE: userland scheduler cruft: because processes are removed
-	 * from the userland scheduler's queue we run through loops to try
-	 * to figure out which is the best of [ existing, waking-up ]
-	 * threads.
+	 * Acquire the current process designation for this user scheduler
+	 * on this cpu.  This will also handle any user-reschedule requests.
 	 */
-	if (lp != gd->gd_uschedcp) {
-		++slow_release;
-		lp->lwp_proc->p_usched->acquire_curproc(lp);
-		/* We may have switched cpus on acquisition */
-		gd = td->td_gd;
-	} else {
-		++fast_release;
-	}
+	lp->lwp_proc->p_usched->acquire_curproc(lp);
+	/* We may have switched cpus on acquisition */
+	gd = td->td_gd;
 
 	/*
 	 * Reduce our priority in preparation for a return to userland.  If
@@ -363,19 +350,6 @@ again:
 	 */
 	if (lwkt_checkpri_self())
 		lwkt_switch();
-
-	/*
-	 * If a userland reschedule is [still] pending we may not be the best
-	 * selected process.  Select a better one.  If another LWKT resched
-	 * is pending the trap will be re-entered.
-	 */
-	if (user_resched_wanted()) {
-		lp->lwp_proc->p_usched->select_curproc(gd);
-		if (lp != gd->gd_uschedcp) {
-			lwkt_setpri_self(TDPRI_KERN_USER);
-			goto again;
-		}
-	}
 }
 
 /*
diff --git a/sys/sys/globaldata.h b/sys/sys/globaldata.h
index 9f9f3df71c..66a4f19859 100644
--- a/sys/sys/globaldata.h
+++ b/sys/sys/globaldata.h
@@ -55,7 +55,7 @@
  * SUCH DAMAGE.
  *
  * $FreeBSD: src/sys/i386/include/globaldata.h,v 1.11.2.1 2000/05/16 06:58:10 dillon Exp $
- * $DragonFly: src/sys/sys/globaldata.h,v 1.42 2006/05/21 20:23:27 dillon Exp $
+ * $DragonFly: src/sys/sys/globaldata.h,v 1.43 2006/05/29 03:57:21 dillon Exp $
  */
 
 #ifndef _SYS_GLOBALDATA_H_
@@ -110,9 +110,6 @@
  * in various vm_map related operations.  gd_vme_avail is *NOT* a count of
  * the number of structures in the cache but is instead a count of the number
  * of unreserved structures in the cache.  See vm_map_entry_reserve().
- *
- * gd_uschedcp is internal to the userland scheduler.  It does not represent
- * the currently running process.
  */
 
 struct sysmsg;
@@ -139,9 +136,9 @@ struct globaldata {
 	struct vmmeter	gd_cnt;
 	struct lwkt_ipiq *gd_ipiq;		/* array[ncpu] of ipiq's */
 	struct lwkt_ipiq gd_cpusyncq;		/* ipiq for cpu synchro */
-	short		gd_upri;		/* userland scheduler helper */
-	short		gd_rrcount;		/* userland scheduler helper */
-	struct thread	gd_schedthread;		/* userland scheduler helper */
+	short		gd_unused01;
+	short		gd_unused02;
+	struct thread	gd_unused02B;
 	struct thread	gd_idlethread;
 	SLGlobalData	gd_slab;		/* slab allocator */
 	int		gd_trap_nesting_level;	/* track traps */
@@ -160,7 +157,7 @@ struct globaldata {
 	struct nchstats	*gd_nchstats;		/* namecache effectiveness */
 	int		gd_pipeqcount;		/* number of structures */
 	void 		*gd_unused04;
-	struct lwp	*gd_uschedcp;		/* userland scheduler */
+	void		*gd_unused05;
 
 	struct tslpque	*gd_tsleep_hash;	/* tsleep/wakeup support */
 	int		gd_spinlocks_rd;	/* Shared spinlocks held */
diff --git a/sys/sys/thread.h b/sys/sys/thread.h
index e9033a1646..1986fb653f 100644
--- a/sys/sys/thread.h
+++ b/sys/sys/thread.h
@@ -7,7 +7,7 @@
  * Types which must already be defined when this header is included by
  * userland:	struct md_thread
  * 
- * $DragonFly: src/sys/sys/thread.h,v 1.81 2006/05/21 20:23:27 dillon Exp $
+ * $DragonFly: src/sys/sys/thread.h,v 1.82 2006/05/29 03:57:21 dillon Exp $
  */
 
 #ifndef _SYS_THREAD_H_
@@ -373,7 +373,6 @@ extern void lwkt_schedule(thread_t td);
 extern void lwkt_schedule_self(thread_t td);
 extern void lwkt_deschedule(thread_t td);
 extern void lwkt_deschedule_self(thread_t td);
-extern void lwkt_acquire(thread_t td);
 extern void lwkt_yield(void);
 extern void lwkt_yield_quick(void);
 extern void lwkt_token_wait(void);
@@ -412,6 +411,8 @@ extern void lwkt_migratecpu(int cpuid);
 
 #ifdef SMP
 
+extern void lwkt_giveaway(struct thread *);
+extern void lwkt_acquire(struct thread *);
 extern int  lwkt_send_ipiq3(struct globaldata *targ, ipifunc3_t func, 
 				void *arg1, int arg2);
 extern int  lwkt_send_ipiq3_passive(struct globaldata *targ, ipifunc3_t func,
diff --git a/sys/sys/usched.h b/sys/sys/usched.h
index cba59b9ee9..e5f4fdaf72 100644
--- a/sys/sys/usched.h
+++ b/sys/sys/usched.h
@@ -3,7 +3,7 @@
  *
  *	Userland scheduler API
  * 
- * $DragonFly: src/sys/sys/usched.h,v 1.9 2006/05/20 02:42:13 dillon Exp $
+ * $DragonFly: src/sys/sys/usched.h,v 1.10 2006/05/29 03:57:21 dillon Exp $
  */
 
 #ifndef _SYS_USCHED_H_
@@ -37,7 +37,6 @@ struct usched {
     void (*release_curproc)(struct lwp *);
     void (*select_curproc)(struct globaldata *);
     void (*setrunqueue)(struct lwp *);
-    void (*remrunqueue)(struct lwp *);
     void (*schedulerclock)(struct lwp *, sysclock_t, sysclock_t);
     void (*recalculate)(struct lwp *);
     void (*resetpriority)(struct lwp *);
@@ -52,10 +51,12 @@ union usched_data {
      */
     struct {
 	short	priority;	/* lower is better */
-	char	interactive;	/* (currently not used) */
+	char	unused01;	/* (currently not used) */
 	char	rqindex;
 	int	origcpu;
 	int	estcpu;		/* dynamic priority modification */
+	u_short rqtype;		/* protected copy of rtprio type */
+	u_short	unused02;
     } bsd4;
 
     int		pad[4];		/* PAD for future expansion */
@@ -75,6 +76,7 @@ union usched_data {
 #ifdef _KERNEL
 
 extern struct usched	usched_bsd4;
+extern struct usched	usched_dummy;
 
 int usched_ctl(struct usched *, int);
 struct usched *usched_init(void);
-- 
2.41.0