kernel - usched_dfly revamp
authorMatthew Dillon <dillon@apollo.backplane.com>
Fri, 21 Sep 2012 23:09:25 +0000 (16:09 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Fri, 21 Sep 2012 23:09:25 +0000 (16:09 -0700)
* NOTE: This introduces a few regressions at high loads.  They've been
  identified and will be fixed in another iteration.

  We've identified an issue with weight2.  When weight2 successfully
  schedules a process pair on the same cpu it can lead to inefficiencies
  elsewhere in the scheduler related to user-mode and kernel-mode
  priority switching.  In this situation testing pgbench/postgres pairs
  (e.g. -j $ncpus -c $ncpus) we sometimes see some serious regressions on
  multi-socket machines, and other times see remarkably high performance.

* Fix a reported panic.

* Revamp the weights and algorithms signficantly.  Fix algorithmic errors
  and improve the accuracy of weight3.  Add weight4 which basically tells
  the scheduler to try harder to find a free cpu to schedule the lwp on
  when the current cpu is busy doing something else.

sys/kern/kern_clock.c
sys/kern/kern_synch.c
sys/kern/kern_usched.c
sys/kern/lwkt_thread.c
sys/kern/usched_bsd4.c
sys/kern/usched_dfly.c
sys/kern/usched_dummy.c
sys/sys/globaldata.h
sys/sys/proc.h
sys/sys/usched.h

index fccc361..7d4cc7c 100644 (file)
@@ -781,8 +781,9 @@ schedclock(systimer_t info, int in_ipi __unused, struct intrframe *frame)
                 * HERE.
                 */
                ++lp->lwp_cpticks;
-               lp->lwp_proc->p_usched->schedulerclock(lp, info->periodic,
-                                                      info->time);
+               usched_schedulerclock(lp, info->periodic, info->time);
+       } else {
+               usched_schedulerclock(NULL, info->periodic, info->time);
        }
        if ((lp = curthread->td_lwp) != NULL) {
                /*
index 26edd34..8f3d49f 100644 (file)
@@ -216,8 +216,11 @@ schedcpu_stats(struct proc *p, void *data __unused)
 
        p->p_swtime++;
        FOREACH_LWP_IN_PROC(lp, p) {
-               if (lp->lwp_stat == LSSLEEP)
-                       lp->lwp_slptime++;
+               if (lp->lwp_stat == LSSLEEP) {
+                       ++lp->lwp_slptime;
+                       if (lp->lwp_slptime == 1)
+                               p->p_usched->uload_update(lp);
+               }
 
                /*
                 * Only recalculate processes that are active or have slept
@@ -481,7 +484,7 @@ tsleep(const volatile void *ident, int flags, const char *wmesg, int timo)
        logtsleep2(tsleep_beg, ident);
        gd = td->td_gd;
        KKASSERT(td != &gd->gd_idlethread);     /* you must be kidding! */
-       td->td_wakefromcpu = gd->gd_cpuid;      /* overwritten by _wakeup */
+       td->td_wakefromcpu = -1;                /* overwritten by _wakeup */
 
        /*
         * NOTE: all of this occurs on the current cpu, including any
@@ -607,21 +610,18 @@ tsleep(const volatile void *ident, int flags, const char *wmesg, int timo)
                if (lp->lwp_stat != LSSTOP)
                        lp->lwp_stat = LSSLEEP;
                lp->lwp_ru.ru_nvcsw++;
-               if (gd->gd_sleeping_lwp)
-                       p->p_usched->uload_update(gd->gd_sleeping_lwp);
-               gd->gd_sleeping_lwp = lp;
-               lwkt_switch();
-               if (gd->gd_sleeping_lwp == lp)
-                       gd->gd_sleeping_lwp = NULL;
                p->p_usched->uload_update(lp);
+               lwkt_switch();
 
                /*
                 * And when we are woken up, put us back in LSRUN.  If we
                 * slept for over a second, recalculate our estcpu.
                 */
                lp->lwp_stat = LSRUN;
-               if (lp->lwp_slptime)
+               if (lp->lwp_slptime) {
+                       p->p_usched->uload_update(lp);
                        p->p_usched->recalculate(lp);
+               }
                lp->lwp_slptime = 0;
        } else {
                lwkt_switch();
index a651874..eaf3cdf 100644 (file)
@@ -144,6 +144,24 @@ usched_ctl(struct usched *usched, int action)
 }
 
 /*
+ * Called from the scheduler clock on each cpu independently at the
+ * common scheduling rate.  If th scheduler clock interrupted a running
+ * lwp the lp will be non-NULL.
+ */
+void
+usched_schedulerclock(struct lwp *lp, sysclock_t periodic, sysclock_t time)
+{
+       struct usched *item;
+
+       TAILQ_FOREACH(item, &usched_list, entry) {
+               if (lp && lp->lwp_proc->p_usched == item)
+                       item->schedulerclock(lp, periodic, time);
+               else
+                       item->schedulerclock(NULL, periodic, time);
+       }
+}
+
+/*
  * USCHED_SET(syscall)
  *
  * SYNOPSIS:
index 7f3fc32..3b95762 100644 (file)
@@ -238,6 +238,7 @@ _lwkt_dequeue(thread_t td)
 
        td->td_flags &= ~TDF_RUNQ;
        TAILQ_REMOVE(&gd->gd_tdrunq, td, td_threadq);
+       --gd->gd_tdrunqcount;
        if (TAILQ_FIRST(&gd->gd_tdrunq) == NULL)
                atomic_clear_int(&gd->gd_reqflags, RQF_RUNNING);
     }
@@ -271,6 +272,7 @@ _lwkt_enqueue(thread_t td)
            else
                TAILQ_INSERT_TAIL(&gd->gd_tdrunq, td, td_threadq);
        }
+       ++gd->gd_tdrunqcount;
 
        /*
         * Request a LWKT reschedule if we are now at the head of the queue.
@@ -1227,6 +1229,10 @@ lwkt_yield(void)
 /*
  * The quick version processes pending interrupts and higher-priority
  * LWKT threads but will not round-robin same-priority LWKT threads.
+ *
+ * When called while attempting to return to userland the only same-pri
+ * threads are the ones which have already tried to become the current
+ * user process.
  */
 void
 lwkt_yield_quick(void)
index 57b3e9a..629be2e 100644 (file)
@@ -920,6 +920,10 @@ found:
  * the BGL IS NOT HELD ON ENTRY.  This routine is called at ESTCPUFREQ on
  * each cpu.
  *
+ * This routine is called on every sched tick.  If the currently running
+ * thread belongs to this scheduler it will be called with a non-NULL lp,
+ * otherwise it will be called with a NULL lp.
+ *
  * MPSAFE
  */
 static
@@ -930,6 +934,12 @@ bsd4_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp)
        bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid];
 
        /*
+        * No impl if no lp running.
+        */
+       if (lp == NULL)
+               return;
+
+       /*
         * Do we need to round-robin?  We round-robin 10 times a second.
         * This should only occur for cpu-bound batch processes.
         */
@@ -950,13 +960,6 @@ bsd4_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp)
        KKASSERT(gd->gd_spinlocks_wr == 0);
 
        bsd4_resetpriority(lp);
-#if 0
-       /*
-       * if we can't call bsd4_resetpriority for some reason we must call
-        * need user_resched().
-        */
-       need_user_resched();
-#endif
 }
 
 /*
index f276bf5..4d180a0 100644 (file)
@@ -103,6 +103,7 @@ struct usched_dfly_pcpu {
        short           rrcount;
        short           upri;
        int             uload;
+       int             ucount;
        struct lwp      *uschedcp;
        struct rq       queues[NQS];
        struct rq       rtqueues[NQS];
@@ -134,8 +135,10 @@ static void dfly_exiting(struct lwp *lp, struct proc *);
 static void dfly_uload_update(struct lwp *lp);
 static void dfly_yield(struct lwp *lp);
 #ifdef SMP
+static void dfly_changeqcpu_locked(struct lwp *lp,
+                               dfly_pcpu_t dd, dfly_pcpu_t rdd);
 static dfly_pcpu_t dfly_choose_best_queue(struct lwp *lp);
-static dfly_pcpu_t dfly_choose_worst_queue(dfly_pcpu_t dd, int weight);
+static dfly_pcpu_t dfly_choose_worst_queue(dfly_pcpu_t dd);
 static dfly_pcpu_t dfly_choose_queue_simple(dfly_pcpu_t dd, struct lwp *lp);
 #endif
 
@@ -212,43 +215,36 @@ SYSCTL_INT(_debug, OID_AUTO, dfly_chooser, CTLFLAG_RW,
  *
  * weight2 - If non-zero, detects thread pairs undergoing synchronous
  *          communications and tries to move them closer together.
- *          This only matters under very heavy loads because if there
- *          are plenty of cpu's available the pairs will be placed
- *          on separate cpu's.  ONLY APPLIES TO PROCESS PAIRS UNDERGOING
- *          SYNCHRONOUS COMMUNICATIONS!  e.g. client/server on same host.
+ *          Behavior is adjusted by bit 4 of features (0x10).
  *
- *          Given A x N processes and B x N processes (for 2*N total),
- *          any low value of N up to around the number of hw threads
- *          in the system, '15' is a good setting, because you don't want
- *          to force process pairs together when you have tons of cpu
- *          cores available.
+ *          WARNING!  Weight2 is a ridiculously sensitive parameter,
+ *          a small value is recommended.
  *
- *          For heavier loads, '35' is a good setting which will still
- *          be fairly optimal at lighter loads.
- *
- *          For extreme loads you can try a higher value like '55', or you
- *          can actually force dispersal by setting a small negative value
- *          like -15.
- *
- *          15: Fewer threads.
- *          35: Heavily loaded.        (default)
- *          50: Very heavily loaded.   (not recommended)
- *          -15: Extreme loads
- *
- * weight3 - Weighting based on the number of runnable threads on the
- *          userland scheduling queue and ignoring their loads.
+ * weight3 - Weighting based on the number of recently runnable threads
+ *          on the userland scheduling queue (ignoring their loads).
  *          A nominal value here prevents high-priority (low-load)
  *          threads from accumulating on one cpu core when other
  *          cores are available.
  *
+ *          This value should be left fairly small relative to weight1
+ *          and weight4.
+ *
+ * weight4 - Weighting based on other cpu queues being available
+ *          or running processes with higher lwp_priority's.
+ *
+ *          This allows a thread to migrate to another nearby cpu if it
+ *          is unable to run on the current cpu based on the other cpu
+ *          being idle or running a lower priority (higher lwp_priority)
+ *          thread.  This value should be large enough to override weight1
+ *
  * features - These flags can be set or cleared to enable or disable various
  *           features.
  *
- *           0x01      Pull when cpu becomes idle              (default)
- *           0x02
+ *           0x01      Enable idle-cpu pulling                 (default)
+ *           0x02      Enable proactive pushing                (default)
  *           0x04      Enable rebalancing rover                (default)
- *           0x08
- *           0x10
+ *           0x08      Enable more proactive pushing           (default)
+ *           0x10      (flip weight2 limit on same cpu)        (default)
  *           0x20      choose best cpu for forked process
  *           0x40      choose current cpu for forked process
  *           0x80      choose random cpu for forked process    (default)
@@ -256,9 +252,10 @@ SYSCTL_INT(_debug, OID_AUTO, dfly_chooser, CTLFLAG_RW,
 #ifdef SMP
 static int usched_dfly_smt = 0;
 static int usched_dfly_cache_coherent = 0;
-static int usched_dfly_weight1 = 50;   /* keep thread on current cpu */
-static int usched_dfly_weight2 = 35;   /* synchronous peer's current cpu */
+static int usched_dfly_weight1 = 30;   /* keep thread on current cpu */
+static int usched_dfly_weight2 = 15;   /* synchronous peer's current cpu */
 static int usched_dfly_weight3 = 10;   /* number of threads on queue */
+static int usched_dfly_weight4 = 50;   /* availability of idle cores */
 static int usched_dfly_features = 0x8F;        /* allow pulls */
 #endif
 static int usched_dfly_rrinterval = (ESTCPUFREQ + 9) / 10;
@@ -273,103 +270,11 @@ KTR_INFO_MASTER(usched);
 #define        KTR_USCHED_DFLY KTR_ALL
 #endif
 
-#if 0
-KTR_INFO(KTR_USCHED_DFLY, usched, dfly_acquire_curproc_urw, 0,
-    "USCHED_DFLY(dfly_acquire_curproc in user_reseched_wanted "
-    "after release: pid %d, cpuid %d, curr_cpuid %d)",
-    pid_t pid, int cpuid, int curr);
-KTR_INFO(KTR_USCHED_DFLY, usched, dfly_acquire_curproc_before_loop, 0,
-    "USCHED_DFLY(dfly_acquire_curproc before loop: pid %d, cpuid %d, "
-    "curr_cpuid %d)",
-    pid_t pid, int cpuid, int curr);
-KTR_INFO(KTR_USCHED_DFLY, usched, dfly_acquire_curproc_not, 0,
-    "USCHED_DFLY(dfly_acquire_curproc couldn't acquire after "
-    "dfly_setrunqueue: pid %d, cpuid %d, curr_lp pid %d, curr_cpuid %d)",
-    pid_t pid, int cpuid, pid_t curr_pid, int curr_cpuid);
-KTR_INFO(KTR_USCHED_DFLY, usched, dfly_acquire_curproc_switch, 0,
-    "USCHED_DFLY(dfly_acquire_curproc after lwkt_switch: pid %d, "
-    "cpuid %d, curr_cpuid %d)",
-    pid_t pid, int cpuid, int curr);
-
-KTR_INFO(KTR_USCHED_DFLY, usched, dfly_release_curproc, 0,
-    "USCHED_DFLY(dfly_release_curproc before select: pid %d, "
-    "cpuid %d, curr_cpuid %d)",
-    pid_t pid, int cpuid, int curr);
-
-KTR_INFO(KTR_USCHED_DFLY, usched, dfly_select_curproc, 0,
-    "USCHED_DFLY(dfly_release_curproc before select: pid %d, "
-    "cpuid %d, old_pid %d, old_cpuid %d, curr_cpuid %d)",
-    pid_t pid, int cpuid, pid_t old_pid, int old_cpuid, int curr);
-
-#ifdef SMP
-KTR_INFO(KTR_USCHED_DFLY, usched, batchy_test_false, 0,
-    "USCHED_DFLY(batchy_looser_pri_test false: pid %d, "
-    "cpuid %d, verify_mask %lu)",
-    pid_t pid, int cpuid, cpumask_t mask);
-KTR_INFO(KTR_USCHED_DFLY, usched, batchy_test_true, 0,
-    "USCHED_DFLY(batchy_looser_pri_test true: pid %d, "
-    "cpuid %d, verify_mask %lu)",
-    pid_t pid, int cpuid, cpumask_t mask);
-
-KTR_INFO(KTR_USCHED_DFLY, usched, dfly_setrunqueue_fc_smt, 0,
-    "USCHED_DFLY(dfly_setrunqueue free cpus smt: pid %d, cpuid %d, "
-    "mask %lu, curr_cpuid %d)",
-    pid_t pid, int cpuid, cpumask_t mask, int curr);
-KTR_INFO(KTR_USCHED_DFLY, usched, dfly_setrunqueue_fc_non_smt, 0,
-    "USCHED_DFLY(dfly_setrunqueue free cpus check non_smt: pid %d, "
-    "cpuid %d, mask %lu, curr_cpuid %d)",
-    pid_t pid, int cpuid, cpumask_t mask, int curr);
-KTR_INFO(KTR_USCHED_DFLY, usched, dfly_setrunqueue_rc, 0,
-    "USCHED_DFLY(dfly_setrunqueue running cpus check: pid %d, "
-    "cpuid %d, mask %lu, curr_cpuid %d)",
-    pid_t pid, int cpuid, cpumask_t mask, int curr);
-KTR_INFO(KTR_USCHED_DFLY, usched, dfly_setrunqueue_found, 0,
-    "USCHED_DFLY(dfly_setrunqueue found cpu: pid %d, cpuid %d, "
-    "mask %lu, found_cpuid %d, curr_cpuid %d)",
-    pid_t pid, int cpuid, cpumask_t mask, int found_cpuid, int curr);
-KTR_INFO(KTR_USCHED_DFLY, usched, dfly_setrunqueue_not_found, 0,
-    "USCHED_DFLY(dfly_setrunqueue not found cpu: pid %d, cpuid %d, "
-    "try_cpuid %d, curr_cpuid %d)",
-    pid_t pid, int cpuid, int try_cpuid, int curr);
-KTR_INFO(KTR_USCHED_DFLY, usched, dfly_setrunqueue_found_best_cpuid, 0,
-    "USCHED_DFLY(dfly_setrunqueue found cpu: pid %d, cpuid %d, "
-    "mask %lu, found_cpuid %d, curr_cpuid %d)",
-    pid_t pid, int cpuid, cpumask_t mask, int found_cpuid, int curr);
-#endif
-#endif
-
 KTR_INFO(KTR_USCHED_DFLY, usched, chooseproc, 0,
     "USCHED_DFLY(chooseproc: pid %d, old_cpuid %d, curr_cpuid %d)",
     pid_t pid, int old_cpuid, int curr);
-#ifdef SMP
-#if 0
-KTR_INFO(KTR_USCHED_DFLY, usched, chooseproc_cc, 0,
-    "USCHED_DFLY(chooseproc_cc: pid %d, old_cpuid %d, curr_cpuid %d)",
-    pid_t pid, int old_cpuid, int curr);
-KTR_INFO(KTR_USCHED_DFLY, usched, chooseproc_cc_not_good, 0,
-    "USCHED_DFLY(chooseproc_cc not good: pid %d, old_cpumask %lu, "
-    "sibling_mask %lu, curr_cpumask %lu)",
-    pid_t pid, cpumask_t old_cpumask, cpumask_t sibling_mask, cpumask_t curr);
-KTR_INFO(KTR_USCHED_DFLY, usched, chooseproc_cc_elected, 0,
-    "USCHED_DFLY(chooseproc_cc elected: pid %d, old_cpumask %lu, "
-    "sibling_mask %lu, curr_cpumask: %lu)",
-    pid_t pid, cpumask_t old_cpumask, cpumask_t sibling_mask, cpumask_t curr);
-
-KTR_INFO(KTR_USCHED_DFLY, usched, sched_thread_no_process, 0,
-    "USCHED_DFLY(sched_thread %d no process scheduled: pid %d, old_cpuid %d)",
-    int id, pid_t pid, int cpuid);
-KTR_INFO(KTR_USCHED_DFLY, usched, sched_thread_process, 0,
-    "USCHED_DFLY(sched_thread %d process scheduled: pid %d, old_cpuid %d)",
-    int id, pid_t pid, int cpuid);
-KTR_INFO(KTR_USCHED_DFLY, usched, sched_thread_no_process_found, 0,
-    "USCHED_DFLY(sched_thread %d no process found; tmpmask %lu)",
-    int id, cpumask_t tmpmask);
-#endif
-#endif
 
 /*
- * DFLY_ACQUIRE_CURPROC
- *
  * This function is called when the kernel intends to return to userland.
  * It is responsible for making the thread the current designated userland
  * thread for this cpu, blocking if necessary.
@@ -386,9 +291,11 @@ dfly_acquire_curproc(struct lwp *lp)
 {
        globaldata_t gd;
        dfly_pcpu_t dd;
+#ifdef SMP
        dfly_pcpu_t rdd;
+#endif
        thread_t td;
-       int doresched;
+       int force_resched;
 
        /*
         * Make sure we aren't sitting on a tsleep queue.
@@ -399,57 +306,49 @@ dfly_acquire_curproc(struct lwp *lp)
                tsleep_remove(td);
        dfly_recalculate_estcpu(lp);
 
+       gd = mycpu;
+       dd = &dfly_pcpu[gd->gd_cpuid];
+
        /*
-        * If a reschedule was requested give another thread the
-        * driver's seat.
+        * Process any pending interrupts/ipi's, then handle reschedule
+        * requests.  dfly_release_curproc() will try to assign a new
+        * uschedcp that isn't us and otherwise NULL it out.
         */
+       force_resched = 0;
        if (user_resched_wanted()) {
+               if (dd->uschedcp == lp)
+                       force_resched = 1;
                clear_user_resched();
                dfly_release_curproc(lp);
-               doresched = 1;
-       } else {
-               doresched = 0;
        }
 
        /*
-        * Loop until we are the current user thread
+        * Loop until we are the current user thread.
+        *
+        * NOTE: dd spinlock not held at top of loop.
         */
-       gd = mycpu;
-       dd = &dfly_pcpu[gd->gd_cpuid];
+       if (dd->uschedcp == lp)
+               lwkt_yield_quick();
 
-       do {
-               /*
-                * Process any pending events and higher priority threads
-                * only.  Do not try to round-robin same-priority lwkt
-                * threads.
-                */
+       while (dd->uschedcp != lp) {
                lwkt_yield_quick();
 
+               spin_lock(&dd->spin);
+
                /*
-                * Become the currently scheduled user thread for this cpu
-                * if we can do so trivially.
+                * We are not or are no longer the current lwp and a forced
+                * reschedule was requested.  Figure out the best cpu to
+                * run on (our current cpu will be given significant weight).
                 *
-                * We can steal another thread's current thread designation
-                * on this cpu since if we are running that other thread
-                * must not be, so we can safely deschedule it.
+                * (if a reschedule was not requested we want to move this
+                *  step after the uschedcp tests).
                 */
-               if (dd->uschedcp == lp) {
-                       /*
-                        * We are already the current lwp (hot path).
-                        */
-                       dd->upri = lp->lwp_priority;
-                       continue;
-               }
-               if (doresched && (rdd = dfly_choose_best_queue(lp)) != dd) {
-                       /*
-                        * We are not or are no longer the current lwp and
-                        * a reschedule was requested.  Figure out the
-                        * best cpu to run on (our current cpu will be
-                        * given significant weight).
-                        *
-                        * (if a reschedule was not requested we want to
-                        * move this step after the uschedcp tests).
-                        */
+#ifdef SMP
+               if (force_resched &&
+                   (usched_dfly_features & 0x08) &&
+                   (rdd = dfly_choose_best_queue(lp)) != dd) {
+                       dfly_changeqcpu_locked(lp, dd, rdd);
+                       spin_unlock(&dd->spin);
                        lwkt_deschedule(lp->lwp_thread);
                        dfly_setrunqueue_dd(rdd, lp);
                        lwkt_switch();
@@ -457,78 +356,86 @@ dfly_acquire_curproc(struct lwp *lp)
                        dd = &dfly_pcpu[gd->gd_cpuid];
                        continue;
                }
-               spin_lock(&dd->spin);
+#endif
+
+               /*
+                * Either no reschedule was requested or the best queue was
+                * dd, and no current process has been selected.  We can
+                * trivially become the current lwp on the current cpu.
+                */
                if (dd->uschedcp == NULL) {
-                       /*
-                        * Either no reschedule was requested or the best
-                        * queue was dd, and no current process has been
-                        * selected.  We can trivially become the current
-                        * lwp on the current cpu.
-                        */
-                       atomic_set_cpumask(&dfly_curprocmask,
-                                          gd->gd_cpumask);
+                       atomic_set_cpumask(&dfly_curprocmask, gd->gd_cpumask);
                        dd->uschedcp = lp;
                        dd->upri = lp->lwp_priority;
                        KKASSERT(lp->lwp_qcpu == dd->cpuid);
                        spin_unlock(&dd->spin);
-                       continue;
+                       break;
                }
+
+               /*
+                * Can we steal the current designated user thread?
+                *
+                * If we do the other thread will stall when it tries to
+                * return to userland, possibly rescheduling elsewhere.
+                *
+                * It is important to do a masked test to avoid the edge
+                * case where two near-equal-priority threads are constantly
+                * interrupting each other.
+                */
                if (dd->uschedcp &&
                   (dd->upri & ~PPQMASK) >
                   (lp->lwp_priority & ~PPQMASK)) {
-                       /*
-                        * Can we steal the current designated user thread?
-                        *
-                        * If we do the other thread will stall when it tries
-                        * to return to userland, possibly rescheduling
-                        * elsewhere.
-                        *
-                        * It is important to do a masked test to
-                        * avoid the edge case where two
-                        * near-equal-priority threads
-                        * are constantly interrupting each other.
-                        */
                        dd->uschedcp = lp;
                        dd->upri = lp->lwp_priority;
                        KKASSERT(lp->lwp_qcpu == dd->cpuid);
                        spin_unlock(&dd->spin);
-                       continue;
+                       break;
                }
-               if (doresched == 0 &&
+
+#ifdef SMP
+               /*
+                * We are not the current lwp, figure out the best cpu
+                * to run on (our current cpu will be given significant
+                * weight).  Loop on cpu change.
+                */
+               if ((usched_dfly_features & 0x02) &&
+                   force_resched == 0 &&
                    (rdd = dfly_choose_best_queue(lp)) != dd) {
-                       /*
-                        * We are not the current lwp, figure out the
-                        * best cpu to run on (our current cpu will be
-                        * given significant weight).  Loop on cpu change.
-                        */
+                       dfly_changeqcpu_locked(lp, dd, rdd);
                        spin_unlock(&dd->spin);
                        lwkt_deschedule(lp->lwp_thread);
                        dfly_setrunqueue_dd(rdd, lp);
                        lwkt_switch();
                        gd = mycpu;
                        dd = &dfly_pcpu[gd->gd_cpuid];
-               } else {
-                       /*
-                        * We cannot become the current lwp, place
-                        * the lp on the run-queue of this or another
-                        * cpu and deschedule ourselves.
-                        *
-                        * When we are reactivated we will have another
-                        * chance.
-                        *
-                        * Reload after a switch or setrunqueue/switch
-                        * possibly moved us to another cpu.
-                        */
-                       spin_unlock(&dd->spin);
-                       lwkt_deschedule(lp->lwp_thread);
-                       dfly_setrunqueue_dd(dd, lp);
-                       lwkt_switch();
-                       gd = mycpu;
-                       dd = &dfly_pcpu[gd->gd_cpuid];
+                       continue;
                }
-       } while (dd->uschedcp != lp);
+#endif
+
+               /*
+                * We cannot become the current lwp, place the lp on the
+                * run-queue of this or another cpu and deschedule ourselves.
+                *
+                * When we are reactivated we will have another chance.
+                *
+                * Reload after a switch or setrunqueue/switch possibly
+                * moved us to another cpu.
+                */
+               spin_unlock(&dd->spin);
+               lwkt_deschedule(lp->lwp_thread);
+               dfly_setrunqueue_dd(dd, lp);
+               lwkt_switch();
+               gd = mycpu;
+               dd = &dfly_pcpu[gd->gd_cpuid];
+       }
 
+       /*
+        * Make sure upri is synchronized, then yield to LWKT threads as
+        * needed before returning.  This could result in another reschedule.
+        * XXX
+        */
        crit_exit_quick(td);
+
        KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0);
 }
 
@@ -560,8 +467,6 @@ dfly_release_curproc(struct lwp *lp)
         * Make sure td_wakefromcpu is defaulted.  This will be overwritten
         * by wakeup().
         */
-       lp->lwp_thread->td_wakefromcpu = gd->gd_cpuid;
-
        if (dd->uschedcp == lp) {
                KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0);
                spin_lock(&dd->spin);
@@ -629,6 +534,7 @@ dfly_select_curproc(globaldata_t gd)
 static void
 dfly_setrunqueue(struct lwp *lp)
 {
+       dfly_pcpu_t dd;
        dfly_pcpu_t rdd;
 
        /*
@@ -641,11 +547,12 @@ dfly_setrunqueue(struct lwp *lp)
        KKASSERT((lp->lwp_thread->td_flags & TDF_RUNQ) == 0);
 
        /*
-        * NOTE: rdd does not necessarily represent the current cpu.
-        *       Instead it represents the cpu the thread was last
-        *       scheduled on.
+        * NOTE: dd/rdd do not necessarily represent the current cpu.
+        *       Instead they may represent the cpu the thread was last
+        *       scheduled on or inherited by its parent.
         */
-       rdd = &dfly_pcpu[lp->lwp_qcpu];
+       dd = &dfly_pcpu[lp->lwp_qcpu];
+       rdd = dd;
 
        /*
         * This process is not supposed to be scheduled anywhere or assigned
@@ -713,10 +620,41 @@ dfly_setrunqueue(struct lwp *lp)
                rdd = dfly_choose_best_queue(lp);
                /* rdd = &dfly_pcpu[lp->lwp_qcpu]; */
        }
+       if (lp->lwp_qcpu != rdd->cpuid) {
+               spin_lock(&dd->spin);
+               dfly_changeqcpu_locked(lp, dd, rdd);
+               spin_unlock(&dd->spin);
+       }
 #endif
        dfly_setrunqueue_dd(rdd, lp);
 }
 
+#ifdef SMP
+
+/*
+ * Change qcpu to rdd->cpuid.  The dd the lp is CURRENTLY on must be
+ * spin-locked on-call.  rdd does not have to be.
+ */
+static void
+dfly_changeqcpu_locked(struct lwp *lp, dfly_pcpu_t dd, dfly_pcpu_t rdd)
+{
+       if (lp->lwp_qcpu != rdd->cpuid) {
+               if (lp->lwp_mpflags & LWP_MP_ULOAD) {
+                       atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ULOAD);
+                       atomic_add_int(&dd->uload,
+                                  -((lp->lwp_priority & ~PPQMASK) & PRIMASK));
+                       atomic_add_int(&dd->ucount, -1);
+               }
+               lp->lwp_qcpu = rdd->cpuid;
+       }
+}
+
+#endif
+
+/*
+ * Place lp on rdd's runqueue.  Nothing is locked on call.  This function
+ * also performs all necessary ancillary notification actions.
+ */
 static void
 dfly_setrunqueue_dd(dfly_pcpu_t rdd, struct lwp *lp)
 {
@@ -804,11 +742,15 @@ dfly_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp)
         */
        KKASSERT(gd->gd_spinlocks_wr == 0);
 
+       if (lp == NULL)
+               return;
+
        /*
         * Do we need to round-robin?  We round-robin 10 times a second.
         * This should only occur for cpu-bound batch processes.
         */
        if (++dd->rrcount >= usched_dfly_rrinterval) {
+               lp->lwp_thread->td_wakefromcpu = -1;
                dd->rrcount = 0;
                need_user_resched();
        }
@@ -841,19 +783,26 @@ dfly_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp)
                 * because it likely finished its batch on that cpu and is
                 * now waiting for cpu again.
                 */
-               rdd = dfly_choose_worst_queue(dd, usched_dfly_weight1 * 4);
-               if (rdd && spin_trylock(&rdd->spin)) {
-                       nlp = dfly_chooseproc_locked(rdd, dd, NULL, 1);
-                       spin_unlock(&rdd->spin);
+               rdd = dfly_choose_worst_queue(dd);
+               if (rdd) {
+                       spin_lock(&dd->spin);
+                       if (spin_trylock(&rdd->spin)) {
+                               nlp = dfly_chooseproc_locked(rdd, dd, NULL, 1);
+                               spin_unlock(&rdd->spin);
+                               if (nlp == NULL)
+                                       spin_unlock(&dd->spin);
+                       } else {
+                               spin_unlock(&dd->spin);
+                               nlp = NULL;
+                       }
                } else {
                        nlp = NULL;
                }
+               /* dd->spin held if nlp != NULL */
 
                /*
                 * Either schedule it or add it to our queue.
                 */
-               if (nlp)
-                       spin_lock(&dd->spin);
                if (nlp &&
                    (nlp->lwp_priority & ~PPQMASK) < (dd->upri & ~PPQMASK)) {
                        atomic_set_cpumask(&dfly_curprocmask, dd->cpumask);
@@ -1111,6 +1060,7 @@ dfly_resetpriority(struct lwp *lp)
                                ((newpriority & ~PPQMASK) & PRIMASK);
                        atomic_add_int(&dfly_pcpu[lp->lwp_qcpu].uload,
                                       delta_uload);
+                       /* no change in ucount */
                }
                if (lp->lwp_mpflags & LWP_MP_ONRUNQ) {
                        dfly_remrunqueue_locked(rdd, lp);
@@ -1250,16 +1200,16 @@ dfly_exiting(struct lwp *lp, struct proc *child_proc)
                atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ULOAD);
                atomic_add_int(&dd->uload,
                               -((lp->lwp_priority & ~PPQMASK) & PRIMASK));
+               atomic_add_int(&dd->ucount, -1);
        }
 }
 
 /*
- * This function cannot block in any way.
+ * This function cannot block in any way, but spinlocks are ok.
  *
  * Update the uload based on the state of the thread (whether it is going
- * to sleep or running again).  Keep a one-entry cache of retained uload
- * for the last thread that had gone to sleep.  This cache prevents uload
- * from dropping when threads block for extremely short periods of time.
+ * to sleep or running again).  The uload is meant to be a longer-term
+ * load and not an instantanious load.
  */
 static void
 dfly_uload_update(struct lwp *lp)
@@ -1268,15 +1218,27 @@ dfly_uload_update(struct lwp *lp)
 
        if (lp->lwp_thread->td_flags & TDF_RUNQ) {
                if ((lp->lwp_mpflags & LWP_MP_ULOAD) == 0) {
-                       atomic_set_int(&lp->lwp_mpflags, LWP_MP_ULOAD);
-                       atomic_add_int(&dd->uload,
+                       spin_lock(&dd->spin);
+                       if ((lp->lwp_mpflags & LWP_MP_ULOAD) == 0) {
+                               atomic_set_int(&lp->lwp_mpflags,
+                                              LWP_MP_ULOAD);
+                               atomic_add_int(&dd->uload,
                                   ((lp->lwp_priority & ~PPQMASK) & PRIMASK));
+                               atomic_add_int(&dd->ucount, 1);
+                       }
+                       spin_unlock(&dd->spin);
                }
-       } else {
+       } else if (lp->lwp_slptime > 0) {
                if (lp->lwp_mpflags & LWP_MP_ULOAD) {
-                       atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ULOAD);
-                       atomic_add_int(&dd->uload,
+                       spin_lock(&dd->spin);
+                       if (lp->lwp_mpflags & LWP_MP_ULOAD) {
+                               atomic_clear_int(&lp->lwp_mpflags,
+                                                LWP_MP_ULOAD);
+                               atomic_add_int(&dd->uload,
                                   -((lp->lwp_priority & ~PPQMASK) & PRIMASK));
+                               atomic_add_int(&dd->ucount, -1);
+                       }
+                       spin_unlock(&dd->spin);
                }
        }
 }
@@ -1315,17 +1277,17 @@ dfly_chooseproc_locked(dfly_pcpu_t rdd, dfly_pcpu_t dd,
 
        if (worst) {
                if (idqbits) {
-                       pri = bsfl(idqbits);
+                       pri = bsrl(idqbits);
                        q = &rdd->idqueues[pri];
                        which = &rdd->idqueuebits;
                        which2 = &idqbits;
                } else if (tsqbits) {
-                       pri = bsfl(tsqbits);
+                       pri = bsrl(tsqbits);
                        q = &rdd->queues[pri];
                        which = &rdd->queuebits;
                        which2 = &tsqbits;
                } else if (rtqbits) {
-                       pri = bsfl(rtqbits);
+                       pri = bsrl(rtqbits);
                        q = &rdd->rtqueues[pri];
                        which = &rdd->rtqueuebits;
                        which2 = &rtqbits;
@@ -1381,14 +1343,21 @@ dfly_chooseproc_locked(dfly_pcpu_t rdd, dfly_pcpu_t dd,
        if (TAILQ_EMPTY(q))
                *which &= ~(1 << pri);
 
+       /*
+        * If we are choosing a process from rdd with the intent to
+        * move it to dd, lwp_qcpu must be adjusted while rdd's spinlock
+        * is still held.
+        */
        if (rdd != dd) {
                if (lp->lwp_mpflags & LWP_MP_ULOAD) {
                        atomic_add_int(&rdd->uload,
                            -((lp->lwp_priority & ~PPQMASK) & PRIMASK));
+                       atomic_add_int(&rdd->ucount, -1);
                }
                lp->lwp_qcpu = dd->cpuid;
                atomic_add_int(&dd->uload,
                    ((lp->lwp_priority & ~PPQMASK) & PRIMASK));
+               atomic_add_int(&dd->ucount, 1);
                atomic_set_int(&lp->lwp_mpflags, LWP_MP_ULOAD);
        }
        return lp;
@@ -1400,7 +1369,9 @@ dfly_chooseproc_locked(dfly_pcpu_t rdd, dfly_pcpu_t dd,
  * USED TO PUSH RUNNABLE LWPS TO THE LEAST LOADED CPU.
  *
  * Choose a cpu node to schedule lp on, hopefully nearby its current
- * node.  We give the current node a modest advantage for obvious reasons.
+ * node.
+ *
+ * We give the current node a modest advantage for obvious reasons.
  *
  * We also give the node the thread was woken up FROM a slight advantage
  * in order to try to schedule paired threads which synchronize/block waiting
@@ -1410,6 +1381,9 @@ dfly_chooseproc_locked(dfly_pcpu_t rdd, dfly_pcpu_t dd,
  * algorithm as it heuristically groups synchronizing processes for locality
  * of reference in multi-socket systems.
  *
+ * We check against running processes and give a big advantage if there
+ * are none running.
+ *
  * The caller will normally dfly_setrunqueue() lp on the returned queue.
  *
  * When the topology is known choose a cpu whos group has, in aggregate,
@@ -1419,13 +1393,14 @@ static
 dfly_pcpu_t
 dfly_choose_best_queue(struct lwp *lp)
 {
+       cpumask_t wakemask;
        cpumask_t mask;
        cpu_node_t *cpup;
        cpu_node_t *cpun;
        cpu_node_t *cpub;
-       dfly_pcpu_t dd1 = &dfly_pcpu[lp->lwp_qcpu];
-       dfly_pcpu_t dd2 = &dfly_pcpu[lp->lwp_thread->td_wakefromcpu];
+       dfly_pcpu_t dd = &dfly_pcpu[lp->lwp_qcpu];
        dfly_pcpu_t rdd;
+       int wakecpu;
        int cpuid;
        int n;
        int count;
@@ -1436,15 +1411,23 @@ dfly_choose_best_queue(struct lwp *lp)
         * When the topology is unknown choose a random cpu that is hopefully
         * idle.
         */
-       if (dd1->cpunode == NULL)
-               return (dfly_choose_queue_simple(dd1, lp));
+       if (dd->cpunode == NULL)
+               return (dfly_choose_queue_simple(dd, lp));
+
+       /*
+        * Pairing mask
+        */
+       if ((wakecpu = lp->lwp_thread->td_wakefromcpu) >= 0)
+               wakemask = dfly_pcpu[wakecpu].cpumask;
+       else
+               wakemask = 0;
 
        /*
         * When the topology is known choose a cpu whos group has, in
         * aggregate, has the lowest weighted load.
         */
        cpup = root_cpu_node;
-       rdd = dd1;
+       rdd = dd;
 
        while (cpup) {
                /*
@@ -1477,51 +1460,81 @@ dfly_choose_best_queue(struct lwp *lp)
                        if (mask == 0)
                                continue;
 
+                       count = 0;
+                       load = 0;
+
+                       while (mask) {
+                               cpuid = BSFCPUMASK(mask);
+                               rdd = &dfly_pcpu[cpuid];
+                               load += rdd->uload;
+                               load += rdd->ucount * usched_dfly_weight3;
+
+                               if (rdd->uschedcp == NULL &&
+                                   rdd->runqcount == 0) {
+                                       load -= usched_dfly_weight4;
+                               } else if (rdd->upri > lp->lwp_priority + PPQ) {
+                                       load -= usched_dfly_weight4 / 2;
+                               }
+                               mask &= ~CPUMASK(cpuid);
+                               ++count;
+                       }
+
                        /*
                         * Compensate if the lp is already accounted for in
                         * the aggregate uload for this mask set.  We want
-                        * to calculate the loads as if lp was not present.
+                        * to calculate the loads as if lp were not present,
+                        * otherwise the calculation is bogus.
                         */
                        if ((lp->lwp_mpflags & LWP_MP_ULOAD) &&
-                           CPUMASK(lp->lwp_qcpu) & mask) {
-                               load = -((lp->lwp_priority & ~PPQMASK) &
+                           (dd->cpumask & cpun->members)) {
+                               load -= ((lp->lwp_priority & ~PPQMASK) &
                                         PRIMASK);
-                       } else {
-                               load = 0;
+                               load -= usched_dfly_weight3;
                        }
 
-                       count = 0;
-                       while (mask) {
-                               cpuid = BSFCPUMASK(mask);
-                               load += dfly_pcpu[cpuid].uload;
-                               load += dfly_pcpu[cpuid].runqcount *
-                                       usched_dfly_weight3;
-                               mask &= ~CPUMASK(cpuid);
-                               ++count;
-                       }
                        load /= count;
 
                        /*
-                        * Give a slight advantage to the cpu groups (lp)
-                        * belongs to.
-                        *
-                        * Give a slight advantage to the cpu groups our
-                        * synchronous partner belongs to.  However, to
-                        * avoid flapping in a two-way comm environment
-                        * we only employ this measure when the wake-from's
-                        * cpu is higher than lp's cpu.
+                        * Advantage the cpu group (lp) is already on.
                         */
-                       if (cpun->members & dd1->cpumask)
+                       if (cpun->members & dd->cpumask)
                                load -= usched_dfly_weight1;
-                       else if ((cpun->members & dd2->cpumask) && dd1 < dd2)
-                               load -= usched_dfly_weight2;
+
+                       /*
+                        * Advantage the cpu group we want to pair (lp) to,
+                        * but don't let it go to the exact same cpu as
+                        * the wakecpu target.
+                        *
+                        * We do this by checking whether cpun is a
+                        * terminal node or not.  All cpun's at the same
+                        * level will either all be terminal or all not
+                        * terminal.
+                        *
+                        * If it is and we match we disadvantage the load.
+                        * If it is and we don't match we advantage the load.
+                        *
+                        * Also note that we are effectively disadvantaging
+                        * all-but-one by the same amount, so it won't effect
+                        * the weight1 factor for the all-but-one nodes.
+                        */
+                       if (cpun->members & wakemask) {
+                               if (cpun->child_node != NULL) {
+                                       /* advantage */
+                                       load -= usched_dfly_weight2;
+                               } else {
+                                       if (usched_dfly_features & 0x10)
+                                               load += usched_dfly_weight2;
+                                       else
+                                               load -= usched_dfly_weight2;
+                               }
+                       }
 
                        /*
                         * Calculate the best load
                         */
                        if (cpub == NULL || lowest_load > load ||
                            (lowest_load == load &&
-                            (cpun->members & dd1->cpumask))
+                            (cpun->members & dd->cpumask))
                        ) {
                                lowest_load = load;
                                cpub = cpun;
@@ -1539,7 +1552,8 @@ dfly_choose_best_queue(struct lwp *lp)
  * USED TO PULL RUNNABLE LWPS FROM THE MOST LOADED CPU.
  *
  * Choose the worst queue close to dd's cpu node with a non-empty runq
- * that is NOT dd.
+ * that is NOT dd.  Also require that the moving of the highest-load thread
+ * from rdd to dd does not cause the uload's to cross each other.
  *
  * This is used by the thread chooser when the current cpu's queues are
  * empty to steal a thread from another cpu's queue.  We want to offload
@@ -1547,7 +1561,7 @@ dfly_choose_best_queue(struct lwp *lp)
  */
 static
 dfly_pcpu_t
-dfly_choose_worst_queue(dfly_pcpu_t dd, int weight)
+dfly_choose_worst_queue(dfly_pcpu_t dd)
 {
        cpumask_t mask;
        cpu_node_t *cpup;
@@ -1558,6 +1572,8 @@ dfly_choose_worst_queue(dfly_pcpu_t dd, int weight)
        int n;
        int count;
        int load;
+       int pri;
+       int hpri;
        int highest_load;
 
        /*
@@ -1606,11 +1622,20 @@ dfly_choose_worst_queue(dfly_pcpu_t dd, int weight)
                                continue;
                        count = 0;
                        load = 0;
+
                        while (mask) {
                                cpuid = BSFCPUMASK(mask);
-                               load += dfly_pcpu[cpuid].uload;
-                               load += dfly_pcpu[cpuid].runqcount *
-                                       usched_dfly_weight3;
+                               rdd = &dfly_pcpu[cpuid];
+                               load += rdd->uload;
+                               load += rdd->ucount * usched_dfly_weight3;
+                               if (rdd->uschedcp == NULL &&
+                                   rdd->runqcount == 0 &&
+                                   globaldata_find(cpuid)->gd_tdrunqcount == 0
+                               ) {
+                                       load -= usched_dfly_weight4;
+                               } else if (rdd->upri > dd->upri + PPQ) {
+                                       load -= usched_dfly_weight4 / 2;
+                               }
                                mask &= ~CPUMASK(cpuid);
                                ++count;
                        }
@@ -1643,8 +1668,19 @@ dfly_choose_worst_queue(dfly_pcpu_t dd, int weight)
         * This also helps us avoid breaking paired threads apart which
         * can have disastrous effects on performance.
         */
-       if (rdd == dd || rdd->uload < dd->uload + weight)
-               rdd = NULL;
+       if (rdd == dd)
+               return(NULL);
+
+       hpri = 0;
+       if (rdd->rtqueuebits && hpri < (pri = bsrl(rdd->rtqueuebits)))
+               hpri = pri;
+       if (rdd->queuebits && hpri < (pri = bsrl(rdd->queuebits)))
+               hpri = pri;
+       if (rdd->idqueuebits && hpri < (pri = bsrl(rdd->idqueuebits)))
+               hpri = pri;
+       hpri *= PPQ;
+       if (rdd->uload - hpri < dd->uload + hpri)
+               return(NULL);
        return (rdd);
 }
 
@@ -1755,6 +1791,7 @@ dfly_remrunqueue_locked(dfly_pcpu_t rdd, struct lwp *lp)
        KKASSERT(rdd->runqcount >= 0);
 
        pri = lp->lwp_rqindex;
+
        switch(lp->lwp_rqtype) {
        case RTP_PRIO_NORMAL:
                q = &rdd->queues[pri];
@@ -1807,19 +1844,13 @@ dfly_setrunqueue_locked(dfly_pcpu_t rdd, struct lwp *lp)
        u_int32_t *which;
        int pri;
 
-       if (lp->lwp_qcpu != rdd->cpuid) {
-               if (lp->lwp_mpflags & LWP_MP_ULOAD) {
-                       atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ULOAD);
-                       atomic_add_int(&dfly_pcpu[lp->lwp_qcpu].uload,
-                                  -((lp->lwp_priority & ~PPQMASK) & PRIMASK));
-               }
-               lp->lwp_qcpu = rdd->cpuid;
-       }
+       KKASSERT(lp->lwp_qcpu == rdd->cpuid);
 
        if ((lp->lwp_mpflags & LWP_MP_ULOAD) == 0) {
                atomic_set_int(&lp->lwp_mpflags, LWP_MP_ULOAD);
                atomic_add_int(&dfly_pcpu[lp->lwp_qcpu].uload,
                               (lp->lwp_priority & ~PPQMASK) & PRIMASK);
+               atomic_add_int(&dfly_pcpu[lp->lwp_qcpu].ucount, 1);
        }
 
        pri = lp->lwp_rqindex;
@@ -1944,7 +1975,7 @@ dfly_helper_thread(void *dummy)
                 *       partially unbalanced (e.g. 6 runnables and only
                 *       4 cores).
                 */
-               rdd = dfly_choose_worst_queue(dd, usched_dfly_weight1 * 8);
+               rdd = dfly_choose_worst_queue(dd);
                if (rdd && spin_trylock(&rdd->spin)) {
                        nlp = dfly_chooseproc_locked(rdd, dd, NULL, 0);
                        spin_unlock(&rdd->spin);
@@ -2180,6 +2211,12 @@ dfly_helper_thread_cpu_init(void)
 
                SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx,
                               SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
+                              OID_AUTO, "weight4", CTLFLAG_RW,
+                              &usched_dfly_weight4, 50,
+                              "Availability of other idle cpus");
+
+               SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx,
+                              SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
                               OID_AUTO, "features", CTLFLAG_RW,
                               &usched_dfly_features, 15,
                               "Allow pulls into empty queues");
index 936966e..50293ef 100644 (file)
@@ -316,23 +316,15 @@ dummy_setrunqueue(struct lwp *lp)
 }
 
 /*
- * This routine is called from a systimer IPI.  Thus it is called with 
- * a critical section held.  Any spinlocks we get here that are also
- * obtained in other procedures must be proected by a critical section
- * in those other procedures to avoid a deadlock.
- *
- * The MP lock may or may not be held on entry and cannot be obtained
- * by this routine (because it is called from a systimer IPI).  Additionally,
- * because this is equivalent to a FAST interrupt, spinlocks cannot be used
- * (or at least, you have to check that gd_spin* counts are 0 before you
- * can).
+ * This routine is called from a systimer IPI.  It must NEVER block.
+ * If a lwp compatible with this scheduler is the currently running
+ * thread this function is called with a non-NULL lp, otherwise it
+ * will be called with a NULL lp.
  *
  * This routine is called at ESTCPUFREQ on each cpu independantly.
  *
  * This routine typically queues a reschedule request, which will cause
  * the scheduler's BLAH_select_curproc() to be called as soon as possible.
- *
- * MPSAFE
  */
 static
 void
@@ -341,6 +333,9 @@ dummy_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp)
        globaldata_t gd = mycpu;
        dummy_pcpu_t dd = &dummy_pcpu[gd->gd_cpuid];
 
+       if (lp == NULL)
+               return;
+
        if (++dd->rrcount >= usched_dummy_rrinterval) {
                dd->rrcount = 0;
                need_user_resched();
index 2234e68..d1f7393 100644 (file)
@@ -137,7 +137,7 @@ struct globaldata {
        struct lwkt_ipiq *gd_ipiq;              /* array[ncpu] of ipiq's */
        struct lwkt_ipiq gd_cpusyncq;           /* ipiq for cpu synchro */
        u_int           gd_npoll;               /* ipiq synchronization */
-       int             gd_unused01;
+       int             gd_tdrunqcount;
        struct thread   gd_unused02B;
        struct thread   gd_idlethread;
        SLGlobalData    gd_slab;                /* slab allocator */
@@ -166,8 +166,7 @@ struct globaldata {
        int             gd_ireserved[7];
        const char      *gd_infomsg;            /* debugging */
        struct lwkt_tokref gd_handoff;          /* hand-off tokref */
-       struct lwp      *gd_sleeping_lwp;
-       void            *gd_preserved[7];       /* future fields */
+       void            *gd_preserved[8];       /* future fields */
        /* extended by <machine/globaldata.h> */
 };
 
index 1753bb2..27d5cf5 100644 (file)
@@ -401,6 +401,7 @@ struct      proc {
 #define LWP_MP_WEXIT   0x0000002 /* working on exiting */
 #define        LWP_MP_WSTOP    0x0000004 /* working on stopping */
 #define        LWP_MP_ULOAD    0x0000008 /* uload accounting for current cpu */
+#define        LWP_MP_RRFORCE  0x0000010 /* forced resched due to rrcount */
 
 #define        FIRST_LWP_IN_PROC(p)            RB_FIRST(lwp_rb_tree, &(p)->p_lwp_tree)
 #define        FOREACH_LWP_IN_PROC(lp, p)      \
index 3b445f1..27d82d3 100644 (file)
@@ -99,6 +99,7 @@ extern int sched_ticks; /* From sys/kern/kern_clock.c */
 
 int usched_ctl(struct usched *, int);
 struct usched *usched_init(void);
+void usched_schedulerclock(struct lwp *, sysclock_t, sysclock_t);
 
 #endif