logtsleep2(tsleep_beg, ident);
gd = td->td_gd;
KKASSERT(td != &gd->gd_idlethread); /* you must be kidding! */
+ td->td_wakefromcpu = gd->gd_cpuid; /* overwritten by _wakeup */
/*
* NOTE: all of this occurs on the current cpu, including any
* Make sure the current process has been untangled from
* the userland scheduler and initialize slptime to start
* counting.
+ *
+ * NOTE: td->td_wakefromcpu is pre-set by the release function
+ * for the dfly scheduler, and then adjusted by _wakeup()
*/
if (lp) {
p->p_usched->release_curproc(lp);
* Make all processes sleeping on the specified identifier runnable.
* count may be zero or one only.
*
- * The domain encodes the sleep/wakeup domain AND the first cpu to check
- * (which is always the current cpu). As we iterate across cpus
+ * The domain encodes the sleep/wakeup domain, flags, plus the originating
+ * cpu.
*
* This call may run without the MP lock held. We can only manipulate thread
* state on the cpu owning the thread. We CANNOT manipulate process state
) {
KKASSERT(td->td_gd == gd);
_tsleep_remove(td);
+ td->td_wakefromcpu = PWAKEUP_DECODE(domain);
if (td->td_flags & TDF_TSLEEP_DESCHEDULED) {
lwkt_schedule(td);
if (domain & PWAKEUP_ONE)
wakeup_one(const volatile void *ident)
{
/* XXX potentially round-robin the first responding cpu */
- _wakeup(__DEALL(ident), PWAKEUP_ENCODE(0, mycpu->gd_cpuid) | PWAKEUP_ONE);
+ _wakeup(__DEALL(ident), PWAKEUP_ENCODE(0, mycpu->gd_cpuid) |
+ PWAKEUP_ONE);
}
/*
void
wakeup_mycpu(const volatile void *ident)
{
- _wakeup(__DEALL(ident), PWAKEUP_MYCPU);
+ _wakeup(__DEALL(ident), PWAKEUP_ENCODE(0, mycpu->gd_cpuid) |
+ PWAKEUP_MYCPU);
}
/*
wakeup_mycpu_one(const volatile void *ident)
{
/* XXX potentially round-robin the first responding cpu */
- _wakeup(__DEALL(ident), PWAKEUP_MYCPU|PWAKEUP_ONE);
+ _wakeup(__DEALL(ident), PWAKEUP_ENCODE(0, mycpu->gd_cpuid) |
+ PWAKEUP_MYCPU | PWAKEUP_ONE);
}
/*
wakeup_oncpu(globaldata_t gd, const volatile void *ident)
{
#ifdef SMP
+ globaldata_t mygd = mycpu;
if (gd == mycpu) {
- _wakeup(__DEALL(ident), PWAKEUP_MYCPU);
+ _wakeup(__DEALL(ident), PWAKEUP_ENCODE(0, mygd->gd_cpuid) |
+ PWAKEUP_MYCPU);
} else {
- lwkt_send_ipiq2(gd, _wakeup, __DEALL(ident), PWAKEUP_MYCPU);
+ lwkt_send_ipiq2(gd, _wakeup, __DEALL(ident),
+ PWAKEUP_ENCODE(0, mygd->gd_cpuid) |
+ PWAKEUP_MYCPU);
}
#else
_wakeup(__DEALL(ident), PWAKEUP_MYCPU);
wakeup_oncpu_one(globaldata_t gd, const volatile void *ident)
{
#ifdef SMP
- if (gd == mycpu) {
- _wakeup(__DEALL(ident), PWAKEUP_MYCPU | PWAKEUP_ONE);
+ globaldata_t mygd = mycpu;
+ if (gd == mygd) {
+ _wakeup(__DEALL(ident), PWAKEUP_ENCODE(0, mygd->gd_cpuid) |
+ PWAKEUP_MYCPU | PWAKEUP_ONE);
} else {
lwkt_send_ipiq2(gd, _wakeup, __DEALL(ident),
+ PWAKEUP_ENCODE(0, mygd->gd_cpuid) |
PWAKEUP_MYCPU | PWAKEUP_ONE);
}
#else
static void dfly_uload_update(struct lwp *lp);
static void dfly_yield(struct lwp *lp);
#ifdef SMP
-static dfly_pcpu_t dfly_choose_best_queue(dfly_pcpu_t dd, struct lwp *lp);
+static dfly_pcpu_t dfly_choose_best_queue(struct lwp *lp);
static dfly_pcpu_t dfly_choose_worst_queue(dfly_pcpu_t dd);
static dfly_pcpu_t dfly_choose_queue_simple(dfly_pcpu_t dd, struct lwp *lp);
#endif
#ifdef SMP
static int usched_dfly_smt = 0;
static int usched_dfly_cache_coherent = 0;
-static int usched_dfly_upri_affinity = 16; /* 32 queues - half-way */
-static int usched_dfly_queue_checks = 5;
+static int usched_dfly_weight1 = 10;
+static int usched_dfly_weight2 = 5;
static int usched_dfly_stick_to_level = 0;
#endif
static int usched_dfly_rrinterval = (ESTCPUFREQ + 9) / 10;
globaldata_t gd = mycpu;
dfly_pcpu_t dd = &dfly_pcpu[gd->gd_cpuid];
+ /*
+ * Make sure td_wakefromcpu is defaulted. This will be overwritten
+ * by wakeup().
+ */
+ lp->lwp_thread->td_wakefromcpu = gd->gd_cpuid;
+
if (dd->uschedcp == lp) {
crit_enter();
KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0);
{
globaldata_t rgd;
dfly_pcpu_t rdd;
- int cpuid;
/*
* First validate the process LWKT state.
KKASSERT((lp->lwp_thread->td_flags & TDF_RUNQ) == 0);
/*
- * NOTE: gd and dd are relative to the target thread's last cpu,
- * NOT our current cpu.
+ * NOTE: rdd does not necessarily represent the current cpu.
+ * Instead it represents the cpu the thread was last
+ * scheduled on.
*/
- rgd = globaldata_find(lp->lwp_qcpu);
rdd = &dfly_pcpu[lp->lwp_qcpu];
- cpuid = rdd->cpuid;
/*
* This process is not supposed to be scheduled anywhere or assigned
if (rdd->uschedcp == NULL) {
spin_lock(&rdd->spin);
if (rdd->uschedcp == NULL) {
- atomic_set_cpumask(&dfly_curprocmask, rgd->gd_cpumask);
+ atomic_set_cpumask(&dfly_curprocmask, 1);
rdd->uschedcp = lp;
rdd->upri = lp->lwp_priority;
spin_unlock(&rdd->spin);
* sibling has a thread assigned).
*/
/*spin_lock(&dfly_spin);*/
- rdd = dfly_choose_best_queue(rdd, lp);
+ rdd = dfly_choose_best_queue(lp);
rgd = globaldata_find(rdd->cpuid);
/*
spin_unlock(&rdd->spin);
}
} else {
- atomic_clear_cpumask(&dfly_rdyprocmask, CPUMASK(cpuid));
+ atomic_clear_cpumask(&dfly_rdyprocmask, rgd->gd_cpumask);
if ((rdd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) {
spin_unlock(&rdd->spin);
lwkt_send_ipiq(rgd, dfly_need_user_resched_remote,
u_int32_t rtqbits;
u_int32_t tsqbits;
u_int32_t idqbits;
- /*usched_dfly_queue_checks*/
rtqbits = dd->rtqueuebits;
tsqbits = dd->queuebits;
* USED TO PUSH RUNNABLE LWPS TO THE LEAST LOADED CPU.
*
* Choose a cpu node to schedule lp on, hopefully nearby its current
- * node. The current node is passed in (dd) (though it can also be obtained
- * from lp->lwp_qcpu). The caller will dfly_setrunqueue() lp on the queue
- * we return.
+ * node. We give the current node a modest advantage for obvious reasons.
+ *
+ * We also give the node the thread was woken up FROM a slight advantage
+ * in order to try to schedule paired threads which synchronize/block waiting
+ * for each other fairly close to each other. Similarly in a network setting
+ * this feature will also attempt to place a user process near the kernel
+ * protocol thread that is feeding it data. THIS IS A CRITICAL PART of the
+ * algorithm as it heuristically groups synchronizing processes for locality
+ * of reference in multi-socket systems.
+ *
+ * The caller will normally dfly_setrunqueue() lp on the returned queue.
*
* When the topology is known choose a cpu whos group has, in aggregate,
* has the lowest weighted load.
*/
static
dfly_pcpu_t
-dfly_choose_best_queue(dfly_pcpu_t dd, struct lwp *lp)
+dfly_choose_best_queue(struct lwp *lp)
{
cpumask_t mask;
cpu_node_t *cpup;
cpu_node_t *cpun;
cpu_node_t *cpub;
+ dfly_pcpu_t dd1 = &dfly_pcpu[lp->lwp_qcpu];
+ dfly_pcpu_t dd2 = &dfly_pcpu[lp->lwp_thread->td_wakefromcpu];
dfly_pcpu_t rdd;
int cpuid;
int n;
* When the topology is unknown choose a random cpu that is hopefully
* idle.
*/
- if (dd->cpunode == NULL)
- return (dfly_choose_queue_simple(dd, lp));
+ if (dd1->cpunode == NULL)
+ return (dfly_choose_queue_simple(dd1, lp));
/*
* When the topology is known choose a cpu whos group has, in
* aggregate, has the lowest weighted load.
*/
cpup = root_cpu_node;
- rdd = dd;
+ rdd = dd1;
level = cpu_topology_levels_number;
while (cpup) {
/*
* Give a slight advantage to nearby cpus.
*/
- if (cpun->members & dd->cpumask)
- load -= PPQ * level;
+ if (cpun->members & dd1->cpumask)
+ load -= PPQ * level * usched_dfly_weight1 / 10;
+ else if (cpun->members & dd2->cpumask)
+ load -= PPQ * level * usched_dfly_weight2 / 10;
/*
* Calculate the best load
*/
if (cpub == NULL || lowest_load > load ||
(lowest_load == load &&
- (cpun->members & dd->cpumask))
+ (cpun->members & dd1->cpumask))
) {
lowest_load = load;
cpub = cpun;
SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx,
SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
- OID_AUTO, "upri_affinity", CTLFLAG_RW,
- &usched_dfly_upri_affinity, 1,
- "Number of PPQs in user priority check");
+ OID_AUTO, "weight1", CTLFLAG_RW,
+ &usched_dfly_weight1, 10,
+ "Weight selection for current cpu");
SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx,
SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
- OID_AUTO, "queue_checks", CTLFLAG_RW,
- &usched_dfly_queue_checks, 5,
- "LWPs to check from a queue before giving up");
+ OID_AUTO, "weight2", CTLFLAG_RW,
+ &usched_dfly_weight2, 5,
+ "Weight selection for wakefrom cpu");
SYSCTL_ADD_PROC(&usched_dfly_sysctl_ctx,
SYSCTL_CHILDREN(usched_dfly_sysctl_tree),