From: Matthew Dillon Date: Tue, 18 Sep 2012 18:01:35 +0000 (-0700) Subject: kernel - Add usched_dfly algorith, set as default for now (3) X-Git-Tag: v3.2.0~118 X-Git-Url: https://gitweb.dragonflybsd.org/~nant/dragonfly.git/commitdiff_plain/c75e41b70db3548a580791be81447d2d22e74aaf kernel - Add usched_dfly algorith, set as default for now (3) * Add a field to the thread structure, td_wakefromcpu. All wakeup() family calls will load this field with the cpu the thread was woken up FROM. * Use this field in usched_dfly to weight scheduling such that pairs of synchronously-dependent threads (for example, a pgbench thread and a postgres server process) are placed closer to each other in the cpu topology. * Weighting: - Load matters the most - Current cpu thread is scheduled on is next - Synchronous wait/wakeup weighting is last * Tests on monster yield better all-around results with a new all-time high w/ pgbench -j 40 -c 40 -T 60 -S bench: 25% idle at 40:40 tps = 215293.173300 (excluding connections establishing) Without the wait/wakeup weighting (but with allwload and current cpu weighting): 41% idle at 40:40 tps = 162352.813046 (excluding connections establishing) Without wait/wakeup or current-cpu weighting. Load balancing only: 43% idle at 40:40 tps = 159047.440641 (excluding connections establishing) --- diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c index a67a76e78f..c8e5ae55a1 100644 --- a/sys/kern/kern_synch.c +++ b/sys/kern/kern_synch.c @@ -481,6 +481,7 @@ tsleep(const volatile void *ident, int flags, const char *wmesg, int timo) logtsleep2(tsleep_beg, ident); gd = td->td_gd; KKASSERT(td != &gd->gd_idlethread); /* you must be kidding! */ + td->td_wakefromcpu = gd->gd_cpuid; /* overwritten by _wakeup */ /* * NOTE: all of this occurs on the current cpu, including any @@ -545,6 +546,9 @@ tsleep(const volatile void *ident, int flags, const char *wmesg, int timo) * Make sure the current process has been untangled from * the userland scheduler and initialize slptime to start * counting. + * + * NOTE: td->td_wakefromcpu is pre-set by the release function + * for the dfly scheduler, and then adjusted by _wakeup() */ if (lp) { p->p_usched->release_curproc(lp); @@ -854,8 +858,8 @@ endtsleep(void *arg) * Make all processes sleeping on the specified identifier runnable. * count may be zero or one only. * - * The domain encodes the sleep/wakeup domain AND the first cpu to check - * (which is always the current cpu). As we iterate across cpus + * The domain encodes the sleep/wakeup domain, flags, plus the originating + * cpu. * * This call may run without the MP lock held. We can only manipulate thread * state on the cpu owning the thread. We CANNOT manipulate process state @@ -889,6 +893,7 @@ restart: ) { KKASSERT(td->td_gd == gd); _tsleep_remove(td); + td->td_wakefromcpu = PWAKEUP_DECODE(domain); if (td->td_flags & TDF_TSLEEP_DESCHEDULED) { lwkt_schedule(td); if (domain & PWAKEUP_ONE) @@ -943,7 +948,8 @@ void wakeup_one(const volatile void *ident) { /* XXX potentially round-robin the first responding cpu */ - _wakeup(__DEALL(ident), PWAKEUP_ENCODE(0, mycpu->gd_cpuid) | PWAKEUP_ONE); + _wakeup(__DEALL(ident), PWAKEUP_ENCODE(0, mycpu->gd_cpuid) | + PWAKEUP_ONE); } /* @@ -953,7 +959,8 @@ wakeup_one(const volatile void *ident) void wakeup_mycpu(const volatile void *ident) { - _wakeup(__DEALL(ident), PWAKEUP_MYCPU); + _wakeup(__DEALL(ident), PWAKEUP_ENCODE(0, mycpu->gd_cpuid) | + PWAKEUP_MYCPU); } /* @@ -964,7 +971,8 @@ void wakeup_mycpu_one(const volatile void *ident) { /* XXX potentially round-robin the first responding cpu */ - _wakeup(__DEALL(ident), PWAKEUP_MYCPU|PWAKEUP_ONE); + _wakeup(__DEALL(ident), PWAKEUP_ENCODE(0, mycpu->gd_cpuid) | + PWAKEUP_MYCPU | PWAKEUP_ONE); } /* @@ -975,10 +983,14 @@ void wakeup_oncpu(globaldata_t gd, const volatile void *ident) { #ifdef SMP + globaldata_t mygd = mycpu; if (gd == mycpu) { - _wakeup(__DEALL(ident), PWAKEUP_MYCPU); + _wakeup(__DEALL(ident), PWAKEUP_ENCODE(0, mygd->gd_cpuid) | + PWAKEUP_MYCPU); } else { - lwkt_send_ipiq2(gd, _wakeup, __DEALL(ident), PWAKEUP_MYCPU); + lwkt_send_ipiq2(gd, _wakeup, __DEALL(ident), + PWAKEUP_ENCODE(0, mygd->gd_cpuid) | + PWAKEUP_MYCPU); } #else _wakeup(__DEALL(ident), PWAKEUP_MYCPU); @@ -993,10 +1005,13 @@ void wakeup_oncpu_one(globaldata_t gd, const volatile void *ident) { #ifdef SMP - if (gd == mycpu) { - _wakeup(__DEALL(ident), PWAKEUP_MYCPU | PWAKEUP_ONE); + globaldata_t mygd = mycpu; + if (gd == mygd) { + _wakeup(__DEALL(ident), PWAKEUP_ENCODE(0, mygd->gd_cpuid) | + PWAKEUP_MYCPU | PWAKEUP_ONE); } else { lwkt_send_ipiq2(gd, _wakeup, __DEALL(ident), + PWAKEUP_ENCODE(0, mygd->gd_cpuid) | PWAKEUP_MYCPU | PWAKEUP_ONE); } #else diff --git a/sys/kern/usched_dfly.c b/sys/kern/usched_dfly.c index c525e0d2b5..5e2b0e3a91 100644 --- a/sys/kern/usched_dfly.c +++ b/sys/kern/usched_dfly.c @@ -132,7 +132,7 @@ static void dfly_exiting(struct lwp *lp, struct proc *); static void dfly_uload_update(struct lwp *lp); static void dfly_yield(struct lwp *lp); #ifdef SMP -static dfly_pcpu_t dfly_choose_best_queue(dfly_pcpu_t dd, struct lwp *lp); +static dfly_pcpu_t dfly_choose_best_queue(struct lwp *lp); static dfly_pcpu_t dfly_choose_worst_queue(dfly_pcpu_t dd); static dfly_pcpu_t dfly_choose_queue_simple(dfly_pcpu_t dd, struct lwp *lp); #endif @@ -206,8 +206,8 @@ SYSCTL_INT(_debug, OID_AUTO, dfly_chooser, CTLFLAG_RW, #ifdef SMP static int usched_dfly_smt = 0; static int usched_dfly_cache_coherent = 0; -static int usched_dfly_upri_affinity = 16; /* 32 queues - half-way */ -static int usched_dfly_queue_checks = 5; +static int usched_dfly_weight1 = 10; +static int usched_dfly_weight2 = 5; static int usched_dfly_stick_to_level = 0; #endif static int usched_dfly_rrinterval = (ESTCPUFREQ + 9) / 10; @@ -452,6 +452,12 @@ dfly_release_curproc(struct lwp *lp) globaldata_t gd = mycpu; dfly_pcpu_t dd = &dfly_pcpu[gd->gd_cpuid]; + /* + * Make sure td_wakefromcpu is defaulted. This will be overwritten + * by wakeup(). + */ + lp->lwp_thread->td_wakefromcpu = gd->gd_cpuid; + if (dd->uschedcp == lp) { crit_enter(); KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0); @@ -521,7 +527,6 @@ dfly_setrunqueue(struct lwp *lp) { globaldata_t rgd; dfly_pcpu_t rdd; - int cpuid; /* * First validate the process LWKT state. @@ -534,12 +539,11 @@ dfly_setrunqueue(struct lwp *lp) KKASSERT((lp->lwp_thread->td_flags & TDF_RUNQ) == 0); /* - * NOTE: gd and dd are relative to the target thread's last cpu, - * NOT our current cpu. + * NOTE: rdd does not necessarily represent the current cpu. + * Instead it represents the cpu the thread was last + * scheduled on. */ - rgd = globaldata_find(lp->lwp_qcpu); rdd = &dfly_pcpu[lp->lwp_qcpu]; - cpuid = rdd->cpuid; /* * This process is not supposed to be scheduled anywhere or assigned @@ -563,7 +567,7 @@ dfly_setrunqueue(struct lwp *lp) if (rdd->uschedcp == NULL) { spin_lock(&rdd->spin); if (rdd->uschedcp == NULL) { - atomic_set_cpumask(&dfly_curprocmask, rgd->gd_cpumask); + atomic_set_cpumask(&dfly_curprocmask, 1); rdd->uschedcp = lp; rdd->upri = lp->lwp_priority; spin_unlock(&rdd->spin); @@ -596,7 +600,7 @@ dfly_setrunqueue(struct lwp *lp) * sibling has a thread assigned). */ /*spin_lock(&dfly_spin);*/ - rdd = dfly_choose_best_queue(rdd, lp); + rdd = dfly_choose_best_queue(lp); rgd = globaldata_find(rdd->cpuid); /* @@ -624,7 +628,7 @@ dfly_setrunqueue(struct lwp *lp) spin_unlock(&rdd->spin); } } else { - atomic_clear_cpumask(&dfly_rdyprocmask, CPUMASK(cpuid)); + atomic_clear_cpumask(&dfly_rdyprocmask, rgd->gd_cpumask); if ((rdd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) { spin_unlock(&rdd->spin); lwkt_send_ipiq(rgd, dfly_need_user_resched_remote, @@ -1107,7 +1111,6 @@ dfly_chooseproc_locked(dfly_pcpu_t dd, struct lwp *chklp, int isremote) u_int32_t rtqbits; u_int32_t tsqbits; u_int32_t idqbits; - /*usched_dfly_queue_checks*/ rtqbits = dd->rtqueuebits; tsqbits = dd->queuebits; @@ -1204,21 +1207,31 @@ dfly_chooseproc_locked(dfly_pcpu_t dd, struct lwp *chklp, int isremote) * USED TO PUSH RUNNABLE LWPS TO THE LEAST LOADED CPU. * * Choose a cpu node to schedule lp on, hopefully nearby its current - * node. The current node is passed in (dd) (though it can also be obtained - * from lp->lwp_qcpu). The caller will dfly_setrunqueue() lp on the queue - * we return. + * node. We give the current node a modest advantage for obvious reasons. + * + * We also give the node the thread was woken up FROM a slight advantage + * in order to try to schedule paired threads which synchronize/block waiting + * for each other fairly close to each other. Similarly in a network setting + * this feature will also attempt to place a user process near the kernel + * protocol thread that is feeding it data. THIS IS A CRITICAL PART of the + * algorithm as it heuristically groups synchronizing processes for locality + * of reference in multi-socket systems. + * + * The caller will normally dfly_setrunqueue() lp on the returned queue. * * When the topology is known choose a cpu whos group has, in aggregate, * has the lowest weighted load. */ static dfly_pcpu_t -dfly_choose_best_queue(dfly_pcpu_t dd, struct lwp *lp) +dfly_choose_best_queue(struct lwp *lp) { cpumask_t mask; cpu_node_t *cpup; cpu_node_t *cpun; cpu_node_t *cpub; + dfly_pcpu_t dd1 = &dfly_pcpu[lp->lwp_qcpu]; + dfly_pcpu_t dd2 = &dfly_pcpu[lp->lwp_thread->td_wakefromcpu]; dfly_pcpu_t rdd; int cpuid; int n; @@ -1230,15 +1243,15 @@ dfly_choose_best_queue(dfly_pcpu_t dd, struct lwp *lp) * When the topology is unknown choose a random cpu that is hopefully * idle. */ - if (dd->cpunode == NULL) - return (dfly_choose_queue_simple(dd, lp)); + if (dd1->cpunode == NULL) + return (dfly_choose_queue_simple(dd1, lp)); /* * When the topology is known choose a cpu whos group has, in * aggregate, has the lowest weighted load. */ cpup = root_cpu_node; - rdd = dd; + rdd = dd1; level = cpu_topology_levels_number; while (cpup) { @@ -1282,15 +1295,17 @@ dfly_choose_best_queue(dfly_pcpu_t dd, struct lwp *lp) /* * Give a slight advantage to nearby cpus. */ - if (cpun->members & dd->cpumask) - load -= PPQ * level; + if (cpun->members & dd1->cpumask) + load -= PPQ * level * usched_dfly_weight1 / 10; + else if (cpun->members & dd2->cpumask) + load -= PPQ * level * usched_dfly_weight2 / 10; /* * Calculate the best load */ if (cpub == NULL || lowest_load > load || (lowest_load == load && - (cpun->members & dd->cpumask)) + (cpun->members & dd1->cpumask)) ) { lowest_load = load; cpub = cpun; @@ -1918,15 +1933,15 @@ dfly_helper_thread_cpu_init(void) SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx, SYSCTL_CHILDREN(usched_dfly_sysctl_tree), - OID_AUTO, "upri_affinity", CTLFLAG_RW, - &usched_dfly_upri_affinity, 1, - "Number of PPQs in user priority check"); + OID_AUTO, "weight1", CTLFLAG_RW, + &usched_dfly_weight1, 10, + "Weight selection for current cpu"); SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx, SYSCTL_CHILDREN(usched_dfly_sysctl_tree), - OID_AUTO, "queue_checks", CTLFLAG_RW, - &usched_dfly_queue_checks, 5, - "LWPs to check from a queue before giving up"); + OID_AUTO, "weight2", CTLFLAG_RW, + &usched_dfly_weight2, 5, + "Weight selection for wakefrom cpu"); SYSCTL_ADD_PROC(&usched_dfly_sysctl_ctx, SYSCTL_CHILDREN(usched_dfly_sysctl_tree), diff --git a/sys/sys/thread.h b/sys/sys/thread.h index 0d9c44452c..66c126e4d2 100644 --- a/sys/sys/thread.h +++ b/sys/sys/thread.h @@ -284,7 +284,8 @@ struct thread { #else int td_cscount_unused; #endif - int td_unused02[4]; /* for future fields */ + int td_wakefromcpu; /* who woke me up? */ + int td_unused02[3]; /* for future fields */ int td_unused03[4]; /* for future fields */ struct iosched_data td_iosdata; /* Dynamic I/O scheduling data */ struct timeval td_start; /* start time for a thread/process */