From: Matthew Dillon Date: Mon, 24 Sep 2012 20:32:11 +0000 (-0700) Subject: kernel - usched_dfly revamp (4), improve tail X-Git-Tag: v3.2.0~75 X-Git-Url: https://gitweb.dragonflybsd.org/~nant/dragonfly.git/commitdiff_plain/d992c3771baa6d7578bf112b8dbea1326e461f01 kernel - usched_dfly revamp (4), improve tail * Improve tail performance (many more cpu-bound processes than available cpus). * Experiment with removing the LWKT priority adjustments for kernel vs user. Instead give LWKT a hint about the user scheduler when scheduling a thread. LWKT's round-robin is left unhinted to hopefully round-robin starved LWKTs running in kernel mode. * Implement a better calculation for the per-thread uload than the priority. Instead, use estcpu. * Adjust default weigntings for new uload calculation scale. --- diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c index cf69eb35b9..22cedcd8c5 100644 --- a/sys/kern/kern_fork.c +++ b/sys/kern/kern_fork.c @@ -673,7 +673,11 @@ lwp_fork(struct lwp *origlp, struct proc *destproc, int flags) td->td_proc = destproc; td->td_lwp = lp; td->td_switch = cpu_heavy_switch; +#ifdef LWKT_SPLIT_USERPRI lwkt_setpri(td, TDPRI_KERN_USER); +#else + lwkt_setpri(td, TDPRI_USER_NORM); +#endif lwkt_set_comm(td, "%s", destproc->p_comm); /* diff --git a/sys/kern/lwkt_thread.c b/sys/kern/lwkt_thread.c index 3501185b51..beaf7a0810 100644 --- a/sys/kern/lwkt_thread.c +++ b/sys/kern/lwkt_thread.c @@ -180,8 +180,17 @@ _lwkt_dequeue(thread_t td) /* * Priority enqueue. * - * NOTE: There are a limited number of lwkt threads runnable since user - * processes only schedule one at a time per cpu. + * There are a limited number of lwkt threads runnable since user + * processes only schedule one at a time per cpu. However, there can + * be many user processes in kernel mode exiting from a tsleep() which + * become runnable. We do a secondary comparison using td_upri to try + * to order these in the situation where several wake up at the same time + * to avoid excessive switching. + * + * NOTE: lwkt_schedulerclock() will force a round-robin based on td_pri and + * will ignore user priority. This is to ensure that user threads in + * kernel mode get cpu at some point regardless of what the user + * scheduler thinks. */ static __inline void @@ -198,8 +207,12 @@ _lwkt_enqueue(thread_t td) TAILQ_INSERT_TAIL(&gd->gd_tdrunq, td, td_threadq); atomic_set_int(&gd->gd_reqflags, RQF_RUNNING); } else { - while (xtd && xtd->td_pri >= td->td_pri) + while (xtd && + (xtd->td_pri > td->td_pri || + (xtd->td_pri == td->td_pri && + xtd->td_upri >= td->td_pri))) { xtd = TAILQ_NEXT(xtd, td_threadq); + } if (xtd) TAILQ_INSERT_BEFORE(xtd, td, td_threadq); else @@ -706,6 +719,7 @@ lwkt_switch(void) goto skip; while ((ntd = TAILQ_NEXT(ntd, td_threadq)) != NULL) { +#ifdef LWKT_SPLIT_USERPRI /* * Never schedule threads returning to userland or the * user thread scheduler helper thread when higher priority @@ -717,6 +731,7 @@ lwkt_switch(void) ntd = NULL; break; } +#endif /* * Try this one. @@ -1129,8 +1144,11 @@ lwkt_passive_release(struct thread *td) { struct lwp *lp = td->td_lwp; +#ifdef LWKT_SPLIT_USERPRI td->td_release = NULL; lwkt_setpri_self(TDPRI_KERN_USER); +#endif + lp->lwp_proc->p_usched->release_curproc(lp); } @@ -1497,6 +1515,10 @@ lwkt_schedulerclock(thread_t td) * If the current thread is at the head of the runq shift it to the * end of any equal-priority threads and request a LWKT reschedule * if it moved. + * + * Ignore upri in this situation. There will only be one user thread + * in user mode, all others will be user threads running in kernel + * mode and we have to make sure they get some cpu. */ xtd = TAILQ_NEXT(td, td_threadq); if (xtd && xtd->td_pri == td->td_pri) { diff --git a/sys/kern/usched_bsd4.c b/sys/kern/usched_bsd4.c index 629be2e695..043b264d2c 100644 --- a/sys/kern/usched_bsd4.c +++ b/sys/kern/usched_bsd4.c @@ -1173,7 +1173,11 @@ bsd4_resetpriority(struct lwp *lp) * The newpriority incorporates the queue type so do a simple masked * check to determine if the process has moved to another queue. If * it has, and it is currently on a run queue, then move it. + * + * td_upri has normal sense (higher values are more desireable), so + * negate it. */ + lp->lwp_thread->td_upri = -(newpriority & ~PPQMASK); if ((lp->lwp_priority ^ newpriority) & ~PPQMASK) { lp->lwp_priority = newpriority; if (lp->lwp_mpflags & LWP_MP_ONRUNQ) { diff --git a/sys/kern/usched_dfly.c b/sys/kern/usched_dfly.c index 121af38792..5c64851c33 100644 --- a/sys/kern/usched_dfly.c +++ b/sys/kern/usched_dfly.c @@ -94,6 +94,7 @@ TAILQ_HEAD(rq, lwp); #define lwp_rqindex lwp_usdata.dfly.rqindex #define lwp_estcpu lwp_usdata.dfly.estcpu #define lwp_estfast lwp_usdata.dfly.estfast +#define lwp_uload lwp_usdata.dfly.uload #define lwp_rqtype lwp_usdata.dfly.rqtype #define lwp_qcpu lwp_usdata.dfly.qcpu @@ -253,11 +254,12 @@ SYSCTL_INT(_debug, OID_AUTO, dfly_chooser, CTLFLAG_RW, #ifdef SMP static int usched_dfly_smt = 0; static int usched_dfly_cache_coherent = 0; -static int usched_dfly_weight1 = 50; /* keep thread on current cpu */ -static int usched_dfly_weight2 = 30; /* synchronous peer's current cpu */ -static int usched_dfly_weight3 = 10; /* number of threads on queue */ -static int usched_dfly_weight4 = 40; /* availability of idle cores */ +static int usched_dfly_weight1 = 200; /* keep thread on current cpu */ +static int usched_dfly_weight2 = 1200; /* synchronous peer's current cpu */ +static int usched_dfly_weight3 = 40; /* number of threads on queue */ +static int usched_dfly_weight4 = 160; /* availability of idle cores */ static int usched_dfly_features = 0x8F; /* allow pulls */ +static int usched_dfly_swmask = ~PPQMASK; /* allow pulls */ #endif static int usched_dfly_rrinterval = (ESTCPUFREQ + 9) / 10; static int usched_dfly_decay = 8; @@ -380,10 +382,12 @@ dfly_acquire_curproc(struct lwp *lp) * * It is important to do a masked test to avoid the edge * case where two near-equal-priority threads are constantly - * interrupting each other. + * interrupting each other. Since our context is the one + * that is active NOW, we WANT to steal the uschedcp + * designation and not switch-flap. */ if (dd->uschedcp && - (dd->upri & ~PPQMASK) > + (dd->upri & ~PPQMASK) >= (lp->lwp_priority & ~PPQMASK)) { dd->uschedcp = lp; dd->upri = lp->lwp_priority; @@ -641,8 +645,7 @@ dfly_changeqcpu_locked(struct lwp *lp, dfly_pcpu_t dd, dfly_pcpu_t rdd) if (lp->lwp_qcpu != rdd->cpuid) { if (lp->lwp_mpflags & LWP_MP_ULOAD) { atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ULOAD); - atomic_add_int(&dd->uload, - -((lp->lwp_priority & ~PPQMASK) & PRIMASK)); + atomic_add_int(&dd->uload, -lp->lwp_uload); atomic_add_int(&dd->ucount, -1); atomic_add_int(&dfly_ucount, -1); } @@ -764,26 +767,37 @@ dfly_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp) dfly_resetpriority(lp); /* - * Rebalance cpus on each scheduler tick. Each cpu in turn will - * calculate the worst queue and, if sufficiently loaded, will - * pull a process from that queue into our current queue. + * Rebalance two cpus every 8 ticks, pulling the worst thread + * from the worst cpu's queue into a rotating cpu number. * - * To try to avoid always moving the same thread. XXX + * This mechanic is needed because the push algorithms can + * steady-state in an non-optimal configuration. We need to mix it + * up a little, even if it means breaking up a paired thread, so + * the push algorithms can rebalance the degenerate conditions. + * This portion of the algorithm exists to ensure stability at the + * selected weightings. + * + * Because we might be breaking up optimal conditions we do not want + * to execute this too quickly, hence we only rebalance approximately + * ~7-8 times per second. The push's, on the otherhand, are capable + * moving threads to other cpus at a much higher rate. + * + * We choose the most heavily loaded thread from the worst queue + * in order to ensure that multiple heavy-weight threads on the same + * queue get broken up, and also because these threads are the most + * likely to be able to remain in place. Hopefully then any pairings, + * if applicable, migrate to where these threads are. */ #ifdef SMP if ((usched_dfly_features & 0x04) && - ((uint16_t)sched_ticks % ncpus) == gd->gd_cpuid) { + ((u_int)sched_ticks & 7) == 0 && + (u_int)sched_ticks / 8 % ncpus == gd->gd_cpuid) { /* * Our cpu is up. */ struct lwp *nlp; dfly_pcpu_t rdd; - /* - * We have to choose the worst thread in the worst queue - * because it likely finished its batch on that cpu and is - * now waiting for cpu again. - */ rdd = dfly_choose_worst_queue(dd); if (rdd) { spin_lock(&dd->spin); @@ -972,6 +986,7 @@ dfly_resetpriority(struct lwp *lp) int rcpu; int checkpri; int estcpu; + int delta_uload; crit_enter(); @@ -1028,6 +1043,20 @@ dfly_resetpriority(struct lwp *lp) /* NOT REACHED */ } + /* + * The LWKT scheduler doesn't dive usched structures, give it a hint + * on the relative priority of user threads running in the kernel. + * The LWKT scheduler will always ensure that a user thread running + * in the kernel will get cpu some time, regardless of its upri, + * but can decide not to instantly switch from one kernel or user + * mode user thread to a kernel-mode user thread when it has a less + * desireable user priority. + * + * td_upri has normal sense (higher values are more desireable), so + * negate it. + */ + lp->lwp_thread->td_upri = -(newpriority & usched_dfly_swmask); + /* * The newpriority incorporates the queue type so do a simple masked * check to determine if the process has moved to another queue. If @@ -1037,21 +1066,6 @@ dfly_resetpriority(struct lwp *lp) * we end up in the same run queue. */ if ((lp->lwp_priority ^ newpriority) & ~PPQMASK) { - int delta_uload; - - /* - * uload can change, calculate the adjustment to reduce - * edge cases since choosers scan the cpu topology without - * locks. - */ - if (lp->lwp_mpflags & LWP_MP_ULOAD) { - delta_uload = - -((lp->lwp_priority & ~PPQMASK) & PRIMASK) + - ((newpriority & ~PPQMASK) & PRIMASK); - atomic_add_int(&dfly_pcpu[lp->lwp_qcpu].uload, - delta_uload); - /* no change in ucount */ - } if (lp->lwp_mpflags & LWP_MP_ONRUNQ) { dfly_remrunqueue_locked(rdd, lp); lp->lwp_priority = newpriority; @@ -1074,6 +1088,15 @@ dfly_resetpriority(struct lwp *lp) rcpu = -1; } + /* + * Adjust effective load + */ + delta_uload = lp->lwp_estcpu / NQS; /* 0-511, 0-100% cpu */ + delta_uload -= lp->lwp_uload; + lp->lwp_uload += delta_uload; + if (lp->lwp_mpflags & LWP_MP_ULOAD) + atomic_add_int(&dfly_pcpu[lp->lwp_qcpu].uload, delta_uload); + /* * Determine if we need to reschedule the target cpu. This only * occurs if the LWP is already on a scheduler queue, which means @@ -1094,7 +1117,8 @@ dfly_resetpriority(struct lwp *lp) if (rcpu >= 0) { if ((dfly_rdyprocmask & CPUMASK(rcpu)) && (checkpri == 0 || - (rdd->upri & ~PRIMASK) > (lp->lwp_priority & ~PRIMASK))) { + (rdd->upri & ~PRIMASK) > + (lp->lwp_priority & ~PRIMASK))) { #ifdef SMP if (rcpu == mycpu->gd_cpuid) { spin_unlock(&rdd->spin); @@ -1183,8 +1207,7 @@ dfly_exiting(struct lwp *lp, struct proc *child_proc) if (lp->lwp_mpflags & LWP_MP_ULOAD) { atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ULOAD); - atomic_add_int(&dd->uload, - -((lp->lwp_priority & ~PPQMASK) & PRIMASK)); + atomic_add_int(&dd->uload, -lp->lwp_uload); atomic_add_int(&dd->ucount, -1); atomic_add_int(&dfly_ucount, -1); } @@ -1208,8 +1231,7 @@ dfly_uload_update(struct lwp *lp) if ((lp->lwp_mpflags & LWP_MP_ULOAD) == 0) { atomic_set_int(&lp->lwp_mpflags, LWP_MP_ULOAD); - atomic_add_int(&dd->uload, - ((lp->lwp_priority & ~PPQMASK) & PRIMASK)); + atomic_add_int(&dd->uload, lp->lwp_uload); atomic_add_int(&dd->ucount, 1); atomic_add_int(&dfly_ucount, 1); } @@ -1221,8 +1243,7 @@ dfly_uload_update(struct lwp *lp) if (lp->lwp_mpflags & LWP_MP_ULOAD) { atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ULOAD); - atomic_add_int(&dd->uload, - -((lp->lwp_priority & ~PPQMASK) & PRIMASK)); + atomic_add_int(&dd->uload, -lp->lwp_uload); atomic_add_int(&dd->ucount, -1); atomic_add_int(&dfly_ucount, -1); } @@ -1338,14 +1359,12 @@ dfly_chooseproc_locked(dfly_pcpu_t rdd, dfly_pcpu_t dd, */ if (rdd != dd) { if (lp->lwp_mpflags & LWP_MP_ULOAD) { - atomic_add_int(&rdd->uload, - -((lp->lwp_priority & ~PPQMASK) & PRIMASK)); + atomic_add_int(&rdd->uload, -lp->lwp_uload); atomic_add_int(&rdd->ucount, -1); atomic_add_int(&dfly_ucount, -1); } lp->lwp_qcpu = dd->cpuid; - atomic_add_int(&dd->uload, - ((lp->lwp_priority & ~PPQMASK) & PRIMASK)); + atomic_add_int(&dd->uload, lp->lwp_uload); atomic_add_int(&dd->ucount, 1); atomic_add_int(&dfly_ucount, 1); atomic_set_int(&lp->lwp_mpflags, LWP_MP_ULOAD); @@ -1477,8 +1496,7 @@ dfly_choose_best_queue(struct lwp *lp) */ if ((lp->lwp_mpflags & LWP_MP_ULOAD) && (dd->cpumask & cpun->members)) { - load -= ((lp->lwp_priority & ~PPQMASK) & - PRIMASK); + load -= lp->lwp_uload; load -= usched_dfly_weight3; } @@ -1562,8 +1580,10 @@ dfly_choose_worst_queue(dfly_pcpu_t dd) int n; int count; int load; +#if 0 int pri; int hpri; +#endif int highest_load; /* @@ -1661,6 +1681,7 @@ dfly_choose_worst_queue(dfly_pcpu_t dd) if (rdd == dd) return(NULL); +#if 0 hpri = 0; if (rdd->rtqueuebits && hpri < (pri = bsrl(rdd->rtqueuebits))) hpri = pri; @@ -1671,6 +1692,7 @@ dfly_choose_worst_queue(dfly_pcpu_t dd) hpri *= PPQ; if (rdd->uload - hpri < dd->uload + hpri) return(NULL); +#endif return (rdd); } @@ -1838,8 +1860,7 @@ dfly_setrunqueue_locked(dfly_pcpu_t rdd, struct lwp *lp) if ((lp->lwp_mpflags & LWP_MP_ULOAD) == 0) { atomic_set_int(&lp->lwp_mpflags, LWP_MP_ULOAD); - atomic_add_int(&dfly_pcpu[lp->lwp_qcpu].uload, - (lp->lwp_priority & ~PPQMASK) & PRIMASK); + atomic_add_int(&dfly_pcpu[lp->lwp_qcpu].uload, lp->lwp_uload); atomic_add_int(&dfly_pcpu[lp->lwp_qcpu].ucount, 1); atomic_add_int(&dfly_ucount, 1); } @@ -1958,6 +1979,8 @@ dfly_helper_thread(void *dummy) * from another cpu. Since we're stealing, might as well * load balance at the same time. * + * We choose the highest-loaded thread from the worst queue. + * * NOTE! This function only returns a non-NULL rdd when * another cpu's queue is obviously overloaded. We * do not want to perform the type of rebalancing @@ -1968,7 +1991,7 @@ dfly_helper_thread(void *dummy) */ rdd = dfly_choose_worst_queue(dd); if (rdd && spin_trylock(&rdd->spin)) { - nlp = dfly_chooseproc_locked(rdd, dd, NULL, 0); + nlp = dfly_chooseproc_locked(rdd, dd, NULL, 1); spin_unlock(&rdd->spin); } else { nlp = NULL; @@ -2208,6 +2231,12 @@ dfly_helper_thread_cpu_init(void) &usched_dfly_features, 15, "Allow pulls into empty queues"); + SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx, + SYSCTL_CHILDREN(usched_dfly_sysctl_tree), + OID_AUTO, "swmask", CTLFLAG_RW, + &usched_dfly_swmask, ~PPQMASK, + "Queue mask to force thread switch"); + #if 0 SYSCTL_ADD_PROC(&usched_dfly_sysctl_ctx, diff --git a/sys/kern/usched_dummy.c b/sys/kern/usched_dummy.c index 50293ef956..3d89756923 100644 --- a/sys/kern/usched_dummy.c +++ b/sys/kern/usched_dummy.c @@ -401,6 +401,12 @@ dummy_resetpriority(struct lwp *lp) lp->lwp_priority = PRIBASE_THREAD + lp->lwp_rtprio.prio; return; } + + /* + * td_upri has normal sense (higher numbers are more desireable), + * so negate it. + */ + lp->lwp_thread->td_upri = -lp->lwp_priority; /* XXX spinlock usually needed */ } diff --git a/sys/sys/thread.h b/sys/sys/thread.h index cc1f794921..d60e2628e3 100644 --- a/sys/sys/thread.h +++ b/sys/sys/thread.h @@ -285,7 +285,8 @@ struct thread { int td_cscount_unused; #endif int td_wakefromcpu; /* who woke me up? */ - int td_unused02[3]; /* for future fields */ + int td_upri; /* user priority (sub-priority under td_pri) */ + int td_unused02[2]; /* for future fields */ int td_unused03[4]; /* for future fields */ struct iosched_data td_iosdata; /* Dynamic I/O scheduling data */ struct timeval td_start; /* start time for a thread/process */ @@ -373,6 +374,7 @@ struct thread { #define TDF_KERNELFP 0x01000000 /* kernel using fp coproc */ #define TDF_DELAYED_WAKEUP 0x02000000 #define TDF_CRYPTO 0x04000000 /* crypto thread */ +#define TDF_USERMODE 0x08000000 /* in or entering user mode */ #define TDF_MP_STOPREQ 0x00000001 /* suspend_kproc */ #define TDF_MP_WAKEREQ 0x00000002 /* resume_kproc */ diff --git a/sys/sys/thread2.h b/sys/sys/thread2.h index 13bbff050e..0d8db6e101 100644 --- a/sys/sys/thread2.h +++ b/sys/sys/thread2.h @@ -261,9 +261,11 @@ lwkt_getpri_self(void) static __inline void lwkt_passive_recover(thread_t td) { +#ifdef LWKT_SPLIT_USERPRI if (td->td_release == NULL) lwkt_setpri_self(TDPRI_USER_NORM); td->td_release = NULL; +#endif } /* diff --git a/sys/sys/usched.h b/sys/sys/usched.h index bb9e9cd43a..82e7c5446d 100644 --- a/sys/sys/usched.h +++ b/sys/sys/usched.h @@ -64,7 +64,7 @@ union usched_data { char forked; /* lock cpu during fork */ char rqindex; short estfast; /* fast estcpu collapse mode */ - short unused01; + short uload; /* for delta uload adjustments */ int estcpu; /* dynamic priority modification */ u_short rqtype; /* protected copy of rtprio type */ u_short qcpu; /* which cpu are we enqueued on? */