From c730be2050146dae9bf5350b6f154244805cbb63 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Tue, 9 Sep 2008 04:06:20 +0000 Subject: [PATCH] Fix issues with the scheduler that were causing unnecessary reschedules between tightly coupled processes as well as inefficient reschedules under heavy loads. The basic problem is that a process entering the kernel is 'passively released', meaning its thread priority is left at TDPRI_USER_NORM. The thread priority is only raised to TDPRI_KERN_USER if the thread switches out. This has the side effect of forcing a LWKT reschedule when any other user process woke up from a blocked condition in the kernel, regardless of its user priority, because it's LWKT thread was at the higher TDPRI_KERN_USER priority. This resulted in some significant switching cavitation under load. There is a twist here because we do not want to starve threads running in the kernel acting on behalf of a very low priority user process, because doing so can deadlock the namecache or other kernel elements that sleep with lockmgr locks held. In addition, the 'other' LWKT thread might be associated with a much higher priority user process that we *DO* in fact want to give cpu to. The solution is elegant. First, do not force a LWKT reschedule for the above case. Second, force a LWKT reschedule on every hard clock. Remove all the old hacks. That's it! The result is that the current thread is allowed to return to user mode and run until the next hard clock even if other LWKT threads (running on behalf of a user process) are runnable. Pure kernel LWKT threads still get absolute priority, of course. When the hard clock occurs the other LWKT threads get the cpu and at the end of that whole mess most of those LWKT threads will be trying to return to user mode and the user scheduler will be able to select the best one. Doing this on a hardclock boundary prevents cavitation from occuring at the syscall enter and return boundary. With this change the TDF_NORESCHED and PNORESCHED flags and their associated code hacks have also been removed, along with lwkt_checkpri_self() which is no longer needed. --- sys/kern/kern_clock.c | 11 ++++++- sys/kern/kern_synch.c | 5 +-- sys/kern/lwkt_thread.c | 52 +++++++++++++------------------- sys/kern/sys_pipe.c | 10 +++--- sys/kern/usched_bsd4.c | 4 +-- sys/platform/pc32/i386/trap.c | 26 ++++++++-------- sys/platform/pc64/amd64/trap.c | 17 +++-------- sys/platform/vkernel/i386/trap.c | 16 +++------- sys/sys/param.h | 3 +- sys/sys/thread.h | 5 ++- 10 files changed, 66 insertions(+), 83 deletions(-) diff --git a/sys/kern/kern_clock.c b/sys/kern/kern_clock.c index df125952c4..5b4dd9154c 100644 --- a/sys/kern/kern_clock.c +++ b/sys/kern/kern_clock.c @@ -70,7 +70,7 @@ * * @(#)kern_clock.c 8.5 (Berkeley) 1/21/94 * $FreeBSD: src/sys/kern/kern_clock.c,v 1.105.2.10 2002/10/17 13:19:40 maxim Exp $ - * $DragonFly: src/sys/kern/kern_clock.c,v 1.61 2007/09/30 04:37:27 sephe Exp $ + * $DragonFly: src/sys/kern/kern_clock.c,v 1.62 2008/09/09 04:06:13 dillon Exp $ */ #include "opt_ntp.h" @@ -507,6 +507,15 @@ hardclock(systimer_t info, struct intrframe *frame) */ hardclock_softtick(gd); + /* + * The LWKT scheduler will generally allow the current process to + * return to user mode even if there are other runnable LWKT threads + * running in kernel mode on behalf of a user process. This will + * ensure that those other threads have an opportunity to run in + * fairly short order (but not instantly). + */ + need_lwkt_resched(); + /* * ITimer handling is per-tick, per-cpu. I don't think ksignal() * is mpsafe on curproc, so XXX get the mplock. diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c index 70f17daf6f..a9ad205faf 100644 --- a/sys/kern/kern_synch.c +++ b/sys/kern/kern_synch.c @@ -37,7 +37,7 @@ * * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95 * $FreeBSD: src/sys/kern/kern_synch.c,v 1.87.2.6 2002/10/13 07:29:53 kbyanc Exp $ - * $DragonFly: src/sys/kern/kern_synch.c,v 1.90 2008/04/30 04:19:57 dillon Exp $ + * $DragonFly: src/sys/kern/kern_synch.c,v 1.91 2008/09/09 04:06:13 dillon Exp $ */ #include "opt_ktrace.h" @@ -437,8 +437,6 @@ tsleep(void *ident, int flags, const char *wmesg, int timo) * the userland scheduler and initialize slptime to start * counting. */ - if (flags & PNORESCHED) - td->td_flags |= TDF_NORESCHED; p->p_usched->release_curproc(lp); lp->lwp_slptime = 0; } @@ -496,7 +494,6 @@ tsleep(void *ident, int flags, const char *wmesg, int timo) * not supposed to happen. Cleanup our temporary flags. */ KKASSERT(gd == td->td_gd); - td->td_flags &= ~TDF_NORESCHED; /* * Cleanup the timeout. diff --git a/sys/kern/lwkt_thread.c b/sys/kern/lwkt_thread.c index a0e7f87ce0..71348f950c 100644 --- a/sys/kern/lwkt_thread.c +++ b/sys/kern/lwkt_thread.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/kern/lwkt_thread.c,v 1.116 2008/06/16 02:00:05 dillon Exp $ + * $DragonFly: src/sys/kern/lwkt_thread.c,v 1.117 2008/09/09 04:06:13 dillon Exp $ */ /* @@ -966,13 +966,29 @@ static __inline void _lwkt_schedule_post(globaldata_t gd, thread_t ntd, int cpri) { + int mypri; + if (ntd->td_flags & TDF_RUNQ) { if (ntd->td_preemptable) { ntd->td_preemptable(ntd, cpri); /* YYY +token */ - } else if ((ntd->td_flags & TDF_NORESCHED) == 0 && - (ntd->td_pri & TDPRI_MASK) > (gd->gd_curthread->td_pri & TDPRI_MASK) - ) { - need_lwkt_resched(); + } else { + /* + * This is a little sticky. Due to the passive release function + * the LWKT priority can wiggle around for threads acting in + * the kernel on behalf of a user process. We do not want this + * to effect the comparison per-say. + * + * What will happen is that the current user process will be + * allowed to run until the next hardclock at which time a + * forced need_lwkt_resched() will allow the other kernel mode + * threads to get in their two cents. This prevents cavitation. + */ + mypri = gd->gd_curthread->td_pri & TDPRI_MASK; + if (mypri >= TDPRI_USER_IDLE && mypri <= TDPRI_USER_REAL) + mypri = TDPRI_KERN_USER; + + if ((ntd->td_pri & TDPRI_MASK) > mypri) + need_lwkt_resched(); } } } @@ -1138,32 +1154,6 @@ lwkt_setpri_self(int pri) crit_exit(); } -/* - * Determine if there is a runnable thread at a higher priority then - * the current thread. lwkt_setpri() does not check this automatically. - * Return 1 if there is, 0 if there isn't. - * - * Example: if bit 31 of runqmask is set and the current thread is priority - * 30, then we wind up checking the mask: 0x80000000 against 0x7fffffff. - * - * If nq reaches 31 the shift operation will overflow to 0 and we will wind - * up comparing against 0xffffffff, a comparison that will always be false. - */ -int -lwkt_checkpri_self(void) -{ - globaldata_t gd = mycpu; - thread_t td = gd->gd_curthread; - int nq = td->td_pri & TDPRI_MASK; - - while (gd->gd_runqmask > (__uint32_t)(2 << nq) - 1) { - if (TAILQ_FIRST(&gd->gd_tdrunq[nq + 1])) - return(1); - ++nq; - } - return(0); -} - /* * Migrate the current thread to the specified cpu. * diff --git a/sys/kern/sys_pipe.c b/sys/kern/sys_pipe.c index e6dd0e0b6b..0292f19cf2 100644 --- a/sys/kern/sys_pipe.c +++ b/sys/kern/sys_pipe.c @@ -17,7 +17,7 @@ * are met. * * $FreeBSD: src/sys/kern/sys_pipe.c,v 1.60.2.13 2002/08/05 15:05:15 des Exp $ - * $DragonFly: src/sys/kern/sys_pipe.c,v 1.49 2008/06/05 18:06:32 swildner Exp $ + * $DragonFly: src/sys/kern/sys_pipe.c,v 1.50 2008/09/09 04:06:13 dillon Exp $ */ /* @@ -612,8 +612,8 @@ pipe_read(struct file *fp, struct uio *uio, struct ucred *cred, int fflags) error = EAGAIN; } else { rpipe->pipe_state |= PIPE_WANTR; - if ((error = tsleep(rpipe, PCATCH|PNORESCHED, - "piperd", 0)) == 0) { + if ((error = tsleep(rpipe, PCATCH, + "piperd", 0)) == 0) { error = pipelock(rpipe, 1); } } @@ -824,7 +824,7 @@ retry: wakeup(wpipe); } pipeselwakeup(wpipe); - error = tsleep(wpipe, PCATCH|PNORESCHED, "pipdwt", 0); + error = tsleep(wpipe, PCATCH, "pipdwt", 0); } pipelock(wpipe,0); if (wpipe->pipe_state & PIPE_DIRECTW) { @@ -1120,7 +1120,7 @@ pipe_write(struct file *fp, struct uio *uio, struct ucred *cred, int fflags) pipeselwakeup(wpipe); wpipe->pipe_state |= PIPE_WANTW; - error = tsleep(wpipe, PCATCH|PNORESCHED, "pipewr", 0); + error = tsleep(wpipe, PCATCH, "pipewr", 0); if (error != 0) break; /* diff --git a/sys/kern/usched_bsd4.c b/sys/kern/usched_bsd4.c index bbb5bfab32..706c35c785 100644 --- a/sys/kern/usched_bsd4.c +++ b/sys/kern/usched_bsd4.c @@ -23,7 +23,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/kern/usched_bsd4.c,v 1.24 2008/06/19 05:34:23 y0netan1 Exp $ + * $DragonFly: src/sys/kern/usched_bsd4.c,v 1.25 2008/09/09 04:06:13 dillon Exp $ */ #include @@ -503,7 +503,7 @@ bsd4_setrunqueue(struct lwp *lp) * If the LWP has higher priority (lower lwp_priority value) on * its target cpu, reschedule on that cpu. */ - if ((lp->lwp_thread->td_flags & TDF_NORESCHED) == 0) { + { if ((dd->upri & ~PRIMASK) > (lp->lwp_priority & ~PRIMASK)) { dd->upri = lp->lwp_priority; spin_unlock_wr(&bsd4_spin); diff --git a/sys/platform/pc32/i386/trap.c b/sys/platform/pc32/i386/trap.c index 34f86049e3..d6c793702f 100644 --- a/sys/platform/pc32/i386/trap.c +++ b/sys/platform/pc32/i386/trap.c @@ -36,7 +36,7 @@ * * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 * $FreeBSD: src/sys/i386/i386/trap.c,v 1.147.2.11 2003/02/27 19:09:59 luoqi Exp $ - * $DragonFly: src/sys/platform/pc32/i386/trap.c,v 1.114 2008/07/13 10:28:51 nth Exp $ + * $DragonFly: src/sys/platform/pc32/i386/trap.c,v 1.115 2008/09/09 04:06:17 dillon Exp $ */ /* @@ -337,7 +337,10 @@ userexit(struct lwp *lp) /* * Handle a LWKT reschedule request first. Since our passive release - * is still in place we do not have to do anything special. + * is still in place we do not have to do anything special. This + * will also allow other kernel threads running on behalf of user + * mode to force us to release and acquire the current process before + * we do. */ while (lwkt_resched_wanted()) { lwkt_switch(); @@ -355,6 +358,11 @@ userexit(struct lwp *lp) /* * Acquire the current process designation for this user scheduler * on this cpu. This will also handle any user-reschedule requests. + * + * If we never blocked we never released and this function is + * typically a nop. However, if a user reschedule was requested + * this function may allow another user process to run before + * returning. */ lp->lwp_proc->p_usched->acquire_curproc(lp); /* We may have switched cpus on acquisition */ @@ -364,20 +372,14 @@ userexit(struct lwp *lp) * Reduce our priority in preparation for a return to userland. If * our passive release function was still in place, our priority was * never raised and does not need to be reduced. + * + * Note that at this point there may be other LWKT thread at + * TDPRI_KERN_USER (aka higher then our currenet priority). We + * do NOT want to run these threads yet. */ if (td->td_release == NULL) lwkt_setpri_self(TDPRI_USER_NORM); td->td_release = NULL; - - /* - * After reducing our priority there might be other kernel-level - * LWKTs that now have a greater priority. Run them as necessary. - * We don't have to worry about losing cpu to userland because - * we still control the current-process designation and we no longer - * have a passive release function installed. - */ - if (lwkt_checkpri_self()) - lwkt_switch(); } #if !defined(KTR_KERNENTRY) diff --git a/sys/platform/pc64/amd64/trap.c b/sys/platform/pc64/amd64/trap.c index db29f39c01..17d6836a9f 100644 --- a/sys/platform/pc64/amd64/trap.c +++ b/sys/platform/pc64/amd64/trap.c @@ -38,7 +38,7 @@ * * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 * $FreeBSD: src/sys/i386/i386/trap.c,v 1.147.2.11 2003/02/27 19:09:59 luoqi Exp $ - * $DragonFly: src/sys/platform/pc64/amd64/trap.c,v 1.2 2008/08/29 17:07:10 dillon Exp $ + * $DragonFly: src/sys/platform/pc64/amd64/trap.c,v 1.3 2008/09/09 04:06:18 dillon Exp $ */ /* @@ -335,23 +335,16 @@ userexit(struct lwp *lp) * Reduce our priority in preparation for a return to userland. If * our passive release function was still in place, our priority was * never raised and does not need to be reduced. + * + * Note that at this point there may be other LWKT thread at + * TDPRI_KERN_USER (aka higher then our currenet priority). We + * do NOT want to run these threads yet. */ if (td->td_release == NULL) lwkt_setpri_self(TDPRI_USER_NORM); td->td_release = NULL; - - /* - * After reducing our priority there might be other kernel-level - * LWKTs that now have a greater priority. Run them as necessary. - * We don't have to worry about losing cpu to userland because - * we still control the current-process designation and we no longer - * have a passive release function installed. - */ - if (lwkt_checkpri_self()) - lwkt_switch(); } - /* * Exception, fault, and trap interface to the kernel. * This common code is called from assembly language IDT gate entry diff --git a/sys/platform/vkernel/i386/trap.c b/sys/platform/vkernel/i386/trap.c index 5b57216e0f..3eca41c38b 100644 --- a/sys/platform/vkernel/i386/trap.c +++ b/sys/platform/vkernel/i386/trap.c @@ -36,7 +36,7 @@ * * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 * $FreeBSD: src/sys/i386/i386/trap.c,v 1.147.2.11 2003/02/27 19:09:59 luoqi Exp $ - * $DragonFly: src/sys/platform/vkernel/i386/trap.c,v 1.34 2008/07/13 10:28:51 nth Exp $ + * $DragonFly: src/sys/platform/vkernel/i386/trap.c,v 1.35 2008/09/09 04:06:19 dillon Exp $ */ /* @@ -348,20 +348,14 @@ userexit(struct lwp *lp) * Reduce our priority in preparation for a return to userland. If * our passive release function was still in place, our priority was * never raised and does not need to be reduced. + * + * Note that at this point there may be other LWKT thread at + * TDPRI_KERN_USER (aka higher then our currenet priority). We + * do NOT want to run these threads yet. */ if (td->td_release == NULL) lwkt_setpri_self(TDPRI_USER_NORM); td->td_release = NULL; - - /* - * After reducing our priority there might be other kernel-level - * LWKTs that now have a greater priority. Run them as necessary. - * We don't have to worry about losing cpu to userland because - * we still control the current-process designation and we no longer - * have a passive release function installed. - */ - if (lwkt_checkpri_self()) - lwkt_switch(); } diff --git a/sys/sys/param.h b/sys/sys/param.h index 1ebbfa755f..ea1a4c21fb 100644 --- a/sys/sys/param.h +++ b/sys/sys/param.h @@ -37,7 +37,7 @@ * * @(#)param.h 8.3 (Berkeley) 4/4/95 * $FreeBSD: src/sys/sys/param.h,v 1.61.2.38 2003/05/22 17:12:01 fjoe Exp $ - * $DragonFly: src/sys/sys/param.h,v 1.51 2008/07/14 04:01:42 dillon Exp $ + * $DragonFly: src/sys/sys/param.h,v 1.52 2008/09/09 04:06:20 dillon Exp $ */ #ifndef _SYS_PARAM_H_ @@ -124,7 +124,6 @@ #define PCATCH 0x00000100 /* tsleep checks signals */ #define PUSRFLAG1 0x00000200 /* Subsystem specific flag */ -#define PNORESCHED 0x00000400 /* No reschedule on wakeup */ #define PWAKEUP_CPUMASK 0x00003FFF /* start cpu for chained wakeups */ #define PWAKEUP_MYCPU 0x00004000 /* wakeup on current cpu only */ #define PWAKEUP_ONE 0x00008000 /* argument to wakeup: only one */ diff --git a/sys/sys/thread.h b/sys/sys/thread.h index 4971519734..8a277ddff8 100644 --- a/sys/sys/thread.h +++ b/sys/sys/thread.h @@ -7,7 +7,7 @@ * Types which must already be defined when this header is included by * userland: struct md_thread * - * $DragonFly: src/sys/sys/thread.h,v 1.94 2008/07/01 02:02:55 dillon Exp $ + * $DragonFly: src/sys/sys/thread.h,v 1.95 2008/09/09 04:06:20 dillon Exp $ */ #ifndef _SYS_THREAD_H_ @@ -275,7 +275,7 @@ struct thread { #define TDF_WAKEREQ 0x4000 /* resume_kproc */ #define TDF_TIMEOUT 0x8000 /* tsleep timeout */ #define TDF_INTTHREAD 0x00010000 /* interrupt thread */ -#define TDF_NORESCHED 0x00020000 /* Do not reschedule on wake */ +#define TDF_UNUSED20000 0x00020000 #define TDF_BLOCKED 0x00040000 /* Thread is blocked */ #define TDF_PANICWARN 0x00080000 /* panic warning in switch */ #define TDF_BLOCKQ 0x00100000 /* on block queue */ @@ -358,7 +358,6 @@ extern lwkt_token_t lwkt_token_pool_get(void *); extern void lwkt_setpri(thread_t, int); extern void lwkt_setpri_self(int); -extern int lwkt_checkpri_self(void); extern void lwkt_setcpu_self(struct globaldata *); extern void lwkt_migratecpu(int); -- 2.41.0