From f9235b6d9cd4b6ef2a6f977a1e659de0ac635e32 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Mon, 23 Aug 2010 18:39:45 -0700 Subject: [PATCH] kernel - rewrite the LWKT scheduler's priority mechanism The purpose of these changes is to begin to address the issue of cpu-bound kernel threads. For example, the crypto threads, or a HAMMER prune cycle that operates entirely out of the buffer cache. These threads tend to hicup the system, creating temporary lockups because they never switch away due to their nature as kernel threads. * Change the LWKT scheduler from a strict hard priority model to a fair-share with hard priority queueing model. A kernel thread will be queued with a hard priority, giving it dibs on the cpu earlier if it has a higher priority. However, if the thread runs past its fair-share quantum it will then become limited by that quantum and other lower-priority threads will be allowed to run. * Rewrite lwkt_yield() and lwkt_user_yield(), remove uio_yield(). Both yield functions are now very fast and can be called without further timing conditionals, simplifying numerous callers. lwkt_user_yield() now uses the fair-share quantum to determine when to yield the cpu for a cpu-bound kernel thread. * Implement the new yield in the crypto kernel threads, HAMMER, and other places (many of which already used the old yield functions which didn't work very well). * lwkt_switch() now only round-robins after the fair share quantum is exhausted. It does not necessarily always round robin. * Separate the critical section count from td_pri. Add td_critcount. --- sys/ddb/db_ps.c | 47 +- sys/kern/kern_clock.c | 5 + sys/kern/kern_intr.c | 2 +- sys/kern/kern_kinfo.c | 2 +- sys/kern/kern_subr.c | 6 +- sys/kern/kern_synch.c | 37 +- sys/kern/kern_threads.c | 3 +- sys/kern/kern_time.c | 7 +- sys/kern/lwkt_ipiq.c | 8 +- sys/kern/lwkt_thread.c | 612 ++++++++++++--------- sys/kern/lwkt_token.c | 4 +- sys/kern/usched_bsd4.c | 5 +- sys/kern/vfs_vnops.c | 4 +- sys/opencrypto/crypto.c | 2 + sys/platform/pc32/apic/apic_vector.s | 24 +- sys/platform/pc32/i386/bcopy.s | 8 +- sys/platform/pc32/i386/exception.s | 2 +- sys/platform/pc32/i386/genassym.c | 2 +- sys/platform/pc32/i386/machdep.c | 6 +- sys/platform/pc32/i386/swtch.s | 2 +- sys/platform/pc32/i386/trap.c | 14 +- sys/platform/pc32/icu/icu_vector.s | 8 +- sys/platform/pc32/isa/ipl.s | 26 +- sys/platform/pc64/apic/apic_vector.s | 24 +- sys/platform/pc64/icu/icu_vector.s | 8 +- sys/platform/pc64/x86_64/exception.S | 2 +- sys/platform/pc64/x86_64/genassym.c | 2 +- sys/platform/pc64/x86_64/ipl.s | 26 +- sys/platform/pc64/x86_64/machdep.c | 6 +- sys/platform/pc64/x86_64/swtch.s | 2 +- sys/platform/pc64/x86_64/trap.c | 14 +- sys/platform/vkernel/i386/cpu_regs.c | 10 +- sys/platform/vkernel/i386/exception.c | 10 +- sys/platform/vkernel/i386/fork_tramp.s | 2 +- sys/platform/vkernel/i386/genassym.c | 2 +- sys/platform/vkernel/i386/swtch.s | 2 +- sys/platform/vkernel/i386/trap.c | 20 +- sys/platform/vkernel/platform/machintr.c | 2 +- sys/platform/vkernel64/platform/machintr.c | 2 +- sys/platform/vkernel64/x86_64/cpu_regs.c | 10 +- sys/platform/vkernel64/x86_64/exception.c | 25 +- sys/platform/vkernel64/x86_64/fork_tramp.s | 2 +- sys/platform/vkernel64/x86_64/genassym.c | 3 +- sys/platform/vkernel64/x86_64/swtch.s | 2 +- sys/platform/vkernel64/x86_64/trap.c | 20 +- sys/sys/globaldata.h | 10 +- sys/sys/thread.h | 29 +- sys/sys/thread2.h | 22 +- sys/sys/uio.h | 1 - sys/sys/upcall.h | 1 - sys/vfs/hammer/hammer_flusher.c | 2 +- sys/vfs/ufs/ffs_rawread.c | 23 +- sys/vm/vm_zeroidle.c | 6 +- 53 files changed, 586 insertions(+), 540 deletions(-) diff --git a/sys/ddb/db_ps.c b/sys/ddb/db_ps.c index ef128a187e..ddc849838b 100644 --- a/sys/ddb/db_ps.c +++ b/sys/ddb/db_ps.c @@ -113,9 +113,8 @@ db_ps(db_expr_t dummy1, boolean_t dummy2, db_expr_t dummy3, char *dummy4) if (db_more(&nl) < 0) return; - db_printf("cpu %d tdrunqmask %08x curthread %p reqflags %04x\n", - gd->gd_cpuid, gd->gd_runqmask, - gd->gd_curthread, gd->gd_reqflags); + db_printf("cpu %d curthread %p reqflags %04x\n", + gd->gd_cpuid, gd->gd_curthread, gd->gd_reqflags); if (gd->gd_curthread && gd->gd_curthread->td_preempted) { db_printf(" PREEMPTING THREAD %p\n", gd->gd_curthread->td_preempted); @@ -137,28 +136,26 @@ db_ps(db_expr_t dummy1, boolean_t dummy2, db_expr_t dummy3, char *dummy4) if (db_more(&nl) < 0) return; db_printf(" tdq thread pid flags pri/cs/mp sp wmesg comm\n"); - for (np = 0; np < 32; ++np) { - TAILQ_FOREACH(td, &gd->gd_tdrunq[np], td_threadq) { - if (db_more(&nl) < 0) - return; - db_printf(" %3d %p %3d %08x %2d/%02d/%02d %p %8.8s %s\n", - np, td, - (td->td_proc ? td->td_proc->p_pid : -1), - td->td_flags, - td->td_pri & TDPRI_MASK, - td->td_pri / TDPRI_CRIT, + TAILQ_FOREACH(td, &gd->gd_tdrunq, td_threadq) { + if (db_more(&nl) < 0) + return; + db_printf(" %p %3d %08x %2d/%02d/%02d %p %8.8s %s\n", + td, + (td->td_proc ? td->td_proc->p_pid : -1), + td->td_flags, + td->td_pri, + td->td_critcount, #ifdef SMP - td->td_mpcount, + td->td_mpcount, #else - 0, + 0, #endif - td->td_sp, - td->td_wmesg ? td->td_wmesg : "-", - td->td_proc ? td->td_proc->p_comm : td->td_comm); - if (td->td_preempted) - db_printf(" PREEMPTING THREAD %p\n", td->td_preempted); - db_dump_td_tokens(td); - } + td->td_sp, + td->td_wmesg ? td->td_wmesg : "-", + td->td_proc ? td->td_proc->p_comm : td->td_comm); + if (td->td_preempted) + db_printf(" PREEMPTING THREAD %p\n", td->td_preempted); + db_dump_td_tokens(td); } if (db_more(&nl) < 0) return; @@ -173,8 +170,8 @@ db_ps(db_expr_t dummy1, boolean_t dummy2, db_expr_t dummy3, char *dummy4) np, td, (td->td_proc ? td->td_proc->p_pid : -1), td->td_flags, - td->td_pri & TDPRI_MASK, - td->td_pri / TDPRI_CRIT, + td->td_pri, + td->td_critcount, #ifdef SMP td->td_mpcount, #else @@ -209,7 +206,7 @@ db_dump_td_tokens(thread_t td) db_printf(" %p[tok=%p", ref, ref->tr_tok); #ifdef SMP - if (td == tok->t_ref->tr_owner) + if (tok->t_ref && td == tok->t_ref->tr_owner) db_printf(",held"); #endif db_printf("]"); diff --git a/sys/kern/kern_clock.c b/sys/kern/kern_clock.c index 83818c2276..6a4660bde8 100644 --- a/sys/kern/kern_clock.c +++ b/sys/kern/kern_clock.c @@ -518,6 +518,11 @@ hardclock(systimer_t info, struct intrframe *frame) vm_fault_ratecheck(); } + /* + * lwkt thread scheduler fair queueing + */ + lwkt_fairq_schedulerclock(curthread); + /* * softticks are handled for all cpus */ diff --git a/sys/kern/kern_intr.c b/sys/kern/kern_intr.c index 5bc5b59f90..a63bcc7127 100644 --- a/sys/kern/kern_intr.c +++ b/sys/kern/kern_intr.c @@ -633,7 +633,7 @@ ithread_fast_handler(struct intrframe *frame) td = curthread; /* We must be in critical section. */ - KKASSERT(td->td_pri >= TDPRI_CRIT); + KKASSERT(td->td_critcount); info = &intr_info_ary[intr]; diff --git a/sys/kern/kern_kinfo.c b/sys/kern/kern_kinfo.c index 94b96dc930..63de1d232d 100644 --- a/sys/kern/kern_kinfo.c +++ b/sys/kern/kern_kinfo.c @@ -250,7 +250,7 @@ fill_kinfo_proc_kthread(struct thread *td, struct kinfo_proc *kp) kp->kp_lwp.kl_tdprio = td->td_pri; kp->kp_lwp.kl_rtprio.type = RTP_PRIO_THREAD; - kp->kp_lwp.kl_rtprio.prio = td->td_pri & TDPRI_MASK; + kp->kp_lwp.kl_rtprio.prio = td->td_pri; kp->kp_lwp.kl_uticks = td->td_uticks; kp->kp_lwp.kl_sticks = td->td_sticks; diff --git a/sys/kern/kern_subr.c b/sys/kern/kern_subr.c index 4abceec919..7991ff5031 100644 --- a/sys/kern/kern_subr.c +++ b/sys/kern/kern_subr.c @@ -82,7 +82,6 @@ uiomove(caddr_t cp, size_t n, struct uio *uio) size_t cnt; int error = 0; int save = 0; - int baseticks = ticks; KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE, ("uiomove: mode")); @@ -108,10 +107,7 @@ uiomove(caddr_t cp, size_t n, struct uio *uio) switch (uio->uio_segflg) { case UIO_USERSPACE: - if (ticks - baseticks >= hogticks) { - uio_yield(); - baseticks = ticks; - } + lwkt_user_yield(); if (uio->uio_rw == UIO_READ) error = copyout(cp, iov->iov_base, cnt); else diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c index bf645d5294..6230739044 100644 --- a/sys/kern/kern_synch.c +++ b/sys/kern/kern_synch.c @@ -486,7 +486,7 @@ tsleep(const volatile void *ident, int flags, const char *wmesg, int timo) * in case this is the idle process and already asleep. */ splz(); - oldpri = td->td_pri & TDPRI_MASK; + oldpri = td->td_pri; lwkt_setpri_self(safepri); lwkt_switch(); lwkt_setpri_self(oldpri); @@ -1102,41 +1102,6 @@ tstop(void) crit_exit(); } -/* - * Yield / synchronous reschedule. This is a bit tricky because the trap - * code might have set a lazy release on the switch function. Setting - * P_PASSIVE_ACQ will ensure that the lazy release executes when we call - * switch, and that we are given a greater chance of affinity with our - * current cpu. - * - * We call lwkt_setpri_self() to rotate our thread to the end of the lwkt - * run queue. lwkt_switch() will also execute any assigned passive release - * (which usually calls release_curproc()), allowing a same/higher priority - * process to be designated as the current process. - * - * While it is possible for a lower priority process to be designated, - * it's call to lwkt_maybe_switch() in acquire_curproc() will likely - * round-robin back to us and we will be able to re-acquire the current - * process designation. - * - * MPSAFE - */ -void -uio_yield(void) -{ - struct thread *td = curthread; - struct proc *p = td->td_proc; - - lwkt_setpri_self(td->td_pri & TDPRI_MASK); - if (p) { - p->p_flag |= P_PASSIVE_ACQ; - lwkt_switch(); - p->p_flag &= ~P_PASSIVE_ACQ; - } else { - lwkt_switch(); - } -} - /* * Compute a tenex style load average of a quantity on * 1, 5 and 15 minute intervals. diff --git a/sys/kern/kern_threads.c b/sys/kern/kern_threads.c index 52df714162..581cace982 100644 --- a/sys/kern/kern_threads.c +++ b/sys/kern/kern_threads.c @@ -56,7 +56,6 @@ #include #include #include -#include /* uio_yield() fixme */ #if 0 @@ -164,7 +163,7 @@ int sys_yield(struct yield_args *uap) { uap->sysmsg_result = 0; - uio_yield(); + lwkt_user_yield(); return(0); } diff --git a/sys/kern/kern_time.c b/sys/kern/kern_time.c index 6621c81810..65ea66d3d3 100644 --- a/sys/kern/kern_time.c +++ b/sys/kern/kern_time.c @@ -296,7 +296,6 @@ nanosleep1(struct timespec *rqt, struct timespec *rmt) struct timespec ts, ts2, ts3; struct timeval tv; int error; - int tried_yield; if (rqt->tv_nsec < 0 || rqt->tv_nsec >= 1000000000) return (EINVAL); @@ -306,7 +305,6 @@ nanosleep1(struct timespec *rqt, struct timespec *rmt) nanouptime(&ts); timespecadd(&ts, rqt); /* ts = target timestamp compare */ TIMESPEC_TO_TIMEVAL(&tv, rqt); /* tv = sleep interval */ - tried_yield = 0; for (;;) { int ticks; @@ -316,9 +314,8 @@ nanosleep1(struct timespec *rqt, struct timespec *rmt) if (tv.tv_sec == 0 && ticks == 0) { thread_t td = curthread; - if (tried_yield || tv.tv_usec < sleep_hard_us) { - tried_yield = 0; - uio_yield(); + if (tv.tv_usec < sleep_hard_us) { + lwkt_user_yield(); } else { crit_enter_quick(td); systimer_init_oneshot(&info, ns1_systimer, diff --git a/sys/kern/lwkt_ipiq.c b/sys/kern/lwkt_ipiq.c index c4533af694..9be618a85e 100644 --- a/sys/kern/lwkt_ipiq.c +++ b/sys/kern/lwkt_ipiq.c @@ -163,7 +163,7 @@ lwkt_send_ipiq3(globaldata_t target, ipifunc3_t func, void *arg1, int arg2) if (gd->gd_intr_nesting_level > 20) panic("lwkt_send_ipiq: TOO HEAVILY NESTED!"); #endif - KKASSERT(curthread->td_pri >= TDPRI_CRIT); + KKASSERT(curthread->td_critcount); ++ipiq_count; ip = &gd->gd_ipiq[target->gd_cpuid]; @@ -253,7 +253,7 @@ lwkt_send_ipiq3_passive(globaldata_t target, ipifunc3_t func, if (gd->gd_intr_nesting_level > 20) panic("lwkt_send_ipiq: TOO HEAVILY NESTED!"); #endif - KKASSERT(curthread->td_pri >= TDPRI_CRIT); + KKASSERT(curthread->td_critcount); ++ipiq_count; ++ipiq_passive; ip = &gd->gd_ipiq[target->gd_cpuid]; @@ -322,7 +322,7 @@ lwkt_send_ipiq3_nowait(globaldata_t target, ipifunc3_t func, struct globaldata *gd = mycpu; logipiq(send_nbio, func, arg1, arg2, gd, target); - KKASSERT(curthread->td_pri >= TDPRI_CRIT); + KKASSERT(curthread->td_critcount); if (target == gd) { func(arg1, arg2, NULL); logipiq(send_end, func, arg1, arg2, gd, target); @@ -530,7 +530,7 @@ lwkt_process_ipiq_core(globaldata_t sgd, lwkt_ipiq_t ip, * Issue a load fence to prevent speculative reads of e.g. data written * by the other cpu prior to it updating the index. */ - KKASSERT(curthread->td_pri >= TDPRI_CRIT); + KKASSERT(curthread->td_critcount); wi = ip->ip_windex; cpu_lfence(); diff --git a/sys/kern/lwkt_thread.c b/sys/kern/lwkt_thread.c index e6b75897e1..48c498c73d 100644 --- a/sys/kern/lwkt_thread.c +++ b/sys/kern/lwkt_thread.c @@ -99,6 +99,7 @@ static struct objcache *thread_cache; #ifdef SMP static void lwkt_schedule_remote(void *arg, int arg2, struct intrframe *frame); #endif +static void lwkt_fairq_accumulate(globaldata_t gd, thread_t td); extern void cpu_heavy_restore(void); extern void cpu_lwkt_restore(void); @@ -147,6 +148,8 @@ SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_weird, CTLFLAG_RW, &preempt_weird, 0, ""); SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count, CTLFLAG_RW, &token_contention_count, 0, "spinning due to token contention"); #endif +static int fairq_enable = 1; +SYSCTL_INT(_lwkt, OID_AUTO, fairq_enable, CTLFLAG_RW, &fairq_enable, 0, ""); /* * These helper procedures handle the runq, they can only be called from @@ -162,26 +165,45 @@ void _lwkt_dequeue(thread_t td) { if (td->td_flags & TDF_RUNQ) { - int nq = td->td_pri & TDPRI_MASK; struct globaldata *gd = td->td_gd; td->td_flags &= ~TDF_RUNQ; - TAILQ_REMOVE(&gd->gd_tdrunq[nq], td, td_threadq); - /* runqmask is passively cleaned up by the switcher */ + TAILQ_REMOVE(&gd->gd_tdrunq, td, td_threadq); + gd->gd_fairq_total_pri -= td->td_pri; + if (TAILQ_FIRST(&gd->gd_tdrunq) == NULL) + atomic_clear_int_nonlocked(&gd->gd_reqflags, RQF_RUNNING); } } +/* + * Priority enqueue. + * + * NOTE: There are a limited number of lwkt threads runnable since user + * processes only schedule one at a time per cpu. + */ static __inline void _lwkt_enqueue(thread_t td) { + thread_t xtd; + if ((td->td_flags & (TDF_RUNQ|TDF_MIGRATING|TDF_BLOCKQ)) == 0) { - int nq = td->td_pri & TDPRI_MASK; struct globaldata *gd = td->td_gd; td->td_flags |= TDF_RUNQ; - TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], td, td_threadq); - gd->gd_runqmask |= 1 << nq; + xtd = TAILQ_FIRST(&gd->gd_tdrunq); + if (xtd == NULL) { + TAILQ_INSERT_TAIL(&gd->gd_tdrunq, td, td_threadq); + atomic_set_int_nonlocked(&gd->gd_reqflags, RQF_RUNNING); + } else { + while (xtd && xtd->td_pri > td->td_pri) + xtd = TAILQ_NEXT(xtd, td_threadq); + if (xtd) + TAILQ_INSERT_BEFORE(xtd, td, td_threadq); + else + TAILQ_INSERT_TAIL(&gd->gd_tdrunq, td, td_threadq); + } + gd->gd_fairq_total_pri += td->td_pri; } } @@ -232,7 +254,8 @@ void lwkt_schedule_self(thread_t td) { crit_enter_quick(td); - KASSERT(td != &td->td_gd->gd_idlethread, ("lwkt_schedule_self(): scheduling gd_idlethread is illegal!")); + KASSERT(td != &td->td_gd->gd_idlethread, + ("lwkt_schedule_self(): scheduling gd_idlethread is illegal!")); KKASSERT(td->td_lwp == NULL || (td->td_lwp->lwp_flag & LWP_ONRUNQ) == 0); _lwkt_enqueue(td); crit_exit_quick(td); @@ -259,11 +282,7 @@ lwkt_deschedule_self(thread_t td) void lwkt_gdinit(struct globaldata *gd) { - int i; - - for (i = 0; i < sizeof(gd->gd_tdrunq)/sizeof(gd->gd_tdrunq[0]); ++i) - TAILQ_INIT(&gd->gd_tdrunq[i]); - gd->gd_runqmask = 0; + TAILQ_INIT(&gd->gd_tdrunq); TAILQ_INIT(&gd->gd_tdallq); } @@ -357,7 +376,8 @@ lwkt_init_thread(thread_t td, void *stack, int stksize, int flags, td->td_kstack_size = stksize; td->td_flags = flags; td->td_gd = gd; - td->td_pri = TDPRI_KERN_DAEMON + TDPRI_CRIT; + td->td_pri = TDPRI_KERN_DAEMON; + td->td_critcount = 1; td->td_toks_stop = &td->td_toks_base; #ifdef SMP if ((flags & TDF_MPSAFE) == 0) @@ -474,9 +494,13 @@ lwkt_switch(void) globaldata_t gd = mycpu; thread_t td = gd->gd_curthread; thread_t ntd; + thread_t xtd; + thread_t nlast; #ifdef SMP + int nquserok; int mpheld; #endif + int didaccumulate; /* * Switching from within a 'fast' (non thread switched) interrupt or IPI @@ -557,17 +581,18 @@ lwkt_switch(void) } #endif #endif + + /* + * If we had preempted another thread on this cpu, resume the preempted + * thread. This occurs transparently, whether the preempted thread + * was scheduled or not (it may have been preempted after descheduling + * itself). + * + * We have to setup the MP lock for the original thread after backing + * out the adjustment that was made to curthread when the original + * was preempted. + */ if ((ntd = td->td_preempted) != NULL) { - /* - * We had preempted another thread on this cpu, resume the preempted - * thread. This occurs transparently, whether the preempted thread - * was scheduled or not (it may have been preempted after descheduling - * itself). - * - * We have to setup the MP lock for the original thread after backing - * out the adjustment that was made to curthread when the original - * was preempted. - */ KKASSERT(ntd->td_flags & TDF_PREEMPT_LOCK); #ifdef SMP if (ntd->td_mpcount && mpheld == 0) { @@ -586,181 +611,197 @@ lwkt_switch(void) * set the reschedule flag if the originally interrupted thread is * at a lower priority. */ - if (gd->gd_runqmask > (2 << (ntd->td_pri & TDPRI_MASK)) - 1) + if (TAILQ_FIRST(&gd->gd_tdrunq) && + TAILQ_FIRST(&gd->gd_tdrunq)->td_pri > ntd->td_pri) { need_lwkt_resched(); + } /* YYY release mp lock on switchback if original doesn't need it */ - } else { + goto havethread_preempted; + } + + /* + * Implement round-robin fairq with priority insertion. The priority + * insertion is handled by _lwkt_enqueue() + * + * We have to adjust the MP lock for the target thread. If we + * need the MP lock and cannot obtain it we try to locate a + * thread that does not need the MP lock. If we cannot, we spin + * instead of HLT. + * + * A similar issue exists for the tokens held by the target thread. + * If we cannot obtain ownership of the tokens we cannot immediately + * schedule the thread. + */ + for (;;) { + clear_lwkt_resched(); + didaccumulate = 0; + ntd = TAILQ_FIRST(&gd->gd_tdrunq); + /* - * Priority queue / round-robin at each priority. Note that user - * processes run at a fixed, low priority and the user process - * scheduler deals with interactions between user processes - * by scheduling and descheduling them from the LWKT queue as - * necessary. - * - * We have to adjust the MP lock for the target thread. If we - * need the MP lock and cannot obtain it we try to locate a - * thread that does not need the MP lock. If we cannot, we spin - * instead of HLT. + * Hotpath if we can get all necessary resources. * - * A similar issue exists for the tokens held by the target thread. - * If we cannot obtain ownership of the tokens we cannot immediately - * schedule the thread. + * If nothing is runnable switch to the idle thread */ + if (ntd == NULL) { + ntd = &gd->gd_idlethread; + if (gd->gd_reqflags & RQF_IDLECHECK_MASK) + ntd->td_flags |= TDF_IDLE_NOHLT; + if (ntd->td_mpcount) { + if (gd->gd_trap_nesting_level == 0 && panicstr == NULL) + panic("Idle thread %p was holding the BGL!", ntd); + if (mpheld == 0) { + cpu_pause(); + continue; + } + } + goto haveidle; + } /* - * If an LWKT reschedule was requested, well that is what we are - * doing now so clear it. + * Hotpath schedule + */ + if (ntd->td_fairq_accum >= 0 && +#ifdef SMP + (ntd->td_mpcount == 0 || mpheld || cpu_try_mplock()) && +#endif + (!TD_TOKS_HELD(ntd) || lwkt_getalltokens(ntd)) + ) { +#ifdef SMP + clr_mplock_contention_mask(gd); +#endif + goto havethread; + } + +#ifdef SMP + /* Reload mpheld (it become stale after mplock/token ops) */ + mpheld = MP_LOCK_HELD(); +#endif + + /* + * Coldpath - unable to schedule ntd, continue looking for threads + * to schedule. This is only allowed of the (presumably) kernel + * thread exhausted its fair share. A kernel thread stuck on + * resources does not currently allow a user thread to get in + * front of it. */ - clear_lwkt_resched(); -again: - if (gd->gd_runqmask) { - int nq = bsrl(gd->gd_runqmask); - if ((ntd = TAILQ_FIRST(&gd->gd_tdrunq[nq])) == NULL) { - gd->gd_runqmask &= ~(1 << nq); - goto again; - } #ifdef SMP + nquserok = ((ntd->td_pri < TDPRI_KERN_LPSCHED) || + (ntd->td_fairq_accum < 0)); +#endif + nlast = NULL; + + for (;;) { /* - * THREAD SELECTION FOR AN SMP MACHINE BUILD + * If the fair-share scheduler ran out ntd gets moved to the + * end and its accumulator will be bumped, if it didn't we + * maintain the same queue position. * - * If the target needs the MP lock and we couldn't get it, - * or if the target is holding tokens and we could not - * gain ownership of the tokens, continue looking for a - * thread to schedule and spin instead of HLT if we can't. - * - * NOTE: the mpheld variable invalid after this conditional, it - * can change due to both cpu_try_mplock() returning success - * AND interactions in lwkt_getalltokens() due to the fact that - * we are trying to check the mpcount of a thread other then - * the current thread. Because of this, if the current thread - * is not holding td_mpcount, an IPI indirectly run via - * lwkt_getalltokens() can obtain and release the MP lock and - * cause the core MP lock to be released. + * nlast keeps track of the last element prior to any moves. */ - if ((ntd->td_mpcount && mpheld == 0 && !cpu_try_mplock()) || - (TD_TOKS_HELD(ntd) && lwkt_getalltokens(ntd) == 0) - ) { - u_int32_t rqmask = gd->gd_runqmask; + if (ntd->td_fairq_accum < 0) { + xtd = TAILQ_NEXT(ntd, td_threadq); + lwkt_fairq_accumulate(gd, ntd); + didaccumulate = 1; + TAILQ_REMOVE(&gd->gd_tdrunq, ntd, td_threadq); + TAILQ_INSERT_TAIL(&gd->gd_tdrunq, ntd, td_threadq); + if (nlast == NULL) { + nlast = ntd; + if (xtd == NULL) + xtd = ntd; + } + ntd = xtd; + } else { + ntd = TAILQ_NEXT(ntd, td_threadq); + } + /* + * If we exhausted the run list switch to the idle thread. + * Since one or more threads had resource acquisition issues + * we do not allow the idle thread to halt. + * + * NOTE: nlast can be NULL. + */ + if (ntd == nlast) { cpu_pause(); - - mpheld = MP_LOCK_HELD(); - ntd = NULL; - while (rqmask) { - TAILQ_FOREACH(ntd, &gd->gd_tdrunq[nq], td_threadq) { - if (ntd->td_mpcount && !mpheld && !cpu_try_mplock()) { - /* spinning due to MP lock being held */ - continue; - } - - /* - * mpheld state invalid after getalltokens call returns - * failure, but the variable is only needed for - * the loop. - */ - if (TD_TOKS_HELD(ntd) && !lwkt_getalltokens(ntd)) { - /* spinning due to token contention */ -#ifdef INVARIANTS - ++token_contention_count; -#endif - mpheld = MP_LOCK_HELD(); - continue; - } - break; - } - if (ntd) - break; - rqmask &= ~(1 << nq); - nq = bsrl(rqmask); - - /* - * We have two choices. We can either refuse to run a - * user thread when a kernel thread needs the MP lock - * but could not get it, or we can allow it to run but - * then expect an IPI (hopefully) later on to force a - * reschedule when the MP lock might become available. - */ - if (nq < TDPRI_KERN_LPSCHED) { - break; /* for now refuse to run */ -#if 0 - if (chain_mplock == 0) - break; - /* continue loop, allow user threads to be scheduled */ -#endif + ntd = &gd->gd_idlethread; + ntd->td_flags |= TDF_IDLE_NOHLT; + set_mplock_contention_mask(gd); + cpu_mplock_contested(); + if (ntd->td_mpcount) { + mpheld = MP_LOCK_HELD(); + if (gd->gd_trap_nesting_level == 0 && panicstr == NULL) + panic("Idle thread %p was holding the BGL!", ntd); + if (mpheld == 0) { + cpu_pause(); + break; /* try again from the top, almost */ } } /* - * Case where a (kernel) thread needed the MP lock and could - * not get one, and we may or may not have found another - * thread which does not need the MP lock to run while - * we wait (ntd). + * If fairq accumulations occured we do not schedule the + * idle thread. This will cause us to try again from + * the (almost) top. */ - if (ntd == NULL) { - ntd = &gd->gd_idlethread; - ntd->td_flags |= TDF_IDLE_NOHLT; - set_mplock_contention_mask(gd); - cpu_mplock_contested(); - goto using_idle_thread; - } else { - clr_mplock_contention_mask(gd); - ++gd->gd_cnt.v_swtch; - TAILQ_REMOVE(&gd->gd_tdrunq[nq], ntd, td_threadq); - TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], ntd, td_threadq); - } - } else { - clr_mplock_contention_mask(gd); - ++gd->gd_cnt.v_swtch; - TAILQ_REMOVE(&gd->gd_tdrunq[nq], ntd, td_threadq); - TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], ntd, td_threadq); + if (didaccumulate) + break; + goto haveidle; } -#else + /* - * THREAD SELECTION FOR A UP MACHINE BUILD. We don't have to - * worry about tokens or the BGL. However, we still have - * to call lwkt_getalltokens() in order to properly detect - * stale tokens. This call cannot fail for a UP build! + * Try to switch to this thread. */ - lwkt_getalltokens(ntd); - ++gd->gd_cnt.v_swtch; - TAILQ_REMOVE(&gd->gd_tdrunq[nq], ntd, td_threadq); - TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], ntd, td_threadq); + if ((ntd->td_pri >= TDPRI_KERN_LPSCHED || nquserok) && + ntd->td_fairq_accum >= 0 && +#ifdef SMP + (ntd->td_mpcount == 0 || mpheld || cpu_try_mplock()) && #endif - } else { - /* - * We have nothing to run but only let the idle loop halt - * the cpu if there are no pending interrupts. - */ - ntd = &gd->gd_idlethread; - if (gd->gd_reqflags & RQF_IDLECHECK_MASK) - ntd->td_flags |= TDF_IDLE_NOHLT; + (!TD_TOKS_HELD(ntd) || lwkt_getalltokens(ntd)) + ) { #ifdef SMP -using_idle_thread: - /* - * The idle thread should not be holding the MP lock unless we - * are trapping in the kernel or in a panic. Since we select the - * idle thread unconditionally when no other thread is available, - * if the MP lock is desired during a panic or kernel trap, we - * have to loop in the scheduler until we get it. - */ - if (ntd->td_mpcount) { - mpheld = MP_LOCK_HELD(); - if (gd->gd_trap_nesting_level == 0 && panicstr == NULL) - panic("Idle thread %p was holding the BGL!", ntd); - if (mpheld == 0) - goto again; + clr_mplock_contention_mask(gd); +#endif + goto havethread; } +#ifdef SMP + /* Reload mpheld (it become stale after mplock/token ops) */ + mpheld = MP_LOCK_HELD(); + if (ntd->td_pri >= TDPRI_KERN_LPSCHED && ntd->td_fairq_accum >= 0) + nquserok = 0; #endif } } - KASSERT(ntd->td_pri >= TDPRI_CRIT, - ("priority problem in lwkt_switch %d %d", td->td_pri, ntd->td_pri)); /* - * Do the actual switch. If the new target does not need the MP lock - * and we are holding it, release the MP lock. If the new target requires - * the MP lock we have already acquired it for the target. + * Do the actual switch. WARNING: mpheld is stale here. + * + * We must always decrement td_fairq_accum on non-idle threads just + * in case a thread never gets a tick due to being in a continuous + * critical section. The page-zeroing code does that. + * + * If the thread we came up with is a higher or equal priority verses + * the thread at the head of the queue we move our thread to the + * front. This way we can always check the front of the queue. */ +havethread: + ++gd->gd_cnt.v_swtch; + --ntd->td_fairq_accum; + xtd = TAILQ_FIRST(&gd->gd_tdrunq); + if (ntd != xtd && ntd->td_pri >= xtd->td_pri) { + TAILQ_REMOVE(&gd->gd_tdrunq, ntd, td_threadq); + TAILQ_INSERT_HEAD(&gd->gd_tdrunq, ntd, td_threadq); + } +havethread_preempted: + ; + /* + * If the new target does not need the MP lock and we are holding it, + * release the MP lock. If the new target requires the MP lock we have + * already acquired it for the target. + * + * WARNING: mpheld is stale here. + */ +haveidle: + KASSERT(ntd->td_critcount, + ("priority problem in lwkt_switch %d %d", td->td_pri, ntd->td_pri)); #ifdef SMP if (ntd->td_mpcount == 0 ) { if (MP_LOCK_HELD()) @@ -772,10 +813,10 @@ using_idle_thread: if (td != ntd) { ++switch_count; #ifdef __x86_64__ - { - int tos_ok __debugvar = jg_tos_ok(ntd); - KKASSERT(tos_ok); - } + { + int tos_ok __debugvar = jg_tos_ok(ntd); + KKASSERT(tos_ok); + } #endif KTR_LOG(ctxsw_sw, gd->gd_cpuid, ntd); td->td_switch(ntd); @@ -798,7 +839,7 @@ using_idle_thread: * * THE CALLER OF LWKT_PREEMPT() MUST BE IN A CRITICAL SECTION. Typically * this is called via lwkt_schedule() through the td_preemptable callback. - * critpri is the managed critical priority that we should ignore in order + * critcount is the managed critical priority that we should ignore in order * to determine whether preemption is possible (aka usually just the crit * priority of lwkt_schedule() itself). * @@ -819,7 +860,7 @@ using_idle_thread: * can leave it synchronized on return). */ void -lwkt_preempt(thread_t ntd, int critpri) +lwkt_preempt(thread_t ntd, int critcount) { struct globaldata *gd = mycpu; thread_t td; @@ -831,7 +872,7 @@ lwkt_preempt(thread_t ntd, int critpri) /* * The caller has put us in a critical section. We can only preempt * if the caller of the caller was not in a critical section (basically - * a local interrupt), as determined by the 'critpri' parameter. We + * a local interrupt), as determined by the 'critcount' parameter. We * also can't preempt if the caller is holding any spinlocks (even if * he isn't in a critical section). This also handles the tokens test. * @@ -840,14 +881,14 @@ lwkt_preempt(thread_t ntd, int critpri) * * Set need_lwkt_resched() unconditionally for now YYY. */ - KASSERT(ntd->td_pri >= TDPRI_CRIT, ("BADCRIT0 %d", ntd->td_pri)); + KASSERT(ntd->td_critcount, ("BADCRIT0 %d", ntd->td_pri)); td = gd->gd_curthread; - if ((ntd->td_pri & TDPRI_MASK) <= (td->td_pri & TDPRI_MASK)) { + if (ntd->td_pri <= td->td_pri) { ++preempt_miss; return; } - if ((td->td_pri & ~TDPRI_MASK) > critpri) { + if (td->td_critcount > critcount) { ++preempt_miss; need_lwkt_resched(); return; @@ -952,43 +993,63 @@ splz_check(void) globaldata_t gd = mycpu; thread_t td = gd->gd_curthread; - if (gd->gd_reqflags && td->td_nest_count < 2) + if ((gd->gd_reqflags & RQF_IDLECHECK_MASK) && td->td_nest_count < 2) splz(); } /* - * This implements a normal yield which will yield to equal priority - * threads as well as higher priority threads. Note that gd_reqflags - * tests will be handled by the crit_exit() call in lwkt_switch(). - * - * (self contained on a per cpu basis) + * This function is used to negotiate a passive release of the current + * process/lwp designation with the user scheduler, allowing the user + * scheduler to schedule another user thread. The related kernel thread + * (curthread) continues running in the released state. */ void -lwkt_yield(void) +lwkt_passive_release(struct thread *td) { - lwkt_schedule_self(curthread); - lwkt_switch(); + struct lwp *lp = td->td_lwp; + + td->td_release = NULL; + lwkt_setpri_self(TDPRI_KERN_USER); + lp->lwp_proc->p_usched->release_curproc(lp); } + /* - * This function is used along with the lwkt_passive_recover() inline - * by the trap code to negotiate a passive release of the current - * process/lwp designation with the user scheduler. + * This implements a normal yield. This routine is virtually a nop if + * there is nothing to yield to but it will always run any pending interrupts + * if called from a critical section. + * + * This yield is designed for kernel threads without a user context. + * + * (self contained on a per cpu basis) */ void -lwkt_passive_release(struct thread *td) +lwkt_yield(void) { - struct lwp *lp = td->td_lwp; + globaldata_t gd = mycpu; + thread_t td = gd->gd_curthread; + thread_t xtd; - td->td_release = NULL; - lwkt_setpri_self(TDPRI_KERN_USER); - lp->lwp_proc->p_usched->release_curproc(lp); + if ((gd->gd_reqflags & RQF_IDLECHECK_MASK) && td->td_nest_count < 2) + splz(); + if (td->td_fairq_accum < 0) { + lwkt_schedule_self(curthread); + lwkt_switch(); + } else { + xtd = TAILQ_FIRST(&gd->gd_tdrunq); + if (xtd && xtd->td_pri > td->td_pri) { + lwkt_schedule_self(curthread); + lwkt_switch(); + } + } } /* - * Make a kernel thread act as if it were in user mode with regards - * to scheduling, to avoid becoming cpu-bound in the kernel. Kernel - * loops which may be potentially cpu-bound can call lwkt_user_yield(). + * This yield is designed for kernel threads with a user context. + * + * The kernel acting on behalf of the user is potentially cpu-bound, + * this function will efficiently allow other threads to run and also + * switch to other processes by releasing. * * The lwkt_user_yield() function is designed to have very low overhead * if no yield is determined to be needed. @@ -996,8 +1057,15 @@ lwkt_passive_release(struct thread *td) void lwkt_user_yield(void) { - thread_t td = curthread; - struct lwp *lp = td->td_lwp; + globaldata_t gd = mycpu; + thread_t td = gd->gd_curthread; + + /* + * Always run any pending interrupts in case we are in a critical + * section. + */ + if ((gd->gd_reqflags & RQF_IDLECHECK_MASK) && td->td_nest_count < 2) + splz(); #ifdef SMP /* @@ -1013,53 +1081,30 @@ lwkt_user_yield(void) #endif /* - * Another kernel thread wants the cpu + * Switch (which forces a release) if another kernel thread needs + * the cpu, if userland wants us to resched, or if our kernel + * quantum has run out. */ - if (lwkt_resched_wanted()) + if (lwkt_resched_wanted() || + user_resched_wanted() || + td->td_fairq_accum < 0) + { lwkt_switch(); - - /* - * If the user scheduler has asynchronously determined that the current - * process (when running in user mode) needs to lose the cpu then make - * sure we are released. - */ - if (user_resched_wanted()) { - if (td->td_release) - td->td_release(td); } +#if 0 /* - * If we are released reduce our priority + * Reacquire the current process if we are released. + * + * XXX not implemented atm. The kernel may be holding locks and such, + * so we want the thread to continue to receive cpu. */ - if (td->td_release == NULL) { - if (lwkt_check_resched(td) > 0) - lwkt_switch(); - if (lp) { - lp->lwp_proc->p_usched->acquire_curproc(lp); - td->td_release = lwkt_passive_release; - lwkt_setpri_self(TDPRI_USER_NORM); - } + if (td->td_release == NULL && lp) { + lp->lwp_proc->p_usched->acquire_curproc(lp); + td->td_release = lwkt_passive_release; + lwkt_setpri_self(TDPRI_USER_NORM); } -} - -/* - * Return 0 if no runnable threads are pending at the same or higher - * priority as the passed thread. - * - * Return 1 if runnable threads are pending at the same priority. - * - * Return 2 if runnable threads are pending at a higher priority. - */ -int -lwkt_check_resched(thread_t td) -{ - int pri = td->td_pri & TDPRI_MASK; - - if (td->td_gd->gd_runqmask > (2 << pri) - 1) - return(2); - if (TAILQ_NEXT(td, td_threadq)) - return(1); - return(0); +#endif } /* @@ -1083,18 +1128,31 @@ lwkt_check_resched(thread_t td) */ static __inline void -_lwkt_schedule_post(globaldata_t gd, thread_t ntd, int cpri, int reschedok) +_lwkt_schedule_post(globaldata_t gd, thread_t ntd, int ccount, int reschedok) { thread_t otd; if (ntd->td_flags & TDF_RUNQ) { if (ntd->td_preemptable && reschedok) { - ntd->td_preemptable(ntd, cpri); /* YYY +token */ + ntd->td_preemptable(ntd, ccount); /* YYY +token */ } else if (reschedok) { otd = curthread; - if ((ntd->td_pri & TDPRI_MASK) > (otd->td_pri & TDPRI_MASK)) + if (ntd->td_pri > otd->td_pri) need_lwkt_resched(); } + + /* + * Give the thread a little fair share scheduler bump if it + * has been asleep for a while. This is primarily to avoid + * a degenerate case for interrupt threads where accumulator + * crosses into negative territory unnecessarily. + */ + if (ntd->td_fairq_lticks != ticks) { + ntd->td_fairq_lticks = ticks; + ntd->td_fairq_accum += gd->gd_fairq_total_pri; + if (ntd->td_fairq_accum > TDFAIRQ_MAX(gd)) + ntd->td_fairq_accum = TDFAIRQ_MAX(gd); + } } } @@ -1118,13 +1176,13 @@ _lwkt_schedule(thread_t td, int reschedok) #ifdef SMP if (td->td_gd == mygd) { _lwkt_enqueue(td); - _lwkt_schedule_post(mygd, td, TDPRI_CRIT, reschedok); + _lwkt_schedule_post(mygd, td, 1, reschedok); } else { lwkt_send_ipiq3(td->td_gd, lwkt_schedule_remote, td, 0); } #else _lwkt_enqueue(td); - _lwkt_schedule_post(mygd, td, TDPRI_CRIT, reschedok); + _lwkt_schedule_post(mygd, td, 1, reschedok); #endif } crit_exit_gd(mygd); @@ -1257,29 +1315,23 @@ lwkt_deschedule(thread_t td) * Set the target thread's priority. This routine does not automatically * switch to a higher priority thread, LWKT threads are not designed for * continuous priority changes. Yield if you want to switch. - * - * We have to retain the critical section count which uses the high bits - * of the td_pri field. The specified priority may also indicate zero or - * more critical sections by adding TDPRI_CRIT*N. - * - * Note that we requeue the thread whether it winds up on a different runq - * or not. uio_yield() depends on this and the routine is not normally - * called with the same priority otherwise. */ void lwkt_setpri(thread_t td, int pri) { - KKASSERT(pri >= 0); KKASSERT(td->td_gd == mycpu); - crit_enter(); - if (td->td_flags & TDF_RUNQ) { - _lwkt_dequeue(td); - td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri; - _lwkt_enqueue(td); - } else { - td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri; + if (td->td_pri != pri) { + KKASSERT(pri >= 0); + crit_enter(); + if (td->td_flags & TDF_RUNQ) { + _lwkt_dequeue(td); + td->td_pri = pri; + _lwkt_enqueue(td); + } else { + td->td_pri = pri; + } + crit_exit(); } - crit_exit(); } /* @@ -1296,7 +1348,7 @@ lwkt_setpri_initial(thread_t td, int pri) { KKASSERT(pri >= 0); KKASSERT((td->td_flags & TDF_RUNQ) == 0); - td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri; + td->td_pri = pri; } void @@ -1308,14 +1360,46 @@ lwkt_setpri_self(int pri) crit_enter(); if (td->td_flags & TDF_RUNQ) { _lwkt_dequeue(td); - td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri; + td->td_pri = pri; _lwkt_enqueue(td); } else { - td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri; + td->td_pri = pri; } crit_exit(); } +/* + * 1/hz tick (typically 10ms) x TDFAIRQ_SCALE (typ 8) = 80ms full cycle. + * + * Example: two competing threads, same priority N. decrement by (2*N) + * increment by N*8, each thread will get 4 ticks. + */ +void +lwkt_fairq_schedulerclock(thread_t td) +{ + if (fairq_enable) { + while (td) { + if (td != &td->td_gd->gd_idlethread) { + td->td_fairq_accum -= td->td_gd->gd_fairq_total_pri; + if (td->td_fairq_accum < -TDFAIRQ_MAX(td->td_gd)) + td->td_fairq_accum = -TDFAIRQ_MAX(td->td_gd); + if (td->td_fairq_accum < 0) + need_lwkt_resched(); + td->td_fairq_lticks = ticks; + } + td = td->td_preempted; + } + } +} + +static void +lwkt_fairq_accumulate(globaldata_t gd, thread_t td) +{ + td->td_fairq_accum += td->td_pri * TDFAIRQ_SCALE; + if (td->td_fairq_accum > TDFAIRQ_MAX(td->td_gd)) + td->td_fairq_accum = TDFAIRQ_MAX(td->td_gd); +} + /* * Migrate the current thread to the specified cpu. * diff --git a/sys/kern/lwkt_token.c b/sys/kern/lwkt_token.c index eaeb554637..ccb251f017 100644 --- a/sys/kern/lwkt_token.c +++ b/sys/kern/lwkt_token.c @@ -401,7 +401,7 @@ _lwkt_gettokref(lwkt_tokref_t ref, thread_t td) * * Since the tokref is already active the scheduler now * takes care of acquisition, so we need only call - * lwkt_yield(). + * lwkt_switch(). * * Since we failed this was not a recursive token so upon * return tr_tok->t_ref should be assigned to this specific @@ -409,7 +409,7 @@ _lwkt_gettokref(lwkt_tokref_t ref, thread_t td) */ atomic_add_long(&ref->tr_tok->t_collisions, 1); logtoken(fail, ref); - lwkt_yield(); + lwkt_switch(); logtoken(succ, ref); KKASSERT(ref->tr_tok->t_ref == ref); } diff --git a/sys/kern/usched_bsd4.c b/sys/kern/usched_bsd4.c index 8737423ccd..048270c73c 100644 --- a/sys/kern/usched_bsd4.c +++ b/sys/kern/usched_bsd4.c @@ -291,10 +291,7 @@ bsd4_acquire_curproc(struct lwp *lp) * the run queue. When we are reactivated we will have * another chance. */ - if (lwkt_check_resched(lp->lwp_thread) > 1) { - lwkt_switch(); - continue; - } + lwkt_yield(); } while (dd->uschedcp != lp); crit_exit(); diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c index 7e5635699e..5acce8e6bb 100644 --- a/sys/kern/vfs_vnops.c +++ b/sys/kern/vfs_vnops.c @@ -573,7 +573,7 @@ vn_rdwr(enum uio_rw rw, struct vnode *vp, caddr_t base, int len, * Package up an I/O request on a vnode into a uio and do it. The I/O * request is split up into smaller chunks and we try to avoid saturating * the buffer cache while potentially holding a vnode locked, so we - * check bwillwrite() before calling vn_rdwr(). We also call uio_yield() + * check bwillwrite() before calling vn_rdwr(). We also call lwkt_user_yield() * to give other processes a chance to lock the vnode (either other processes * core'ing the same binary, or unrelated processes scanning the directory). * @@ -616,7 +616,7 @@ vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, caddr_t base, int len, break; offset += chunk; base += chunk; - uio_yield(); + lwkt_user_yield(); } while (len); if (aresid) *aresid += len; diff --git a/sys/opencrypto/crypto.c b/sys/opencrypto/crypto.c index 7e34357c80..cbc6bee690 100644 --- a/sys/opencrypto/crypto.c +++ b/sys/opencrypto/crypto.c @@ -830,6 +830,7 @@ crypto_dispatch(struct cryptop *crp) KASSERT(cap != NULL, ("%s: Driver disappeared.", __func__)); if (!cap->cc_qblocked) { result = crypto_invoke(cap, crp, 0); + lwkt_yield(); if (result != ERESTART) return (result); /* @@ -1362,6 +1363,7 @@ crypto_proc(void *arg) CRYPTO_Q_UNLOCK(tdinfo); result = crypto_invoke(cap, submit, hint); + lwkt_yield(); CRYPTO_Q_LOCK(tdinfo); if (result == ERESTART) { diff --git a/sys/platform/pc32/apic/apic_vector.s b/sys/platform/pc32/apic/apic_vector.s index a536fbcf57..99add4d95c 100644 --- a/sys/platform/pc32/apic/apic_vector.s +++ b/sys/platform/pc32/apic/apic_vector.s @@ -147,8 +147,8 @@ IDTVEC(vec_name) ; \ pushl %eax ; \ testl $-1,TD_NEST_COUNT(%ebx) ; \ jne 1f ; \ - cmpl $TDPRI_CRIT,TD_PRI(%ebx) ; \ - jl 2f ; \ + testl $-1,TD_CRITCOUNT(%ebx) ; \ + je 2f ; \ 1: ; \ /* in critical section, make interrupt pending */ \ /* set the pending bit and return, leave interrupt masked */ \ @@ -160,9 +160,9 @@ IDTVEC(vec_name) ; \ andl $~IRQ_LBIT(irq_num),PCPU(fpending) ; \ pushl $irq_num ; \ pushl %esp ; /* pass frame by reference */ \ - addl $TDPRI_CRIT,TD_PRI(%ebx) ; \ + incl TD_CRITCOUNT(%ebx) ; \ call ithread_fast_handler ; /* returns 0 to unmask */ \ - subl $TDPRI_CRIT,TD_PRI(%ebx) ; \ + decl TD_CRITCOUNT(%ebx) ; \ addl $8, %esp ; \ UNMASK_IRQ(irq_num) ; \ 5: ; \ @@ -299,14 +299,14 @@ Xipiq: incl PCPU(cnt) + V_IPI movl PCPU(curthread),%ebx - cmpl $TDPRI_CRIT,TD_PRI(%ebx) - jge 1f + testl $-1,TD_CRITCOUNT(%ebx) + jne 1f subl $8,%esp /* make same as interrupt frame */ pushl %esp /* pass frame by reference */ incl PCPU(intr_nesting_level) - addl $TDPRI_CRIT,TD_PRI(%ebx) + incl TD_CRITCOUNT(%ebx) call lwkt_process_ipiq_frame - subl $TDPRI_CRIT,TD_PRI(%ebx) + decl TD_CRITCOUNT(%ebx) decl PCPU(intr_nesting_level) addl $12,%esp pushl $0 /* CPL for frame (REMOVED) */ @@ -328,16 +328,16 @@ Xtimer: incl PCPU(cnt) + V_TIMER movl PCPU(curthread),%ebx - cmpl $TDPRI_CRIT,TD_PRI(%ebx) - jge 1f + testl $-1,TD_CRITCOUNT(%ebx) + jne 1f testl $-1,TD_NEST_COUNT(%ebx) jne 1f subl $8,%esp /* make same as interrupt frame */ pushl %esp /* pass frame by reference */ incl PCPU(intr_nesting_level) - addl $TDPRI_CRIT,TD_PRI(%ebx) + incl TD_CRITCOUNT(%ebx) call lapic_timer_process_frame - subl $TDPRI_CRIT,TD_PRI(%ebx) + decl TD_CRITCOUNT(%ebx) decl PCPU(intr_nesting_level) addl $12,%esp pushl $0 /* CPL for frame (REMOVED) */ diff --git a/sys/platform/pc32/i386/bcopy.s b/sys/platform/pc32/i386/bcopy.s index 131ab0da54..b68f55c5d6 100644 --- a/sys/platform/pc32/i386/bcopy.s +++ b/sys/platform/pc32/i386/bcopy.s @@ -302,7 +302,7 @@ ENTRY(asm_generic_bcopy) pushl %ecx ; \ movl GD_CURTHREAD(%eax),%edx ; /* EDX = CURTHREAD */ \ movl TD_SAVEFPU(%edx),%ebx ; /* save app save area */\ - addl $TDPRI_CRIT,TD_PRI(%edx) ; \ + incl TD_CRITCOUNT(%edx) ; \ cmpl $0,GD_NPXTHREAD(%eax) ; \ je 100f ; \ fxsave 0(%ebx) ; /* race(1) */ \ @@ -315,11 +315,11 @@ ENTRY(asm_generic_bcopy) orl $TDF_KERNELFP,TD_FLAGS(%edx) ; \ clts ; \ movl %edx,GD_NPXTHREAD(%eax) ; /* race(3) */ \ - subl $TDPRI_CRIT,TD_PRI(%edx) ; /* crit_exit() */ \ + decl TD_CRITCOUNT(%edx) ; /* crit_exit() */ \ cmpl $0,GD_REQFLAGS(%eax) ; \ je 101f ; \ - cmpl $TDPRI_CRIT,TD_PRI(%edx) ; \ - jge 101f ; \ + testl $-1,TD_CRITCOUNT(%edx) ; \ + jne 101f ; \ call splz_check ; \ /* note: eax,ecx,edx destroyed */ \ 101: ; \ diff --git a/sys/platform/pc32/i386/exception.s b/sys/platform/pc32/i386/exception.s index c65f37d182..27da25b3e6 100644 --- a/sys/platform/pc32/i386/exception.s +++ b/sys/platform/pc32/i386/exception.s @@ -895,7 +895,7 @@ IDTVEC(int0x80_syscall) */ ENTRY(fork_trampoline) movl PCPU(curthread),%eax - subl $TDPRI_CRIT,TD_PRI(%eax) + decl TD_CRITCOUNT(%eax) /* * cpu_set_fork_handler intercepts this function call to diff --git a/sys/platform/pc32/i386/genassym.c b/sys/platform/pc32/i386/genassym.c index 65e5c4daad..0176996cfd 100644 --- a/sys/platform/pc32/i386/genassym.c +++ b/sys/platform/pc32/i386/genassym.c @@ -92,6 +92,7 @@ ASSYM(TD_NEST_COUNT, offsetof(struct thread, td_nest_count)); #ifdef SMP ASSYM(TD_MPCOUNT, offsetof(struct thread, td_mpcount)); #endif +ASSYM(TD_CRITCOUNT, offsetof(struct thread, td_critcount)); ASSYM(TD_FLAGS, offsetof(struct thread, td_flags)); ASSYM(TDF_RUNNING, TDF_RUNNING); ASSYM(TDF_USINGFP, TDF_USINGFP); @@ -100,7 +101,6 @@ ASSYM(MACHINTR_INTREN, offsetof(struct machintr_abi, intren)); ASSYM(TD_SAVEFPU, offsetof(struct thread, td_mach) + offsetof(struct md_thread, mtd_savefpu)); -ASSYM(TDPRI_CRIT, TDPRI_CRIT); ASSYM(TDPRI_INT_SUPPORT, TDPRI_INT_SUPPORT); #ifdef SMP ASSYM(CPUMASK_LOCK, CPUMASK_LOCK); diff --git a/sys/platform/pc32/i386/machdep.c b/sys/platform/pc32/i386/machdep.c index 171b029b7c..4d09c15ff0 100644 --- a/sys/platform/pc32/i386/machdep.c +++ b/sys/platform/pc32/i386/machdep.c @@ -753,7 +753,7 @@ sendupcall(struct vmupcall *vu, int morepending) */ vu->vu_pending = 0; upcall.upc_pending = morepending; - crit_count += TDPRI_CRIT; + ++crit_count; copyout(&upcall.upc_pending, &lp->lwp_upcall->upc_pending, sizeof(upcall.upc_pending)); copyout(&crit_count, (char *)upcall.upc_uthread + upcall.upc_critoff, @@ -811,7 +811,7 @@ fetchupcall(struct vmupcall *vu, int morepending, void *rsp) crit_count = 0; if (error == 0) error = copyin((char *)upcall.upc_uthread + upcall.upc_critoff, &crit_count, sizeof(int)); - crit_count += TDPRI_CRIT; + ++crit_count; if (error == 0) error = copyout(&crit_count, (char *)upcall.upc_uthread + upcall.upc_critoff, sizeof(int)); regs->tf_eax = (register_t)vu->vu_func; @@ -903,7 +903,7 @@ cpu_idle(void) struct thread *td = curthread; crit_exit(); - KKASSERT(td->td_pri < TDPRI_CRIT); + KKASSERT(td->td_critcount == 0); for (;;) { /* * See if there are any LWKTs ready to go. diff --git a/sys/platform/pc32/i386/swtch.s b/sys/platform/pc32/i386/swtch.s index 91a66e0932..e19990991a 100644 --- a/sys/platform/pc32/i386/swtch.s +++ b/sys/platform/pc32/i386/swtch.s @@ -546,7 +546,7 @@ ENTRY(cpu_kthread_restore) movl %ecx,%cr3 andl $~TDF_RUNNING,TD_FLAGS(%ebx) orl $TDF_RUNNING,TD_FLAGS(%eax) - subl $TDPRI_CRIT,TD_PRI(%eax) + decl TD_CRITCOUNT(%eax) popl %eax /* kthread exit function */ pushl PCB_EBX(%edx) /* argument to ESI function */ pushl %eax /* set exit func as return address */ diff --git a/sys/platform/pc32/i386/trap.c b/sys/platform/pc32/i386/trap.c index e9b125d669..fed82300c1 100644 --- a/sys/platform/pc32/i386/trap.c +++ b/sys/platform/pc32/i386/trap.c @@ -406,7 +406,7 @@ trap(struct trapframe *frame) int have_mplock = 0; #endif #ifdef INVARIANTS - int crit_count = td->td_pri & ~TDPRI_MASK; + int crit_count = td->td_critcount; #endif vm_offset_t eva; @@ -929,9 +929,9 @@ out2: ; if (p != NULL && lp != NULL) KTR_LOG(kernentry_trap_ret, p->p_pid, lp->lwp_tid); #ifdef INVARIANTS - KASSERT(crit_count == (td->td_pri & ~TDPRI_MASK), + KASSERT(crit_count == td->td_critcount, ("syscall: critical section count mismatch! %d/%d", - crit_count / TDPRI_CRIT, td->td_pri / TDPRI_CRIT)); + crit_count, td->td_pri)); #endif } @@ -1099,7 +1099,7 @@ trap_fatal(struct trapframe *frame, vm_offset_t eva) kprintf("Idle\n"); } kprintf("current thread = pri %d ", curthread->td_pri); - if (curthread->td_pri >= TDPRI_CRIT) + if (curthread->td_critcount) kprintf("(CRIT)"); kprintf("\n"); #ifdef SMP @@ -1179,7 +1179,7 @@ syscall2(struct trapframe *frame) int error; int narg; #ifdef INVARIANTS - int crit_count = td->td_pri & ~TDPRI_MASK; + int crit_count = td->td_critcount; #endif #ifdef SMP int have_mplock = 0; @@ -1399,9 +1399,9 @@ bad: #endif KTR_LOG(kernentry_syscall_ret, p->p_pid, lp->lwp_tid, error); #ifdef INVARIANTS - KASSERT(crit_count == (td->td_pri & ~TDPRI_MASK), + KASSERT(crit_count == td->td_critcount, ("syscall: critical section count mismatch! %d/%d", - crit_count / TDPRI_CRIT, td->td_pri / TDPRI_CRIT)); + crit_count, td->td_pri)); #endif } diff --git a/sys/platform/pc32/icu/icu_vector.s b/sys/platform/pc32/icu/icu_vector.s index 7c7c7c9d45..d196f9ec11 100644 --- a/sys/platform/pc32/icu/icu_vector.s +++ b/sys/platform/pc32/icu/icu_vector.s @@ -147,8 +147,8 @@ IDTVEC(vec_name) ; \ pushl $0 ; /* DUMMY CPL FOR DORETI */ \ testl $-1,TD_NEST_COUNT(%ebx) ; \ jne 1f ; \ - cmpl $TDPRI_CRIT,TD_PRI(%ebx) ; \ - jl 2f ; \ + testl $-1,TD_CRITCOUNT(%ebx) ; \ + je 2f ; \ 1: ; \ /* set pending bit and return, leave interrupt masked */ \ orl $IRQ_LBIT(irq_num),PCPU(fpending) ; \ @@ -159,9 +159,9 @@ IDTVEC(vec_name) ; \ andl $~IRQ_LBIT(irq_num),PCPU(fpending) ; \ pushl $irq_num ; \ pushl %esp ; /* pass frame by reference */ \ - addl $TDPRI_CRIT,TD_PRI(%ebx) ; \ + incl TD_CRITCOUNT(%ebx) ; \ call ithread_fast_handler ; /* returns 0 to unmask int */ \ - subl $TDPRI_CRIT,TD_PRI(%ebx) ; \ + decl TD_CRITCOUNT(%ebx) ; \ addl $8,%esp ; \ UNMASK_IRQ(icu, irq_num) ; \ 5: ; \ diff --git a/sys/platform/pc32/isa/ipl.s b/sys/platform/pc32/isa/ipl.s index a6e5baebcd..f0c7d6e6db 100644 --- a/sys/platform/pc32/isa/ipl.s +++ b/sys/platform/pc32/isa/ipl.s @@ -111,12 +111,12 @@ doreti: popl %eax /* cpl to restore XXX */ movl $0,%eax /* irq mask unavailable due to BGL */ movl PCPU(curthread),%ebx - cli /* interlock with TDPRI_CRIT */ + cli /* interlock with td_critcount */ cmpl $0,PCPU(reqflags) /* short cut if nothing to do */ je 5f - cmpl $TDPRI_CRIT,TD_PRI(%ebx) /* can't unpend if in critical sec */ - jge 5f - addl $TDPRI_CRIT,TD_PRI(%ebx) /* force all ints to pending */ + testl $-1,TD_CRITCOUNT(%ebx) /* can't unpend if in critical sec */ + jne 5f + incl TD_CRITCOUNT(%ebx) /* force all ints to pending */ doreti_next: sti /* allow new interrupts */ movl %eax,%ecx /* irq mask unavailable due to BGL */ @@ -152,7 +152,7 @@ doreti_next: * BGL requirements. We can only clear RQF_INTPEND if *ALL* pending * interrupts have been processed. */ - subl $TDPRI_CRIT,TD_PRI(%ebx) /* interlocked with cli */ + decl TD_CRITCOUNT(%ebx) /* interlocked with cli */ testl %eax,%eax jnz 5f andl $~RQF_INTPEND,PCPU(reqflags) @@ -240,9 +240,9 @@ doreti_soft: pushl %eax pushl %ecx incl TD_NEST_COUNT(%ebx) /* prevent doreti/splz nesting */ - subl $TDPRI_CRIT,TD_PRI(%ebx) /* so we can preempt */ + decl TD_CRITCOUNT(%ebx) /* so we can preempt */ call sched_ithd /* YYY must pull in imasks */ - addl $TDPRI_CRIT,TD_PRI(%ebx) + incl TD_CRITCOUNT(%ebx) decl TD_NEST_COUNT(%ebx) addl $4,%esp popl %eax @@ -264,9 +264,9 @@ doreti_ast: movl %eax,%esi /* save cpl (can't use stack) */ movl $T_ASTFLT,TF_TRAPNO(%esp) pushl %esp /* pass frame by reference */ - subl $TDPRI_CRIT,TD_PRI(%ebx) + decl TD_CRITCOUNT(%ebx) call trap - addl $TDPRI_CRIT,TD_PRI(%ebx) + incl TD_CRITCOUNT(%ebx) addl $4,%esp movl %esi,%eax /* restore cpl for loop */ jmp doreti_next @@ -315,7 +315,7 @@ ENTRY(splz) pushfl pushl %ebx movl PCPU(curthread),%ebx - addl $TDPRI_CRIT,TD_PRI(%ebx) + incl TD_CRITCOUNT(%ebx) movl $0,%eax splz_next: @@ -335,7 +335,7 @@ splz_next: cmpl $0,%ecx jnz splz_soft - subl $TDPRI_CRIT,TD_PRI(%ebx) + decl TD_CRITCOUNT(%ebx) /* * Nothing left to do, finish up. Interrupts are still disabled. @@ -380,10 +380,10 @@ splz_soft: sti pushl %eax pushl %ecx - subl $TDPRI_CRIT,TD_PRI(%ebx) + decl TD_CRITCOUNT(%ebx) incl TD_NEST_COUNT(%ebx) /* prevent doreti/splz nesting */ call sched_ithd /* YYY must pull in imasks */ - addl $TDPRI_CRIT,TD_PRI(%ebx) + incl TD_CRITCOUNT(%ebx) decl TD_NEST_COUNT(%ebx) /* prevent doreti/splz nesting */ addl $4,%esp popl %eax diff --git a/sys/platform/pc64/apic/apic_vector.s b/sys/platform/pc64/apic/apic_vector.s index 32dd04846b..2bd8d07e20 100644 --- a/sys/platform/pc64/apic/apic_vector.s +++ b/sys/platform/pc64/apic/apic_vector.s @@ -130,8 +130,8 @@ IDTVEC(vec_name) ; \ movq PCPU(curthread),%rbx ; \ testl $-1,TD_NEST_COUNT(%rbx) ; \ jne 1f ; \ - cmpl $TDPRI_CRIT,TD_PRI(%rbx) ; \ - jl 2f ; \ + testl $-1,TD_CRITCOUNT(%rbx) ; \ + je 2f ; \ 1: ; \ /* in critical section, make interrupt pending */ \ /* set the pending bit and return, leave interrupt masked */ \ @@ -143,9 +143,9 @@ IDTVEC(vec_name) ; \ andl $~IRQ_LBIT(irq_num),PCPU(fpending) ; \ pushq $irq_num ; /* trapframe -> intrframe */ \ movq %rsp, %rdi ; /* pass frame by reference */ \ - addl $TDPRI_CRIT,TD_PRI(%rbx) ; \ + incl TD_CRITCOUNT(%rbx) ; \ call ithread_fast_handler ; /* returns 0 to unmask */ \ - subl $TDPRI_CRIT,TD_PRI(%rbx) ; \ + decl TD_CRITCOUNT(%rbx) ; \ addq $8, %rsp ; /* intrframe -> trapframe */ \ UNMASK_IRQ(irq_num) ; \ 5: ; \ @@ -305,14 +305,14 @@ Xipiq: incl PCPU(cnt) + V_IPI movq PCPU(curthread),%rbx - cmpl $TDPRI_CRIT,TD_PRI(%rbx) - jge 1f + testl $-1,TD_CRITCOUNT(%rbx) + jne 1f subq $8,%rsp /* make same as interrupt frame */ movq %rsp,%rdi /* pass frame by reference */ incl PCPU(intr_nesting_level) - addl $TDPRI_CRIT,TD_PRI(%rbx) + incl TD_CRITCOUNT(%rbx) call lwkt_process_ipiq_frame - subl $TDPRI_CRIT,TD_PRI(%rbx) + decl TD_CRITCOUNT(%rbx) decl PCPU(intr_nesting_level) addq $8,%rsp /* turn into trapframe */ MEXITCOUNT @@ -334,16 +334,16 @@ Xtimer: incl PCPU(cnt) + V_TIMER movq PCPU(curthread),%rbx - cmpl $TDPRI_CRIT,TD_PRI(%rbx) - jge 1f + testl $-1,TD_CRITCOUNT(%rbx) + jne 1f testl $-1,TD_NEST_COUNT(%rbx) jne 1f subq $8,%rsp /* make same as interrupt frame */ movq %rsp,%rdi /* pass frame by reference */ incl PCPU(intr_nesting_level) - addl $TDPRI_CRIT,TD_PRI(%rbx) + incl TD_CRITCOUNT(%rbx) call lapic_timer_process_frame - subl $TDPRI_CRIT,TD_PRI(%rbx) + decl TD_CRITCOUNT(%rbx) decl PCPU(intr_nesting_level) addq $8,%rsp /* turn into trapframe */ MEXITCOUNT diff --git a/sys/platform/pc64/icu/icu_vector.s b/sys/platform/pc64/icu/icu_vector.s index b10ef36d54..715b222518 100644 --- a/sys/platform/pc64/icu/icu_vector.s +++ b/sys/platform/pc64/icu/icu_vector.s @@ -142,8 +142,8 @@ IDTVEC(vec_name) ; \ movq PCPU(curthread),%rbx ; \ testl $-1,TD_NEST_COUNT(%rbx) ; \ jne 1f ; \ - cmpl $TDPRI_CRIT,TD_PRI(%rbx) ; \ - jl 2f ; \ + testl $-1,TD_CRITCOUNT(%rbx) ; \ + je 2f ; \ 1: ; \ /* set pending bit and return, leave interrupt masked */ \ orl $IRQ_LBIT(irq_num),PCPU(fpending) ; \ @@ -154,9 +154,9 @@ IDTVEC(vec_name) ; \ andl $~IRQ_LBIT(irq_num),PCPU(fpending) ; \ pushq $irq_num ; \ movq %rsp,%rdi ; /* rdi = call argument */ \ - addl $TDPRI_CRIT,TD_PRI(%rbx) ; \ + incl TD_CRITCOUNT(%rbx) ; \ call ithread_fast_handler ; /* returns 0 to unmask int */ \ - subl $TDPRI_CRIT,TD_PRI(%rbx) ; \ + decl TD_CRITCOUNT(%rbx) ; \ addq $8,%rsp ; /* intr frame -> trap frame */ \ UNMASK_IRQ(icu, irq_num) ; \ 5: ; \ diff --git a/sys/platform/pc64/x86_64/exception.S b/sys/platform/pc64/x86_64/exception.S index 4efea946ae..f462849e4e 100644 --- a/sys/platform/pc64/x86_64/exception.S +++ b/sys/platform/pc64/x86_64/exception.S @@ -408,7 +408,7 @@ nmi_restoreregs: */ ENTRY(fork_trampoline) movq PCPU(curthread),%rax - subl $TDPRI_CRIT,TD_PRI(%rax) + decl TD_CRITCOUNT(%rax) /* * cpu_set_fork_handler intercepts this function call to diff --git a/sys/platform/pc64/x86_64/genassym.c b/sys/platform/pc64/x86_64/genassym.c index 116204dcdb..d7a169b413 100644 --- a/sys/platform/pc64/x86_64/genassym.c +++ b/sys/platform/pc64/x86_64/genassym.c @@ -178,6 +178,7 @@ ASSYM(TD_LWP, offsetof(struct thread, td_lwp)); ASSYM(TD_PCB, offsetof(struct thread, td_pcb)); ASSYM(TD_SP, offsetof(struct thread, td_sp)); ASSYM(TD_PRI, offsetof(struct thread, td_pri)); +ASSYM(TD_CRITCOUNT, offsetof(struct thread, td_critcount)); ASSYM(TD_MACH, offsetof(struct thread, td_mach)); ASSYM(TD_WCHAN, offsetof(struct thread, td_wchan)); ASSYM(TD_NEST_COUNT, offsetof(struct thread, td_nest_count)); @@ -236,7 +237,6 @@ ASSYM(MSR_FSBASE, MSR_FSBASE); ASSYM(MACHINTR_INTREN, offsetof(struct machintr_abi, intren)); -ASSYM(TDPRI_CRIT, TDPRI_CRIT); ASSYM(TDPRI_INT_SUPPORT, TDPRI_INT_SUPPORT); #ifdef SMP ASSYM(CPUMASK_LOCK, CPUMASK_LOCK); diff --git a/sys/platform/pc64/x86_64/ipl.s b/sys/platform/pc64/x86_64/ipl.s index 2037850334..44214a7aff 100644 --- a/sys/platform/pc64/x86_64/ipl.s +++ b/sys/platform/pc64/x86_64/ipl.s @@ -138,12 +138,12 @@ doreti: FAKE_MCOUNT(bintr) /* init "from" bintr -> doreti */ movq $0,%rax /* irq mask unavailable due to BGL */ movq PCPU(curthread),%rbx - cli /* interlock with TDPRI_CRIT */ + cli /* interlock with critical section */ cmpl $0,PCPU(reqflags) /* short cut if nothing to do */ je 5f - cmpl $TDPRI_CRIT,TD_PRI(%rbx) /* can't unpend if in critical sec */ - jge 5f - addl $TDPRI_CRIT,TD_PRI(%rbx) /* force all ints to pending */ + testl $-1,TD_CRITCOUNT(%rbx) /* can't unpend if in critical sec */ + jne 5f + incl TD_CRITCOUNT(%rbx) /* force all ints to pending */ doreti_next: sti /* allow new interrupts */ movl %eax,%ecx /* irq mask unavailable due to BGL */ @@ -175,7 +175,7 @@ doreti_next: * BGL requirements. We can only clear RQF_INTPEND if *ALL* pending * interrupts have been processed. */ - subl $TDPRI_CRIT,TD_PRI(%rbx) /* interlocked with cli */ + decl TD_CRITCOUNT(%rbx) /* interlocked with cli */ testl %eax,%eax jnz 5f andl $~RQF_INTPEND,PCPU(reqflags) @@ -275,9 +275,9 @@ doreti_soft: pushq %rax movl %ecx,%edi /* argument to C call */ incl TD_NEST_COUNT(%rbx) /* prevent doreti/splz nesting */ - subl $TDPRI_CRIT,TD_PRI(%rbx) /* so we can preempt */ + decl TD_CRITCOUNT(%rbx) /* so we can preempt */ call sched_ithd /* YYY must pull in imasks */ - addl $TDPRI_CRIT,TD_PRI(%rbx) + incl TD_CRITCOUNT(%rbx) decl TD_NEST_COUNT(%rbx) popq %rax jmp doreti_next @@ -298,9 +298,9 @@ doreti_ast: movl %eax,%r12d /* save cpl (can't use stack) */ movl $T_ASTFLT,TF_TRAPNO(%rsp) movq %rsp,%rdi /* pass frame by ref (%edi = C arg) */ - subl $TDPRI_CRIT,TD_PRI(%rbx) + decl TD_CRITCOUNT(%rbx) call trap - addl $TDPRI_CRIT,TD_PRI(%rbx) + incl TD_CRITCOUNT(%rbx) movl %r12d,%eax /* restore cpl for loop */ jmp doreti_next @@ -348,7 +348,7 @@ ENTRY(splz) pushfq pushq %rbx movq PCPU(curthread),%rbx - addl $TDPRI_CRIT,TD_PRI(%rbx) + incl TD_CRITCOUNT(%rbx) movl $0,%eax splz_next: @@ -368,7 +368,7 @@ splz_next: cmpl $0,%ecx jnz splz_soft - subl $TDPRI_CRIT,TD_PRI(%rbx) + decl TD_CRITCOUNT(%rbx) /* * Nothing left to do, finish up. Interrupts are still disabled. @@ -431,10 +431,10 @@ splz_soft: sti pushq %rax movl %ecx,%edi /* C argument */ - subl $TDPRI_CRIT,TD_PRI(%rbx) + decl TD_CRITCOUNT(%rbx) incl TD_NEST_COUNT(%rbx) /* prevent doreti/splz nesting */ call sched_ithd /* YYY must pull in imasks */ - addl $TDPRI_CRIT,TD_PRI(%rbx) + incl TD_CRITCOUNT(%rbx) decl TD_NEST_COUNT(%rbx) /* prevent doreti/splz nesting */ popq %rax jmp splz_next diff --git a/sys/platform/pc64/x86_64/machdep.c b/sys/platform/pc64/x86_64/machdep.c index e342c39784..944b3ec50a 100644 --- a/sys/platform/pc64/x86_64/machdep.c +++ b/sys/platform/pc64/x86_64/machdep.c @@ -778,7 +778,7 @@ sendupcall(struct vmupcall *vu, int morepending) */ vu->vu_pending = 0; upcall.upc_pending = morepending; - crit_count += TDPRI_CRIT; + ++crit_count; copyout(&upcall.upc_pending, &lp->lwp_upcall->upc_pending, sizeof(upcall.upc_pending)); copyout(&crit_count, (char *)upcall.upc_uthread + upcall.upc_critoff, @@ -836,7 +836,7 @@ fetchupcall(struct vmupcall *vu, int morepending, void *rsp) crit_count = 0; if (error == 0) error = copyin((char *)upcall.upc_uthread + upcall.upc_critoff, &crit_count, sizeof(int)); - crit_count += TDPRI_CRIT; + ++crit_count; if (error == 0) error = copyout(&crit_count, (char *)upcall.upc_uthread + upcall.upc_critoff, sizeof(int)); regs->tf_rax = (register_t)vu->vu_func; @@ -928,7 +928,7 @@ cpu_idle(void) struct thread *td = curthread; crit_exit(); - KKASSERT(td->td_pri < TDPRI_CRIT); + KKASSERT(td->td_critcount == 0); for (;;) { /* * See if there are any LWKTs ready to go. diff --git a/sys/platform/pc64/x86_64/swtch.s b/sys/platform/pc64/x86_64/swtch.s index 87cc24aa48..80efbc8b3e 100644 --- a/sys/platform/pc64/x86_64/swtch.s +++ b/sys/platform/pc64/x86_64/swtch.s @@ -609,7 +609,7 @@ ENTRY(cpu_kthread_restore) /* rax and rbx come from the switchout code */ andl $~TDF_RUNNING,TD_FLAGS(%rbx) orl $TDF_RUNNING,TD_FLAGS(%rax) - subl $TDPRI_CRIT,TD_PRI(%rax) + decl TD_CRITCOUNT(%rax) movq PCB_R12(%rdx),%rdi /* argument to RBX function */ movq PCB_RBX(%rdx),%rax /* thread function */ /* note: top of stack return address inherited by function */ diff --git a/sys/platform/pc64/x86_64/trap.c b/sys/platform/pc64/x86_64/trap.c index b394da5852..fb57f4ccf5 100644 --- a/sys/platform/pc64/x86_64/trap.c +++ b/sys/platform/pc64/x86_64/trap.c @@ -366,7 +366,7 @@ trap(struct trapframe *frame) int have_mplock = 0; #endif #ifdef INVARIANTS - int crit_count = td->td_pri & ~TDPRI_MASK; + int crit_count = td->td_critcount; #endif vm_offset_t eva; @@ -794,9 +794,9 @@ out2: ; if (p != NULL && lp != NULL) KTR_LOG(kernentry_trap_ret, p->p_pid, lp->lwp_tid); #ifdef INVARIANTS - KASSERT(crit_count == (td->td_pri & ~TDPRI_MASK), + KASSERT(crit_count == td->td_critcount, ("syscall: critical section count mismatch! %d/%d", - crit_count / TDPRI_CRIT, td->td_pri / TDPRI_CRIT)); + crit_count, td->td_pri)); #endif } @@ -971,7 +971,7 @@ trap_fatal(struct trapframe *frame, vm_offset_t eva) kprintf("Idle\n"); } kprintf("current thread = pri %d ", curthread->td_pri); - if (curthread->td_pri >= TDPRI_CRIT) + if (curthread->td_critcount) kprintf("(CRIT)"); kprintf("\n"); @@ -1032,7 +1032,7 @@ syscall2(struct trapframe *frame) int error; int narg; #ifdef INVARIANTS - int crit_count = td->td_pri & ~TDPRI_MASK; + int crit_count = td->td_critcount; #endif #ifdef SMP int have_mplock = 0; @@ -1259,9 +1259,9 @@ bad: #endif KTR_LOG(kernentry_syscall_ret, p->p_pid, lp->lwp_tid, error); #ifdef INVARIANTS - KASSERT(crit_count == (td->td_pri & ~TDPRI_MASK), + KASSERT(crit_count == td->td_critcount, ("syscall: critical section count mismatch! %d/%d", - crit_count / TDPRI_CRIT, td->td_pri / TDPRI_CRIT)); + crit_count, td->td_pri)); #endif } diff --git a/sys/platform/vkernel/i386/cpu_regs.c b/sys/platform/vkernel/i386/cpu_regs.c index 591632be0b..ff53178321 100644 --- a/sys/platform/vkernel/i386/cpu_regs.c +++ b/sys/platform/vkernel/i386/cpu_regs.c @@ -587,7 +587,7 @@ sendupcall(struct vmupcall *vu, int morepending) */ vu->vu_pending = 0; upcall.upc_pending = morepending; - crit_count += TDPRI_CRIT; + ++crit_count; copyout(&upcall.upc_pending, &lp->lwp_upcall->upc_pending, sizeof(upcall.upc_pending)); copyout(&crit_count, (char *)upcall.upc_uthread + upcall.upc_critoff, @@ -645,7 +645,7 @@ fetchupcall (struct vmupcall *vu, int morepending, void *rsp) crit_count = 0; if (error == 0) error = copyin((char *)upcall.upc_uthread + upcall.upc_critoff, &crit_count, sizeof(int)); - crit_count += TDPRI_CRIT; + ++crit_count; if (error == 0) error = copyout(&crit_count, (char *)upcall.upc_uthread + upcall.upc_critoff, sizeof(int)); regs->tf_eax = (register_t)vu->vu_func; @@ -704,7 +704,7 @@ cpu_idle(void) struct mdglobaldata *gd = mdcpu; crit_exit(); - KKASSERT(td->td_pri < TDPRI_CRIT); + KKASSERT(td->td_critcount == 0); cpu_enable_intr(); for (;;) { /* @@ -724,7 +724,7 @@ cpu_idle(void) struct timeval tv1, tv2; gettimeofday(&tv1, NULL); #endif - umtx_sleep(&gd->mi.gd_runqmask, 0, 1000000); + umtx_sleep(&gd->mi.gd_reqflags, 0, 1000000); #ifdef DEBUGIDLE gettimeofday(&tv2, NULL); if (tv2.tv_usec - tv1.tv_usec + @@ -732,7 +732,7 @@ cpu_idle(void) > 500000) { kprintf("cpu %d idlelock %08x %08x\n", gd->mi.gd_cpuid, - gd->mi.gd_runqmask, + gd->mi.gd_reqflags, gd->gd_fpending); } #endif diff --git a/sys/platform/vkernel/i386/exception.c b/sys/platform/vkernel/i386/exception.c index 7513a5f962..8b770ef4fa 100644 --- a/sys/platform/vkernel/i386/exception.c +++ b/sys/platform/vkernel/i386/exception.c @@ -78,12 +78,12 @@ static void ipisig(int nada, siginfo_t *info, void *ctxp) { - if (curthread->td_pri < TDPRI_CRIT) { - curthread->td_pri += TDPRI_CRIT; + if (curthread->td_critcount == 0) { + ++curthread->td_critcount; ++mycpu->gd_intr_nesting_level; lwkt_process_ipiq(); --mycpu->gd_intr_nesting_level; - curthread->td_pri -= TDPRI_CRIT; + --curthread->td_critcount; } else { need_ipiq(); } @@ -115,13 +115,13 @@ stopsig(int nada, siginfo_t *info, void *ctxp) sigaddset(&ss, SIGTERM); sigaddset(&ss, SIGWINCH); - curthread->td_pri += TDPRI_CRIT; + ++curthread->td_critcount; ++mycpu->gd_intr_nesting_level; while (stopped_cpus & mycpu->gd_cpumask) { sigsuspend(&ss); } --mycpu->gd_intr_nesting_level; - curthread->td_pri -= TDPRI_CRIT; + --curthread->td_critcount; } #endif diff --git a/sys/platform/vkernel/i386/fork_tramp.s b/sys/platform/vkernel/i386/fork_tramp.s index 3a4202b086..6c2eede5ea 100644 --- a/sys/platform/vkernel/i386/fork_tramp.s +++ b/sys/platform/vkernel/i386/fork_tramp.s @@ -59,7 +59,7 @@ */ ENTRY(fork_trampoline) movl PCPU(curthread),%eax - subl $TDPRI_CRIT,TD_PRI(%eax) + decl TD_CRITCOUNT(%eax) /* * cpu_set_fork_handler intercepts this function call to diff --git a/sys/platform/vkernel/i386/genassym.c b/sys/platform/vkernel/i386/genassym.c index cbabc033d6..5d332fd143 100644 --- a/sys/platform/vkernel/i386/genassym.c +++ b/sys/platform/vkernel/i386/genassym.c @@ -83,6 +83,7 @@ ASSYM(TD_LWP, offsetof(struct thread, td_lwp)); ASSYM(TD_PCB, offsetof(struct thread, td_pcb)); ASSYM(TD_SP, offsetof(struct thread, td_sp)); ASSYM(TD_PRI, offsetof(struct thread, td_pri)); +ASSYM(TD_CRITCOUNT, offsetof(struct thread, td_critcount)); ASSYM(TD_MACH, offsetof(struct thread, td_mach)); ASSYM(TD_WCHAN, offsetof(struct thread, td_wchan)); ASSYM(TD_NEST_COUNT, offsetof(struct thread, td_nest_count)); @@ -94,7 +95,6 @@ ASSYM(TDF_RUNNING, TDF_RUNNING); ASSYM(TD_SAVEFPU, offsetof(struct thread, td_mach) + offsetof(struct md_thread, mtd_savefpu)); -ASSYM(TDPRI_CRIT, TDPRI_CRIT); ASSYM(TDPRI_INT_SUPPORT, TDPRI_INT_SUPPORT); #ifdef SMP ASSYM(CPUMASK_LOCK, CPUMASK_LOCK); diff --git a/sys/platform/vkernel/i386/swtch.s b/sys/platform/vkernel/i386/swtch.s index 40705a3030..9e26f07152 100644 --- a/sys/platform/vkernel/i386/swtch.s +++ b/sys/platform/vkernel/i386/swtch.s @@ -532,7 +532,7 @@ ENTRY(cpu_kthread_restore) movl $0,%ebp andl $~TDF_RUNNING,TD_FLAGS(%ebx) orl $TDF_RUNNING,TD_FLAGS(%eax) - subl $TDPRI_CRIT,TD_PRI(%eax) + decl TD_CRITCOUNT(%eax) popl %eax /* kthread exit function */ pushl PCB_EBX(%edx) /* argument to ESI function */ pushl %eax /* set exit func as return address */ diff --git a/sys/platform/vkernel/i386/trap.c b/sys/platform/vkernel/i386/trap.c index 49fbd831f2..47d8cfd2dc 100644 --- a/sys/platform/vkernel/i386/trap.c +++ b/sys/platform/vkernel/i386/trap.c @@ -381,7 +381,7 @@ user_trap(struct trapframe *frame) int have_mplock = 0; #endif #ifdef INVARIANTS - int crit_count = td->td_pri & ~TDPRI_MASK; + int crit_count = td->td_critcount; #endif vm_offset_t eva; @@ -659,9 +659,9 @@ out2: ; #endif KTR_LOG(kernentry_trap_ret, lp->lwp_proc->p_pid, lp->lwp_tid); #ifdef INVARIANTS - KASSERT(crit_count == (td->td_pri & ~TDPRI_MASK), + KASSERT(crit_count == td->td_critcount, ("syscall: critical section count mismatch! %d/%d", - crit_count / TDPRI_CRIT, td->td_pri / TDPRI_CRIT)); + crit_count, td->td_pri)); #endif } @@ -677,7 +677,7 @@ kern_trap(struct trapframe *frame) int have_mplock = 0; #endif #ifdef INVARIANTS - int crit_count = td->td_pri & ~TDPRI_MASK; + int crit_count = td->td_critcount; #endif vm_offset_t eva; @@ -871,9 +871,9 @@ out2: rel_mplock(); #endif #ifdef INVARIANTS - KASSERT(crit_count == (td->td_pri & ~TDPRI_MASK), + KASSERT(crit_count == td->td_critcount, ("syscall: critical section count mismatch! %d/%d", - crit_count / TDPRI_CRIT, td->td_pri / TDPRI_CRIT)); + crit_count, td->td_pri)); #endif } @@ -1016,7 +1016,7 @@ trap_fatal(struct trapframe *frame, int usermode, vm_offset_t eva) kprintf("Idle\n"); } kprintf("current thread = pri %d ", curthread->td_pri); - if (curthread->td_pri >= TDPRI_CRIT) + if (curthread->td_critcount) kprintf("(CRIT)"); kprintf("\n"); #ifdef SMP @@ -1096,7 +1096,7 @@ syscall2(struct trapframe *frame) int error; int narg; #ifdef INVARIANTS - int crit_count = td->td_pri & ~TDPRI_MASK; + int crit_count = td->td_critcount; #endif #ifdef SMP int have_mplock = 0; @@ -1304,9 +1304,9 @@ bad: #endif KTR_LOG(kernentry_syscall_ret, lp->lwp_proc->p_pid, lp->lwp_tid, error); #ifdef INVARIANTS - KASSERT(crit_count == (td->td_pri & ~TDPRI_MASK), + KASSERT(crit_count == td->td_critcount, ("syscall: critical section count mismatch! %d/%d", - crit_count / TDPRI_CRIT, td->td_pri / TDPRI_CRIT)); + crit_count, td->td_pri)); #endif } diff --git a/sys/platform/vkernel/platform/machintr.c b/sys/platform/vkernel/platform/machintr.c index 5b9163dd78..5b5c955303 100644 --- a/sys/platform/vkernel/platform/machintr.c +++ b/sys/platform/vkernel/platform/machintr.c @@ -160,7 +160,7 @@ signalintr(int intr) struct mdglobaldata *gd = mdcpu; thread_t td = gd->mi.gd_curthread; - if (td->td_pri >= TDPRI_CRIT || td->td_nest_count) { + if (td->td_critcount || td->td_nest_count) { atomic_set_int_nonlocked(&gd->gd_fpending, 1 << intr); atomic_set_int_nonlocked(&gd->mi.gd_reqflags, RQF_INTPEND); } else { diff --git a/sys/platform/vkernel64/platform/machintr.c b/sys/platform/vkernel64/platform/machintr.c index 7aa99ab457..44cc1daf19 100644 --- a/sys/platform/vkernel64/platform/machintr.c +++ b/sys/platform/vkernel64/platform/machintr.c @@ -160,7 +160,7 @@ signalintr(int intr) struct mdglobaldata *gd = mdcpu; thread_t td = gd->mi.gd_curthread; - if (td->td_pri >= TDPRI_CRIT || td->td_nest_count) { + if (td->td_critcount || td->td_nest_count) { atomic_set_int_nonlocked(&gd->gd_fpending, 1 << intr); atomic_set_int_nonlocked(&gd->mi.gd_reqflags, RQF_INTPEND); } else { diff --git a/sys/platform/vkernel64/x86_64/cpu_regs.c b/sys/platform/vkernel64/x86_64/cpu_regs.c index 926cc18c85..bd4f3efb03 100644 --- a/sys/platform/vkernel64/x86_64/cpu_regs.c +++ b/sys/platform/vkernel64/x86_64/cpu_regs.c @@ -591,7 +591,7 @@ sendupcall(struct vmupcall *vu, int morepending) */ vu->vu_pending = 0; upcall.upc_pending = morepending; - crit_count += TDPRI_CRIT; + ++crit_count; copyout(&upcall.upc_pending, &lp->lwp_upcall->upc_pending, sizeof(upcall.upc_pending)); copyout(&crit_count, (char *)upcall.upc_uthread + upcall.upc_critoff, @@ -649,7 +649,7 @@ fetchupcall(struct vmupcall *vu, int morepending, void *rsp) crit_count = 0; if (error == 0) error = copyin((char *)upcall.upc_uthread + upcall.upc_critoff, &crit_count, sizeof(int)); - crit_count += TDPRI_CRIT; + ++crit_count; if (error == 0) error = copyout(&crit_count, (char *)upcall.upc_uthread + upcall.upc_critoff, sizeof(int)); regs->tf_rax = (register_t)vu->vu_func; @@ -708,7 +708,7 @@ cpu_idle(void) struct mdglobaldata *gd = mdcpu; crit_exit(); - KKASSERT(td->td_pri < TDPRI_CRIT); + KKASSERT(td->td_critcount == 0); cpu_enable_intr(); for (;;) { /* @@ -728,7 +728,7 @@ cpu_idle(void) struct timeval tv1, tv2; gettimeofday(&tv1, NULL); #endif - umtx_sleep(&gd->mi.gd_runqmask, 0, 1000000); + umtx_sleep(&gd->mi.gd_reqflags, 0, 1000000); #ifdef DEBUGIDLE gettimeofday(&tv2, NULL); if (tv2.tv_usec - tv1.tv_usec + @@ -736,7 +736,7 @@ cpu_idle(void) > 500000) { kprintf("cpu %d idlelock %08x %08x\n", gd->mi.gd_cpuid, - gd->mi.gd_runqmask, + gd->mi.gd_reqflags, gd->gd_fpending); } #endif diff --git a/sys/platform/vkernel64/x86_64/exception.c b/sys/platform/vkernel64/x86_64/exception.c index 6236a2effa..34ef724984 100644 --- a/sys/platform/vkernel64/x86_64/exception.c +++ b/sys/platform/vkernel64/x86_64/exception.c @@ -78,12 +78,15 @@ static void ipisig(int nada, siginfo_t *info, void *ctxp) { - if (curthread->td_pri < TDPRI_CRIT) { - curthread->td_pri += TDPRI_CRIT; - ++mycpu->gd_intr_nesting_level; + globaldata_t gd = mycpu; + thread_t td = gd->gd_curthread; + + if (td->td_critcount == 0) { + ++td->td_critcount; + ++gd->gd_intr_nesting_level; lwkt_process_ipiq(); - --mycpu->gd_intr_nesting_level; - curthread->td_pri -= TDPRI_CRIT; + --gd->gd_intr_nesting_level; + --td->td_critcount; } else { need_ipiq(); } @@ -104,6 +107,8 @@ static void stopsig(int nada, siginfo_t *info, void *ctxp) { + globaldata_t gd = mycpu; + thread_t td = gd->gd_curthread; sigset_t ss; sigemptyset(&ss); @@ -115,13 +120,13 @@ stopsig(int nada, siginfo_t *info, void *ctxp) sigaddset(&ss, SIGTERM); sigaddset(&ss, SIGWINCH); - curthread->td_pri += TDPRI_CRIT; - ++mycpu->gd_intr_nesting_level; - while (stopped_cpus & mycpu->gd_cpumask) { + ++td->td_critcount; + ++gd->gd_intr_nesting_level; + while (stopped_cpus & gd->gd_cpumask) { sigsuspend(&ss); } - --mycpu->gd_intr_nesting_level; - curthread->td_pri -= TDPRI_CRIT; + --gd->gd_intr_nesting_level; + --td->td_critcount; } #endif diff --git a/sys/platform/vkernel64/x86_64/fork_tramp.s b/sys/platform/vkernel64/x86_64/fork_tramp.s index 1e3fb418ba..e804ef5096 100644 --- a/sys/platform/vkernel64/x86_64/fork_tramp.s +++ b/sys/platform/vkernel64/x86_64/fork_tramp.s @@ -56,7 +56,7 @@ */ ENTRY(fork_trampoline) movq PCPU(curthread),%rax - subl $TDPRI_CRIT,TD_PRI(%rax) + decl TD_CRITCOUNT(%rax) /* * cpu_set_fork_handler intercepts this function call to diff --git a/sys/platform/vkernel64/x86_64/genassym.c b/sys/platform/vkernel64/x86_64/genassym.c index af70e870f6..ee7725a0cd 100644 --- a/sys/platform/vkernel64/x86_64/genassym.c +++ b/sys/platform/vkernel64/x86_64/genassym.c @@ -109,6 +109,7 @@ ASSYM(TD_LWP, offsetof(struct thread, td_lwp)); ASSYM(TD_PCB, offsetof(struct thread, td_pcb)); ASSYM(TD_SP, offsetof(struct thread, td_sp)); ASSYM(TD_PRI, offsetof(struct thread, td_pri)); +ASSYM(TD_CRITCOUNT, offsetof(struct thread, td_critcount)); #ifdef SMP ASSYM(TD_MPCOUNT, offsetof(struct thread, td_mpcount)); #endif @@ -116,5 +117,3 @@ ASSYM(TD_FLAGS, offsetof(struct thread, td_flags)); ASSYM(TD_SAVEFPU, offsetof(struct thread, td_savefpu)); ASSYM(TDF_RUNNING, TDF_RUNNING); ASSYM(GD_NPXTHREAD, offsetof(struct mdglobaldata, gd_npxthread)); - -ASSYM(TDPRI_CRIT, TDPRI_CRIT); diff --git a/sys/platform/vkernel64/x86_64/swtch.s b/sys/platform/vkernel64/x86_64/swtch.s index 0d755ab6d7..65929f39ab 100644 --- a/sys/platform/vkernel64/x86_64/swtch.s +++ b/sys/platform/vkernel64/x86_64/swtch.s @@ -593,7 +593,7 @@ ENTRY(cpu_kthread_restore) /* rax and rbx come from the switchout code */ andl $~TDF_RUNNING,TD_FLAGS(%rbx) orl $TDF_RUNNING,TD_FLAGS(%rax) - subl $TDPRI_CRIT,TD_PRI(%rax) + decl TD_CRITCOUNT(%rax) movq PCB_R12(%rdx),%rdi /* argument to RBX function */ movq PCB_RBX(%rdx),%rax /* thread function */ /* note: top of stack return address inherited by function */ diff --git a/sys/platform/vkernel64/x86_64/trap.c b/sys/platform/vkernel64/x86_64/trap.c index 1a0b07dfdb..b89b756298 100644 --- a/sys/platform/vkernel64/x86_64/trap.c +++ b/sys/platform/vkernel64/x86_64/trap.c @@ -381,7 +381,7 @@ user_trap(struct trapframe *frame) int have_mplock = 0; #endif #ifdef INVARIANTS - int crit_count = td->td_pri & ~TDPRI_MASK; + int crit_count = td->td_critcount; #endif vm_offset_t eva; @@ -635,9 +635,9 @@ out2: ; #endif KTR_LOG(kernentry_trap_ret, lp->lwp_proc->p_pid, lp->lwp_tid); #ifdef INVARIANTS - KASSERT(crit_count == (td->td_pri & ~TDPRI_MASK), + KASSERT(crit_count == td->td_critcount, ("syscall: critical section count mismatch! %d/%d", - crit_count / TDPRI_CRIT, td->td_pri / TDPRI_CRIT)); + crit_count, td->td_pri)); #endif } @@ -653,7 +653,7 @@ kern_trap(struct trapframe *frame) int have_mplock = 0; #endif #ifdef INVARIANTS - int crit_count = td->td_pri & ~TDPRI_MASK; + int crit_count = td->td_critcount; #endif vm_offset_t eva; @@ -846,9 +846,9 @@ out2: rel_mplock(); #endif #ifdef INVARIANTS - KASSERT(crit_count == (td->td_pri & ~TDPRI_MASK), + KASSERT(crit_count == td->td_critcount, ("syscall: critical section count mismatch! %d/%d", - crit_count / TDPRI_CRIT, td->td_pri / TDPRI_CRIT)); + crit_count, td->td_pri)); #endif } @@ -1009,7 +1009,7 @@ trap_fatal(struct trapframe *frame, int usermode, vm_offset_t eva) kprintf("Idle\n"); } kprintf("current thread = pri %d ", curthread->td_pri); - if (curthread->td_pri >= TDPRI_CRIT) + if (curthread->td_critcount) kprintf("(CRIT)"); kprintf("\n"); #ifdef SMP @@ -1141,7 +1141,7 @@ syscall2(struct trapframe *frame) int error; int narg; #ifdef INVARIANTS - int crit_count = td->td_pri & ~TDPRI_MASK; + int crit_count = td->td_critcount; #endif #ifdef SMP int have_mplock = 0; @@ -1363,9 +1363,9 @@ bad: #endif KTR_LOG(kernentry_syscall_ret, lp->lwp_proc->p_pid, lp->lwp_tid, error); #ifdef INVARIANTS - KASSERT(crit_count == (td->td_pri & ~TDPRI_MASK), + KASSERT(crit_count == td->td_critcount, ("syscall: critical section count mismatch! %d/%d", - crit_count / TDPRI_CRIT, td->td_pri / TDPRI_CRIT)); + crit_count, td->td_pri)); #endif } diff --git a/sys/sys/globaldata.h b/sys/sys/globaldata.h index 7c546453e1..ba58de030b 100644 --- a/sys/sys/globaldata.h +++ b/sys/sys/globaldata.h @@ -129,9 +129,7 @@ struct globaldata { __uint32_t gd_reqflags; /* (see note above) */ void *gd_unused00B; lwkt_queue gd_tdallq; /* all threads */ - lwkt_queue gd_unused00C; - lwkt_queue gd_tdrunq[32]; /* runnable threads */ - __uint32_t gd_runqmask; /* which queues? */ + lwkt_queue gd_tdrunq; /* runnable threads */ __uint32_t gd_cpuid; cpumask_t gd_cpumask; /* mask = 1<kernel syscall, * trap, and AST/signal transitions to provide a stable ucred for * (primarily) system calls. This field will be NULL for pure kernel @@ -219,9 +216,10 @@ struct thread { const char *td_wmesg; /* string name for blockage */ const volatile void *td_wchan; /* waiting on channel */ int td_pri; /* 0-31, 31=highest priority (note 1) */ + int td_critcount; /* critical section priority */ int td_flags; /* TDF flags */ int td_wdomain; /* domain for wchan address (typ 0) */ - void (*td_preemptable)(struct thread *td, int critpri); + void (*td_preemptable)(struct thread *td, int critcount); void (*td_release)(struct thread *td); char *td_kstack; /* kernel stack */ int td_kstack_size; /* size of kernel stack */ @@ -231,7 +229,7 @@ struct thread { __uint64_t td_sticks; /* Statclock hits in system mode (uS) */ __uint64_t td_iticks; /* Statclock hits processing intr (uS) */ int td_locks; /* lockmgr lock debugging */ - int td_unused01; + int td_fairq_lticks; /* fairq wakeup accumulator reset */ void *td_dsched_priv1; /* priv data for I/O schedulers */ int td_refs; /* hold position in gd_tdallq / hold free */ int td_nest_count; /* prevent splz nesting */ @@ -257,6 +255,7 @@ struct thread { int td_crit_debug_index; int td_in_crit_report; #endif + int td_fairq_accum; /* fairq priority accumulator */ struct md_thread td_mach; }; @@ -311,6 +310,7 @@ struct thread { #define TDF_KERNELFP 0x01000000 /* kernel using fp coproc */ #define TDF_NETWORK 0x02000000 /* network proto thread */ #define TDF_CRYPTO 0x04000000 /* crypto thread */ +#define TDF_MARKER 0x80000000 /* fairq marker thread */ /* * Thread priorities. Typically only one thread from any given @@ -339,14 +339,21 @@ struct thread { #define TDPRI_INT_HIGH 29 /* high priority interrupt */ #define TDPRI_MAX 31 -#define TDPRI_MASK 31 -#define TDPRI_CRIT 32 /* high bits of td_pri used for crit */ +/* + * Scale is the approximate number of ticks for which we desire the + * entire gd_tdrunq to get service. With hz = 100 a scale of 8 is 80ms. + * + * Setting this value too small will result in inefficient switching + * rates. + */ +#define TDFAIRQ_SCALE 8 +#define TDFAIRQ_MAX(gd) ((gd)->gd_fairq_total_pri * TDFAIRQ_SCALE) #define LWKT_THREAD_STACK (UPAGES * PAGE_SIZE) #define CACHE_NTHREADS 6 -#define IN_CRITICAL_SECT(td) ((td)->td_pri >= TDPRI_CRIT) +#define IN_CRITICAL_SECT(td) ((td)->td_critcount) #ifdef _KERNEL @@ -403,7 +410,11 @@ extern lwkt_token_t lwkt_getpooltoken(void *); extern void lwkt_setpri(thread_t, int); extern void lwkt_setpri_initial(thread_t, int); extern void lwkt_setpri_self(int); -extern int lwkt_check_resched(thread_t); +extern void lwkt_fairq_schedulerclock(thread_t td); +extern void lwkt_fairq_setpri_self(int pri); +extern int lwkt_fairq_push(int pri); +extern void lwkt_fairq_pop(int pri); +extern void lwkt_fairq_yield(void); extern void lwkt_setcpu_self(struct globaldata *); extern void lwkt_migratecpu(int); diff --git a/sys/sys/thread2.h b/sys/sys/thread2.h index 0087690c8e..442c50058f 100644 --- a/sys/sys/thread2.h +++ b/sys/sys/thread2.h @@ -121,10 +121,10 @@ _crit_enter(__DEBUG_CRIT_ARG__) struct thread *td = curthread; #ifdef INVARIANTS - if (td->td_pri < 0) + if (td->td_critcount < 0) crit_panic(); #endif - td->td_pri += TDPRI_CRIT; + ++td->td_critcount; __DEBUG_CRIT_ENTER(td); cpu_ccfence(); } @@ -132,7 +132,7 @@ _crit_enter(__DEBUG_CRIT_ARG__) static __inline void _crit_enter_quick(struct thread *curtd __DEBUG_CRIT_ADD_ARG__) { - curtd->td_pri += TDPRI_CRIT; + ++curtd->td_critcount; __DEBUG_CRIT_ENTER(curtd); cpu_ccfence(); } @@ -147,7 +147,7 @@ static __inline void _crit_exit_noyield(struct thread *curtd __DEBUG_CRIT_ADD_ARG__) { __DEBUG_CRIT_EXIT(curtd); - curtd->td_pri -= TDPRI_CRIT; + --curtd->td_critcount; #ifdef INVARIANTS if (curtd->td_pri < 0) crit_panic(); @@ -161,13 +161,13 @@ _crit_exit(__DEBUG_CRIT_ARG__) thread_t td = curthread; __DEBUG_CRIT_EXIT(td); - td->td_pri -= TDPRI_CRIT; + --td->td_critcount; #ifdef INVARIANTS if (td->td_pri < 0) crit_panic(); #endif cpu_ccfence(); /* prevent compiler reordering */ - if (td->td_gd->gd_reqflags && td->td_pri < TDPRI_CRIT) + if (td->td_gd->gd_reqflags && td->td_critcount == 0) splz_check(); } @@ -177,9 +177,9 @@ _crit_exit_quick(struct thread *curtd __DEBUG_CRIT_ADD_ARG__) globaldata_t gd = curtd->td_gd; __DEBUG_CRIT_EXIT(curtd); - curtd->td_pri -= TDPRI_CRIT; + --curtd->td_critcount; cpu_ccfence(); /* prevent compiler reordering */ - if (gd->gd_reqflags && curtd->td_pri < TDPRI_CRIT) + if (gd->gd_reqflags && curtd->td_critcount == 0) splz_check(); } @@ -192,7 +192,7 @@ _crit_exit_gd(globaldata_t mygd __DEBUG_CRIT_ADD_ARG__) static __inline int crit_test(thread_t td) { - return(td->td_pri >= TDPRI_CRIT); + return(td->td_critcount); } /* @@ -202,13 +202,13 @@ crit_test(thread_t td) static __inline int lwkt_runnable(void) { - return (mycpu->gd_runqmask != 0); + return (TAILQ_FIRST(&mycpu->gd_tdrunq) != NULL); } static __inline int lwkt_getpri(thread_t td) { - return(td->td_pri & TDPRI_MASK); + return(td->td_pri); } static __inline int diff --git a/sys/sys/uio.h b/sys/sys/uio.h index f0de1d038a..c14dfebbe8 100644 --- a/sys/sys/uio.h +++ b/sys/sys/uio.h @@ -93,7 +93,6 @@ struct uio { struct vm_object; struct vm_page; -void uio_yield (void); int uiomove (caddr_t, size_t, struct uio *); int uiomovez (size_t, struct uio *); int uiomove_frombuf (void *buf, size_t buflen, struct uio *uio); diff --git a/sys/sys/upcall.h b/sys/sys/upcall.h index 1eb26d44c5..324871fd58 100644 --- a/sys/sys/upcall.h +++ b/sys/sys/upcall.h @@ -59,7 +59,6 @@ struct upcall { #define UPC_CONTROL_POLLANDCLEAR 5 #define UPC_CONTROL_WAIT 6 -#define UPC_CRITADD 32 /* NOTE! same as TDPRI_CRIT */ #define UPC_RESERVED 32 /* # of reserved id's */ #if defined(_KERNEL) diff --git a/sys/vfs/hammer/hammer_flusher.c b/sys/vfs/hammer/hammer_flusher.c index 082cb85c3f..1c8a433f6d 100644 --- a/sys/vfs/hammer/hammer_flusher.c +++ b/sys/vfs/hammer/hammer_flusher.c @@ -333,7 +333,7 @@ hammer_flusher_flush(hammer_mount_t hmp) if (++hmp->check_yield > hammer_yield_check) { hmp->check_yield = 0; - lwkt_user_yield(); + lwkt_yield(); } /* diff --git a/sys/vfs/ufs/ffs_rawread.c b/sys/vfs/ufs/ffs_rawread.c index 72dfbbb8c9..13550dc1bc 100644 --- a/sys/vfs/ufs/ffs_rawread.c +++ b/sys/vfs/ufs/ffs_rawread.c @@ -52,7 +52,7 @@ #include static int ffs_rawread_readahead(struct vnode *vp, caddr_t udata, off_t offset, - size_t len, struct buf *bp, int *baseticks); + size_t len, struct buf *bp); static int ffs_rawread_main(struct vnode *vp, struct uio *uio); @@ -142,7 +142,7 @@ done: static int ffs_rawread_readahead(struct vnode *vp, caddr_t udata, off_t loffset, - size_t len, struct buf *bp, int *baseticks) + size_t len, struct buf *bp) { int error; int iolen; @@ -190,10 +190,7 @@ ffs_rawread_readahead(struct vnode *vp, caddr_t udata, off_t loffset, if (vmapbuf(bp, udata, len) < 0) return EFAULT; - if (ticks - *baseticks >= hogticks) { - *baseticks = ticks; - uio_yield(); - } + lwkt_user_yield(); bzero(bp->b_data, bp->b_bcount); /* Mark operation completed (similar to bufdone()) */ @@ -230,7 +227,6 @@ ffs_rawread_main(struct vnode *vp, struct uio *uio) int error, nerror; struct buf *bp, *nbp, *tbp; int iolen; - int baseticks = ticks; caddr_t udata; int resid; off_t offset; @@ -250,8 +246,8 @@ ffs_rawread_main(struct vnode *vp, struct uio *uio) if (bp == NULL) { /* Setup first read */ /* XXX: Leave some bufs for swap */ bp = getpbuf_kva(&ffsrawbufcnt); - error = ffs_rawread_readahead(vp, udata, offset, resid, - bp, &baseticks); + error = ffs_rawread_readahead(vp, udata, offset, + resid, bp); if (error != 0) break; @@ -267,7 +263,7 @@ ffs_rawread_main(struct vnode *vp, struct uio *uio) udata + bp->b_bufsize, offset + bp->b_bufsize, resid - bp->b_bufsize, - nbp, &baseticks); + nbp); if (nerror) { relpbuf(nbp, &ffsrawbufcnt); nbp = NULL; @@ -298,7 +294,7 @@ ffs_rawread_main(struct vnode *vp, struct uio *uio) /* Incomplete read. Try to read remaining part */ error = ffs_rawread_readahead( vp, udata, offset, - bp->b_bufsize - iolen, bp, &baseticks); + bp->b_bufsize - iolen, bp); if (error != 0) break; } else if (nbp != NULL) { /* Complete read with readahead */ @@ -317,7 +313,7 @@ ffs_rawread_main(struct vnode *vp, struct uio *uio) vp, udata + bp->b_bufsize, offset + bp->b_bufsize, resid - bp->b_bufsize, - nbp, &baseticks); + nbp); if (nerror != 0) { relpbuf(nbp, &ffsrawbufcnt); nbp = NULL; @@ -327,8 +323,7 @@ ffs_rawread_main(struct vnode *vp, struct uio *uio) break; } else if (resid > 0) { /* More to read, no readahead */ error = ffs_rawread_readahead(vp, udata, offset, - resid, bp, - &baseticks); + resid, bp); if (error != 0) break; } diff --git a/sys/vm/vm_zeroidle.c b/sys/vm/vm_zeroidle.c index 15559323dc..700b773d0c 100644 --- a/sys/vm/vm_zeroidle.c +++ b/sys/vm/vm_zeroidle.c @@ -208,8 +208,7 @@ vm_pagezero(void __unused *arg) * resched has been requested. */ while (i < PAGE_SIZE) { - if (lwkt_check_resched(curthread)) - break; + lwkt_yield(); if (idlezero_nocache == 1) bzeront(&pg[i], IDLEZERO_RUN); else @@ -227,8 +226,7 @@ vm_pagezero(void __unused *arg) ++idlezero_count; break; } - if (lwkt_check_resched(curthread)) - lwkt_switch(); + lwkt_yield(); } } -- 2.41.0