X-Git-Url: https://gitweb.dragonflybsd.org/dragonfly.git/blobdiff_plain/6f83ced90ea75526068ff4fbb95caabc32a7a3d8..03ac22da0ce225ca12694a9b80003ea708a64320:/sys/kern/lwkt_thread.c diff --git a/sys/kern/lwkt_thread.c b/sys/kern/lwkt_thread.c index 959549cc89..82014c3d1e 100644 --- a/sys/kern/lwkt_thread.c +++ b/sys/kern/lwkt_thread.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003,2004 The DragonFly Project. All rights reserved. + * Copyright (c) 2003-2010 The DragonFly Project. All rights reserved. * * This code is derived from software contributed to The DragonFly Project * by Matthew Dillon @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #include @@ -55,6 +56,9 @@ #include #include +#include + +#include #include #include @@ -72,14 +76,16 @@ #define KTR_CTXSW KTR_ALL #endif KTR_INFO_MASTER(ctxsw); -KTR_INFO(KTR_CTXSW, ctxsw, sw, 0, "sw %p > %p", 2 * sizeof(struct thread *)); -KTR_INFO(KTR_CTXSW, ctxsw, pre, 1, "pre %p > %p", 2 * sizeof(struct thread *)); +KTR_INFO(KTR_CTXSW, ctxsw, sw, 0, "#cpu[%d].td = %p", + sizeof(int) + sizeof(struct thread *)); +KTR_INFO(KTR_CTXSW, ctxsw, pre, 1, "#cpu[%d].td = %p", + sizeof(int) + sizeof(struct thread *)); +KTR_INFO(KTR_CTXSW, ctxsw, newtd, 2, "#threads[%p].name = %s", + sizeof (struct thread *) + sizeof(char *)); +KTR_INFO(KTR_CTXSW, ctxsw, deadtd, 3, "#threads[%p].name = ", sizeof (struct thread *)); static MALLOC_DEFINE(M_THREAD, "thread", "lwkt threads"); -#ifdef SMP -static int mplock_countx = 0; -#endif #ifdef INVARIANTS static int panic_on_cscount = 0; #endif @@ -87,49 +93,20 @@ static __int64_t switch_count = 0; static __int64_t preempt_hit = 0; static __int64_t preempt_miss = 0; static __int64_t preempt_weird = 0; -static __int64_t token_contention_count = 0; -static __int64_t mplock_contention_count = 0; +static __int64_t token_contention_count __debugvar = 0; static int lwkt_use_spin_port; -#ifdef SMP -static int chain_mplock = 0; -static int bgl_yield = 10; -#endif static struct objcache *thread_cache; -volatile cpumask_t mp_lock_contention_mask; - #ifdef SMP static void lwkt_schedule_remote(void *arg, int arg2, struct intrframe *frame); #endif +static void lwkt_fairq_accumulate(globaldata_t gd, thread_t td); extern void cpu_heavy_restore(void); extern void cpu_lwkt_restore(void); extern void cpu_kthread_restore(void); extern void cpu_idle_restore(void); -#ifdef __x86_64__ - -static int -jg_tos_ok(struct thread *td) -{ - void *tos; - int tos_ok; - - if (td == NULL) { - return 1; - } - KKASSERT(td->td_sp != NULL); - tos = ((void **)td->td_sp)[0]; - tos_ok = 0; - if ((tos == cpu_heavy_restore) || (tos == cpu_lwkt_restore) || - (tos == cpu_kthread_restore) || (tos == cpu_idle_restore)) { - tos_ok = 1; - } - return tos_ok; -} - -#endif - /* * We can make all thread ports use the spin backend instead of the thread * backend. This should only be set to debug the spin backend. @@ -137,35 +114,42 @@ jg_tos_ok(struct thread *td) TUNABLE_INT("lwkt.use_spin_port", &lwkt_use_spin_port); #ifdef INVARIANTS -SYSCTL_INT(_lwkt, OID_AUTO, panic_on_cscount, CTLFLAG_RW, &panic_on_cscount, 0, ""); +SYSCTL_INT(_lwkt, OID_AUTO, panic_on_cscount, CTLFLAG_RW, &panic_on_cscount, 0, + "Panic if attempting to switch lwkt's while mastering cpusync"); #endif -#ifdef SMP -SYSCTL_INT(_lwkt, OID_AUTO, chain_mplock, CTLFLAG_RW, &chain_mplock, 0, ""); -SYSCTL_INT(_lwkt, OID_AUTO, bgl_yield_delay, CTLFLAG_RW, &bgl_yield, 0, ""); -#endif -SYSCTL_QUAD(_lwkt, OID_AUTO, switch_count, CTLFLAG_RW, &switch_count, 0, ""); -SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_hit, CTLFLAG_RW, &preempt_hit, 0, ""); -SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_miss, CTLFLAG_RW, &preempt_miss, 0, ""); -SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_weird, CTLFLAG_RW, &preempt_weird, 0, ""); +SYSCTL_QUAD(_lwkt, OID_AUTO, switch_count, CTLFLAG_RW, &switch_count, 0, + "Number of switched threads"); +SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_hit, CTLFLAG_RW, &preempt_hit, 0, + "Successful preemption events"); +SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_miss, CTLFLAG_RW, &preempt_miss, 0, + "Failed preemption events"); +SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_weird, CTLFLAG_RW, &preempt_weird, 0, + "Number of preempted threads."); #ifdef INVARIANTS SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count, CTLFLAG_RW, &token_contention_count, 0, "spinning due to token contention"); -SYSCTL_QUAD(_lwkt, OID_AUTO, mplock_contention_count, CTLFLAG_RW, - &mplock_contention_count, 0, "spinning due to MPLOCK contention"); #endif - -/* - * Kernel Trace - */ -#if !defined(KTR_GIANT_CONTENTION) -#define KTR_GIANT_CONTENTION KTR_ALL -#endif - -KTR_INFO_MASTER(giant); -KTR_INFO(KTR_GIANT_CONTENTION, giant, beg, 0, "thread=%p", sizeof(void *)); -KTR_INFO(KTR_GIANT_CONTENTION, giant, end, 1, "thread=%p", sizeof(void *)); - -#define loggiant(name) KTR_LOG(giant_ ## name, curthread) +static int fairq_enable = 1; +SYSCTL_INT(_lwkt, OID_AUTO, fairq_enable, CTLFLAG_RW, + &fairq_enable, 0, "Turn on fairq priority accumulators"); +static int lwkt_spin_loops = 10; +SYSCTL_INT(_lwkt, OID_AUTO, spin_loops, CTLFLAG_RW, + &lwkt_spin_loops, 0, ""); +static int lwkt_spin_delay = 1; +SYSCTL_INT(_lwkt, OID_AUTO, spin_delay, CTLFLAG_RW, + &lwkt_spin_delay, 0, "Scheduler spin delay in microseconds 0=auto"); +static int lwkt_spin_method = 1; +SYSCTL_INT(_lwkt, OID_AUTO, spin_method, CTLFLAG_RW, + &lwkt_spin_method, 0, "LWKT scheduler behavior when contended"); +static int lwkt_spin_fatal = 0; /* disabled */ +SYSCTL_INT(_lwkt, OID_AUTO, spin_fatal, CTLFLAG_RW, + &lwkt_spin_fatal, 0, "LWKT scheduler spin loops till fatal panic"); +static int preempt_enable = 1; +SYSCTL_INT(_lwkt, OID_AUTO, preempt_enable, CTLFLAG_RW, + &preempt_enable, 0, "Enable preemption"); + +static __cachealign int lwkt_cseq_rindex; +static __cachealign int lwkt_cseq_windex; /* * These helper procedures handle the runq, they can only be called from @@ -181,26 +165,45 @@ void _lwkt_dequeue(thread_t td) { if (td->td_flags & TDF_RUNQ) { - int nq = td->td_pri & TDPRI_MASK; struct globaldata *gd = td->td_gd; td->td_flags &= ~TDF_RUNQ; - TAILQ_REMOVE(&gd->gd_tdrunq[nq], td, td_threadq); - /* runqmask is passively cleaned up by the switcher */ + TAILQ_REMOVE(&gd->gd_tdrunq, td, td_threadq); + gd->gd_fairq_total_pri -= td->td_pri; + if (TAILQ_FIRST(&gd->gd_tdrunq) == NULL) + atomic_clear_int(&gd->gd_reqflags, RQF_RUNNING); } } +/* + * Priority enqueue. + * + * NOTE: There are a limited number of lwkt threads runnable since user + * processes only schedule one at a time per cpu. + */ static __inline void _lwkt_enqueue(thread_t td) { + thread_t xtd; + if ((td->td_flags & (TDF_RUNQ|TDF_MIGRATING|TDF_BLOCKQ)) == 0) { - int nq = td->td_pri & TDPRI_MASK; struct globaldata *gd = td->td_gd; td->td_flags |= TDF_RUNQ; - TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], td, td_threadq); - gd->gd_runqmask |= 1 << nq; + xtd = TAILQ_FIRST(&gd->gd_tdrunq); + if (xtd == NULL) { + TAILQ_INSERT_TAIL(&gd->gd_tdrunq, td, td_threadq); + atomic_set_int(&gd->gd_reqflags, RQF_RUNNING); + } else { + while (xtd && xtd->td_pri > td->td_pri) + xtd = TAILQ_NEXT(xtd, td_threadq); + if (xtd) + TAILQ_INSERT_BEFORE(xtd, td, td_threadq); + else + TAILQ_INSERT_TAIL(&gd->gd_tdrunq, td, td_threadq); + } + gd->gd_fairq_total_pri += td->td_pri; } } @@ -250,8 +253,10 @@ lwkt_init(void) void lwkt_schedule_self(thread_t td) { + KKASSERT((td->td_flags & TDF_MIGRATING) == 0); crit_enter_quick(td); - KASSERT(td != &td->td_gd->gd_idlethread, ("lwkt_schedule_self(): scheduling gd_idlethread is illegal!")); + KASSERT(td != &td->td_gd->gd_idlethread, + ("lwkt_schedule_self(): scheduling gd_idlethread is illegal!")); KKASSERT(td->td_lwp == NULL || (td->td_lwp->lwp_flag & LWP_ONRUNQ) == 0); _lwkt_enqueue(td); crit_exit_quick(td); @@ -278,11 +283,7 @@ lwkt_deschedule_self(thread_t td) void lwkt_gdinit(struct globaldata *gd) { - int i; - - for (i = 0; i < sizeof(gd->gd_tdrunq)/sizeof(gd->gd_tdrunq[0]); ++i) - TAILQ_INIT(&gd->gd_tdrunq[i]); - gd->gd_runqmask = 0; + TAILQ_INIT(&gd->gd_tdrunq); TAILQ_INIT(&gd->gd_tdallq); } @@ -306,10 +307,17 @@ lwkt_alloc_thread(struct thread *td, int stksize, int cpu, int flags) * thread intact through the exit. */ if (td == NULL) { - if ((td = gd->gd_freetd) != NULL) + crit_enter_gd(gd); + if ((td = gd->gd_freetd) != NULL) { + KKASSERT((td->td_flags & (TDF_RUNNING|TDF_PREEMPT_LOCK| + TDF_RUNQ)) == 0); gd->gd_freetd = NULL; - else + } else { td = objcache_get(thread_cache, M_WAITOK); + KKASSERT((td->td_flags & (TDF_RUNNING|TDF_PREEMPT_LOCK| + TDF_RUNQ)) == 0); + } + crit_exit_gd(gd); KASSERT((td->td_flags & (TDF_ALLOCATED_THREAD|TDF_RUNNING)) == TDF_ALLOCATED_THREAD, ("lwkt_alloc_thread: corrupted td flags 0x%X", td->td_flags)); @@ -326,7 +334,7 @@ lwkt_alloc_thread(struct thread *td, int stksize, int cpu, int flags) } } if (stack == NULL) { - stack = (void *)kmem_alloc(&kernel_map, stksize); + stack = (void *)kmem_alloc_stack(&kernel_map, stksize); flags |= TDF_ALLOCATED_STACK; } if (cpu < 0) @@ -365,6 +373,11 @@ lwkt_init_thread_remote(void *arg) #endif +/* + * lwkt core thread structural initialization. + * + * NOTE: All threads are initialized as mpsafe threads. + */ void lwkt_init_thread(thread_t td, void *stack, int stksize, int flags, struct globaldata *gd) @@ -376,11 +389,9 @@ lwkt_init_thread(thread_t td, void *stack, int stksize, int flags, td->td_kstack_size = stksize; td->td_flags = flags; td->td_gd = gd; - td->td_pri = TDPRI_KERN_DAEMON + TDPRI_CRIT; -#ifdef SMP - if ((flags & TDF_MPSAFE) == 0) - td->td_mpcount = 1; -#endif + td->td_pri = TDPRI_KERN_DAEMON; + td->td_critcount = 1; + td->td_toks_stop = &td->td_toks_base; if (lwkt_use_spin_port) lwkt_initport_spin(&td->td_msgport); else @@ -405,6 +416,8 @@ lwkt_init_thread(thread_t td, void *stack, int stksize, int flags, TAILQ_INSERT_TAIL(&gd->gd_tdallq, td, td_allq); crit_exit_gd(mygd); #endif + + dsched_new_thread(td); } void @@ -415,19 +428,20 @@ lwkt_set_comm(thread_t td, const char *ctl, ...) __va_start(va, ctl); kvsnprintf(td->td_comm, sizeof(td->td_comm), ctl, va); __va_end(va); + KTR_LOG(ctxsw_newtd, td, &td->td_comm[0]); } void lwkt_hold(thread_t td) { - ++td->td_refs; + atomic_add_int(&td->td_refs, 1); } void lwkt_rele(thread_t td) { KKASSERT(td->td_refs > 0); - --td->td_refs; + atomic_add_int(&td->td_refs, -1); } void @@ -440,9 +454,8 @@ lwkt_wait_free(thread_t td) void lwkt_free_thread(thread_t td) { - KASSERT((td->td_flags & TDF_RUNNING) == 0, - ("lwkt_free_thread: did not exit! %p", td)); - + KKASSERT(td->td_refs == 0); + KKASSERT((td->td_flags & (TDF_RUNNING|TDF_PREEMPT_LOCK|TDF_RUNQ)) == 0); if (td->td_flags & TDF_ALLOCATED_THREAD) { objcache_put(thread_cache, td); } else if (td->td_flags & TDF_ALLOCATED_STACK) { @@ -453,6 +466,7 @@ lwkt_free_thread(thread_t td) td->td_kstack = NULL; td->td_kstack_size = 0; } + KTR_LOG(ctxsw_deadtd, td); } @@ -471,16 +485,18 @@ lwkt_free_thread(thread_t td) * different beast and LWKT priorities should not be confused with * user process priorities. * - * The MP lock may be out of sync with the thread's td_mpcount. lwkt_switch() - * cleans it up. Note that the td_switch() function cannot do anything that - * requires the MP lock since the MP lock will have already been setup for - * the target thread (not the current thread). It's nice to have a scheduler - * that does not need the MP lock to work because it allows us to do some - * really cool high-performance MP lock optimizations. - * * PREEMPTION NOTE: Preemption occurs via lwkt_preempt(). lwkt_switch() * is not called by the current thread in the preemption case, only when * the preempting thread blocks (in order to return to the original thread). + * + * SPECIAL NOTE ON SWITCH ATOMICY: Certain operations such as thread + * migration and tsleep deschedule the current lwkt thread and call + * lwkt_switch(). In particular, the target cpu of the migration fully + * expects the thread to become non-runnable and can deadlock against + * cpusync operations if we run any IPIs prior to switching the thread out. + * + * WE MUST BE VERY CAREFUL NOT TO RUN SPLZ DIRECTLY OR INDIRECTLY IF + * THE CURRENET THREAD HAS BEEN DESCHEDULED! */ void lwkt_switch(void) @@ -488,9 +504,12 @@ lwkt_switch(void) globaldata_t gd = mycpu; thread_t td = gd->gd_curthread; thread_t ntd; -#ifdef SMP - int mpheld; -#endif + thread_t xtd; + int spinning = lwkt_spin_loops; /* loops before HLTing */ + int reqflags; + int cseq; + int oseq; + int fatal_count; /* * Switching from within a 'fast' (non thread switched) interrupt or IPI @@ -503,9 +522,11 @@ lwkt_switch(void) int savegdnest; int savegdtrap; - if (gd->gd_trap_nesting_level == 0 && panicstr == NULL) { - panic("lwkt_switch: cannot switch from within " - "a fast interrupt, yet, td %p\n", td); + if (gd->gd_trap_nesting_level == 0 && panic_cpu_gd != mycpu) { + panic("lwkt_switch: Attempt to switch from a " + "a fast interrupt, ipi, or hard code section, " + "td %p\n", + td); } else { savegdnest = gd->gd_intr_nesting_level; savegdtrap = gd->gd_trap_nesting_level; @@ -513,9 +534,10 @@ lwkt_switch(void) gd->gd_trap_nesting_level = 0; if ((td->td_flags & TDF_PANICWARN) == 0) { td->td_flags |= TDF_PANICWARN; - kprintf("Warning: thread switch from interrupt or IPI, " + kprintf("Warning: thread switch from interrupt, IPI, " + "or hard code section.\n" "thread %p (%s)\n", td, td->td_comm); - print_backtrace(); + print_backtrace(-1); } lwkt_switch(); gd->gd_intr_nesting_level = savegdnest; @@ -537,31 +559,19 @@ lwkt_switch(void) td->td_release(td); crit_enter_gd(gd); - if (td->td_toks) + if (TD_TOKS_HELD(td)) lwkt_relalltokens(td); /* * We had better not be holding any spin locks, but don't get into an * endless panic loop. */ - KASSERT(gd->gd_spinlock_rd == NULL || panicstr != NULL, - ("lwkt_switch: still holding a shared spinlock %p!", - gd->gd_spinlock_rd)); KASSERT(gd->gd_spinlocks_wr == 0 || panicstr != NULL, ("lwkt_switch: still holding %d exclusive spinlocks!", gd->gd_spinlocks_wr)); #ifdef SMP - /* - * td_mpcount cannot be used to determine if we currently hold the - * MP lock because get_mplock() will increment it prior to attempting - * to get the lock, and switch out if it can't. Our ownership of - * the actual lock will remain stable while we are in a critical section - * (but, of course, another cpu may own or release the lock so the - * actual value of mp_lock is not stable). - */ - mpheld = MP_LOCK_HELD(); #ifdef INVARIANTS if (td->td_cscount) { kprintf("Diagnostic: attempt to switch while mastering cpusync: %p\n", @@ -571,28 +581,19 @@ lwkt_switch(void) } #endif #endif + + /* + * If we had preempted another thread on this cpu, resume the preempted + * thread. This occurs transparently, whether the preempted thread + * was scheduled or not (it may have been preempted after descheduling + * itself). + * + * We have to setup the MP lock for the original thread after backing + * out the adjustment that was made to curthread when the original + * was preempted. + */ if ((ntd = td->td_preempted) != NULL) { - /* - * We had preempted another thread on this cpu, resume the preempted - * thread. This occurs transparently, whether the preempted thread - * was scheduled or not (it may have been preempted after descheduling - * itself). - * - * We have to setup the MP lock for the original thread after backing - * out the adjustment that was made to curthread when the original - * was preempted. - */ KKASSERT(ntd->td_flags & TDF_PREEMPT_LOCK); -#ifdef SMP - if (ntd->td_mpcount && mpheld == 0) { - panic("MPLOCK NOT HELD ON RETURN: %p %p %d %d", - td, ntd, td->td_mpcount, ntd->td_mpcount); - } - if (ntd->td_mpcount) { - td->td_mpcount -= ntd->td_mpcount; - KKASSERT(td->td_mpcount >= 0); - } -#endif ntd->td_flags |= TDF_PREEMPT_DONE; /* @@ -600,190 +601,334 @@ lwkt_switch(void) * set the reschedule flag if the originally interrupted thread is * at a lower priority. */ - if (gd->gd_runqmask > (2 << (ntd->td_pri & TDPRI_MASK)) - 1) + if (TAILQ_FIRST(&gd->gd_tdrunq) && + TAILQ_FIRST(&gd->gd_tdrunq)->td_pri > ntd->td_pri) { need_lwkt_resched(); + } /* YYY release mp lock on switchback if original doesn't need it */ - } else { + goto havethread_preempted; + } + + /* + * Implement round-robin fairq with priority insertion. The priority + * insertion is handled by _lwkt_enqueue() + * + * If we cannot obtain ownership of the tokens we cannot immediately + * schedule the target thread. + * + * Reminder: Again, we cannot afford to run any IPIs in this path if + * the current thread has been descheduled. + */ + for (;;) { /* - * Priority queue / round-robin at each priority. Note that user - * processes run at a fixed, low priority and the user process - * scheduler deals with interactions between user processes - * by scheduling and descheduling them from the LWKT queue as - * necessary. - * - * We have to adjust the MP lock for the target thread. If we - * need the MP lock and cannot obtain it we try to locate a - * thread that does not need the MP lock. If we cannot, we spin - * instead of HLT. - * - * A similar issue exists for the tokens held by the target thread. - * If we cannot obtain ownership of the tokens we cannot immediately - * schedule the thread. + * Clear RQF_AST_LWKT_RESCHED (we handle the reschedule request) + * and set RQF_WAKEUP (prevent unnecessary IPIs from being + * received). */ + for (;;) { + reqflags = gd->gd_reqflags; + if (atomic_cmpset_int(&gd->gd_reqflags, reqflags, + (reqflags & ~RQF_AST_LWKT_RESCHED) | + RQF_WAKEUP)) { + break; + } + } /* - * If an LWKT reschedule was requested, well that is what we are - * doing now so clear it. + * Hotpath - pull the head of the run queue and attempt to schedule + * it. Fairq exhaustion moves the task to the end of the list. If + * no threads are runnable we switch to the idle thread. */ - clear_lwkt_resched(); -again: - if (gd->gd_runqmask) { - int nq = bsrl(gd->gd_runqmask); - if ((ntd = TAILQ_FIRST(&gd->gd_tdrunq[nq])) == NULL) { - gd->gd_runqmask &= ~(1 << nq); - goto again; - } + for (;;) { + ntd = TAILQ_FIRST(&gd->gd_tdrunq); + + if (ntd == NULL) { + /* + * Runq is empty, switch to idle and clear RQF_WAKEUP + * to allow it to halt. + */ + ntd = &gd->gd_idlethread; #ifdef SMP - /* - * THREAD SELECTION FOR AN SMP MACHINE BUILD - * - * If the target needs the MP lock and we couldn't get it, - * or if the target is holding tokens and we could not - * gain ownership of the tokens, continue looking for a - * thread to schedule and spin instead of HLT if we can't. - * - * NOTE: the mpheld variable invalid after this conditional, it - * can change due to both cpu_try_mplock() returning success - * AND interactions in lwkt_getalltokens() due to the fact that - * we are trying to check the mpcount of a thread other then - * the current thread. Because of this, if the current thread - * is not holding td_mpcount, an IPI indirectly run via - * lwkt_getalltokens() can obtain and release the MP lock and - * cause the core MP lock to be released. - */ - if ((ntd->td_mpcount && mpheld == 0 && !cpu_try_mplock()) || - (ntd->td_toks && lwkt_getalltokens(ntd) == 0) - ) { - u_int32_t rqmask = gd->gd_runqmask; - - mpheld = MP_LOCK_HELD(); - ntd = NULL; - while (rqmask) { - TAILQ_FOREACH(ntd, &gd->gd_tdrunq[nq], td_threadq) { - if (ntd->td_mpcount && !mpheld && !cpu_try_mplock()) { - /* spinning due to MP lock being held */ -#ifdef INVARIANTS - ++mplock_contention_count; + if (gd->gd_trap_nesting_level == 0 && panicstr == NULL) + ASSERT_NO_TOKENS_HELD(ntd); #endif - /* mplock still not held, 'mpheld' still valid */ - continue; - } - - /* - * mpheld state invalid after getalltokens call returns - * failure, but the variable is only needed for - * the loop. - */ - if (ntd->td_toks && !lwkt_getalltokens(ntd)) { - /* spinning due to token contention */ -#ifdef INVARIANTS - ++token_contention_count; -#endif - mpheld = MP_LOCK_HELD(); - continue; - } - break; - } - if (ntd) - break; - rqmask &= ~(1 << nq); - nq = bsrl(rqmask); - - /* - * We have two choices. We can either refuse to run a - * user thread when a kernel thread needs the MP lock - * but could not get it, or we can allow it to run but - * then expect an IPI (hopefully) later on to force a - * reschedule when the MP lock might become available. - */ - if (nq < TDPRI_KERN_LPSCHED) { - if (chain_mplock == 0) - break; - atomic_set_int(&mp_lock_contention_mask, - gd->gd_cpumask); - /* continue loop, allow user threads to be scheduled */ - } - } - if (ntd == NULL) { - cpu_mplock_contested(); - ntd = &gd->gd_idlethread; - ntd->td_flags |= TDF_IDLE_NOHLT; - goto using_idle_thread; - } else { - ++gd->gd_cnt.v_swtch; - TAILQ_REMOVE(&gd->gd_tdrunq[nq], ntd, td_threadq); - TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], ntd, td_threadq); - } - } else { - if (ntd->td_mpcount) - ++mplock_countx; - ++gd->gd_cnt.v_swtch; - TAILQ_REMOVE(&gd->gd_tdrunq[nq], ntd, td_threadq); - TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], ntd, td_threadq); + cpu_time.cp_msg[0] = 0; + cpu_time.cp_stallpc = 0; + atomic_clear_int(&gd->gd_reqflags, RQF_WAKEUP); + goto haveidle; } -#else + + if (ntd->td_fairq_accum >= 0) + break; + + /*splz_check(); cannot do this here, see above */ + lwkt_fairq_accumulate(gd, ntd); + TAILQ_REMOVE(&gd->gd_tdrunq, ntd, td_threadq); + TAILQ_INSERT_TAIL(&gd->gd_tdrunq, ntd, td_threadq); + } + + /* + * Hotpath - schedule ntd. Leaves RQF_WAKEUP set to prevent + * unwanted decontention IPIs. + * + * NOTE: For UP there is no mplock and lwkt_getalltokens() + * always succeeds. + */ + if (TD_TOKS_NOT_HELD(ntd) || lwkt_getalltokens(ntd)) + goto havethread; + + /* + * Coldpath (SMP only since tokens always succeed on UP) + * + * We had some contention on the thread we wanted to schedule. + * What we do now is try to find a thread that we can schedule + * in its stead until decontention reschedules on our cpu. + * + * The coldpath scan does NOT rearrange threads in the run list + * and it also ignores the accumulator. + * + * We do not immediately schedule a user priority thread, instead + * we record it in xtd and continue looking for kernel threads. + * A cpu can only have one user priority thread (normally) so just + * record the first one. + * + * NOTE: This scan will also include threads whos fairq's were + * accumulated in the first loop. + */ + ++token_contention_count; + xtd = NULL; + while ((ntd = TAILQ_NEXT(ntd, td_threadq)) != NULL) { /* - * THREAD SELECTION FOR A UP MACHINE BUILD. We don't have to - * worry about tokens or the BGL. However, we still have - * to call lwkt_getalltokens() in order to properly detect - * stale tokens. This call cannot fail for a UP build! + * Try to switch to this thread. If the thread is running at + * user priority we clear WAKEUP to allow decontention IPIs + * (since this thread is simply running until the one we wanted + * decontends), and we make sure that LWKT_RESCHED is not set. + * + * Otherwise for kernel threads we leave WAKEUP set to avoid + * unnecessary decontention IPIs. */ - lwkt_getalltokens(ntd); - ++gd->gd_cnt.v_swtch; - TAILQ_REMOVE(&gd->gd_tdrunq[nq], ntd, td_threadq); - TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], ntd, td_threadq); -#endif - } else { + if (ntd->td_pri < TDPRI_KERN_LPSCHED) { + if (xtd == NULL) + xtd = ntd; + continue; + } + /* - * We have nothing to run but only let the idle loop halt - * the cpu if there are no pending interrupts. + * Do not let the fairq get too negative. Even though we are + * ignoring it atm once the scheduler decontends a very negative + * thread will get moved to the end of the queue. */ - ntd = &gd->gd_idlethread; - if (gd->gd_reqflags & RQF_IDLECHECK_MASK) - ntd->td_flags |= TDF_IDLE_NOHLT; -#ifdef SMP -using_idle_thread: + if (TD_TOKS_NOT_HELD(ntd) || lwkt_getalltokens(ntd)) { + if (ntd->td_fairq_accum < -TDFAIRQ_MAX(gd)) + ntd->td_fairq_accum = -TDFAIRQ_MAX(gd); + goto havethread; + } + /* - * The idle thread should not be holding the MP lock unless we - * are trapping in the kernel or in a panic. Since we select the - * idle thread unconditionally when no other thread is available, - * if the MP lock is desired during a panic or kernel trap, we - * have to loop in the scheduler until we get it. + * Well fubar, this thread is contended as well, loop */ - if (ntd->td_mpcount) { - mpheld = MP_LOCK_HELD(); - if (gd->gd_trap_nesting_level == 0 && panicstr == NULL) { - panic("Idle thread %p was holding the BGL!", ntd); - } else if (mpheld == 0) { - cpu_mplock_contested(); - goto again; + /* */ + } + + /* + * We exhausted the run list but we may have recorded a user + * thread to try. We have three choices based on + * lwkt.decontention_method. + * + * (0) Atomically clear RQF_WAKEUP in order to receive decontention + * IPIs (to interrupt the user process) and test + * RQF_AST_LWKT_RESCHED at the same time. + * + * This results in significant decontention IPI traffic but may + * be more responsive. + * + * (1) Leave RQF_WAKEUP set so we do not receive a decontention IPI. + * An automatic LWKT reschedule will occur on the next hardclock + * (typically 100hz). + * + * This results in no decontention IPI traffic but may be less + * responsive. This is the default. + * + * (2) Refuse to schedule the user process at this time. + * + * This is highly experimental and should not be used under + * normal circumstances. This can cause a user process to + * get starved out in situations where kernel threads are + * fighting each other for tokens. + */ + if (xtd) { + ntd = xtd; + + switch(lwkt_spin_method) { + case 0: + for (;;) { + reqflags = gd->gd_reqflags; + if (atomic_cmpset_int(&gd->gd_reqflags, + reqflags, + reqflags & ~RQF_WAKEUP)) { + break; + } } + break; + case 1: + reqflags = gd->gd_reqflags; + break; + default: + goto skip; + break; + } + if ((reqflags & RQF_AST_LWKT_RESCHED) == 0 && + (TD_TOKS_NOT_HELD(ntd) || lwkt_getalltokens(ntd)) + ) { + if (ntd->td_fairq_accum < -TDFAIRQ_MAX(gd)) + ntd->td_fairq_accum = -TDFAIRQ_MAX(gd); + goto havethread; } + +skip: + /* + * Make sure RQF_WAKEUP is set if we failed to schedule the + * user thread to prevent the idle thread from halting. + */ + atomic_set_int(&gd->gd_reqflags, RQF_WAKEUP); + } + + /* + * We exhausted the run list, meaning that all runnable threads + * are contended. + */ + cpu_pause(); + ntd = &gd->gd_idlethread; +#ifdef SMP + if (gd->gd_trap_nesting_level == 0 && panicstr == NULL) + ASSERT_NO_TOKENS_HELD(ntd); + /* contention case, do not clear contention mask */ +#endif + + /* + * Ok, we might want to spin a few times as some tokens are held for + * very short periods of time and IPI overhead is 1uS or worse + * (meaning it is usually better to spin). Regardless we have to + * call splz_check() to be sure to service any interrupts blocked + * by our critical section, otherwise we could livelock e.g. IPIs. + * + * The IPI mechanic is really a last resort. In nearly all other + * cases RQF_WAKEUP is left set to prevent decontention IPIs. + * + * When we decide not to spin we clear RQF_WAKEUP and switch to + * the idle thread. Clearing RQF_WEAKEUP allows the idle thread + * to halt and decontended tokens will issue an IPI to us. The + * idle thread will check for pending reschedules already set + * (RQF_AST_LWKT_RESCHED) before actually halting so we don't have + * to here. + * + * Also, if TDF_RUNQ is not set the current thread is trying to + * deschedule, possibly in an atomic fashion. We cannot afford to + * stay here. + */ + if (spinning <= 0 || (td->td_flags & TDF_RUNQ) == 0) { + atomic_clear_int(&gd->gd_reqflags, RQF_WAKEUP); + goto haveidle; + } + --spinning; + + /* + * When spinning a delay is required both to avoid livelocks from + * token order reversals (a thread may be trying to acquire multiple + * tokens), and also to reduce cpu cache management traffic. + * + * In order to scale to a large number of CPUs we use a time slot + * resequencer to force contending cpus into non-contending + * time-slots. The scheduler may still contend with the lock holder + * but will not (generally) contend with all the other cpus trying + * trying to get the same token. + * + * The resequencer uses a FIFO counter mechanic. The owner of the + * rindex at the head of the FIFO is allowed to pull itself off + * the FIFO and fetchadd is used to enter into the FIFO. This bit + * of code is VERY cache friendly and forces all spinning schedulers + * into their own time slots. + * + * This code has been tested to 48-cpus and caps the cache + * contention load at ~1uS intervals regardless of the number of + * cpus. Scaling beyond 64 cpus might require additional smarts + * (such as separate FIFOs for specific token cases). + * + * WARNING! We can't call splz_check() or anything else here as + * it could cause a deadlock. + */ +#if defined(INVARIANTS) && defined(__amd64__) + if ((read_rflags() & PSL_I) == 0) { + cpu_enable_intr(); + panic("lwkt_switch() called with interrupts disabled"); + } +#endif + cseq = atomic_fetchadd_int(&lwkt_cseq_windex, 1); + fatal_count = lwkt_spin_fatal; + while ((oseq = lwkt_cseq_rindex) != cseq) { + cpu_ccfence(); +#if !defined(_KERNEL_VIRTUAL) + if (cpu_mi_feature & CPU_MI_MONITOR) { + cpu_mmw_pause_int(&lwkt_cseq_rindex, oseq); + } else #endif + { + DELAY(1); + cpu_lfence(); + } + if (fatal_count && --fatal_count == 0) + panic("lwkt_switch: fatal spin wait"); } + cseq = lwkt_spin_delay; /* don't trust the system operator */ + cpu_ccfence(); + if (cseq < 1) + cseq = 1; + if (cseq > 1000) + cseq = 1000; + DELAY(cseq); + atomic_add_int(&lwkt_cseq_rindex, 1); + splz_check(); /* ok, we already checked that td is still scheduled */ + /* highest level for(;;) loop */ } - KASSERT(ntd->td_pri >= TDPRI_CRIT, - ("priority problem in lwkt_switch %d %d", td->td_pri, ntd->td_pri)); +havethread: /* - * Do the actual switch. If the new target does not need the MP lock - * and we are holding it, release the MP lock. If the new target requires - * the MP lock we have already acquired it for the target. + * We must always decrement td_fairq_accum on non-idle threads just + * in case a thread never gets a tick due to being in a continuous + * critical section. The page-zeroing code does this, for example. + * + * If the thread we came up with is a higher or equal priority verses + * the thread at the head of the queue we move our thread to the + * front. This way we can always check the front of the queue. + * + * Clear gd_idle_repeat when doing a normal switch to a non-idle + * thread. */ -#ifdef SMP - if (ntd->td_mpcount == 0 ) { - if (MP_LOCK_HELD()) - cpu_rel_mplock(); - } else { - ASSERT_MP_LOCK_HELD(ntd); + ++gd->gd_cnt.v_swtch; + --ntd->td_fairq_accum; + ntd->td_wmesg = NULL; + xtd = TAILQ_FIRST(&gd->gd_tdrunq); + if (ntd != xtd && ntd->td_pri >= xtd->td_pri) { + TAILQ_REMOVE(&gd->gd_tdrunq, ntd, td_threadq); + TAILQ_INSERT_HEAD(&gd->gd_tdrunq, ntd, td_threadq); } -#endif + gd->gd_idle_repeat = 0; + +havethread_preempted: + /* + * If the new target does not need the MP lock and we are holding it, + * release the MP lock. If the new target requires the MP lock we have + * already acquired it for the target. + */ + ; +haveidle: + KASSERT(ntd->td_critcount, + ("priority problem in lwkt_switch %d %d", + td->td_critcount, ntd->td_critcount)); + if (td != ntd) { ++switch_count; -#ifdef __x86_64__ - KKASSERT(jg_tos_ok(ntd)); -#endif - KTR_LOG(ctxsw_sw, td, ntd); + KTR_LOG(ctxsw_sw, gd->gd_cpuid, ntd); td->td_switch(ntd); } /* NOTE: current cpu may have changed after switch */ @@ -804,7 +949,7 @@ using_idle_thread: * * THE CALLER OF LWKT_PREEMPT() MUST BE IN A CRITICAL SECTION. Typically * this is called via lwkt_schedule() through the td_preemptable callback. - * critpri is the managed critical priority that we should ignore in order + * critcount is the managed critical priority that we should ignore in order * to determine whether preemption is possible (aka usually just the crit * priority of lwkt_schedule() itself). * @@ -815,29 +960,18 @@ using_idle_thread: * preempted source thread will be resumed the instant the target blocks * whether or not the source is scheduled (i.e. preemption is supposed to * be as transparent as possible). - * - * The target thread inherits our MP count (added to its own) for the - * duration of the preemption in order to preserve the atomicy of the - * MP lock during the preemption. Therefore, any preempting targets must be - * careful in regards to MP assertions. Note that the MP count may be - * out of sync with the physical mp_lock, but we do not have to preserve - * the original ownership of the lock if it was out of synch (that is, we - * can leave it synchronized on return). */ void -lwkt_preempt(thread_t ntd, int critpri) +lwkt_preempt(thread_t ntd, int critcount) { struct globaldata *gd = mycpu; thread_t td; -#ifdef SMP - int mpheld; - int savecnt; -#endif + int save_gd_intr_nesting_level; /* * The caller has put us in a critical section. We can only preempt * if the caller of the caller was not in a critical section (basically - * a local interrupt), as determined by the 'critpri' parameter. We + * a local interrupt), as determined by the 'critcount' parameter. We * also can't preempt if the caller is holding any spinlocks (even if * he isn't in a critical section). This also handles the tokens test. * @@ -846,14 +980,19 @@ lwkt_preempt(thread_t ntd, int critpri) * * Set need_lwkt_resched() unconditionally for now YYY. */ - KASSERT(ntd->td_pri >= TDPRI_CRIT, ("BADCRIT0 %d", ntd->td_pri)); + KASSERT(ntd->td_critcount, ("BADCRIT0 %d", ntd->td_pri)); + + if (preempt_enable == 0) { + ++preempt_miss; + return; + } td = gd->gd_curthread; - if ((ntd->td_pri & TDPRI_MASK) <= (td->td_pri & TDPRI_MASK)) { + if (ntd->td_pri <= td->td_pri) { ++preempt_miss; return; } - if ((td->td_pri & ~TDPRI_MASK) > critpri) { + if (td->td_critcount > critcount) { ++preempt_miss; need_lwkt_resched(); return; @@ -866,24 +1005,16 @@ lwkt_preempt(thread_t ntd, int critpri) } #endif /* - * Take the easy way out and do not preempt if we are holding - * any spinlocks. We could test whether the thread(s) being - * preempted interlock against the target thread's tokens and whether - * we can get all the target thread's tokens, but this situation - * should not occur very often so its easier to simply not preempt. - * Also, plain spinlocks are impossible to figure out at this point so - * just don't preempt. + * We don't have to check spinlocks here as they will also bump + * td_critcount. * * Do not try to preempt if the target thread is holding any tokens. * We could try to acquire the tokens but this case is so rare there * is no need to support it. */ - if (gd->gd_spinlock_rd || gd->gd_spinlocks_wr) { - ++preempt_miss; - need_lwkt_resched(); - return; - } - if (ntd->td_toks) { + KKASSERT(gd->gd_spinlocks_wr == 0); + + if (TD_TOKS_HELD(ntd)) { ++preempt_miss; need_lwkt_resched(); return; @@ -898,57 +1029,33 @@ lwkt_preempt(thread_t ntd, int critpri) need_lwkt_resched(); return; } -#ifdef SMP - /* - * note: an interrupt might have occured just as we were transitioning - * to or from the MP lock. In this case td_mpcount will be pre-disposed - * (non-zero) but not actually synchronized with the actual state of the - * lock. We can use it to imply an MP lock requirement for the - * preemption but we cannot use it to test whether we hold the MP lock - * or not. - */ - savecnt = td->td_mpcount; - mpheld = MP_LOCK_HELD(); - ntd->td_mpcount += td->td_mpcount; - if (mpheld == 0 && ntd->td_mpcount && !cpu_try_mplock()) { - ntd->td_mpcount -= td->td_mpcount; - ++preempt_miss; - need_lwkt_resched(); - return; - } -#endif /* * Since we are able to preempt the current thread, there is no need to * call need_lwkt_resched(). + * + * We must temporarily clear gd_intr_nesting_level around the switch + * since switchouts from the target thread are allowed (they will just + * return to our thread), and since the target thread has its own stack. */ ++preempt_hit; ntd->td_preempted = td; td->td_flags |= TDF_PREEMPT_LOCK; - KTR_LOG(ctxsw_pre, td, ntd); + KTR_LOG(ctxsw_pre, gd->gd_cpuid, ntd); + save_gd_intr_nesting_level = gd->gd_intr_nesting_level; + gd->gd_intr_nesting_level = 0; td->td_switch(ntd); + gd->gd_intr_nesting_level = save_gd_intr_nesting_level; KKASSERT(ntd->td_preempted && (td->td_flags & TDF_PREEMPT_DONE)); -#ifdef SMP - KKASSERT(savecnt == td->td_mpcount); - mpheld = MP_LOCK_HELD(); - if (mpheld && td->td_mpcount == 0) - cpu_rel_mplock(); - else if (mpheld == 0 && td->td_mpcount) - panic("lwkt_preempt(): MP lock was not held through"); -#endif ntd->td_preempted = NULL; td->td_flags &= ~(TDF_PREEMPT_LOCK|TDF_PREEMPT_DONE); } /* * Conditionally call splz() if gd_reqflags indicates work is pending. - * - * td_nest_count prevents deep nesting via splz() or doreti() which - * might otherwise blow out the kernel stack. Note that except for - * this special case, we MUST call splz() here to handle any - * pending ints, particularly after we switch, or we might accidently - * halt the cpu with interrupts pending. + * This will work inside a critical section but not inside a hard code + * section. * * (self contained on a per cpu basis) */ @@ -958,28 +1065,39 @@ splz_check(void) globaldata_t gd = mycpu; thread_t td = gd->gd_curthread; - if (gd->gd_reqflags && td->td_nest_count < 2) + if ((gd->gd_reqflags & RQF_IDLECHECK_MASK) && + gd->gd_intr_nesting_level == 0 && + td->td_nest_count < 2) + { splz(); + } } /* - * This implements a normal yield which will yield to equal priority - * threads as well as higher priority threads. Note that gd_reqflags - * tests will be handled by the crit_exit() call in lwkt_switch(). + * This version is integrated into crit_exit, reqflags has already + * been tested but td_critcount has not. * - * (self contained on a per cpu basis) + * We only want to execute the splz() on the 1->0 transition of + * critcount and not in a hard code section or if too deeply nested. */ void -lwkt_yield(void) +lwkt_maybe_splz(thread_t td) { - lwkt_schedule_self(curthread); - lwkt_switch(); + globaldata_t gd = td->td_gd; + + if (td->td_critcount == 0 && + gd->gd_intr_nesting_level == 0 && + td->td_nest_count < 2) + { + splz(); + } } /* - * This function is used along with the lwkt_passive_recover() inline - * by the trap code to negotiate a passive release of the current - * process/lwp designation with the user scheduler. + * This function is used to negotiate a passive release of the current + * process/lwp designation with the user scheduler, allowing the user + * scheduler to schedule another user thread. The related kernel thread + * (curthread) continues running in the released state. */ void lwkt_passive_release(struct thread *td) @@ -991,10 +1109,43 @@ lwkt_passive_release(struct thread *td) lp->lwp_proc->p_usched->release_curproc(lp); } + /* - * Make a kernel thread act as if it were in user mode with regards - * to scheduling, to avoid becoming cpu-bound in the kernel. Kernel - * loops which may be potentially cpu-bound can call lwkt_user_yield(). + * This implements a normal yield. This routine is virtually a nop if + * there is nothing to yield to but it will always run any pending interrupts + * if called from a critical section. + * + * This yield is designed for kernel threads without a user context. + * + * (self contained on a per cpu basis) + */ +void +lwkt_yield(void) +{ + globaldata_t gd = mycpu; + thread_t td = gd->gd_curthread; + thread_t xtd; + + if ((gd->gd_reqflags & RQF_IDLECHECK_MASK) && td->td_nest_count < 2) + splz(); + if (td->td_fairq_accum < 0) { + lwkt_schedule_self(curthread); + lwkt_switch(); + } else { + xtd = TAILQ_FIRST(&gd->gd_tdrunq); + if (xtd && xtd->td_pri > td->td_pri) { + lwkt_schedule_self(curthread); + lwkt_switch(); + } + } +} + +/* + * This yield is designed for kernel threads with a user context. + * + * The kernel acting on behalf of the user is potentially cpu-bound, + * this function will efficiently allow other threads to run and also + * switch to other processes by releasing. * * The lwkt_user_yield() function is designed to have very low overhead * if no yield is determined to be needed. @@ -1002,77 +1153,41 @@ lwkt_passive_release(struct thread *td) void lwkt_user_yield(void) { - thread_t td = curthread; - struct lwp *lp = td->td_lwp; + globaldata_t gd = mycpu; + thread_t td = gd->gd_curthread; -#ifdef SMP /* - * XXX SEVERE TEMPORARY HACK. A cpu-bound operation running in the - * kernel can prevent other cpus from servicing interrupt threads - * which still require the MP lock (which is a lot of them). This - * has a chaining effect since if the interrupt is blocked, so is - * the event, so normal scheduling will not pick up on the problem. + * Always run any pending interrupts in case we are in a critical + * section. */ - if (mplock_countx && td->td_mpcount) { - int savecnt = td->td_mpcount; - - td->td_mpcount = 1; - mplock_countx = 0; - rel_mplock(); - DELAY(bgl_yield); - get_mplock(); - td->td_mpcount = savecnt; - } -#endif + if ((gd->gd_reqflags & RQF_IDLECHECK_MASK) && td->td_nest_count < 2) + splz(); /* - * Another kernel thread wants the cpu + * Switch (which forces a release) if another kernel thread needs + * the cpu, if userland wants us to resched, or if our kernel + * quantum has run out. */ - if (lwkt_resched_wanted()) + if (lwkt_resched_wanted() || + user_resched_wanted() || + td->td_fairq_accum < 0) + { lwkt_switch(); - - /* - * If the user scheduler has asynchronously determined that the current - * process (when running in user mode) needs to lose the cpu then make - * sure we are released. - */ - if (user_resched_wanted()) { - if (td->td_release) - td->td_release(td); } +#if 0 /* - * If we are released reduce our priority + * Reacquire the current process if we are released. + * + * XXX not implemented atm. The kernel may be holding locks and such, + * so we want the thread to continue to receive cpu. */ - if (td->td_release == NULL) { - if (lwkt_check_resched(td) > 0) - lwkt_switch(); - if (lp) { - lp->lwp_proc->p_usched->acquire_curproc(lp); - td->td_release = lwkt_passive_release; - lwkt_setpri_self(TDPRI_USER_NORM); - } + if (td->td_release == NULL && lp) { + lp->lwp_proc->p_usched->acquire_curproc(lp); + td->td_release = lwkt_passive_release; + lwkt_setpri_self(TDPRI_USER_NORM); } -} - -/* - * Return 0 if no runnable threads are pending at the same or higher - * priority as the passed thread. - * - * Return 1 if runnable threads are pending at the same priority. - * - * Return 2 if runnable threads are pending at a higher priority. - */ -int -lwkt_check_resched(thread_t td) -{ - int pri = td->td_pri & TDPRI_MASK; - - if (td->td_gd->gd_runqmask > (2 << pri) - 1) - return(2); - if (TAILQ_NEXT(td, td_threadq)) - return(1); - return(0); +#endif } /* @@ -1096,18 +1211,31 @@ lwkt_check_resched(thread_t td) */ static __inline void -_lwkt_schedule_post(globaldata_t gd, thread_t ntd, int cpri, int reschedok) +_lwkt_schedule_post(globaldata_t gd, thread_t ntd, int ccount, int reschedok) { thread_t otd; if (ntd->td_flags & TDF_RUNQ) { if (ntd->td_preemptable && reschedok) { - ntd->td_preemptable(ntd, cpri); /* YYY +token */ + ntd->td_preemptable(ntd, ccount); /* YYY +token */ } else if (reschedok) { otd = curthread; - if ((ntd->td_pri & TDPRI_MASK) > (otd->td_pri & TDPRI_MASK)) + if (ntd->td_pri > otd->td_pri) need_lwkt_resched(); } + + /* + * Give the thread a little fair share scheduler bump if it + * has been asleep for a while. This is primarily to avoid + * a degenerate case for interrupt threads where accumulator + * crosses into negative territory unnecessarily. + */ + if (ntd->td_fairq_lticks != ticks) { + ntd->td_fairq_lticks = ticks; + ntd->td_fairq_accum += gd->gd_fairq_total_pri; + if (ntd->td_fairq_accum > TDFAIRQ_MAX(gd)) + ntd->td_fairq_accum = TDFAIRQ_MAX(gd); + } } } @@ -1117,7 +1245,9 @@ _lwkt_schedule(thread_t td, int reschedok) { globaldata_t mygd = mycpu; - KASSERT(td != &td->td_gd->gd_idlethread, ("lwkt_schedule(): scheduling gd_idlethread is illegal!")); + KASSERT(td != &td->td_gd->gd_idlethread, + ("lwkt_schedule(): scheduling gd_idlethread is illegal!")); + KKASSERT((td->td_flags & TDF_MIGRATING) == 0); crit_enter_gd(mygd); KKASSERT(td->td_lwp == NULL || (td->td_lwp->lwp_flag & LWP_ONRUNQ) == 0); if (td == mygd->gd_curthread) { @@ -1131,13 +1261,13 @@ _lwkt_schedule(thread_t td, int reschedok) #ifdef SMP if (td->td_gd == mygd) { _lwkt_enqueue(td); - _lwkt_schedule_post(mygd, td, TDPRI_CRIT, reschedok); + _lwkt_schedule_post(mygd, td, 1, reschedok); } else { lwkt_send_ipiq3(td->td_gd, lwkt_schedule_remote, td, 0); } #else _lwkt_enqueue(td); - _lwkt_schedule_post(mygd, td, TDPRI_CRIT, reschedok); + _lwkt_schedule_post(mygd, td, 1, reschedok); #endif } crit_exit_gd(mygd); @@ -1219,12 +1349,15 @@ lwkt_acquire(thread_t td) cpu_lfence(); KKASSERT((td->td_flags & TDF_RUNQ) == 0); crit_enter_gd(mygd); + DEBUG_PUSH_INFO("lwkt_acquire"); while (td->td_flags & (TDF_RUNNING|TDF_PREEMPT_LOCK)) { #ifdef SMP lwkt_process_ipiq(); #endif cpu_lfence(); } + DEBUG_POP_INFO(); + cpu_mfence(); td->td_gd = mygd; TAILQ_INSERT_TAIL(&mygd->gd_tdallq, td, td_allq); td->td_flags &= ~TDF_MIGRATING; @@ -1270,29 +1403,23 @@ lwkt_deschedule(thread_t td) * Set the target thread's priority. This routine does not automatically * switch to a higher priority thread, LWKT threads are not designed for * continuous priority changes. Yield if you want to switch. - * - * We have to retain the critical section count which uses the high bits - * of the td_pri field. The specified priority may also indicate zero or - * more critical sections by adding TDPRI_CRIT*N. - * - * Note that we requeue the thread whether it winds up on a different runq - * or not. uio_yield() depends on this and the routine is not normally - * called with the same priority otherwise. */ void lwkt_setpri(thread_t td, int pri) { - KKASSERT(pri >= 0); KKASSERT(td->td_gd == mycpu); - crit_enter(); - if (td->td_flags & TDF_RUNQ) { - _lwkt_dequeue(td); - td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri; - _lwkt_enqueue(td); - } else { - td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri; + if (td->td_pri != pri) { + KKASSERT(pri >= 0); + crit_enter(); + if (td->td_flags & TDF_RUNQ) { + _lwkt_dequeue(td); + td->td_pri = pri; + _lwkt_enqueue(td); + } else { + td->td_pri = pri; + } + crit_exit(); } - crit_exit(); } /* @@ -1309,7 +1436,7 @@ lwkt_setpri_initial(thread_t td, int pri) { KKASSERT(pri >= 0); KKASSERT((td->td_flags & TDF_RUNQ) == 0); - td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri; + td->td_pri = pri; } void @@ -1321,14 +1448,49 @@ lwkt_setpri_self(int pri) crit_enter(); if (td->td_flags & TDF_RUNQ) { _lwkt_dequeue(td); - td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri; + td->td_pri = pri; _lwkt_enqueue(td); } else { - td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri; + td->td_pri = pri; } crit_exit(); } +/* + * 1/hz tick (typically 10ms) x TDFAIRQ_SCALE (typ 8) = 80ms full cycle. + * + * Example: two competing threads, same priority N. decrement by (2*N) + * increment by N*8, each thread will get 4 ticks. + */ +void +lwkt_fairq_schedulerclock(thread_t td) +{ + globaldata_t gd; + + if (fairq_enable) { + while (td) { + gd = td->td_gd; + if (td != &gd->gd_idlethread) { + td->td_fairq_accum -= gd->gd_fairq_total_pri; + if (td->td_fairq_accum < -TDFAIRQ_MAX(gd)) + td->td_fairq_accum = -TDFAIRQ_MAX(gd); + if (td->td_fairq_accum < 0) + need_lwkt_resched(); + td->td_fairq_lticks = ticks; + } + td = td->td_preempted; + } + } +} + +static void +lwkt_fairq_accumulate(globaldata_t gd, thread_t td) +{ + td->td_fairq_accum += td->td_pri * TDFAIRQ_SCALE; + if (td->td_fairq_accum > TDFAIRQ_MAX(td->td_gd)) + td->td_fairq_accum = TDFAIRQ_MAX(td->td_gd); +} + /* * Migrate the current thread to the specified cpu. * @@ -1396,15 +1558,24 @@ lwkt_setcpu_remote(void *arg) { thread_t td = arg; globaldata_t gd = mycpu; + int retry = 10000000; + DEBUG_PUSH_INFO("lwkt_setcpu_remote"); while (td->td_flags & (TDF_RUNNING|TDF_PREEMPT_LOCK)) { #ifdef SMP lwkt_process_ipiq(); #endif cpu_lfence(); + cpu_pause(); + if (--retry == 0) { + kprintf("lwkt_setcpu_remote: td->td_flags %08x\n", + td->td_flags); + retry = 10000000; + } } + DEBUG_POP_INFO(); td->td_gd = gd; - cpu_sfence(); + cpu_mfence(); td->td_flags &= ~TDF_MIGRATING; KKASSERT(td->td_lwp == NULL || (td->td_lwp->lwp_flag & LWP_ONRUNQ) == 0); _lwkt_enqueue(td); @@ -1429,9 +1600,8 @@ lwkt_preempted_proc(void) * rel_mplock() at the start of the new thread. */ int -lwkt_create(void (*func)(void *), void *arg, - struct thread **tdp, thread_t template, int tdflags, int cpu, - const char *fmt, ...) +lwkt_create(void (*func)(void *), void *arg, struct thread **tdp, + thread_t template, int tdflags, int cpu, const char *fmt, ...) { thread_t td; __va_list ap; @@ -1471,9 +1641,14 @@ lwkt_exit(void) thread_t std; globaldata_t gd; + /* + * Do any cleanup that might block here + */ if (td->td_flags & TDF_VERBOSE) kprintf("kthread %p %s has exited\n", td, td->td_comm); caps_exit(td); + biosched_done(td); + dsched_exit_thread(td); /* * Get us into a critical section to interlock gd_freetd and loop @@ -1485,19 +1660,26 @@ lwkt_exit(void) gd = mycpu; crit_enter_quick(td); while ((std = gd->gd_freetd) != NULL) { + KKASSERT((std->td_flags & (TDF_RUNNING|TDF_PREEMPT_LOCK)) == 0); gd->gd_freetd = NULL; objcache_put(thread_cache, std); } /* * Remove thread resources from kernel lists and deschedule us for - * the last time. + * the last time. We cannot block after this point or we may end + * up with a stale td on the tsleepq. */ if (td->td_flags & TDF_TSLEEPQ) tsleep_remove(td); - biosched_done(td); lwkt_deschedule_self(td); lwkt_remove_tdallq(td); + KKASSERT(td->td_refs == 0); + + /* + * Final cleanup + */ + KKASSERT(gd->gd_freetd == NULL); if (td->td_flags & TDF_ALLOCATED_THREAD) gd->gd_freetd = td; cpu_thread_exit(); @@ -1510,14 +1692,33 @@ lwkt_remove_tdallq(thread_t td) TAILQ_REMOVE(&td->td_gd->gd_tdallq, td, td_allq); } +/* + * Code reduction and branch prediction improvements. Call/return + * overhead on modern cpus often degenerates into 0 cycles due to + * the cpu's branch prediction hardware and return pc cache. We + * can take advantage of this by not inlining medium-complexity + * functions and we can also reduce the branch prediction impact + * by collapsing perfectly predictable branches into a single + * procedure instead of duplicating it. + * + * Is any of this noticeable? Probably not, so I'll take the + * smaller code size. + */ +void +crit_exit_wrapper(__DEBUG_CRIT_ARG__) +{ + _crit_exit(mycpu __DEBUG_CRIT_PASS_ARG__); +} + void crit_panic(void) { thread_t td = curthread; - int lpri = td->td_pri; + int lcrit = td->td_critcount; - td->td_pri = 0; - panic("td_pri is/would-go negative! %p %d", td, lpri); + td->td_critcount = 0; + panic("td_critcount is/would-go negative! %p %d", td, lcrit); + /* NOT REACHED */ } #ifdef SMP @@ -1546,73 +1747,4 @@ lwkt_smp_stopped(void) crit_exit_gd(gd); } -/* - * get_mplock() calls this routine if it is unable to obtain the MP lock. - * get_mplock() has already incremented td_mpcount. We must block and - * not return until giant is held. - * - * All we have to do is lwkt_switch() away. The LWKT scheduler will not - * reschedule the thread until it can obtain the giant lock for it. - */ -void -lwkt_mp_lock_contested(void) -{ - ++mplock_countx; - loggiant(beg); - lwkt_switch(); - loggiant(end); -} - -/* - * The rel_mplock() code will call this function after releasing the - * last reference on the MP lock if mp_lock_contention_mask is non-zero. - * - * We then chain an IPI to a single other cpu potentially needing the - * lock. This is a bit heuristical and we can wind up with IPIs flying - * all over the place. - */ -static void lwkt_mp_lock_uncontested_remote(void *arg __unused); - -void -lwkt_mp_lock_uncontested(void) -{ - globaldata_t gd; - globaldata_t dgd; - cpumask_t mask; - cpumask_t tmpmask; - int cpuid; - - if (chain_mplock) { - gd = mycpu; - atomic_clear_int(&mp_lock_contention_mask, gd->gd_cpumask); - mask = mp_lock_contention_mask; - tmpmask = ~((1 << gd->gd_cpuid) - 1); - - if (mask) { - if (mask & tmpmask) - cpuid = bsfl(mask & tmpmask); - else - cpuid = bsfl(mask); - atomic_clear_int(&mp_lock_contention_mask, 1 << cpuid); - dgd = globaldata_find(cpuid); - lwkt_send_ipiq(dgd, lwkt_mp_lock_uncontested_remote, NULL); - } - } -} - -/* - * The idea is for this IPI to interrupt a potentially lower priority - * thread, such as a user thread, to allow the scheduler to reschedule - * a higher priority kernel thread that needs the MP lock. - * - * For now we set the LWKT reschedule flag which generates an AST in - * doreti, though theoretically it is also possible to possibly preempt - * here if the underlying thread was operating in user mode. Nah. - */ -static void -lwkt_mp_lock_uncontested_remote(void *arg __unused) -{ - need_lwkt_resched(); -} - #endif