gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2003-2010 The DragonFly Project. All rights reserved.
	3	*
	4	* This code is derived from software contributed to The DragonFly Project
	5	* by Matthew Dillon <dillon@backplane.com>
	6	*
	7	* Redistribution and use in source and binary forms, with or without
	8	* modification, are permitted provided that the following conditions
	9	* are met:
	10	*
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in
	15	* the documentation and/or other materials provided with the
	16	* distribution.
	17	* 3. Neither the name of The DragonFly Project nor the names of its
	18	* contributors may be used to endorse or promote products derived
	19	* from this software without specific, prior written permission.
	20	*
	21	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	22	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	23	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	24	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	25	* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	26	* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
	27	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	28	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	29	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	30	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	31	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	32	* SUCH DAMAGE.
	33	*/
	34
	35	/*
	36	* Each cpu in a system has its own self-contained light weight kernel
	37	* thread scheduler, which means that generally speaking we only need
	38	* to use a critical section to avoid problems. Foreign thread
	39	* scheduling is queued via (async) IPIs.
	40	*/
	41
	42	#include <sys/param.h>
	43	#include <sys/systm.h>
	44	#include <sys/kernel.h>
	45	#include <sys/proc.h>
	46	#include <sys/rtprio.h>
	47	#include <sys/kinfo.h>
	48	#include <sys/queue.h>
	49	#include <sys/sysctl.h>
	50	#include <sys/kthread.h>
	51	#include <machine/cpu.h>
	52	#include <sys/lock.h>
	53	#include <sys/caps.h>
	54	#include <sys/spinlock.h>
	55	#include <sys/ktr.h>
	56
	57	#include <sys/thread2.h>
	58	#include <sys/spinlock2.h>
	59	#include <sys/mplock2.h>
	60
	61	#include <sys/dsched.h>
	62
	63	#include <vm/vm.h>
	64	#include <vm/vm_param.h>
	65	#include <vm/vm_kern.h>
	66	#include <vm/vm_object.h>
	67	#include <vm/vm_page.h>
	68	#include <vm/vm_map.h>
	69	#include <vm/vm_pager.h>
	70	#include <vm/vm_extern.h>
	71
	72	#include <machine/stdarg.h>
	73	#include <machine/smp.h>
	74
	75	#if !defined(KTR_CTXSW)
	76	#define KTR_CTXSW KTR_ALL
	77	#endif
	78	KTR_INFO_MASTER(ctxsw);
	79	KTR_INFO(KTR_CTXSW, ctxsw, sw, 0, "#cpu[%d].td = %p",
	80	sizeof(int) + sizeof(struct thread *));
	81	KTR_INFO(KTR_CTXSW, ctxsw, pre, 1, "#cpu[%d].td = %p",
	82	sizeof(int) + sizeof(struct thread *));
	83	KTR_INFO(KTR_CTXSW, ctxsw, newtd, 2, "#threads[%p].name = %s",
	84	sizeof (struct thread ) + sizeof(char ));
	85	KTR_INFO(KTR_CTXSW, ctxsw, deadtd, 3, "#threads[%p].name = <dead>", sizeof (struct thread *));
	86
	87	static MALLOC_DEFINE(M_THREAD, "thread", "lwkt threads");
	88
	89	#ifdef INVARIANTS
	90	static int panic_on_cscount = 0;
	91	#endif
	92	static __int64_t switch_count = 0;
	93	static __int64_t preempt_hit = 0;
	94	static __int64_t preempt_miss = 0;
	95	static __int64_t preempt_weird = 0;
	96	static __int64_t token_contention_count __debugvar = 0;
	97	static int lwkt_use_spin_port;
	98	static struct objcache *thread_cache;
	99
	100	#ifdef SMP
	101	static void lwkt_schedule_remote(void arg, int arg2, struct intrframe frame);
	102	static void lwkt_setcpu_remote(void *arg);
	103	#endif
	104	static void lwkt_fairq_accumulate(globaldata_t gd, thread_t td);
	105
	106	extern void cpu_heavy_restore(void);
	107	extern void cpu_lwkt_restore(void);
	108	extern void cpu_kthread_restore(void);
	109	extern void cpu_idle_restore(void);
	110
	111	/*
	112	* We can make all thread ports use the spin backend instead of the thread
	113	* backend. This should only be set to debug the spin backend.
	114	*/
	115	TUNABLE_INT("lwkt.use_spin_port", &lwkt_use_spin_port);
	116
	117	#ifdef INVARIANTS
	118	SYSCTL_INT(_lwkt, OID_AUTO, panic_on_cscount, CTLFLAG_RW, &panic_on_cscount, 0,
	119	"Panic if attempting to switch lwkt's while mastering cpusync");
	120	#endif
	121	SYSCTL_QUAD(_lwkt, OID_AUTO, switch_count, CTLFLAG_RW, &switch_count, 0,
	122	"Number of switched threads");
	123	SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_hit, CTLFLAG_RW, &preempt_hit, 0,
	124	"Successful preemption events");
	125	SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_miss, CTLFLAG_RW, &preempt_miss, 0,
	126	"Failed preemption events");
	127	SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_weird, CTLFLAG_RW, &preempt_weird, 0,
	128	"Number of preempted threads.");
	129	#ifdef INVARIANTS
	130	SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count, CTLFLAG_RW,
	131	&token_contention_count, 0, "spinning due to token contention");
	132	#endif
	133	static int fairq_enable = 1;
	134	SYSCTL_INT(_lwkt, OID_AUTO, fairq_enable, CTLFLAG_RW,
	135	&fairq_enable, 0, "Turn on fairq priority accumulators");
	136	static int lwkt_spin_loops = 10;
	137	SYSCTL_INT(_lwkt, OID_AUTO, spin_loops, CTLFLAG_RW,
	138	&lwkt_spin_loops, 0, "");
	139	static int lwkt_spin_delay = 1;
	140	SYSCTL_INT(_lwkt, OID_AUTO, spin_delay, CTLFLAG_RW,
	141	&lwkt_spin_delay, 0, "Scheduler spin delay in microseconds 0=auto");
	142	static int lwkt_spin_method = 1;
	143	SYSCTL_INT(_lwkt, OID_AUTO, spin_method, CTLFLAG_RW,
	144	&lwkt_spin_method, 0, "LWKT scheduler behavior when contended");
	145	static int lwkt_spin_fatal = 0; /* disabled */
	146	SYSCTL_INT(_lwkt, OID_AUTO, spin_fatal, CTLFLAG_RW,
	147	&lwkt_spin_fatal, 0, "LWKT scheduler spin loops till fatal panic");
	148	static int preempt_enable = 1;
	149	SYSCTL_INT(_lwkt, OID_AUTO, preempt_enable, CTLFLAG_RW,
	150	&preempt_enable, 0, "Enable preemption");
	151	static int lwkt_cache_threads = 32;
	152	SYSCTL_INT(_lwkt, OID_AUTO, cache_threads, CTLFLAG_RD,
	153	&lwkt_cache_threads, 0, "thread+kstack cache");
	154
	155	static __cachealign int lwkt_cseq_rindex;
	156	static __cachealign int lwkt_cseq_windex;
	157
	158	/*
	159	* These helper procedures handle the runq, they can only be called from
	160	* within a critical section.
	161	*
	162	* WARNING! Prior to SMP being brought up it is possible to enqueue and
	163	* dequeue threads belonging to other cpus, so be sure to use td->td_gd
	164	* instead of 'mycpu' when referencing the globaldata structure. Once
	165	* SMP live enqueuing and dequeueing only occurs on the current cpu.
	166	*/
	167	static __inline
	168	void
	169	_lwkt_dequeue(thread_t td)
	170	{
	171	if (td->td_flags & TDF_RUNQ) {
	172	struct globaldata *gd = td->td_gd;
	173
	174	td->td_flags &= ~TDF_RUNQ;
	175	TAILQ_REMOVE(&gd->gd_tdrunq, td, td_threadq);
	176	gd->gd_fairq_total_pri -= td->td_pri;
	177	if (TAILQ_FIRST(&gd->gd_tdrunq) == NULL)
	178	atomic_clear_int(&gd->gd_reqflags, RQF_RUNNING);
	179	}
	180	}
	181
	182	/*
	183	* Priority enqueue.
	184	*
	185	* NOTE: There are a limited number of lwkt threads runnable since user
	186	* processes only schedule one at a time per cpu.
	187	*/
	188	static __inline
	189	void
	190	_lwkt_enqueue(thread_t td)
	191	{
	192	thread_t xtd;
	193
	194	if ((td->td_flags & (TDF_RUNQ\|TDF_MIGRATING\|TDF_BLOCKQ)) == 0) {
	195	struct globaldata *gd = td->td_gd;
	196
	197	td->td_flags \|= TDF_RUNQ;
	198	xtd = TAILQ_FIRST(&gd->gd_tdrunq);
	199	if (xtd == NULL) {
	200	TAILQ_INSERT_TAIL(&gd->gd_tdrunq, td, td_threadq);
	201	atomic_set_int(&gd->gd_reqflags, RQF_RUNNING);
	202	} else {
	203	while (xtd && xtd->td_pri > td->td_pri)
	204	xtd = TAILQ_NEXT(xtd, td_threadq);
	205	if (xtd)
	206	TAILQ_INSERT_BEFORE(xtd, td, td_threadq);
	207	else
	208	TAILQ_INSERT_TAIL(&gd->gd_tdrunq, td, td_threadq);
	209	}
	210	gd->gd_fairq_total_pri += td->td_pri;
	211	}
	212	}
	213
	214	static __boolean_t
	215	_lwkt_thread_ctor(void obj, void privdata, int ocflags)
	216	{
	217	struct thread td = (struct thread )obj;
	218
	219	td->td_kstack = NULL;
	220	td->td_kstack_size = 0;
	221	td->td_flags = TDF_ALLOCATED_THREAD;
	222	return (1);
	223	}
	224
	225	static void
	226	_lwkt_thread_dtor(void obj, void privdata)
	227	{
	228	struct thread td = (struct thread )obj;
	229
	230	KASSERT(td->td_flags & TDF_ALLOCATED_THREAD,
	231	("_lwkt_thread_dtor: not allocated from objcache"));
	232	KASSERT((td->td_flags & TDF_ALLOCATED_STACK) && td->td_kstack &&
	233	td->td_kstack_size > 0,
	234	("_lwkt_thread_dtor: corrupted stack"));
	235	kmem_free(&kernel_map, (vm_offset_t)td->td_kstack, td->td_kstack_size);
	236	}
	237
	238	/*
	239	* Initialize the lwkt s/system.
	240	*
	241	* Nominally cache up to 32 thread + kstack structures.
	242	*/
	243	void
	244	lwkt_init(void)
	245	{
	246	TUNABLE_INT("lwkt.cache_threads", &lwkt_cache_threads);
	247	thread_cache = objcache_create_mbacked(
	248	M_THREAD, sizeof(struct thread),
	249	NULL, lwkt_cache_threads,
	250	_lwkt_thread_ctor, _lwkt_thread_dtor, NULL);
	251	}
	252
	253	/*
	254	* Schedule a thread to run. As the current thread we can always safely
	255	* schedule ourselves, and a shortcut procedure is provided for that
	256	* function.
	257	*
	258	* (non-blocking, self contained on a per cpu basis)
	259	*/
	260	void
	261	lwkt_schedule_self(thread_t td)
	262	{
	263	KKASSERT((td->td_flags & TDF_MIGRATING) == 0);
	264	crit_enter_quick(td);
	265	KASSERT(td != &td->td_gd->gd_idlethread,
	266	("lwkt_schedule_self(): scheduling gd_idlethread is illegal!"));
	267	KKASSERT(td->td_lwp == NULL \|\| (td->td_lwp->lwp_flag & LWP_ONRUNQ) == 0);
	268	_lwkt_enqueue(td);
	269	crit_exit_quick(td);
	270	}
	271
	272	/*
	273	* Deschedule a thread.
	274	*
	275	* (non-blocking, self contained on a per cpu basis)
	276	*/
	277	void
	278	lwkt_deschedule_self(thread_t td)
	279	{
	280	crit_enter_quick(td);
	281	_lwkt_dequeue(td);
	282	crit_exit_quick(td);
	283	}
	284
	285	/*
	286	* LWKTs operate on a per-cpu basis
	287	*
	288	* WARNING! Called from early boot, 'mycpu' may not work yet.
	289	*/
	290	void
	291	lwkt_gdinit(struct globaldata *gd)
	292	{
	293	TAILQ_INIT(&gd->gd_tdrunq);
	294	TAILQ_INIT(&gd->gd_tdallq);
	295	}
	296
	297	/*
	298	* Create a new thread. The thread must be associated with a process context
	299	* or LWKT start address before it can be scheduled. If the target cpu is
	300	* -1 the thread will be created on the current cpu.
	301	*
	302	* If you intend to create a thread without a process context this function
	303	* does everything except load the startup and switcher function.
	304	*/
	305	thread_t
	306	lwkt_alloc_thread(struct thread *td, int stksize, int cpu, int flags)
	307	{
	308	globaldata_t gd = mycpu;
	309	void *stack;
	310
	311	/*
	312	* If static thread storage is not supplied allocate a thread. Reuse
	313	* a cached free thread if possible. gd_freetd is used to keep an exiting
	314	* thread intact through the exit.
	315	*/
	316	if (td == NULL) {
	317	crit_enter_gd(gd);
	318	if ((td = gd->gd_freetd) != NULL) {
	319	KKASSERT((td->td_flags & (TDF_RUNNING\|TDF_PREEMPT_LOCK\|
	320	TDF_RUNQ)) == 0);
	321	gd->gd_freetd = NULL;
	322	} else {
	323	td = objcache_get(thread_cache, M_WAITOK);
	324	KKASSERT((td->td_flags & (TDF_RUNNING\|TDF_PREEMPT_LOCK\|
	325	TDF_RUNQ)) == 0);
	326	}
	327	crit_exit_gd(gd);
	328	KASSERT((td->td_flags &
	329	(TDF_ALLOCATED_THREAD\|TDF_RUNNING)) == TDF_ALLOCATED_THREAD,
	330	("lwkt_alloc_thread: corrupted td flags 0x%X", td->td_flags));
	331	flags \|= td->td_flags & (TDF_ALLOCATED_THREAD\|TDF_ALLOCATED_STACK);
	332	}
	333
	334	/*
	335	* Try to reuse cached stack.
	336	*/
	337	if ((stack = td->td_kstack) != NULL && td->td_kstack_size != stksize) {
	338	if (flags & TDF_ALLOCATED_STACK) {
	339	kmem_free(&kernel_map, (vm_offset_t)stack, td->td_kstack_size);
	340	stack = NULL;
	341	}
	342	}
	343	if (stack == NULL) {
	344	stack = (void *)kmem_alloc_stack(&kernel_map, stksize);
	345	flags \|= TDF_ALLOCATED_STACK;
	346	}
	347	if (cpu < 0)
	348	lwkt_init_thread(td, stack, stksize, flags, gd);
	349	else
	350	lwkt_init_thread(td, stack, stksize, flags, globaldata_find(cpu));
	351	return(td);
	352	}
	353
	354	/*
	355	* Initialize a preexisting thread structure. This function is used by
	356	* lwkt_alloc_thread() and also used to initialize the per-cpu idlethread.
	357	*
	358	* All threads start out in a critical section at a priority of
	359	* TDPRI_KERN_DAEMON. Higher level code will modify the priority as
	360	* appropriate. This function may send an IPI message when the
	361	* requested cpu is not the current cpu and consequently gd_tdallq may
	362	* not be initialized synchronously from the point of view of the originating
	363	* cpu.
	364	*
	365	* NOTE! we have to be careful in regards to creating threads for other cpus
	366	* if SMP has not yet been activated.
	367	*/
	368	#ifdef SMP
	369
	370	static void
	371	lwkt_init_thread_remote(void *arg)
	372	{
	373	thread_t td = arg;
	374
	375	/*
	376	* Protected by critical section held by IPI dispatch
	377	*/
	378	TAILQ_INSERT_TAIL(&td->td_gd->gd_tdallq, td, td_allq);
	379	}
	380
	381	#endif
	382
	383	/*
	384	* lwkt core thread structural initialization.
	385	*
	386	* NOTE: All threads are initialized as mpsafe threads.
	387	*/
	388	void
	389	lwkt_init_thread(thread_t td, void *stack, int stksize, int flags,
	390	struct globaldata *gd)
	391	{
	392	globaldata_t mygd = mycpu;
	393
	394	bzero(td, sizeof(struct thread));
	395	td->td_kstack = stack;
	396	td->td_kstack_size = stksize;
	397	td->td_flags = flags;
	398	td->td_gd = gd;
	399	td->td_pri = TDPRI_KERN_DAEMON;
	400	td->td_critcount = 1;
	401	td->td_toks_stop = &td->td_toks_base;
	402	if (lwkt_use_spin_port)
	403	lwkt_initport_spin(&td->td_msgport);
	404	else
	405	lwkt_initport_thread(&td->td_msgport, td);
	406	pmap_init_thread(td);
	407	#ifdef SMP
	408	/*
	409	* Normally initializing a thread for a remote cpu requires sending an
	410	* IPI. However, the idlethread is setup before the other cpus are
	411	* activated so we have to treat it as a special case. XXX manipulation
	412	* of gd_tdallq requires the BGL.
	413	*/
	414	if (gd == mygd \|\| td == &gd->gd_idlethread) {
	415	crit_enter_gd(mygd);
	416	TAILQ_INSERT_TAIL(&gd->gd_tdallq, td, td_allq);
	417	crit_exit_gd(mygd);
	418	} else {
	419	lwkt_send_ipiq(gd, lwkt_init_thread_remote, td);
	420	}
	421	#else
	422	crit_enter_gd(mygd);
	423	TAILQ_INSERT_TAIL(&gd->gd_tdallq, td, td_allq);
	424	crit_exit_gd(mygd);
	425	#endif
	426
	427	dsched_new_thread(td);
	428	}
	429
	430	void
	431	lwkt_set_comm(thread_t td, const char *ctl, ...)
	432	{
	433	__va_list va;
	434
	435	__va_start(va, ctl);
	436	kvsnprintf(td->td_comm, sizeof(td->td_comm), ctl, va);
	437	__va_end(va);
	438	KTR_LOG(ctxsw_newtd, td, &td->td_comm[0]);
	439	}
	440
	441	void
	442	lwkt_hold(thread_t td)
	443	{
	444	atomic_add_int(&td->td_refs, 1);
	445	}
	446
	447	void
	448	lwkt_rele(thread_t td)
	449	{
	450	KKASSERT(td->td_refs > 0);
	451	atomic_add_int(&td->td_refs, -1);
	452	}
	453
	454	void
	455	lwkt_wait_free(thread_t td)
	456	{
	457	while (td->td_refs)
	458	tsleep(td, 0, "tdreap", hz);
	459	}
	460
	461	void
	462	lwkt_free_thread(thread_t td)
	463	{
	464	KKASSERT(td->td_refs == 0);
	465	KKASSERT((td->td_flags & (TDF_RUNNING\|TDF_PREEMPT_LOCK\|TDF_RUNQ)) == 0);
	466	if (td->td_flags & TDF_ALLOCATED_THREAD) {
	467	objcache_put(thread_cache, td);
	468	} else if (td->td_flags & TDF_ALLOCATED_STACK) {
	469	/* client-allocated struct with internally allocated stack */
	470	KASSERT(td->td_kstack && td->td_kstack_size > 0,
	471	("lwkt_free_thread: corrupted stack"));
	472	kmem_free(&kernel_map, (vm_offset_t)td->td_kstack, td->td_kstack_size);
	473	td->td_kstack = NULL;
	474	td->td_kstack_size = 0;
	475	}
	476	KTR_LOG(ctxsw_deadtd, td);
	477	}
	478
	479
	480	/*
	481	* Switch to the next runnable lwkt. If no LWKTs are runnable then
	482	* switch to the idlethread. Switching must occur within a critical
	483	* section to avoid races with the scheduling queue.
	484	*
	485	* We always have full control over our cpu's run queue. Other cpus
	486	* that wish to manipulate our queue must use the cpu_*msg() calls to
	487	* talk to our cpu, so a critical section is all that is needed and
	488	* the result is very, very fast thread switching.
	489	*
	490	* The LWKT scheduler uses a fixed priority model and round-robins at
	491	* each priority level. User process scheduling is a totally
	492	* different beast and LWKT priorities should not be confused with
	493	* user process priorities.
	494	*
	495	* PREEMPTION NOTE: Preemption occurs via lwkt_preempt(). lwkt_switch()
	496	* is not called by the current thread in the preemption case, only when
	497	* the preempting thread blocks (in order to return to the original thread).
	498	*
	499	* SPECIAL NOTE ON SWITCH ATOMICY: Certain operations such as thread
	500	* migration and tsleep deschedule the current lwkt thread and call
	501	* lwkt_switch(). In particular, the target cpu of the migration fully
	502	* expects the thread to become non-runnable and can deadlock against
	503	* cpusync operations if we run any IPIs prior to switching the thread out.
	504	*
	505	* WE MUST BE VERY CAREFUL NOT TO RUN SPLZ DIRECTLY OR INDIRECTLY IF
	506	* THE CURRENT THREAD HAS BEEN DESCHEDULED!
	507	*/
	508	void
	509	lwkt_switch(void)
	510	{
	511	globaldata_t gd = mycpu;
	512	thread_t td = gd->gd_curthread;
	513	thread_t ntd;
	514	thread_t xtd;
	515	int spinning = lwkt_spin_loops; /* loops before HLTing */
	516	int reqflags;
	517	int cseq;
	518	int oseq;
	519	int fatal_count;
	520
	521	KKASSERT(gd->gd_processing_ipiq == 0);
	522
	523	/*
	524	* Switching from within a 'fast' (non thread switched) interrupt or IPI
	525	* is illegal. However, we may have to do it anyway if we hit a fatal
	526	* kernel trap or we have paniced.
	527	*
	528	* If this case occurs save and restore the interrupt nesting level.
	529	*/
	530	if (gd->gd_intr_nesting_level) {
	531	int savegdnest;
	532	int savegdtrap;
	533
	534	if (gd->gd_trap_nesting_level == 0 && panic_cpu_gd != mycpu) {
	535	panic("lwkt_switch: Attempt to switch from a "
	536	"a fast interrupt, ipi, or hard code section, "
	537	"td %p\n",
	538	td);
	539	} else {
	540	savegdnest = gd->gd_intr_nesting_level;
	541	savegdtrap = gd->gd_trap_nesting_level;
	542	gd->gd_intr_nesting_level = 0;
	543	gd->gd_trap_nesting_level = 0;
	544	if ((td->td_flags & TDF_PANICWARN) == 0) {
	545	td->td_flags \|= TDF_PANICWARN;
	546	kprintf("Warning: thread switch from interrupt, IPI, "
	547	"or hard code section.\n"
	548	"thread %p (%s)\n", td, td->td_comm);
	549	print_backtrace(-1);
	550	}
	551	lwkt_switch();
	552	gd->gd_intr_nesting_level = savegdnest;
	553	gd->gd_trap_nesting_level = savegdtrap;
	554	return;
	555	}
	556	}
	557
	558	/*
	559	* Passive release (used to transition from user to kernel mode
	560	* when we block or switch rather then when we enter the kernel).
	561	* This function is NOT called if we are switching into a preemption
	562	* or returning from a preemption. Typically this causes us to lose
	563	* our current process designation (if we have one) and become a true
	564	* LWKT thread, and may also hand the current process designation to
	565	* another process and schedule thread.
	566	*/
	567	if (td->td_release)
	568	td->td_release(td);
	569
	570	crit_enter_gd(gd);
	571	if (TD_TOKS_HELD(td))
	572	lwkt_relalltokens(td);
	573
	574	/*
	575	* We had better not be holding any spin locks, but don't get into an
	576	* endless panic loop.
	577	*/
	578	KASSERT(gd->gd_spinlocks_wr == 0 \|\| panicstr != NULL,
	579	("lwkt_switch: still holding %d exclusive spinlocks!",
	580	gd->gd_spinlocks_wr));
	581
	582
	583	#ifdef SMP
	584	#ifdef INVARIANTS
	585	if (td->td_cscount) {
	586	kprintf("Diagnostic: attempt to switch while mastering cpusync: %p\n",
	587	td);
	588	if (panic_on_cscount)
	589	panic("switching while mastering cpusync");
	590	}
	591	#endif
	592	#endif
	593
	594	/*
	595	* If we had preempted another thread on this cpu, resume the preempted
	596	* thread. This occurs transparently, whether the preempted thread
	597	* was scheduled or not (it may have been preempted after descheduling
	598	* itself).
	599	*
	600	* We have to setup the MP lock for the original thread after backing
	601	* out the adjustment that was made to curthread when the original
	602	* was preempted.
	603	*/
	604	if ((ntd = td->td_preempted) != NULL) {
	605	KKASSERT(ntd->td_flags & TDF_PREEMPT_LOCK);
	606	ntd->td_flags \|= TDF_PREEMPT_DONE;
	607
	608	/*
	609	* The interrupt may have woken a thread up, we need to properly
	610	* set the reschedule flag if the originally interrupted thread is
	611	* at a lower priority.
	612	*/
	613	if (TAILQ_FIRST(&gd->gd_tdrunq) &&
	614	TAILQ_FIRST(&gd->gd_tdrunq)->td_pri > ntd->td_pri) {
	615	need_lwkt_resched();
	616	}
	617	/* YYY release mp lock on switchback if original doesn't need it */
	618	goto havethread_preempted;
	619	}
	620
	621	/*
	622	* Implement round-robin fairq with priority insertion. The priority
	623	* insertion is handled by _lwkt_enqueue()
	624	*
	625	* If we cannot obtain ownership of the tokens we cannot immediately
	626	* schedule the target thread.
	627	*
	628	* Reminder: Again, we cannot afford to run any IPIs in this path if
	629	* the current thread has been descheduled.
	630	*/
	631	for (;;) {
	632	/*
	633	* Clear RQF_AST_LWKT_RESCHED (we handle the reschedule request)
	634	* and set RQF_WAKEUP (prevent unnecessary IPIs from being
	635	* received).
	636	*/
	637	for (;;) {
	638	reqflags = gd->gd_reqflags;
	639	if (atomic_cmpset_int(&gd->gd_reqflags, reqflags,
	640	(reqflags & ~RQF_AST_LWKT_RESCHED) \|
	641	RQF_WAKEUP)) {
	642	break;
	643	}
	644	}
	645
	646	/*
	647	* Hotpath - pull the head of the run queue and attempt to schedule
	648	* it. Fairq exhaustion moves the task to the end of the list. If
	649	* no threads are runnable we switch to the idle thread.
	650	*/
	651	for (;;) {
	652	ntd = TAILQ_FIRST(&gd->gd_tdrunq);
	653
	654	if (ntd == NULL) {
	655	/*
	656	* Runq is empty, switch to idle and clear RQF_WAKEUP
	657	* to allow it to halt.
	658	*/
	659	ntd = &gd->gd_idlethread;
	660	#ifdef SMP
	661	if (gd->gd_trap_nesting_level == 0 && panicstr == NULL)
	662	ASSERT_NO_TOKENS_HELD(ntd);
	663	#endif
	664	cpu_time.cp_msg[0] = 0;
	665	cpu_time.cp_stallpc = 0;
	666	atomic_clear_int(&gd->gd_reqflags, RQF_WAKEUP);
	667	goto haveidle;
	668	}
	669
	670	if (ntd->td_fairq_accum >= 0)
	671	break;
	672
	673	/splz_check(); cannot do this here, see above /
	674	lwkt_fairq_accumulate(gd, ntd);
	675	TAILQ_REMOVE(&gd->gd_tdrunq, ntd, td_threadq);
	676	TAILQ_INSERT_TAIL(&gd->gd_tdrunq, ntd, td_threadq);
	677	}
	678
	679	/*
	680	* Hotpath - schedule ntd. Leaves RQF_WAKEUP set to prevent
	681	* unwanted decontention IPIs.
	682	*
	683	* NOTE: For UP there is no mplock and lwkt_getalltokens()
	684	* always succeeds.
	685	*/
	686	if (TD_TOKS_NOT_HELD(ntd) \|\| lwkt_getalltokens(ntd))
	687	goto havethread;
	688
	689	/*
	690	* Coldpath (SMP only since tokens always succeed on UP)
	691	*
	692	* We had some contention on the thread we wanted to schedule.
	693	* What we do now is try to find a thread that we can schedule
	694	* in its stead until decontention reschedules on our cpu.
	695	*
	696	* The coldpath scan does NOT rearrange threads in the run list
	697	* and it also ignores the accumulator.
	698	*
	699	* We do not immediately schedule a user priority thread, instead
	700	* we record it in xtd and continue looking for kernel threads.
	701	* A cpu can only have one user priority thread (normally) so just
	702	* record the first one.
	703	*
	704	* NOTE: This scan will also include threads whos fairq's were
	705	* accumulated in the first loop.
	706	*/
	707	++token_contention_count;
	708	xtd = NULL;
	709	while ((ntd = TAILQ_NEXT(ntd, td_threadq)) != NULL) {
	710	/*
	711	* Try to switch to this thread. If the thread is running at
	712	* user priority we clear WAKEUP to allow decontention IPIs
	713	* (since this thread is simply running until the one we wanted
	714	* decontends), and we make sure that LWKT_RESCHED is not set.
	715	*
	716	* Otherwise for kernel threads we leave WAKEUP set to avoid
	717	* unnecessary decontention IPIs.
	718	*/
	719	if (ntd->td_pri < TDPRI_KERN_LPSCHED) {
	720	if (xtd == NULL)
	721	xtd = ntd;
	722	continue;
	723	}
	724
	725	/*
	726	* Do not let the fairq get too negative. Even though we are
	727	* ignoring it atm once the scheduler decontends a very negative
	728	* thread will get moved to the end of the queue.
	729	*/
	730	if (TD_TOKS_NOT_HELD(ntd) \|\| lwkt_getalltokens(ntd)) {
	731	if (ntd->td_fairq_accum < -TDFAIRQ_MAX(gd))
	732	ntd->td_fairq_accum = -TDFAIRQ_MAX(gd);
	733	goto havethread;
	734	}
	735
	736	/*
	737	* Well fubar, this thread is contended as well, loop
	738	*/
	739	/* */
	740	}
	741
	742	/*
	743	* We exhausted the run list but we may have recorded a user
	744	* thread to try. We have three choices based on
	745	* lwkt.decontention_method.
	746	*
	747	* (0) Atomically clear RQF_WAKEUP in order to receive decontention
	748	* IPIs (to interrupt the user process) and test
	749	* RQF_AST_LWKT_RESCHED at the same time.
	750	*
	751	* This results in significant decontention IPI traffic but may
	752	* be more responsive.
	753	*
	754	* (1) Leave RQF_WAKEUP set so we do not receive a decontention IPI.
	755	* An automatic LWKT reschedule will occur on the next hardclock
	756	* (typically 100hz).
	757	*
	758	* This results in no decontention IPI traffic but may be less
	759	* responsive. This is the default.
	760	*
	761	* (2) Refuse to schedule the user process at this time.
	762	*
	763	* This is highly experimental and should not be used under
	764	* normal circumstances. This can cause a user process to
	765	* get starved out in situations where kernel threads are
	766	* fighting each other for tokens.
	767	*/
	768	if (xtd) {
	769	ntd = xtd;
	770
	771	switch(lwkt_spin_method) {
	772	case 0:
	773	for (;;) {
	774	reqflags = gd->gd_reqflags;
	775	if (atomic_cmpset_int(&gd->gd_reqflags,
	776	reqflags,
	777	reqflags & ~RQF_WAKEUP)) {
	778	break;
	779	}
	780	}
	781	break;
	782	case 1:
	783	reqflags = gd->gd_reqflags;
	784	break;
	785	default:
	786	goto skip;
	787	break;
	788	}
	789	if ((reqflags & RQF_AST_LWKT_RESCHED) == 0 &&
	790	(TD_TOKS_NOT_HELD(ntd) \|\| lwkt_getalltokens(ntd))
	791	) {
	792	if (ntd->td_fairq_accum < -TDFAIRQ_MAX(gd))
	793	ntd->td_fairq_accum = -TDFAIRQ_MAX(gd);
	794	goto havethread;
	795	}
	796
	797	skip:
	798	/*
	799	* Make sure RQF_WAKEUP is set if we failed to schedule the
	800	* user thread to prevent the idle thread from halting.
	801	*/
	802	atomic_set_int(&gd->gd_reqflags, RQF_WAKEUP);
	803	}
	804
	805	/*
	806	* We exhausted the run list, meaning that all runnable threads
	807	* are contended.
	808	*/
	809	cpu_pause();
	810	ntd = &gd->gd_idlethread;
	811	#ifdef SMP
	812	if (gd->gd_trap_nesting_level == 0 && panicstr == NULL)
	813	ASSERT_NO_TOKENS_HELD(ntd);
	814	/* contention case, do not clear contention mask */
	815	#endif
	816
	817	/*
	818	* Ok, we might want to spin a few times as some tokens are held for
	819	* very short periods of time and IPI overhead is 1uS or worse
	820	* (meaning it is usually better to spin). Regardless we have to
	821	* call splz_check() to be sure to service any interrupts blocked
	822	* by our critical section, otherwise we could livelock e.g. IPIs.
	823	*
	824	* The IPI mechanic is really a last resort. In nearly all other
	825	* cases RQF_WAKEUP is left set to prevent decontention IPIs.
	826	*
	827	* When we decide not to spin we clear RQF_WAKEUP and switch to
	828	* the idle thread. Clearing RQF_WEAKEUP allows the idle thread
	829	* to halt and decontended tokens will issue an IPI to us. The
	830	* idle thread will check for pending reschedules already set
	831	* (RQF_AST_LWKT_RESCHED) before actually halting so we don't have
	832	* to here.
	833	*
	834	* Also, if TDF_RUNQ is not set the current thread is trying to
	835	* deschedule, possibly in an atomic fashion. We cannot afford to
	836	* stay here.
	837	*/
	838	if (spinning <= 0 \|\| (td->td_flags & TDF_RUNQ) == 0) {
	839	atomic_clear_int(&gd->gd_reqflags, RQF_WAKEUP);
	840	goto haveidle;
	841	}
	842	--spinning;
	843
	844	/*
	845	* When spinning a delay is required both to avoid livelocks from
	846	* token order reversals (a thread may be trying to acquire multiple
	847	* tokens), and also to reduce cpu cache management traffic.
	848	*
	849	* In order to scale to a large number of CPUs we use a time slot
	850	* resequencer to force contending cpus into non-contending
	851	* time-slots. The scheduler may still contend with the lock holder
	852	* but will not (generally) contend with all the other cpus trying
	853	* trying to get the same token.
	854	*
	855	* The resequencer uses a FIFO counter mechanic. The owner of the
	856	* rindex at the head of the FIFO is allowed to pull itself off
	857	* the FIFO and fetchadd is used to enter into the FIFO. This bit
	858	* of code is VERY cache friendly and forces all spinning schedulers
	859	* into their own time slots.
	860	*
	861	* This code has been tested to 48-cpus and caps the cache
	862	* contention load at ~1uS intervals regardless of the number of
	863	* cpus. Scaling beyond 64 cpus might require additional smarts
	864	* (such as separate FIFOs for specific token cases).
	865	*
	866	* WARNING! We can't call splz_check() or anything else here as
	867	* it could cause a deadlock.
	868	*/
	869	#if defined(INVARIANTS) && defined(__amd64__)
	870	if ((read_rflags() & PSL_I) == 0) {
	871	cpu_enable_intr();
	872	panic("lwkt_switch() called with interrupts disabled");
	873	}
	874	#endif
	875	cseq = atomic_fetchadd_int(&lwkt_cseq_windex, 1);
	876	fatal_count = lwkt_spin_fatal;
	877	while ((oseq = lwkt_cseq_rindex) != cseq) {
	878	cpu_ccfence();
	879	#if !defined(_KERNEL_VIRTUAL)
	880	if (cpu_mi_feature & CPU_MI_MONITOR) {
	881	cpu_mmw_pause_int(&lwkt_cseq_rindex, oseq);
	882	} else
	883	#endif
	884	{
	885	DELAY(1);
	886	cpu_lfence();
	887	}
	888	if (fatal_count && --fatal_count == 0)
	889	panic("lwkt_switch: fatal spin wait");
	890	}
	891	cseq = lwkt_spin_delay; /* don't trust the system operator */
	892	cpu_ccfence();
	893	if (cseq < 1)
	894	cseq = 1;
	895	if (cseq > 1000)
	896	cseq = 1000;
	897	DELAY(cseq);
	898	atomic_add_int(&lwkt_cseq_rindex, 1);
	899	splz_check(); /* ok, we already checked that td is still scheduled */
	900	/* highest level for(;;) loop */
	901	}
	902
	903	havethread:
	904	/*
	905	* We must always decrement td_fairq_accum on non-idle threads just
	906	* in case a thread never gets a tick due to being in a continuous
	907	* critical section. The page-zeroing code does this, for example.
	908	*
	909	* If the thread we came up with is a higher or equal priority verses
	910	* the thread at the head of the queue we move our thread to the
	911	* front. This way we can always check the front of the queue.
	912	*
	913	* Clear gd_idle_repeat when doing a normal switch to a non-idle
	914	* thread.
	915	*/
	916	++gd->gd_cnt.v_swtch;
	917	--ntd->td_fairq_accum;
	918	ntd->td_wmesg = NULL;
	919	xtd = TAILQ_FIRST(&gd->gd_tdrunq);
	920	if (ntd != xtd && ntd->td_pri >= xtd->td_pri) {
	921	TAILQ_REMOVE(&gd->gd_tdrunq, ntd, td_threadq);
	922	TAILQ_INSERT_HEAD(&gd->gd_tdrunq, ntd, td_threadq);
	923	}
	924	gd->gd_idle_repeat = 0;
	925
	926	havethread_preempted:
	927	/*
	928	* If the new target does not need the MP lock and we are holding it,
	929	* release the MP lock. If the new target requires the MP lock we have
	930	* already acquired it for the target.
	931	*/
	932	;
	933	haveidle:
	934	KASSERT(ntd->td_critcount,
	935	("priority problem in lwkt_switch %d %d",
	936	td->td_critcount, ntd->td_critcount));
	937
	938	if (td != ntd) {
	939	/*
	940	* Execute the actual thread switch operation. This function
	941	* returns to the current thread and returns the previous thread
	942	* (which may be different from the thread we switched to).
	943	*
	944	* We are responsible for marking ntd as TDF_RUNNING.
	945	*/
	946	++switch_count;
	947	KTR_LOG(ctxsw_sw, gd->gd_cpuid, ntd);
	948	ntd->td_flags \|= TDF_RUNNING;
	949	lwkt_switch_return(td->td_switch(ntd));
	950	/* ntd invalid, td_switch() can return a different thread_t */
	951	}
	952	/* NOTE: current cpu may have changed after switch */
	953	crit_exit_quick(td);
	954	}
	955
	956	/*
	957	* Called by assembly in the td_switch (thread restore path) for thread
	958	* bootstrap cases which do not 'return' to lwkt_switch().
	959	*/
	960	void
	961	lwkt_switch_return(thread_t otd)
	962	{
	963	#ifdef SMP
	964	globaldata_t rgd;
	965
	966	/*
	967	* Check if otd was migrating. Now that we are on ntd we can finish
	968	* up the migration. This is a bit messy but it is the only place
	969	* where td is known to be fully descheduled.
	970	*
	971	* We can only activate the migration if otd was migrating but not
	972	* held on the cpu due to a preemption chain. We still have to
	973	* clear TDF_RUNNING on the old thread either way.
	974	*
	975	* We are responsible for clearing the previously running thread's
	976	* TDF_RUNNING.
	977	*/
	978	if ((rgd = otd->td_migrate_gd) != NULL &&
	979	(otd->td_flags & TDF_PREEMPT_LOCK) == 0) {
	980	KKASSERT((otd->td_flags & (TDF_MIGRATING \| TDF_RUNNING)) ==
	981	(TDF_MIGRATING \| TDF_RUNNING));
	982	otd->td_migrate_gd = NULL;
	983	otd->td_flags &= ~TDF_RUNNING;
	984	lwkt_send_ipiq(rgd, lwkt_setcpu_remote, otd);
	985	} else {
	986	otd->td_flags &= ~TDF_RUNNING;
	987	}
	988	#else
	989	otd->td_flags &= ~TDF_RUNNING;
	990	#endif
	991	}
	992
	993	/*
	994	* Request that the target thread preempt the current thread. Preemption
	995	* only works under a specific set of conditions:
	996	*
	997	* - We are not preempting ourselves
	998	* - The target thread is owned by the current cpu
	999	* - We are not currently being preempted
	1000	* - The target is not currently being preempted
	1001	* - We are not holding any spin locks
	1002	* - The target thread is not holding any tokens
	1003	* - We are able to satisfy the target's MP lock requirements (if any).
	1004	*
	1005	* THE CALLER OF LWKT_PREEMPT() MUST BE IN A CRITICAL SECTION. Typically
	1006	* this is called via lwkt_schedule() through the td_preemptable callback.
	1007	* critcount is the managed critical priority that we should ignore in order
	1008	* to determine whether preemption is possible (aka usually just the crit
	1009	* priority of lwkt_schedule() itself).
	1010	*
	1011	* XXX at the moment we run the target thread in a critical section during
	1012	* the preemption in order to prevent the target from taking interrupts
	1013	* that WE can't. Preemption is strictly limited to interrupt threads
	1014	* and interrupt-like threads, outside of a critical section, and the
	1015	* preempted source thread will be resumed the instant the target blocks
	1016	* whether or not the source is scheduled (i.e. preemption is supposed to
	1017	* be as transparent as possible).
	1018	*/
	1019	void
	1020	lwkt_preempt(thread_t ntd, int critcount)
	1021	{
	1022	struct globaldata *gd = mycpu;
	1023	thread_t xtd;
	1024	thread_t td;
	1025	int save_gd_intr_nesting_level;
	1026
	1027	/*
	1028	* The caller has put us in a critical section. We can only preempt
	1029	* if the caller of the caller was not in a critical section (basically
	1030	* a local interrupt), as determined by the 'critcount' parameter. We
	1031	* also can't preempt if the caller is holding any spinlocks (even if
	1032	* he isn't in a critical section). This also handles the tokens test.
	1033	*
	1034	* YYY The target thread must be in a critical section (else it must
	1035	* inherit our critical section? I dunno yet).
	1036	*
	1037	* Set need_lwkt_resched() unconditionally for now YYY.
	1038	*/
	1039	KASSERT(ntd->td_critcount, ("BADCRIT0 %d", ntd->td_pri));
	1040
	1041	if (preempt_enable == 0) {
	1042	++preempt_miss;
	1043	return;
	1044	}
	1045
	1046	td = gd->gd_curthread;
	1047	if (ntd->td_pri <= td->td_pri) {
	1048	++preempt_miss;
	1049	return;
	1050	}
	1051	if (td->td_critcount > critcount) {
	1052	++preempt_miss;
	1053	need_lwkt_resched();
	1054	return;
	1055	}
	1056	#ifdef SMP
	1057	if (ntd->td_gd != gd) {
	1058	++preempt_miss;
	1059	need_lwkt_resched();
	1060	return;
	1061	}
	1062	#endif
	1063	/*
	1064	* We don't have to check spinlocks here as they will also bump
	1065	* td_critcount.
	1066	*
	1067	* Do not try to preempt if the target thread is holding any tokens.
	1068	* We could try to acquire the tokens but this case is so rare there
	1069	* is no need to support it.
	1070	*/
	1071	KKASSERT(gd->gd_spinlocks_wr == 0);
	1072
	1073	if (TD_TOKS_HELD(ntd)) {
	1074	++preempt_miss;
	1075	need_lwkt_resched();
	1076	return;
	1077	}
	1078	if (td == ntd \|\| ((td->td_flags \| ntd->td_flags) & TDF_PREEMPT_LOCK)) {
	1079	++preempt_weird;
	1080	need_lwkt_resched();
	1081	return;
	1082	}
	1083	if (ntd->td_preempted) {
	1084	++preempt_hit;
	1085	need_lwkt_resched();
	1086	return;
	1087	}
	1088	KKASSERT(gd->gd_processing_ipiq == 0);
	1089
	1090	/*
	1091	* Since we are able to preempt the current thread, there is no need to
	1092	* call need_lwkt_resched().
	1093	*
	1094	* We must temporarily clear gd_intr_nesting_level around the switch
	1095	* since switchouts from the target thread are allowed (they will just
	1096	* return to our thread), and since the target thread has its own stack.
	1097	*
	1098	* A preemption must switch back to the original thread, assert the
	1099	* case.
	1100	*/
	1101	++preempt_hit;
	1102	ntd->td_preempted = td;
	1103	td->td_flags \|= TDF_PREEMPT_LOCK;
	1104	KTR_LOG(ctxsw_pre, gd->gd_cpuid, ntd);
	1105	save_gd_intr_nesting_level = gd->gd_intr_nesting_level;
	1106	gd->gd_intr_nesting_level = 0;
	1107	ntd->td_flags \|= TDF_RUNNING;
	1108	xtd = td->td_switch(ntd);
	1109	KKASSERT(xtd == ntd);
	1110	lwkt_switch_return(xtd);
	1111	gd->gd_intr_nesting_level = save_gd_intr_nesting_level;
	1112
	1113	KKASSERT(ntd->td_preempted && (td->td_flags & TDF_PREEMPT_DONE));
	1114	ntd->td_preempted = NULL;
	1115	td->td_flags &= ~(TDF_PREEMPT_LOCK\|TDF_PREEMPT_DONE);
	1116	}
	1117
	1118	/*
	1119	* Conditionally call splz() if gd_reqflags indicates work is pending.
	1120	* This will work inside a critical section but not inside a hard code
	1121	* section.
	1122	*
	1123	* (self contained on a per cpu basis)
	1124	*/
	1125	void
	1126	splz_check(void)
	1127	{
	1128	globaldata_t gd = mycpu;
	1129	thread_t td = gd->gd_curthread;
	1130
	1131	if ((gd->gd_reqflags & RQF_IDLECHECK_MASK) &&
	1132	gd->gd_intr_nesting_level == 0 &&
	1133	td->td_nest_count < 2)
	1134	{
	1135	splz();
	1136	}
	1137	}
	1138
	1139	/*
	1140	* This version is integrated into crit_exit, reqflags has already
	1141	* been tested but td_critcount has not.
	1142	*
	1143	* We only want to execute the splz() on the 1->0 transition of
	1144	* critcount and not in a hard code section or if too deeply nested.
	1145	*/
	1146	void
	1147	lwkt_maybe_splz(thread_t td)
	1148	{
	1149	globaldata_t gd = td->td_gd;
	1150
	1151	if (td->td_critcount == 0 &&
	1152	gd->gd_intr_nesting_level == 0 &&
	1153	td->td_nest_count < 2)
	1154	{
	1155	splz();
	1156	}
	1157	}
	1158
	1159	/*
	1160	* This function is used to negotiate a passive release of the current
	1161	* process/lwp designation with the user scheduler, allowing the user
	1162	* scheduler to schedule another user thread. The related kernel thread
	1163	* (curthread) continues running in the released state.
	1164	*/
	1165	void
	1166	lwkt_passive_release(struct thread *td)
	1167	{
	1168	struct lwp *lp = td->td_lwp;
	1169
	1170	td->td_release = NULL;
	1171	lwkt_setpri_self(TDPRI_KERN_USER);
	1172	lp->lwp_proc->p_usched->release_curproc(lp);
	1173	}
	1174
	1175
	1176	/*
	1177	* This implements a normal yield. This routine is virtually a nop if
	1178	* there is nothing to yield to but it will always run any pending interrupts
	1179	* if called from a critical section.
	1180	*
	1181	* This yield is designed for kernel threads without a user context.
	1182	*
	1183	* (self contained on a per cpu basis)
	1184	*/
	1185	void
	1186	lwkt_yield(void)
	1187	{
	1188	globaldata_t gd = mycpu;
	1189	thread_t td = gd->gd_curthread;
	1190	thread_t xtd;
	1191
	1192	if ((gd->gd_reqflags & RQF_IDLECHECK_MASK) && td->td_nest_count < 2)
	1193	splz();
	1194	if (td->td_fairq_accum < 0) {
	1195	lwkt_schedule_self(curthread);
	1196	lwkt_switch();
	1197	} else {
	1198	xtd = TAILQ_FIRST(&gd->gd_tdrunq);
	1199	if (xtd && xtd->td_pri > td->td_pri) {
	1200	lwkt_schedule_self(curthread);
	1201	lwkt_switch();
	1202	}
	1203	}
	1204	}
	1205
	1206	/*
	1207	* This yield is designed for kernel threads with a user context.
	1208	*
	1209	* The kernel acting on behalf of the user is potentially cpu-bound,
	1210	* this function will efficiently allow other threads to run and also
	1211	* switch to other processes by releasing.
	1212	*
	1213	* The lwkt_user_yield() function is designed to have very low overhead
	1214	* if no yield is determined to be needed.
	1215	*/
	1216	void
	1217	lwkt_user_yield(void)
	1218	{
	1219	globaldata_t gd = mycpu;
	1220	thread_t td = gd->gd_curthread;
	1221
	1222	/*
	1223	* Always run any pending interrupts in case we are in a critical
	1224	* section.
	1225	*/
	1226	if ((gd->gd_reqflags & RQF_IDLECHECK_MASK) && td->td_nest_count < 2)
	1227	splz();
	1228
	1229	/*
	1230	* Switch (which forces a release) if another kernel thread needs
	1231	* the cpu, if userland wants us to resched, or if our kernel
	1232	* quantum has run out.
	1233	*/
	1234	if (lwkt_resched_wanted() \|\|
	1235	user_resched_wanted() \|\|
	1236	td->td_fairq_accum < 0)
	1237	{
	1238	lwkt_switch();
	1239	}
	1240
	1241	#if 0
	1242	/*
	1243	* Reacquire the current process if we are released.
	1244	*
	1245	* XXX not implemented atm. The kernel may be holding locks and such,
	1246	* so we want the thread to continue to receive cpu.
	1247	*/
	1248	if (td->td_release == NULL && lp) {
	1249	lp->lwp_proc->p_usched->acquire_curproc(lp);
	1250	td->td_release = lwkt_passive_release;
	1251	lwkt_setpri_self(TDPRI_USER_NORM);
	1252	}
	1253	#endif
	1254	}
	1255
	1256	/*
	1257	* Generic schedule. Possibly schedule threads belonging to other cpus and
	1258	* deal with threads that might be blocked on a wait queue.
	1259	*
	1260	* We have a little helper inline function which does additional work after
	1261	* the thread has been enqueued, including dealing with preemption and
	1262	* setting need_lwkt_resched() (which prevents the kernel from returning
	1263	* to userland until it has processed higher priority threads).
	1264	*
	1265	* It is possible for this routine to be called after a failed _enqueue
	1266	* (due to the target thread migrating, sleeping, or otherwise blocked).
	1267	* We have to check that the thread is actually on the run queue!
	1268	*
	1269	* reschedok is an optimized constant propagated from lwkt_schedule() or
	1270	* lwkt_schedule_noresched(). By default it is non-zero, causing a
	1271	* reschedule to be requested if the target thread has a higher priority.
	1272	* The port messaging code will set MSG_NORESCHED and cause reschedok to
	1273	* be 0, prevented undesired reschedules.
	1274	*/
	1275	static __inline
	1276	void
	1277	_lwkt_schedule_post(globaldata_t gd, thread_t ntd, int ccount, int reschedok)
	1278	{
	1279	thread_t otd;
	1280
	1281	if (ntd->td_flags & TDF_RUNQ) {
	1282	if (ntd->td_preemptable && reschedok) {
	1283	ntd->td_preemptable(ntd, ccount); /* YYY +token */
	1284	} else if (reschedok) {
	1285	otd = curthread;
	1286	if (ntd->td_pri > otd->td_pri)
	1287	need_lwkt_resched();
	1288	}
	1289
	1290	/*
	1291	* Give the thread a little fair share scheduler bump if it
	1292	* has been asleep for a while. This is primarily to avoid
	1293	* a degenerate case for interrupt threads where accumulator
	1294	* crosses into negative territory unnecessarily.
	1295	*/
	1296	if (ntd->td_fairq_lticks != ticks) {
	1297	ntd->td_fairq_lticks = ticks;
	1298	ntd->td_fairq_accum += gd->gd_fairq_total_pri;
	1299	if (ntd->td_fairq_accum > TDFAIRQ_MAX(gd))
	1300	ntd->td_fairq_accum = TDFAIRQ_MAX(gd);
	1301	}
	1302	}
	1303	}
	1304
	1305	static __inline
	1306	void
	1307	_lwkt_schedule(thread_t td, int reschedok)
	1308	{
	1309	globaldata_t mygd = mycpu;
	1310
	1311	KASSERT(td != &td->td_gd->gd_idlethread,
	1312	("lwkt_schedule(): scheduling gd_idlethread is illegal!"));
	1313	KKASSERT((td->td_flags & TDF_MIGRATING) == 0);
	1314	crit_enter_gd(mygd);
	1315	KKASSERT(td->td_lwp == NULL \|\| (td->td_lwp->lwp_flag & LWP_ONRUNQ) == 0);
	1316	if (td == mygd->gd_curthread) {
	1317	_lwkt_enqueue(td);
	1318	} else {
	1319	/*
	1320	* If we own the thread, there is no race (since we are in a
	1321	* critical section). If we do not own the thread there might
	1322	* be a race but the target cpu will deal with it.
	1323	*/
	1324	#ifdef SMP
	1325	if (td->td_gd == mygd) {
	1326	_lwkt_enqueue(td);
	1327	_lwkt_schedule_post(mygd, td, 1, reschedok);
	1328	} else {
	1329	lwkt_send_ipiq3(td->td_gd, lwkt_schedule_remote, td, 0);
	1330	}
	1331	#else
	1332	_lwkt_enqueue(td);
	1333	_lwkt_schedule_post(mygd, td, 1, reschedok);
	1334	#endif
	1335	}
	1336	crit_exit_gd(mygd);
	1337	}
	1338
	1339	void
	1340	lwkt_schedule(thread_t td)
	1341	{
	1342	_lwkt_schedule(td, 1);
	1343	}
	1344
	1345	void
	1346	lwkt_schedule_noresched(thread_t td)
	1347	{
	1348	_lwkt_schedule(td, 0);
	1349	}
	1350
	1351	#ifdef SMP
	1352
	1353	/*
	1354	* When scheduled remotely if frame != NULL the IPIQ is being
	1355	* run via doreti or an interrupt then preemption can be allowed.
	1356	*
	1357	* To allow preemption we have to drop the critical section so only
	1358	* one is present in _lwkt_schedule_post.
	1359	*/
	1360	static void
	1361	lwkt_schedule_remote(void arg, int arg2, struct intrframe frame)
	1362	{
	1363	thread_t td = curthread;
	1364	thread_t ntd = arg;
	1365
	1366	if (frame && ntd->td_preemptable) {
	1367	crit_exit_noyield(td);
	1368	_lwkt_schedule(ntd, 1);
	1369	crit_enter_quick(td);
	1370	} else {
	1371	_lwkt_schedule(ntd, 1);
	1372	}
	1373	}
	1374
	1375	/*
	1376	* Thread migration using a 'Pull' method. The thread may or may not be
	1377	* the current thread. It MUST be descheduled and in a stable state.
	1378	* lwkt_giveaway() must be called on the cpu owning the thread.
	1379	*
	1380	* At any point after lwkt_giveaway() is called, the target cpu may
	1381	* 'pull' the thread by calling lwkt_acquire().
	1382	*
	1383	* We have to make sure the thread is not sitting on a per-cpu tsleep
	1384	* queue or it will blow up when it moves to another cpu.
	1385	*
	1386	* MPSAFE - must be called under very specific conditions.
	1387	*/
	1388	void
	1389	lwkt_giveaway(thread_t td)
	1390	{
	1391	globaldata_t gd = mycpu;
	1392
	1393	crit_enter_gd(gd);
	1394	if (td->td_flags & TDF_TSLEEPQ)
	1395	tsleep_remove(td);
	1396	KKASSERT(td->td_gd == gd);
	1397	TAILQ_REMOVE(&gd->gd_tdallq, td, td_allq);
	1398	td->td_flags \|= TDF_MIGRATING;
	1399	crit_exit_gd(gd);
	1400	}
	1401
	1402	void
	1403	lwkt_acquire(thread_t td)
	1404	{
	1405	globaldata_t gd;
	1406	globaldata_t mygd;
	1407	int retry = 10000000;
	1408
	1409	KKASSERT(td->td_flags & TDF_MIGRATING);
	1410	gd = td->td_gd;
	1411	mygd = mycpu;
	1412	if (gd != mycpu) {
	1413	cpu_lfence();
	1414	KKASSERT((td->td_flags & TDF_RUNQ) == 0);
	1415	crit_enter_gd(mygd);
	1416	DEBUG_PUSH_INFO("lwkt_acquire");
	1417	while (td->td_flags & (TDF_RUNNING\|TDF_PREEMPT_LOCK)) {
	1418	#ifdef SMP
	1419	lwkt_process_ipiq();
	1420	#endif
	1421	cpu_lfence();
	1422	if (--retry == 0) {
	1423	kprintf("lwkt_acquire: stuck: td %p td->td_flags %08x\n",
	1424	td, td->td_flags);
	1425	retry = 10000000;
	1426	}
	1427	}
	1428	DEBUG_POP_INFO();
	1429	cpu_mfence();
	1430	td->td_gd = mygd;
	1431	TAILQ_INSERT_TAIL(&mygd->gd_tdallq, td, td_allq);
	1432	td->td_flags &= ~TDF_MIGRATING;
	1433	crit_exit_gd(mygd);
	1434	} else {
	1435	crit_enter_gd(mygd);
	1436	TAILQ_INSERT_TAIL(&mygd->gd_tdallq, td, td_allq);
	1437	td->td_flags &= ~TDF_MIGRATING;
	1438	crit_exit_gd(mygd);
	1439	}
	1440	}
	1441
	1442	#endif
	1443
	1444	/*
	1445	* Generic deschedule. Descheduling threads other then your own should be
	1446	* done only in carefully controlled circumstances. Descheduling is
	1447	* asynchronous.
	1448	*
	1449	* This function may block if the cpu has run out of messages.
	1450	*/
	1451	void
	1452	lwkt_deschedule(thread_t td)
	1453	{
	1454	crit_enter();
	1455	#ifdef SMP
	1456	if (td == curthread) {
	1457	_lwkt_dequeue(td);
	1458	} else {
	1459	if (td->td_gd == mycpu) {
	1460	_lwkt_dequeue(td);
	1461	} else {
	1462	lwkt_send_ipiq(td->td_gd, (ipifunc1_t)lwkt_deschedule, td);
	1463	}
	1464	}
	1465	#else
	1466	_lwkt_dequeue(td);
	1467	#endif
	1468	crit_exit();
	1469	}
	1470
	1471	/*
	1472	* Set the target thread's priority. This routine does not automatically
	1473	* switch to a higher priority thread, LWKT threads are not designed for
	1474	* continuous priority changes. Yield if you want to switch.
	1475	*/
	1476	void
	1477	lwkt_setpri(thread_t td, int pri)
	1478	{
	1479	KKASSERT(td->td_gd == mycpu);
	1480	if (td->td_pri != pri) {
	1481	KKASSERT(pri >= 0);
	1482	crit_enter();
	1483	if (td->td_flags & TDF_RUNQ) {
	1484	_lwkt_dequeue(td);
	1485	td->td_pri = pri;
	1486	_lwkt_enqueue(td);
	1487	} else {
	1488	td->td_pri = pri;
	1489	}
	1490	crit_exit();
	1491	}
	1492	}
	1493
	1494	/*
	1495	* Set the initial priority for a thread prior to it being scheduled for
	1496	* the first time. The thread MUST NOT be scheduled before or during
	1497	* this call. The thread may be assigned to a cpu other then the current
	1498	* cpu.
	1499	*
	1500	* Typically used after a thread has been created with TDF_STOPPREQ,
	1501	* and before the thread is initially scheduled.
	1502	*/
	1503	void
	1504	lwkt_setpri_initial(thread_t td, int pri)
	1505	{
	1506	KKASSERT(pri >= 0);
	1507	KKASSERT((td->td_flags & TDF_RUNQ) == 0);
	1508	td->td_pri = pri;
	1509	}
	1510
	1511	void
	1512	lwkt_setpri_self(int pri)
	1513	{
	1514	thread_t td = curthread;
	1515
	1516	KKASSERT(pri >= 0 && pri <= TDPRI_MAX);
	1517	crit_enter();
	1518	if (td->td_flags & TDF_RUNQ) {
	1519	_lwkt_dequeue(td);
	1520	td->td_pri = pri;
	1521	_lwkt_enqueue(td);
	1522	} else {
	1523	td->td_pri = pri;
	1524	}
	1525	crit_exit();
	1526	}
	1527
	1528	/*
	1529	* 1/hz tick (typically 10ms) x TDFAIRQ_SCALE (typ 8) = 80ms full cycle.
	1530	*
	1531	* Example: two competing threads, same priority N. decrement by (2*N)
	1532	* increment by N*8, each thread will get 4 ticks.
	1533	*/
	1534	void
	1535	lwkt_fairq_schedulerclock(thread_t td)
	1536	{
	1537	globaldata_t gd;
	1538
	1539	if (fairq_enable) {
	1540	while (td) {
	1541	gd = td->td_gd;
	1542	if (td != &gd->gd_idlethread) {
	1543	td->td_fairq_accum -= gd->gd_fairq_total_pri;
	1544	if (td->td_fairq_accum < -TDFAIRQ_MAX(gd))
	1545	td->td_fairq_accum = -TDFAIRQ_MAX(gd);
	1546	if (td->td_fairq_accum < 0)
	1547	need_lwkt_resched();
	1548	td->td_fairq_lticks = ticks;
	1549	}
	1550	td = td->td_preempted;
	1551	}
	1552	}
	1553	}
	1554
	1555	static void
	1556	lwkt_fairq_accumulate(globaldata_t gd, thread_t td)
	1557	{
	1558	td->td_fairq_accum += td->td_pri * TDFAIRQ_SCALE;
	1559	if (td->td_fairq_accum > TDFAIRQ_MAX(td->td_gd))
	1560	td->td_fairq_accum = TDFAIRQ_MAX(td->td_gd);
	1561	}
	1562
	1563	/*
	1564	* Migrate the current thread to the specified cpu.
	1565	*
	1566	* This is accomplished by descheduling ourselves from the current cpu
	1567	* and setting td_migrate_gd. The lwkt_switch() code will detect that the
	1568	* 'old' thread wants to migrate after it has been completely switched out
	1569	* and will complete the migration.
	1570	*
	1571	* TDF_MIGRATING prevents scheduling races while the thread is being migrated.
	1572	*
	1573	* We must be sure to release our current process designation (if a user
	1574	* process) before clearing out any tsleepq we are on because the release
	1575	* code may re-add us.
	1576	*
	1577	* We must be sure to remove ourselves from the current cpu's tsleepq
	1578	* before potentially moving to another queue. The thread can be on
	1579	* a tsleepq due to a left-over tsleep_interlock().
	1580	*/
	1581
	1582	void
	1583	lwkt_setcpu_self(globaldata_t rgd)
	1584	{
	1585	#ifdef SMP
	1586	thread_t td = curthread;
	1587
	1588	if (td->td_gd != rgd) {
	1589	crit_enter_quick(td);
	1590
	1591	if (td->td_release)
	1592	td->td_release(td);
	1593	if (td->td_flags & TDF_TSLEEPQ)
	1594	tsleep_remove(td);
	1595
	1596	/*
	1597	* Set TDF_MIGRATING to prevent a spurious reschedule while we are
	1598	* trying to deschedule ourselves and switch away, then deschedule
	1599	* ourself, remove us from tdallq, and set td_migrate_gd. Finally,
	1600	* call lwkt_switch() to complete the operation.
	1601	*/
	1602	td->td_flags \|= TDF_MIGRATING;
	1603	lwkt_deschedule_self(td);
	1604	TAILQ_REMOVE(&td->td_gd->gd_tdallq, td, td_allq);
	1605	td->td_migrate_gd = rgd;
	1606	lwkt_switch();
	1607
	1608	/*
	1609	* We are now on the target cpu
	1610	*/
	1611	KKASSERT(rgd == mycpu);
	1612	TAILQ_INSERT_TAIL(&rgd->gd_tdallq, td, td_allq);
	1613	crit_exit_quick(td);
	1614	}
	1615	#endif
	1616	}
	1617
	1618	void
	1619	lwkt_migratecpu(int cpuid)
	1620	{
	1621	#ifdef SMP
	1622	globaldata_t rgd;
	1623
	1624	rgd = globaldata_find(cpuid);
	1625	lwkt_setcpu_self(rgd);
	1626	#endif
	1627	}
	1628
	1629	#ifdef SMP
	1630	/*
	1631	* Remote IPI for cpu migration (called while in a critical section so we
	1632	* do not have to enter another one).
	1633	*
	1634	* The thread (td) has already been completely descheduled from the
	1635	* originating cpu and we can simply assert the case. The thread is
	1636	* assigned to the new cpu and enqueued.
	1637	*
	1638	* The thread will re-add itself to tdallq when it resumes execution.
	1639	*/
	1640	static void
	1641	lwkt_setcpu_remote(void *arg)
	1642	{
	1643	thread_t td = arg;
	1644	globaldata_t gd = mycpu;
	1645
	1646	KKASSERT((td->td_flags & (TDF_RUNNING\|TDF_PREEMPT_LOCK)) == 0);
	1647	td->td_gd = gd;
	1648	cpu_mfence();
	1649	td->td_flags &= ~TDF_MIGRATING;
	1650	KKASSERT(td->td_migrate_gd == NULL);
	1651	KKASSERT(td->td_lwp == NULL \|\| (td->td_lwp->lwp_flag & LWP_ONRUNQ) == 0);
	1652	_lwkt_enqueue(td);
	1653	}
	1654	#endif
	1655
	1656	struct lwp *
	1657	lwkt_preempted_proc(void)
	1658	{
	1659	thread_t td = curthread;
	1660	while (td->td_preempted)
	1661	td = td->td_preempted;
	1662	return(td->td_lwp);
	1663	}
	1664
	1665	/*
	1666	* Create a kernel process/thread/whatever. It shares it's address space
	1667	* with proc0 - ie: kernel only.
	1668	*
	1669	* NOTE! By default new threads are created with the MP lock held. A
	1670	* thread which does not require the MP lock should release it by calling
	1671	* rel_mplock() at the start of the new thread.
	1672	*/
	1673	int
	1674	lwkt_create(void (func)(void ), void arg, struct thread *tdp,
	1675	thread_t template, int tdflags, int cpu, const char *fmt, ...)
	1676	{
	1677	thread_t td;
	1678	__va_list ap;
	1679
	1680	td = lwkt_alloc_thread(template, LWKT_THREAD_STACK, cpu,
	1681	tdflags);
	1682	if (tdp)
	1683	*tdp = td;
	1684	cpu_set_thread_handler(td, lwkt_exit, func, arg);
	1685
	1686	/*
	1687	* Set up arg0 for 'ps' etc
	1688	*/
	1689	__va_start(ap, fmt);
	1690	kvsnprintf(td->td_comm, sizeof(td->td_comm), fmt, ap);
	1691	__va_end(ap);
	1692
	1693	/*
	1694	* Schedule the thread to run
	1695	*/
	1696	if ((td->td_flags & TDF_STOPREQ) == 0)
	1697	lwkt_schedule(td);
	1698	else
	1699	td->td_flags &= ~TDF_STOPREQ;
	1700	return 0;
	1701	}
	1702
	1703	/*
	1704	* Destroy an LWKT thread. Warning! This function is not called when
	1705	* a process exits, cpu_proc_exit() directly calls cpu_thread_exit() and
	1706	* uses a different reaping mechanism.
	1707	*/
	1708	void
	1709	lwkt_exit(void)
	1710	{
	1711	thread_t td = curthread;
	1712	thread_t std;
	1713	globaldata_t gd;
	1714
	1715	/*
	1716	* Do any cleanup that might block here
	1717	*/
	1718	if (td->td_flags & TDF_VERBOSE)
	1719	kprintf("kthread %p %s has exited\n", td, td->td_comm);
	1720	caps_exit(td);
	1721	biosched_done(td);
	1722	dsched_exit_thread(td);
	1723
	1724	/*
	1725	* Get us into a critical section to interlock gd_freetd and loop
	1726	* until we can get it freed.
	1727	*
	1728	* We have to cache the current td in gd_freetd because objcache_put()ing
	1729	* it would rip it out from under us while our thread is still active.
	1730	*/
	1731	gd = mycpu;
	1732	crit_enter_quick(td);
	1733	while ((std = gd->gd_freetd) != NULL) {
	1734	KKASSERT((std->td_flags & (TDF_RUNNING\|TDF_PREEMPT_LOCK)) == 0);
	1735	gd->gd_freetd = NULL;
	1736	objcache_put(thread_cache, std);
	1737	}
	1738
	1739	/*
	1740	* Remove thread resources from kernel lists and deschedule us for
	1741	* the last time. We cannot block after this point or we may end
	1742	* up with a stale td on the tsleepq.
	1743	*/
	1744	if (td->td_flags & TDF_TSLEEPQ)
	1745	tsleep_remove(td);
	1746	lwkt_deschedule_self(td);
	1747	lwkt_remove_tdallq(td);
	1748	KKASSERT(td->td_refs == 0);
	1749
	1750	/*
	1751	* Final cleanup
	1752	*/
	1753	KKASSERT(gd->gd_freetd == NULL);
	1754	if (td->td_flags & TDF_ALLOCATED_THREAD)
	1755	gd->gd_freetd = td;
	1756	cpu_thread_exit();
	1757	}
	1758
	1759	void
	1760	lwkt_remove_tdallq(thread_t td)
	1761	{
	1762	KKASSERT(td->td_gd == mycpu);
	1763	TAILQ_REMOVE(&td->td_gd->gd_tdallq, td, td_allq);
	1764	}
	1765
	1766	/*
	1767	* Code reduction and branch prediction improvements. Call/return
	1768	* overhead on modern cpus often degenerates into 0 cycles due to
	1769	* the cpu's branch prediction hardware and return pc cache. We
	1770	* can take advantage of this by not inlining medium-complexity
	1771	* functions and we can also reduce the branch prediction impact
	1772	* by collapsing perfectly predictable branches into a single
	1773	* procedure instead of duplicating it.
	1774	*
	1775	* Is any of this noticeable? Probably not, so I'll take the
	1776	* smaller code size.
	1777	*/
	1778	void
	1779	crit_exit_wrapper(__DEBUG_CRIT_ARG__)
	1780	{
	1781	_crit_exit(mycpu __DEBUG_CRIT_PASS_ARG__);
	1782	}
	1783
	1784	void
	1785	crit_panic(void)
	1786	{
	1787	thread_t td = curthread;
	1788	int lcrit = td->td_critcount;
	1789
	1790	td->td_critcount = 0;
	1791	panic("td_critcount is/would-go negative! %p %d", td, lcrit);
	1792	/* NOT REACHED */
	1793	}
	1794
	1795	#ifdef SMP
	1796
	1797	/*
	1798	* Called from debugger/panic on cpus which have been stopped. We must still
	1799	* process the IPIQ while stopped, even if we were stopped while in a critical
	1800	* section (XXX).
	1801	*
	1802	* If we are dumping also try to process any pending interrupts. This may
	1803	* or may not work depending on the state of the cpu at the point it was
	1804	* stopped.
	1805	*/
	1806	void
	1807	lwkt_smp_stopped(void)
	1808	{
	1809	globaldata_t gd = mycpu;
	1810
	1811	crit_enter_gd(gd);
	1812	if (dumping) {
	1813	lwkt_process_ipiq();
	1814	splz();
	1815	} else {
	1816	lwkt_process_ipiq();
	1817	}
	1818	crit_exit_gd(gd);
	1819	}
	1820
	1821	#endif