gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2003-2011 The DragonFly Project. All rights reserved.
	3	*
	4	* This code is derived from software contributed to The DragonFly Project
	5	* by Matthew Dillon <dillon@backplane.com>
	6	*
	7	* Redistribution and use in source and binary forms, with or without
	8	* modification, are permitted provided that the following conditions
	9	* are met:
	10	*
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in
	15	* the documentation and/or other materials provided with the
	16	* distribution.
	17	* 3. Neither the name of The DragonFly Project nor the names of its
	18	* contributors may be used to endorse or promote products derived
	19	* from this software without specific, prior written permission.
	20	*
	21	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	22	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	23	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	24	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	25	* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	26	* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
	27	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	28	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	29	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	30	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	31	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	32	* SUCH DAMAGE.
	33	*/
	34
	35	/*
	36	* Each cpu in a system has its own self-contained light weight kernel
	37	* thread scheduler, which means that generally speaking we only need
	38	* to use a critical section to avoid problems. Foreign thread
	39	* scheduling is queued via (async) IPIs.
	40	*/
	41
	42	#include <sys/param.h>
	43	#include <sys/systm.h>
	44	#include <sys/kernel.h>
	45	#include <sys/proc.h>
	46	#include <sys/rtprio.h>
	47	#include <sys/kinfo.h>
	48	#include <sys/queue.h>
	49	#include <sys/sysctl.h>
	50	#include <sys/kthread.h>
	51	#include <machine/cpu.h>
	52	#include <sys/lock.h>
	53	#include <sys/spinlock.h>
	54	#include <sys/ktr.h>
	55
	56	#include <sys/thread2.h>
	57	#include <sys/spinlock2.h>
	58
	59	#include <sys/dsched.h>
	60
	61	#include <vm/vm.h>
	62	#include <vm/vm_param.h>
	63	#include <vm/vm_kern.h>
	64	#include <vm/vm_object.h>
	65	#include <vm/vm_page.h>
	66	#include <vm/vm_map.h>
	67	#include <vm/vm_pager.h>
	68	#include <vm/vm_extern.h>
	69
	70	#include <machine/stdarg.h>
	71	#include <machine/smp.h>
	72	#include <machine/clock.h>
	73
	74	#ifdef _KERNEL_VIRTUAL
	75	#include <pthread.h>
	76	#endif
	77
	78	#define LOOPMASK
	79
	80	#if !defined(KTR_CTXSW)
	81	#define KTR_CTXSW KTR_ALL
	82	#endif
	83	KTR_INFO_MASTER(ctxsw);
	84	KTR_INFO(KTR_CTXSW, ctxsw, sw, 0, "#cpu[%d].td = %p", int cpu, struct thread *td);
	85	KTR_INFO(KTR_CTXSW, ctxsw, pre, 1, "#cpu[%d].td = %p", int cpu, struct thread *td);
	86	KTR_INFO(KTR_CTXSW, ctxsw, newtd, 2, "#threads[%p].name = %s", struct thread td, char comm);
	87	KTR_INFO(KTR_CTXSW, ctxsw, deadtd, 3, "#threads[%p].name = <dead>", struct thread *td);
	88
	89	static MALLOC_DEFINE(M_THREAD, "thread", "lwkt threads");
	90
	91	#ifdef INVARIANTS
	92	static int panic_on_cscount = 0;
	93	#endif
	94	static int64_t switch_count = 0;
	95	static int64_t preempt_hit = 0;
	96	static int64_t preempt_miss = 0;
	97	static int64_t preempt_weird = 0;
	98	static int lwkt_use_spin_port;
	99	static struct objcache *thread_cache;
	100	int cpu_mwait_spin = 0;
	101
	102	static void lwkt_schedule_remote(void arg, int arg2, struct intrframe frame);
	103	static void lwkt_setcpu_remote(void *arg);
	104
	105	/*
	106	* We can make all thread ports use the spin backend instead of the thread
	107	* backend. This should only be set to debug the spin backend.
	108	*/
	109	TUNABLE_INT("lwkt.use_spin_port", &lwkt_use_spin_port);
	110
	111	#ifdef INVARIANTS
	112	SYSCTL_INT(_lwkt, OID_AUTO, panic_on_cscount, CTLFLAG_RW, &panic_on_cscount, 0,
	113	"Panic if attempting to switch lwkt's while mastering cpusync");
	114	#endif
	115	SYSCTL_QUAD(_lwkt, OID_AUTO, switch_count, CTLFLAG_RW, &switch_count, 0,
	116	"Number of switched threads");
	117	SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_hit, CTLFLAG_RW, &preempt_hit, 0,
	118	"Successful preemption events");
	119	SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_miss, CTLFLAG_RW, &preempt_miss, 0,
	120	"Failed preemption events");
	121	SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_weird, CTLFLAG_RW, &preempt_weird, 0,
	122	"Number of preempted threads.");
	123	static int fairq_enable = 0;
	124	SYSCTL_INT(_lwkt, OID_AUTO, fairq_enable, CTLFLAG_RW,
	125	&fairq_enable, 0, "Turn on fairq priority accumulators");
	126	static int fairq_bypass = -1;
	127	SYSCTL_INT(_lwkt, OID_AUTO, fairq_bypass, CTLFLAG_RW,
	128	&fairq_bypass, 0, "Allow fairq to bypass td on token failure");
	129	extern int lwkt_sched_debug;
	130	int lwkt_sched_debug = 0;
	131	SYSCTL_INT(_lwkt, OID_AUTO, sched_debug, CTLFLAG_RW,
	132	&lwkt_sched_debug, 0, "Scheduler debug");
	133	static u_int lwkt_spin_loops = 10;
	134	SYSCTL_UINT(_lwkt, OID_AUTO, spin_loops, CTLFLAG_RW,
	135	&lwkt_spin_loops, 0, "Scheduler spin loops until sorted decon");
	136	static int preempt_enable = 1;
	137	SYSCTL_INT(_lwkt, OID_AUTO, preempt_enable, CTLFLAG_RW,
	138	&preempt_enable, 0, "Enable preemption");
	139	static int lwkt_cache_threads = 0;
	140	SYSCTL_INT(_lwkt, OID_AUTO, cache_threads, CTLFLAG_RD,
	141	&lwkt_cache_threads, 0, "thread+kstack cache");
	142
	143	/*
	144	* These helper procedures handle the runq, they can only be called from
	145	* within a critical section.
	146	*
	147	* WARNING! Prior to SMP being brought up it is possible to enqueue and
	148	* dequeue threads belonging to other cpus, so be sure to use td->td_gd
	149	* instead of 'mycpu' when referencing the globaldata structure. Once
	150	* SMP live enqueuing and dequeueing only occurs on the current cpu.
	151	*/
	152	static __inline
	153	void
	154	_lwkt_dequeue(thread_t td)
	155	{
	156	if (td->td_flags & TDF_RUNQ) {
	157	struct globaldata *gd = td->td_gd;
	158
	159	td->td_flags &= ~TDF_RUNQ;
	160	TAILQ_REMOVE(&gd->gd_tdrunq, td, td_threadq);
	161	--gd->gd_tdrunqcount;
	162	if (TAILQ_FIRST(&gd->gd_tdrunq) == NULL)
	163	atomic_clear_int(&gd->gd_reqflags, RQF_RUNNING);
	164	}
	165	}
	166
	167	/*
	168	* Priority enqueue.
	169	*
	170	* There are a limited number of lwkt threads runnable since user
	171	* processes only schedule one at a time per cpu. However, there can
	172	* be many user processes in kernel mode exiting from a tsleep() which
	173	* become runnable.
	174	*
	175	* We scan the queue in both directions to help deal with degenerate
	176	* situations when hundreds or thousands (or more) threads are runnable.
	177	*
	178	* NOTE: lwkt_schedulerclock() will force a round-robin based on td_pri and
	179	* will ignore user priority. This is to ensure that user threads in
	180	* kernel mode get cpu at some point regardless of what the user
	181	* scheduler thinks.
	182	*/
	183	static __inline
	184	void
	185	_lwkt_enqueue(thread_t td)
	186	{
	187	thread_t xtd; /* forward scan */
	188	thread_t rtd; /* reverse scan */
	189
	190	if ((td->td_flags & (TDF_RUNQ\|TDF_MIGRATING\|TDF_BLOCKQ)) == 0) {
	191	struct globaldata *gd = td->td_gd;
	192
	193	td->td_flags \|= TDF_RUNQ;
	194	xtd = TAILQ_FIRST(&gd->gd_tdrunq);
	195	if (xtd == NULL) {
	196	TAILQ_INSERT_TAIL(&gd->gd_tdrunq, td, td_threadq);
	197	atomic_set_int(&gd->gd_reqflags, RQF_RUNNING);
	198	} else {
	199	/*
	200	* NOTE: td_upri - higher numbers more desireable, same sense
	201	* as td_pri (typically reversed from lwp_upri).
	202	*
	203	* In the equal priority case we want the best selection
	204	* at the beginning so the less desireable selections know
	205	* that they have to setrunqueue/go-to-another-cpu, even
	206	* though it means switching back to the 'best' selection.
	207	* This also avoids degenerate situations when many threads
	208	* are runnable or waking up at the same time.
	209	*
	210	* If upri matches exactly place at end/round-robin.
	211	*/
	212	rtd = TAILQ_LAST(&gd->gd_tdrunq, lwkt_queue);
	213
	214	while (xtd &&
	215	(xtd->td_pri > td->td_pri \|\|
	216	(xtd->td_pri == td->td_pri &&
	217	xtd->td_upri >= td->td_upri))) {
	218	xtd = TAILQ_NEXT(xtd, td_threadq);
	219
	220	/*
	221	* Doing a reverse scan at the same time is an optimization
	222	* for the insert-closer-to-tail case that avoids having to
	223	* scan the entire list. This situation can occur when
	224	* thousands of threads are woken up at the same time.
	225	*/
	226	if (rtd->td_pri > td->td_pri \|\|
	227	(rtd->td_pri == td->td_pri &&
	228	rtd->td_upri >= td->td_upri)) {
	229	TAILQ_INSERT_AFTER(&gd->gd_tdrunq, rtd, td, td_threadq);
	230	goto skip;
	231	}
	232	rtd = TAILQ_PREV(rtd, lwkt_queue, td_threadq);
	233	}
	234	if (xtd)
	235	TAILQ_INSERT_BEFORE(xtd, td, td_threadq);
	236	else
	237	TAILQ_INSERT_TAIL(&gd->gd_tdrunq, td, td_threadq);
	238	}
	239	skip:
	240	++gd->gd_tdrunqcount;
	241
	242	/*
	243	* Request a LWKT reschedule if we are now at the head of the queue.
	244	*/
	245	if (TAILQ_FIRST(&gd->gd_tdrunq) == td)
	246	need_lwkt_resched();
	247	}
	248	}
	249
	250	static boolean_t
	251	_lwkt_thread_ctor(void obj, void privdata, int ocflags)
	252	{
	253	struct thread td = (struct thread )obj;
	254
	255	td->td_kstack = NULL;
	256	td->td_kstack_size = 0;
	257	td->td_flags = TDF_ALLOCATED_THREAD;
	258	td->td_mpflags = 0;
	259	return (1);
	260	}
	261
	262	static void
	263	_lwkt_thread_dtor(void obj, void privdata)
	264	{
	265	struct thread td = (struct thread )obj;
	266
	267	KASSERT(td->td_flags & TDF_ALLOCATED_THREAD,
	268	("_lwkt_thread_dtor: not allocated from objcache"));
	269	KASSERT((td->td_flags & TDF_ALLOCATED_STACK) && td->td_kstack &&
	270	td->td_kstack_size > 0,
	271	("_lwkt_thread_dtor: corrupted stack"));
	272	kmem_free(&kernel_map, (vm_offset_t)td->td_kstack, td->td_kstack_size);
	273	td->td_kstack = NULL;
	274	td->td_flags = 0;
	275	}
	276
	277	/*
	278	* Initialize the lwkt s/system.
	279	*
	280	* Nominally cache up to 32 thread + kstack structures. Cache more on
	281	* systems with a lot of cpu cores.
	282	*/
	283	static void
	284	lwkt_init(void)
	285	{
	286	TUNABLE_INT("lwkt.cache_threads", &lwkt_cache_threads);
	287	if (lwkt_cache_threads == 0) {
	288	lwkt_cache_threads = ncpus * 4;
	289	if (lwkt_cache_threads < 32)
	290	lwkt_cache_threads = 32;
	291	}
	292	thread_cache = objcache_create_mbacked(
	293	M_THREAD, sizeof(struct thread),
	294	0, lwkt_cache_threads,
	295	_lwkt_thread_ctor, _lwkt_thread_dtor, NULL);
	296	}
	297	SYSINIT(lwkt_init, SI_BOOT2_LWKT_INIT, SI_ORDER_FIRST, lwkt_init, NULL);
	298
	299	/*
	300	* Schedule a thread to run. As the current thread we can always safely
	301	* schedule ourselves, and a shortcut procedure is provided for that
	302	* function.
	303	*
	304	* (non-blocking, self contained on a per cpu basis)
	305	*/
	306	void
	307	lwkt_schedule_self(thread_t td)
	308	{
	309	KKASSERT((td->td_flags & TDF_MIGRATING) == 0);
	310	crit_enter_quick(td);
	311	KASSERT(td != &td->td_gd->gd_idlethread,
	312	("lwkt_schedule_self(): scheduling gd_idlethread is illegal!"));
	313	KKASSERT(td->td_lwp == NULL \|\|
	314	(td->td_lwp->lwp_mpflags & LWP_MP_ONRUNQ) == 0);
	315	_lwkt_enqueue(td);
	316	crit_exit_quick(td);
	317	}
	318
	319	/*
	320	* Deschedule a thread.
	321	*
	322	* (non-blocking, self contained on a per cpu basis)
	323	*/
	324	void
	325	lwkt_deschedule_self(thread_t td)
	326	{
	327	crit_enter_quick(td);
	328	_lwkt_dequeue(td);
	329	crit_exit_quick(td);
	330	}
	331
	332	/*
	333	* LWKTs operate on a per-cpu basis
	334	*
	335	* WARNING! Called from early boot, 'mycpu' may not work yet.
	336	*/
	337	void
	338	lwkt_gdinit(struct globaldata *gd)
	339	{
	340	TAILQ_INIT(&gd->gd_tdrunq);
	341	TAILQ_INIT(&gd->gd_tdallq);
	342	}
	343
	344	/*
	345	* Create a new thread. The thread must be associated with a process context
	346	* or LWKT start address before it can be scheduled. If the target cpu is
	347	* -1 the thread will be created on the current cpu.
	348	*
	349	* If you intend to create a thread without a process context this function
	350	* does everything except load the startup and switcher function.
	351	*/
	352	thread_t
	353	lwkt_alloc_thread(struct thread *td, int stksize, int cpu, int flags)
	354	{
	355	static int cpu_rotator;
	356	globaldata_t gd = mycpu;
	357	void *stack;
	358
	359	/*
	360	* If static thread storage is not supplied allocate a thread. Reuse
	361	* a cached free thread if possible. gd_freetd is used to keep an exiting
	362	* thread intact through the exit.
	363	*/
	364	if (td == NULL) {
	365	crit_enter_gd(gd);
	366	if ((td = gd->gd_freetd) != NULL) {
	367	KKASSERT((td->td_flags & (TDF_RUNNING\|TDF_PREEMPT_LOCK\|
	368	TDF_RUNQ)) == 0);
	369	gd->gd_freetd = NULL;
	370	} else {
	371	td = objcache_get(thread_cache, M_WAITOK);
	372	KKASSERT((td->td_flags & (TDF_RUNNING\|TDF_PREEMPT_LOCK\|
	373	TDF_RUNQ)) == 0);
	374	}
	375	crit_exit_gd(gd);
	376	KASSERT((td->td_flags &
	377	(TDF_ALLOCATED_THREAD\|TDF_RUNNING\|TDF_PREEMPT_LOCK)) ==
	378	TDF_ALLOCATED_THREAD,
	379	("lwkt_alloc_thread: corrupted td flags 0x%X", td->td_flags));
	380	flags \|= td->td_flags & (TDF_ALLOCATED_THREAD\|TDF_ALLOCATED_STACK);
	381	}
	382
	383	/*
	384	* Try to reuse cached stack.
	385	*/
	386	if ((stack = td->td_kstack) != NULL && td->td_kstack_size != stksize) {
	387	if (flags & TDF_ALLOCATED_STACK) {
	388	kmem_free(&kernel_map, (vm_offset_t)stack, td->td_kstack_size);
	389	stack = NULL;
	390	}
	391	}
	392	if (stack == NULL) {
	393	if (cpu < 0)
	394	stack = (void *)kmem_alloc_stack(&kernel_map, stksize, 0);
	395	else
	396	stack = (void *)kmem_alloc_stack(&kernel_map, stksize,
	397	KM_CPU(cpu));
	398	flags \|= TDF_ALLOCATED_STACK;
	399	}
	400	if (cpu < 0) {
	401	cpu = ++cpu_rotator;
	402	cpu_ccfence();
	403	cpu %= ncpus;
	404	}
	405	lwkt_init_thread(td, stack, stksize, flags, globaldata_find(cpu));
	406	return(td);
	407	}
	408
	409	/*
	410	* Initialize a preexisting thread structure. This function is used by
	411	* lwkt_alloc_thread() and also used to initialize the per-cpu idlethread.
	412	*
	413	* All threads start out in a critical section at a priority of
	414	* TDPRI_KERN_DAEMON. Higher level code will modify the priority as
	415	* appropriate. This function may send an IPI message when the
	416	* requested cpu is not the current cpu and consequently gd_tdallq may
	417	* not be initialized synchronously from the point of view of the originating
	418	* cpu.
	419	*
	420	* NOTE! we have to be careful in regards to creating threads for other cpus
	421	* if SMP has not yet been activated.
	422	*/
	423	static void
	424	lwkt_init_thread_remote(void *arg)
	425	{
	426	thread_t td = arg;
	427
	428	/*
	429	* Protected by critical section held by IPI dispatch
	430	*/
	431	TAILQ_INSERT_TAIL(&td->td_gd->gd_tdallq, td, td_allq);
	432	}
	433
	434	/*
	435	* lwkt core thread structural initialization.
	436	*
	437	* NOTE: All threads are initialized as mpsafe threads.
	438	*/
	439	void
	440	lwkt_init_thread(thread_t td, void *stack, int stksize, int flags,
	441	struct globaldata *gd)
	442	{
	443	globaldata_t mygd = mycpu;
	444
	445	bzero(td, sizeof(struct thread));
	446	td->td_kstack = stack;
	447	td->td_kstack_size = stksize;
	448	td->td_flags = flags;
	449	td->td_mpflags = 0;
	450	td->td_type = TD_TYPE_GENERIC;
	451	td->td_gd = gd;
	452	td->td_pri = TDPRI_KERN_DAEMON;
	453	td->td_critcount = 1;
	454	td->td_toks_have = NULL;
	455	td->td_toks_stop = &td->td_toks_base;
	456	if (lwkt_use_spin_port \|\| (flags & TDF_FORCE_SPINPORT)) {
	457	lwkt_initport_spin(&td->td_msgport, td,
	458	(flags & TDF_FIXEDCPU) ? TRUE : FALSE);
	459	} else {
	460	lwkt_initport_thread(&td->td_msgport, td);
	461	}
	462	pmap_init_thread(td);
	463	/*
	464	* Normally initializing a thread for a remote cpu requires sending an
	465	* IPI. However, the idlethread is setup before the other cpus are
	466	* activated so we have to treat it as a special case. XXX manipulation
	467	* of gd_tdallq requires the BGL.
	468	*/
	469	if (gd == mygd \|\| td == &gd->gd_idlethread) {
	470	crit_enter_gd(mygd);
	471	TAILQ_INSERT_TAIL(&gd->gd_tdallq, td, td_allq);
	472	crit_exit_gd(mygd);
	473	} else {
	474	lwkt_send_ipiq(gd, lwkt_init_thread_remote, td);
	475	}
	476	dsched_enter_thread(td);
	477	}
	478
	479	void
	480	lwkt_set_comm(thread_t td, const char *ctl, ...)
	481	{
	482	__va_list va;
	483
	484	__va_start(va, ctl);
	485	kvsnprintf(td->td_comm, sizeof(td->td_comm), ctl, va);
	486	__va_end(va);
	487	KTR_LOG(ctxsw_newtd, td, td->td_comm);
	488	}
	489
	490	/*
	491	* Prevent the thread from getting destroyed. Note that unlike PHOLD/PRELE
	492	* this does not prevent the thread from migrating to another cpu so the
	493	* gd_tdallq state is not protected by this.
	494	*/
	495	void
	496	lwkt_hold(thread_t td)
	497	{
	498	atomic_add_int(&td->td_refs, 1);
	499	}
	500
	501	void
	502	lwkt_rele(thread_t td)
	503	{
	504	KKASSERT(td->td_refs > 0);
	505	atomic_add_int(&td->td_refs, -1);
	506	}
	507
	508	void
	509	lwkt_free_thread(thread_t td)
	510	{
	511	KKASSERT(td->td_refs == 0);
	512	KKASSERT((td->td_flags & (TDF_RUNNING \| TDF_PREEMPT_LOCK \|
	513	TDF_RUNQ \| TDF_TSLEEPQ)) == 0);
	514	if (td->td_flags & TDF_ALLOCATED_THREAD) {
	515	objcache_put(thread_cache, td);
	516	} else if (td->td_flags & TDF_ALLOCATED_STACK) {
	517	/* client-allocated struct with internally allocated stack */
	518	KASSERT(td->td_kstack && td->td_kstack_size > 0,
	519	("lwkt_free_thread: corrupted stack"));
	520	kmem_free(&kernel_map, (vm_offset_t)td->td_kstack, td->td_kstack_size);
	521	td->td_kstack = NULL;
	522	td->td_kstack_size = 0;
	523	}
	524
	525	KTR_LOG(ctxsw_deadtd, td);
	526	}
	527
	528
	529	/*
	530	* Switch to the next runnable lwkt. If no LWKTs are runnable then
	531	* switch to the idlethread. Switching must occur within a critical
	532	* section to avoid races with the scheduling queue.
	533	*
	534	* We always have full control over our cpu's run queue. Other cpus
	535	* that wish to manipulate our queue must use the cpu_*msg() calls to
	536	* talk to our cpu, so a critical section is all that is needed and
	537	* the result is very, very fast thread switching.
	538	*
	539	* The LWKT scheduler uses a fixed priority model and round-robins at
	540	* each priority level. User process scheduling is a totally
	541	* different beast and LWKT priorities should not be confused with
	542	* user process priorities.
	543	*
	544	* PREEMPTION NOTE: Preemption occurs via lwkt_preempt(). lwkt_switch()
	545	* is not called by the current thread in the preemption case, only when
	546	* the preempting thread blocks (in order to return to the original thread).
	547	*
	548	* SPECIAL NOTE ON SWITCH ATOMICY: Certain operations such as thread
	549	* migration and tsleep deschedule the current lwkt thread and call
	550	* lwkt_switch(). In particular, the target cpu of the migration fully
	551	* expects the thread to become non-runnable and can deadlock against
	552	* cpusync operations if we run any IPIs prior to switching the thread out.
	553	*
	554	* WE MUST BE VERY CAREFUL NOT TO RUN SPLZ DIRECTLY OR INDIRECTLY IF
	555	* THE CURRENT THREAD HAS BEEN DESCHEDULED!
	556	*/
	557	void
	558	lwkt_switch(void)
	559	{
	560	globaldata_t gd = mycpu;
	561	thread_t td = gd->gd_curthread;
	562	thread_t ntd;
	563	int upri;
	564	#ifdef LOOPMASK
	565	uint64_t tsc_base = rdtsc();
	566	#endif
	567
	568	KKASSERT(gd->gd_processing_ipiq == 0);
	569	KKASSERT(td->td_flags & TDF_RUNNING);
	570
	571	/*
	572	* Switching from within a 'fast' (non thread switched) interrupt or IPI
	573	* is illegal. However, we may have to do it anyway if we hit a fatal
	574	* kernel trap or we have paniced.
	575	*
	576	* If this case occurs save and restore the interrupt nesting level.
	577	*/
	578	if (gd->gd_intr_nesting_level) {
	579	int savegdnest;
	580	int savegdtrap;
	581
	582	if (gd->gd_trap_nesting_level == 0 && panic_cpu_gd != mycpu) {
	583	panic("lwkt_switch: Attempt to switch from a "
	584	"fast interrupt, ipi, or hard code section, "
	585	"td %p\n",
	586	td);
	587	} else {
	588	savegdnest = gd->gd_intr_nesting_level;
	589	savegdtrap = gd->gd_trap_nesting_level;
	590	gd->gd_intr_nesting_level = 0;
	591	gd->gd_trap_nesting_level = 0;
	592	if ((td->td_flags & TDF_PANICWARN) == 0) {
	593	td->td_flags \|= TDF_PANICWARN;
	594	kprintf("Warning: thread switch from interrupt, IPI, "
	595	"or hard code section.\n"
	596	"thread %p (%s)\n", td, td->td_comm);
	597	print_backtrace(-1);
	598	}
	599	lwkt_switch();
	600	gd->gd_intr_nesting_level = savegdnest;
	601	gd->gd_trap_nesting_level = savegdtrap;
	602	return;
	603	}
	604	}
	605
	606	/*
	607	* Release our current user process designation if we are blocking
	608	* or if a user reschedule was requested.
	609	*
	610	* NOTE: This function is NOT called if we are switching into or
	611	* returning from a preemption.
	612	*
	613	* NOTE: Releasing our current user process designation may cause
	614	* it to be assigned to another thread, which in turn will
	615	* cause us to block in the usched acquire code when we attempt
	616	* to return to userland.
	617	*
	618	* NOTE: On SMP systems this can be very nasty when heavy token
	619	* contention is present so we want to be careful not to
	620	* release the designation gratuitously.
	621	*/
	622	if (td->td_release &&
	623	(user_resched_wanted() \|\| (td->td_flags & TDF_RUNQ) == 0)) {
	624	td->td_release(td);
	625	}
	626
	627	/*
	628	* Release all tokens. Once we do this we must remain in the critical
	629	* section and cannot run IPIs or other interrupts until we switch away
	630	* because they may implode if they try to get a token using our thread
	631	* context.
	632	*/
	633	crit_enter_gd(gd);
	634	if (TD_TOKS_HELD(td))
	635	lwkt_relalltokens(td);
	636
	637	/*
	638	* We had better not be holding any spin locks, but don't get into an
	639	* endless panic loop.
	640	*/
	641	KASSERT(gd->gd_spinlocks == 0 \|\| panicstr != NULL,
	642	("lwkt_switch: still holding %d exclusive spinlocks!",
	643	gd->gd_spinlocks));
	644
	645	#ifdef INVARIANTS
	646	if (td->td_cscount) {
	647	kprintf("Diagnostic: attempt to switch while mastering cpusync: %p\n",
	648	td);
	649	if (panic_on_cscount)
	650	panic("switching while mastering cpusync");
	651	}
	652	#endif
	653
	654	/*
	655	* If we had preempted another thread on this cpu, resume the preempted
	656	* thread. This occurs transparently, whether the preempted thread
	657	* was scheduled or not (it may have been preempted after descheduling
	658	* itself).
	659	*
	660	* We have to setup the MP lock for the original thread after backing
	661	* out the adjustment that was made to curthread when the original
	662	* was preempted.
	663	*/
	664	if ((ntd = td->td_preempted) != NULL) {
	665	KKASSERT(ntd->td_flags & TDF_PREEMPT_LOCK);
	666	ntd->td_flags \|= TDF_PREEMPT_DONE;
	667	ntd->td_contended = 0; /* reset contended */
	668
	669	/*
	670	* The interrupt may have woken a thread up, we need to properly
	671	* set the reschedule flag if the originally interrupted thread is
	672	* at a lower priority.
	673	*
	674	* The interrupt may not have descheduled.
	675	*/
	676	if (TAILQ_FIRST(&gd->gd_tdrunq) != ntd)
	677	need_lwkt_resched();
	678	goto havethread_preempted;
	679	}
	680
	681	/*
	682	* Figure out switch target. If we cannot switch to our desired target
	683	* look for a thread that we can switch to.
	684	*
	685	* NOTE! The limited spin loop and related parameters are extremely
	686	* important for system performance, particularly for pipes and
	687	* concurrent conflicting VM faults.
	688	*/
	689	clear_lwkt_resched();
	690	ntd = TAILQ_FIRST(&gd->gd_tdrunq);
	691
	692	if (ntd) {
	693	do {
	694	if (TD_TOKS_NOT_HELD(ntd) \|\|
	695	lwkt_getalltokens(ntd, (ntd->td_contended > lwkt_spin_loops)))
	696	{
	697	goto havethread;
	698	}
	699	++gd->gd_cnt.v_lock_colls;
	700	++ntd->td_contended; /* overflow ok */
	701	#ifdef LOOPMASK
	702	if (tsc_frequency && rdtsc() - tsc_base > tsc_frequency) {
	703	kprintf("lwkt_switch: excessive contended %d "
	704	"thread %p\n", ntd->td_contended, ntd);
	705	tsc_base = rdtsc();
	706	}
	707	#endif
	708	} while (ntd->td_contended < (lwkt_spin_loops >> 1));
	709	upri = ntd->td_upri;
	710
	711	/*
	712	* Bleh, the thread we wanted to switch to has a contended token.
	713	* See if we can switch to another thread.
	714	*
	715	* We generally don't want to do this because it represents a
	716	* priority inversion. Do not allow the case if the thread
	717	* is returning to userland (not a kernel thread) AND the thread
	718	* has a lower upri.
	719	*/
	720	while ((ntd = TAILQ_NEXT(ntd, td_threadq)) != NULL) {
	721	if (ntd->td_pri < TDPRI_KERN_LPSCHED && upri > ntd->td_upri)
	722	break;
	723	upri = ntd->td_upri;
	724
	725	/*
	726	* Try this one.
	727	*/
	728	if (TD_TOKS_NOT_HELD(ntd) \|\|
	729	lwkt_getalltokens(ntd, (ntd->td_contended > lwkt_spin_loops))) {
	730	goto havethread;
	731	}
	732	++ntd->td_contended; /* overflow ok */
	733	++gd->gd_cnt.v_lock_colls;
	734	}
	735
	736	/*
	737	* Fall through, switch to idle thread to get us out of the current
	738	* context. Since we were contended, prevent HLT by flagging a
	739	* LWKT reschedule.
	740	*/
	741	need_lwkt_resched();
	742	}
	743
	744	/*
	745	* We either contended on ntd or the runq is empty. We must switch
	746	* through the idle thread to get out of the current context.
	747	*/
	748	ntd = &gd->gd_idlethread;
	749	if (gd->gd_trap_nesting_level == 0 && panicstr == NULL)
	750	ASSERT_NO_TOKENS_HELD(ntd);
	751	cpu_time.cp_msg[0] = 0;
	752	goto haveidle;
	753
	754	havethread:
	755	/*
	756	* Clear gd_idle_repeat when doing a normal switch to a non-idle
	757	* thread.
	758	*/
	759	ntd->td_wmesg = NULL;
	760	ntd->td_contended = 0; /* reset once scheduled */
	761	++gd->gd_cnt.v_swtch;
	762	gd->gd_idle_repeat = 0;
	763
	764	havethread_preempted:
	765	/*
	766	* If the new target does not need the MP lock and we are holding it,
	767	* release the MP lock. If the new target requires the MP lock we have
	768	* already acquired it for the target.
	769	*/
	770	;
	771	haveidle:
	772	KASSERT(ntd->td_critcount,
	773	("priority problem in lwkt_switch %d %d",
	774	td->td_critcount, ntd->td_critcount));
	775
	776	if (td != ntd) {
	777	/*
	778	* Execute the actual thread switch operation. This function
	779	* returns to the current thread and returns the previous thread
	780	* (which may be different from the thread we switched to).
	781	*
	782	* We are responsible for marking ntd as TDF_RUNNING.
	783	*/
	784	KKASSERT((ntd->td_flags & TDF_RUNNING) == 0);
	785	++switch_count;
	786	KTR_LOG(ctxsw_sw, gd->gd_cpuid, ntd);
	787	ntd->td_flags \|= TDF_RUNNING;
	788	lwkt_switch_return(td->td_switch(ntd));
	789	/* ntd invalid, td_switch() can return a different thread_t */
	790	}
	791
	792	/*
	793	* catch-all. XXX is this strictly needed?
	794	*/
	795	splz_check();
	796
	797	/* NOTE: current cpu may have changed after switch */
	798	crit_exit_quick(td);
	799	}
	800
	801	/*
	802	* Called by assembly in the td_switch (thread restore path) for thread
	803	* bootstrap cases which do not 'return' to lwkt_switch().
	804	*/
	805	void
	806	lwkt_switch_return(thread_t otd)
	807	{
	808	globaldata_t rgd;
	809	#ifdef LOOPMASK
	810	uint64_t tsc_base = rdtsc();
	811	#endif
	812	int exiting;
	813
	814	exiting = otd->td_flags & TDF_EXITING;
	815	cpu_ccfence();
	816
	817	/*
	818	* Check if otd was migrating. Now that we are on ntd we can finish
	819	* up the migration. This is a bit messy but it is the only place
	820	* where td is known to be fully descheduled.
	821	*
	822	* We can only activate the migration if otd was migrating but not
	823	* held on the cpu due to a preemption chain. We still have to
	824	* clear TDF_RUNNING on the old thread either way.
	825	*
	826	* We are responsible for clearing the previously running thread's
	827	* TDF_RUNNING.
	828	*/
	829	if ((rgd = otd->td_migrate_gd) != NULL &&
	830	(otd->td_flags & TDF_PREEMPT_LOCK) == 0) {
	831	KKASSERT((otd->td_flags & (TDF_MIGRATING \| TDF_RUNNING)) ==
	832	(TDF_MIGRATING \| TDF_RUNNING));
	833	otd->td_migrate_gd = NULL;
	834	otd->td_flags &= ~TDF_RUNNING;
	835	lwkt_send_ipiq(rgd, lwkt_setcpu_remote, otd);
	836	} else {
	837	otd->td_flags &= ~TDF_RUNNING;
	838	}
	839
	840	/*
	841	* Final exit validations (see lwp_wait()). Note that otd becomes
	842	* invalid the instant we set TDF_MP_EXITSIG.
	843	*
	844	* Use the EXITING status loaded from before we clear TDF_RUNNING,
	845	* because if it is not set otd becomes invalid the instant we clear
	846	* TDF_RUNNING on it (otherwise, if the system is fast enough, we
	847	* might 'steal' TDF_EXITING from another switch-return!).
	848	*/
	849	while (exiting) {
	850	u_int mpflags;
	851
	852	mpflags = otd->td_mpflags;
	853	cpu_ccfence();
	854
	855	if (mpflags & TDF_MP_EXITWAIT) {
	856	if (atomic_cmpset_int(&otd->td_mpflags, mpflags,
	857	mpflags \| TDF_MP_EXITSIG)) {
	858	wakeup(otd);
	859	break;
	860	}
	861	} else {
	862	if (atomic_cmpset_int(&otd->td_mpflags, mpflags,
	863	mpflags \| TDF_MP_EXITSIG)) {
	864	wakeup(otd);
	865	break;
	866	}
	867	}
	868
	869	#ifdef LOOPMASK
	870	if (tsc_frequency && rdtsc() - tsc_base > tsc_frequency) {
	871	kprintf("lwkt_switch_return: excessive TDF_EXITING "
	872	"thread %p\n", otd);
	873	tsc_base = rdtsc();
	874	}
	875	#endif
	876	}
	877	}
	878
	879	/*
	880	* Request that the target thread preempt the current thread. Preemption
	881	* can only occur only:
	882	*
	883	* - If our critical section is the one that we were called with
	884	* - The relative priority of the target thread is higher
	885	* - The target is not excessively interrupt-nested via td_nest_count
	886	* - The target thread holds no tokens.
	887	* - The target thread is not already scheduled and belongs to the
	888	* current cpu.
	889	* - The current thread is not holding any spin-locks.
	890	*
	891	* THE CALLER OF LWKT_PREEMPT() MUST BE IN A CRITICAL SECTION. Typically
	892	* this is called via lwkt_schedule() through the td_preemptable callback.
	893	* critcount is the managed critical priority that we should ignore in order
	894	* to determine whether preemption is possible (aka usually just the crit
	895	* priority of lwkt_schedule() itself).
	896	*
	897	* Preemption is typically limited to interrupt threads.
	898	*
	899	* Operation works in a fairly straight-forward manner. The normal
	900	* scheduling code is bypassed and we switch directly to the target
	901	* thread. When the target thread attempts to block or switch away
	902	* code at the base of lwkt_switch() will switch directly back to our
	903	* thread. Our thread is able to retain whatever tokens it holds and
	904	* if the target needs one of them the target will switch back to us
	905	* and reschedule itself normally.
	906	*/
	907	void
	908	lwkt_preempt(thread_t ntd, int critcount)
	909	{
	910	struct globaldata *gd = mycpu;
	911	thread_t xtd;
	912	thread_t td;
	913	int save_gd_intr_nesting_level;
	914
	915	/*
	916	* The caller has put us in a critical section. We can only preempt
	917	* if the caller of the caller was not in a critical section (basically
	918	* a local interrupt), as determined by the 'critcount' parameter. We
	919	* also can't preempt if the caller is holding any spinlocks (even if
	920	* he isn't in a critical section). This also handles the tokens test.
	921	*
	922	* YYY The target thread must be in a critical section (else it must
	923	* inherit our critical section? I dunno yet).
	924	*/
	925	KASSERT(ntd->td_critcount, ("BADCRIT0 %d", ntd->td_pri));
	926
	927	td = gd->gd_curthread;
	928	if (preempt_enable == 0) {
	929	++preempt_miss;
	930	return;
	931	}
	932	if (ntd->td_pri <= td->td_pri) {
	933	++preempt_miss;
	934	return;
	935	}
	936	if (td->td_critcount > critcount) {
	937	++preempt_miss;
	938	return;
	939	}
	940	if (td->td_nest_count >= 2) {
	941	++preempt_miss;
	942	return;
	943	}
	944	if (td->td_cscount) {
	945	++preempt_miss;
	946	return;
	947	}
	948	if (ntd->td_gd != gd) {
	949	++preempt_miss;
	950	return;
	951	}
	952
	953	/*
	954	* We don't have to check spinlocks here as they will also bump
	955	* td_critcount.
	956	*
	957	* Do not try to preempt if the target thread is holding any tokens.
	958	* We could try to acquire the tokens but this case is so rare there
	959	* is no need to support it.
	960	*/
	961	KKASSERT(gd->gd_spinlocks == 0);
	962
	963	if (TD_TOKS_HELD(ntd)) {
	964	++preempt_miss;
	965	return;
	966	}
	967	if (td == ntd \|\| ((td->td_flags \| ntd->td_flags) & TDF_PREEMPT_LOCK)) {
	968	++preempt_weird;
	969	return;
	970	}
	971	if (ntd->td_preempted) {
	972	++preempt_hit;
	973	return;
	974	}
	975	KKASSERT(gd->gd_processing_ipiq == 0);
	976
	977	/*
	978	* Since we are able to preempt the current thread, there is no need to
	979	* call need_lwkt_resched().
	980	*
	981	* We must temporarily clear gd_intr_nesting_level around the switch
	982	* since switchouts from the target thread are allowed (they will just
	983	* return to our thread), and since the target thread has its own stack.
	984	*
	985	* A preemption must switch back to the original thread, assert the
	986	* case.
	987	*/
	988	++preempt_hit;
	989	ntd->td_preempted = td;
	990	td->td_flags \|= TDF_PREEMPT_LOCK;
	991	KTR_LOG(ctxsw_pre, gd->gd_cpuid, ntd);
	992	save_gd_intr_nesting_level = gd->gd_intr_nesting_level;
	993	gd->gd_intr_nesting_level = 0;
	994
	995	KKASSERT((ntd->td_flags & TDF_RUNNING) == 0);
	996	ntd->td_flags \|= TDF_RUNNING;
	997	xtd = td->td_switch(ntd);
	998	KKASSERT(xtd == ntd);
	999	lwkt_switch_return(xtd);
	1000	gd->gd_intr_nesting_level = save_gd_intr_nesting_level;
	1001
	1002	KKASSERT(ntd->td_preempted && (td->td_flags & TDF_PREEMPT_DONE));
	1003	ntd->td_preempted = NULL;
	1004	td->td_flags &= ~(TDF_PREEMPT_LOCK\|TDF_PREEMPT_DONE);
	1005	}
	1006
	1007	/*
	1008	* Conditionally call splz() if gd_reqflags indicates work is pending.
	1009	* This will work inside a critical section but not inside a hard code
	1010	* section.
	1011	*
	1012	* (self contained on a per cpu basis)
	1013	*/
	1014	void
	1015	splz_check(void)
	1016	{
	1017	globaldata_t gd = mycpu;
	1018	thread_t td = gd->gd_curthread;
	1019
	1020	if ((gd->gd_reqflags & RQF_IDLECHECK_MASK) &&
	1021	gd->gd_intr_nesting_level == 0 &&
	1022	td->td_nest_count < 2)
	1023	{
	1024	splz();
	1025	}
	1026	}
	1027
	1028	/*
	1029	* This version is integrated into crit_exit, reqflags has already
	1030	* been tested but td_critcount has not.
	1031	*
	1032	* We only want to execute the splz() on the 1->0 transition of
	1033	* critcount and not in a hard code section or if too deeply nested.
	1034	*
	1035	* NOTE: gd->gd_spinlocks is implied to be 0 when td_critcount is 0.
	1036	*/
	1037	void
	1038	lwkt_maybe_splz(thread_t td)
	1039	{
	1040	globaldata_t gd = td->td_gd;
	1041
	1042	if (td->td_critcount == 0 &&
	1043	gd->gd_intr_nesting_level == 0 &&
	1044	td->td_nest_count < 2)
	1045	{
	1046	splz();
	1047	}
	1048	}
	1049
	1050	/*
	1051	* Drivers which set up processing co-threads can call this function to
	1052	* run the co-thread at a higher priority and to allow it to preempt
	1053	* normal threads.
	1054	*/
	1055	void
	1056	lwkt_set_interrupt_support_thread(void)
	1057	{
	1058	thread_t td = curthread;
	1059
	1060	lwkt_setpri_self(TDPRI_INT_SUPPORT);
	1061	td->td_flags \|= TDF_INTTHREAD;
	1062	td->td_preemptable = lwkt_preempt;
	1063	}
	1064
	1065
	1066	/*
	1067	* This function is used to negotiate a passive release of the current
	1068	* process/lwp designation with the user scheduler, allowing the user
	1069	* scheduler to schedule another user thread. The related kernel thread
	1070	* (curthread) continues running in the released state.
	1071	*/
	1072	void
	1073	lwkt_passive_release(struct thread *td)
	1074	{
	1075	struct lwp *lp = td->td_lwp;
	1076
	1077	td->td_release = NULL;
	1078	lwkt_setpri_self(TDPRI_KERN_USER);
	1079
	1080	lp->lwp_proc->p_usched->release_curproc(lp);
	1081	}
	1082
	1083
	1084	/*
	1085	* This implements a LWKT yield, allowing a kernel thread to yield to other
	1086	* kernel threads at the same or higher priority. This function can be
	1087	* called in a tight loop and will typically only yield once per tick.
	1088	*
	1089	* Most kernel threads run at the same priority in order to allow equal
	1090	* sharing.
	1091	*
	1092	* (self contained on a per cpu basis)
	1093	*/
	1094	void
	1095	lwkt_yield(void)
	1096	{
	1097	globaldata_t gd = mycpu;
	1098	thread_t td = gd->gd_curthread;
	1099
	1100	/*
	1101	* Should never be called with spinlocks held but there is a path
	1102	* via ACPI where it might happen.
	1103	*/
	1104	if (gd->gd_spinlocks)
	1105	return;
	1106
	1107	/*
	1108	* Safe to call splz if we are not too-heavily nested.
	1109	*/
	1110	if ((gd->gd_reqflags & RQF_IDLECHECK_MASK) && td->td_nest_count < 2)
	1111	splz();
	1112
	1113	/*
	1114	* Caller allows switching
	1115	*/
	1116	if (lwkt_resched_wanted()) {
	1117	lwkt_schedule_self(curthread);
	1118	lwkt_switch();
	1119	}
	1120	}
	1121
	1122	/*
	1123	* The quick version processes pending interrupts and higher-priority
	1124	* LWKT threads but will not round-robin same-priority LWKT threads.
	1125	*
	1126	* When called while attempting to return to userland the only same-pri
	1127	* threads are the ones which have already tried to become the current
	1128	* user process.
	1129	*/
	1130	void
	1131	lwkt_yield_quick(void)
	1132	{
	1133	globaldata_t gd = mycpu;
	1134	thread_t td = gd->gd_curthread;
	1135
	1136	if ((gd->gd_reqflags & RQF_IDLECHECK_MASK) && td->td_nest_count < 2)
	1137	splz();
	1138	if (lwkt_resched_wanted()) {
	1139	crit_enter();
	1140	if (TAILQ_FIRST(&gd->gd_tdrunq) == td) {
	1141	clear_lwkt_resched();
	1142	} else {
	1143	lwkt_schedule_self(curthread);
	1144	lwkt_switch();
	1145	}
	1146	crit_exit();
	1147	}
	1148	}
	1149
	1150	/*
	1151	* This yield is designed for kernel threads with a user context.
	1152	*
	1153	* The kernel acting on behalf of the user is potentially cpu-bound,
	1154	* this function will efficiently allow other threads to run and also
	1155	* switch to other processes by releasing.
	1156	*
	1157	* The lwkt_user_yield() function is designed to have very low overhead
	1158	* if no yield is determined to be needed.
	1159	*/
	1160	void
	1161	lwkt_user_yield(void)
	1162	{
	1163	globaldata_t gd = mycpu;
	1164	thread_t td = gd->gd_curthread;
	1165
	1166	/*
	1167	* Should never be called with spinlocks held but there is a path
	1168	* via ACPI where it might happen.
	1169	*/
	1170	if (gd->gd_spinlocks)
	1171	return;
	1172
	1173	/*
	1174	* Always run any pending interrupts in case we are in a critical
	1175	* section.
	1176	*/
	1177	if ((gd->gd_reqflags & RQF_IDLECHECK_MASK) && td->td_nest_count < 2)
	1178	splz();
	1179
	1180	/*
	1181	* Switch (which forces a release) if another kernel thread needs
	1182	* the cpu, if userland wants us to resched, or if our kernel
	1183	* quantum has run out.
	1184	*/
	1185	if (lwkt_resched_wanted() \|\|
	1186	user_resched_wanted())
	1187	{
	1188	lwkt_switch();
	1189	}
	1190
	1191	#if 0
	1192	/*
	1193	* Reacquire the current process if we are released.
	1194	*
	1195	* XXX not implemented atm. The kernel may be holding locks and such,
	1196	* so we want the thread to continue to receive cpu.
	1197	*/
	1198	if (td->td_release == NULL && lp) {
	1199	lp->lwp_proc->p_usched->acquire_curproc(lp);
	1200	td->td_release = lwkt_passive_release;
	1201	lwkt_setpri_self(TDPRI_USER_NORM);
	1202	}
	1203	#endif
	1204	}
	1205
	1206	/*
	1207	* Generic schedule. Possibly schedule threads belonging to other cpus and
	1208	* deal with threads that might be blocked on a wait queue.
	1209	*
	1210	* We have a little helper inline function which does additional work after
	1211	* the thread has been enqueued, including dealing with preemption and
	1212	* setting need_lwkt_resched() (which prevents the kernel from returning
	1213	* to userland until it has processed higher priority threads).
	1214	*
	1215	* It is possible for this routine to be called after a failed _enqueue
	1216	* (due to the target thread migrating, sleeping, or otherwise blocked).
	1217	* We have to check that the thread is actually on the run queue!
	1218	*/
	1219	static __inline
	1220	void
	1221	_lwkt_schedule_post(globaldata_t gd, thread_t ntd, int ccount)
	1222	{
	1223	if (ntd->td_flags & TDF_RUNQ) {
	1224	if (ntd->td_preemptable) {
	1225	ntd->td_preemptable(ntd, ccount); /* YYY +token */
	1226	}
	1227	}
	1228	}
	1229
	1230	static __inline
	1231	void
	1232	_lwkt_schedule(thread_t td)
	1233	{
	1234	globaldata_t mygd = mycpu;
	1235
	1236	KASSERT(td != &td->td_gd->gd_idlethread,
	1237	("lwkt_schedule(): scheduling gd_idlethread is illegal!"));
	1238	KKASSERT((td->td_flags & TDF_MIGRATING) == 0);
	1239	crit_enter_gd(mygd);
	1240	KKASSERT(td->td_lwp == NULL \|\|
	1241	(td->td_lwp->lwp_mpflags & LWP_MP_ONRUNQ) == 0);
	1242
	1243	if (td == mygd->gd_curthread) {
	1244	_lwkt_enqueue(td);
	1245	} else {
	1246	/*
	1247	* If we own the thread, there is no race (since we are in a
	1248	* critical section). If we do not own the thread there might
	1249	* be a race but the target cpu will deal with it.
	1250	*/
	1251	if (td->td_gd == mygd) {
	1252	_lwkt_enqueue(td);
	1253	_lwkt_schedule_post(mygd, td, 1);
	1254	} else {
	1255	lwkt_send_ipiq3(td->td_gd, lwkt_schedule_remote, td, 0);
	1256	}
	1257	}
	1258	crit_exit_gd(mygd);
	1259	}
	1260
	1261	void
	1262	lwkt_schedule(thread_t td)
	1263	{
	1264	_lwkt_schedule(td);
	1265	}
	1266
	1267	void
	1268	lwkt_schedule_noresched(thread_t td) /* XXX not impl */
	1269	{
	1270	_lwkt_schedule(td);
	1271	}
	1272
	1273	/*
	1274	* When scheduled remotely if frame != NULL the IPIQ is being
	1275	* run via doreti or an interrupt then preemption can be allowed.
	1276	*
	1277	* To allow preemption we have to drop the critical section so only
	1278	* one is present in _lwkt_schedule_post.
	1279	*/
	1280	static void
	1281	lwkt_schedule_remote(void arg, int arg2, struct intrframe frame)
	1282	{
	1283	thread_t td = curthread;
	1284	thread_t ntd = arg;
	1285
	1286	if (frame && ntd->td_preemptable) {
	1287	crit_exit_noyield(td);
	1288	_lwkt_schedule(ntd);
	1289	crit_enter_quick(td);
	1290	} else {
	1291	_lwkt_schedule(ntd);
	1292	}
	1293	}
	1294
	1295	/*
	1296	* Thread migration using a 'Pull' method. The thread may or may not be
	1297	* the current thread. It MUST be descheduled and in a stable state.
	1298	* lwkt_giveaway() must be called on the cpu owning the thread.
	1299	*
	1300	* At any point after lwkt_giveaway() is called, the target cpu may
	1301	* 'pull' the thread by calling lwkt_acquire().
	1302	*
	1303	* We have to make sure the thread is not sitting on a per-cpu tsleep
	1304	* queue or it will blow up when it moves to another cpu.
	1305	*
	1306	* MPSAFE - must be called under very specific conditions.
	1307	*/
	1308	void
	1309	lwkt_giveaway(thread_t td)
	1310	{
	1311	globaldata_t gd = mycpu;
	1312
	1313	crit_enter_gd(gd);
	1314	if (td->td_flags & TDF_TSLEEPQ)
	1315	tsleep_remove(td);
	1316	KKASSERT(td->td_gd == gd);
	1317	TAILQ_REMOVE(&gd->gd_tdallq, td, td_allq);
	1318	td->td_flags \|= TDF_MIGRATING;
	1319	crit_exit_gd(gd);
	1320	}
	1321
	1322	void
	1323	lwkt_acquire(thread_t td)
	1324	{
	1325	globaldata_t gd;
	1326	globaldata_t mygd;
	1327
	1328	KKASSERT(td->td_flags & TDF_MIGRATING);
	1329	gd = td->td_gd;
	1330	mygd = mycpu;
	1331	if (gd != mycpu) {
	1332	#ifdef LOOPMASK
	1333	uint64_t tsc_base = rdtsc();
	1334	#endif
	1335	cpu_lfence();
	1336	KKASSERT((td->td_flags & TDF_RUNQ) == 0);
	1337	crit_enter_gd(mygd);
	1338	DEBUG_PUSH_INFO("lwkt_acquire");
	1339	while (td->td_flags & (TDF_RUNNING\|TDF_PREEMPT_LOCK)) {
	1340	lwkt_process_ipiq();
	1341	cpu_lfence();
	1342	#ifdef _KERNEL_VIRTUAL
	1343	pthread_yield();
	1344	#endif
	1345	#ifdef LOOPMASK
	1346	if (tsc_frequency && rdtsc() - tsc_base > tsc_frequency) {
	1347	kprintf("lwkt_acquire: stuck td %p td->td_flags %08x\n",
	1348	td, td->td_flags);
	1349	tsc_base = rdtsc();
	1350	}
	1351	#endif
	1352	}
	1353	DEBUG_POP_INFO();
	1354	cpu_mfence();
	1355	td->td_gd = mygd;
	1356	TAILQ_INSERT_TAIL(&mygd->gd_tdallq, td, td_allq);
	1357	td->td_flags &= ~TDF_MIGRATING;
	1358	crit_exit_gd(mygd);
	1359	} else {
	1360	crit_enter_gd(mygd);
	1361	TAILQ_INSERT_TAIL(&mygd->gd_tdallq, td, td_allq);
	1362	td->td_flags &= ~TDF_MIGRATING;
	1363	crit_exit_gd(mygd);
	1364	}
	1365	}
	1366
	1367	/*
	1368	* Generic deschedule. Descheduling threads other then your own should be
	1369	* done only in carefully controlled circumstances. Descheduling is
	1370	* asynchronous.
	1371	*
	1372	* This function may block if the cpu has run out of messages.
	1373	*/
	1374	void
	1375	lwkt_deschedule(thread_t td)
	1376	{
	1377	crit_enter();
	1378	if (td == curthread) {
	1379	_lwkt_dequeue(td);
	1380	} else {
	1381	if (td->td_gd == mycpu) {
	1382	_lwkt_dequeue(td);
	1383	} else {
	1384	lwkt_send_ipiq(td->td_gd, (ipifunc1_t)lwkt_deschedule, td);
	1385	}
	1386	}
	1387	crit_exit();
	1388	}
	1389
	1390	/*
	1391	* Set the target thread's priority. This routine does not automatically
	1392	* switch to a higher priority thread, LWKT threads are not designed for
	1393	* continuous priority changes. Yield if you want to switch.
	1394	*/
	1395	void
	1396	lwkt_setpri(thread_t td, int pri)
	1397	{
	1398	if (td->td_pri != pri) {
	1399	KKASSERT(pri >= 0);
	1400	crit_enter();
	1401	if (td->td_flags & TDF_RUNQ) {
	1402	KKASSERT(td->td_gd == mycpu);
	1403	_lwkt_dequeue(td);
	1404	td->td_pri = pri;
	1405	_lwkt_enqueue(td);
	1406	} else {
	1407	td->td_pri = pri;
	1408	}
	1409	crit_exit();
	1410	}
	1411	}
	1412
	1413	/*
	1414	* Set the initial priority for a thread prior to it being scheduled for
	1415	* the first time. The thread MUST NOT be scheduled before or during
	1416	* this call. The thread may be assigned to a cpu other then the current
	1417	* cpu.
	1418	*
	1419	* Typically used after a thread has been created with TDF_STOPPREQ,
	1420	* and before the thread is initially scheduled.
	1421	*/
	1422	void
	1423	lwkt_setpri_initial(thread_t td, int pri)
	1424	{
	1425	KKASSERT(pri >= 0);
	1426	KKASSERT((td->td_flags & TDF_RUNQ) == 0);
	1427	td->td_pri = pri;
	1428	}
	1429
	1430	void
	1431	lwkt_setpri_self(int pri)
	1432	{
	1433	thread_t td = curthread;
	1434
	1435	KKASSERT(pri >= 0 && pri <= TDPRI_MAX);
	1436	crit_enter();
	1437	if (td->td_flags & TDF_RUNQ) {
	1438	_lwkt_dequeue(td);
	1439	td->td_pri = pri;
	1440	_lwkt_enqueue(td);
	1441	} else {
	1442	td->td_pri = pri;
	1443	}
	1444	crit_exit();
	1445	}
	1446
	1447	/*
	1448	* hz tick scheduler clock for LWKT threads
	1449	*/
	1450	void
	1451	lwkt_schedulerclock(thread_t td)
	1452	{
	1453	globaldata_t gd = td->td_gd;
	1454	thread_t xtd;
	1455
	1456	if (TAILQ_FIRST(&gd->gd_tdrunq) == td) {
	1457	/*
	1458	* If the current thread is at the head of the runq shift it to the
	1459	* end of any equal-priority threads and request a LWKT reschedule
	1460	* if it moved.
	1461	*
	1462	* Ignore upri in this situation. There will only be one user thread
	1463	* in user mode, all others will be user threads running in kernel
	1464	* mode and we have to make sure they get some cpu.
	1465	*/
	1466	xtd = TAILQ_NEXT(td, td_threadq);
	1467	if (xtd && xtd->td_pri == td->td_pri) {
	1468	TAILQ_REMOVE(&gd->gd_tdrunq, td, td_threadq);
	1469	while (xtd && xtd->td_pri == td->td_pri)
	1470	xtd = TAILQ_NEXT(xtd, td_threadq);
	1471	if (xtd)
	1472	TAILQ_INSERT_BEFORE(xtd, td, td_threadq);
	1473	else
	1474	TAILQ_INSERT_TAIL(&gd->gd_tdrunq, td, td_threadq);
	1475	need_lwkt_resched();
	1476	}
	1477	} else {
	1478	/*
	1479	* If we scheduled a thread other than the one at the head of the
	1480	* queue always request a reschedule every tick.
	1481	*/
	1482	need_lwkt_resched();
	1483	}
	1484	}
	1485
	1486	/*
	1487	* Migrate the current thread to the specified cpu.
	1488	*
	1489	* This is accomplished by descheduling ourselves from the current cpu
	1490	* and setting td_migrate_gd. The lwkt_switch() code will detect that the
	1491	* 'old' thread wants to migrate after it has been completely switched out
	1492	* and will complete the migration.
	1493	*
	1494	* TDF_MIGRATING prevents scheduling races while the thread is being migrated.
	1495	*
	1496	* We must be sure to release our current process designation (if a user
	1497	* process) before clearing out any tsleepq we are on because the release
	1498	* code may re-add us.
	1499	*
	1500	* We must be sure to remove ourselves from the current cpu's tsleepq
	1501	* before potentially moving to another queue. The thread can be on
	1502	* a tsleepq due to a left-over tsleep_interlock().
	1503	*/
	1504
	1505	void
	1506	lwkt_setcpu_self(globaldata_t rgd)
	1507	{
	1508	thread_t td = curthread;
	1509
	1510	if (td->td_gd != rgd) {
	1511	crit_enter_quick(td);
	1512
	1513	if (td->td_release)
	1514	td->td_release(td);
	1515	if (td->td_flags & TDF_TSLEEPQ)
	1516	tsleep_remove(td);
	1517
	1518	/*
	1519	* Set TDF_MIGRATING to prevent a spurious reschedule while we are
	1520	* trying to deschedule ourselves and switch away, then deschedule
	1521	* ourself, remove us from tdallq, and set td_migrate_gd. Finally,
	1522	* call lwkt_switch() to complete the operation.
	1523	*/
	1524	td->td_flags \|= TDF_MIGRATING;
	1525	lwkt_deschedule_self(td);
	1526	TAILQ_REMOVE(&td->td_gd->gd_tdallq, td, td_allq);
	1527	td->td_migrate_gd = rgd;
	1528	lwkt_switch();
	1529
	1530	/*
	1531	* We are now on the target cpu
	1532	*/
	1533	KKASSERT(rgd == mycpu);
	1534	TAILQ_INSERT_TAIL(&rgd->gd_tdallq, td, td_allq);
	1535	crit_exit_quick(td);
	1536	}
	1537	}
	1538
	1539	void
	1540	lwkt_migratecpu(int cpuid)
	1541	{
	1542	globaldata_t rgd;
	1543
	1544	rgd = globaldata_find(cpuid);
	1545	lwkt_setcpu_self(rgd);
	1546	}
	1547
	1548	/*
	1549	* Remote IPI for cpu migration (called while in a critical section so we
	1550	* do not have to enter another one).
	1551	*
	1552	* The thread (td) has already been completely descheduled from the
	1553	* originating cpu and we can simply assert the case. The thread is
	1554	* assigned to the new cpu and enqueued.
	1555	*
	1556	* The thread will re-add itself to tdallq when it resumes execution.
	1557	*/
	1558	static void
	1559	lwkt_setcpu_remote(void *arg)
	1560	{
	1561	thread_t td = arg;
	1562	globaldata_t gd = mycpu;
	1563
	1564	KKASSERT((td->td_flags & (TDF_RUNNING\|TDF_PREEMPT_LOCK)) == 0);
	1565	td->td_gd = gd;
	1566	cpu_mfence();
	1567	td->td_flags &= ~TDF_MIGRATING;
	1568	KKASSERT(td->td_migrate_gd == NULL);
	1569	KKASSERT(td->td_lwp == NULL \|\|
	1570	(td->td_lwp->lwp_mpflags & LWP_MP_ONRUNQ) == 0);
	1571	_lwkt_enqueue(td);
	1572	}
	1573
	1574	struct lwp *
	1575	lwkt_preempted_proc(void)
	1576	{
	1577	thread_t td = curthread;
	1578	while (td->td_preempted)
	1579	td = td->td_preempted;
	1580	return(td->td_lwp);
	1581	}
	1582
	1583	/*
	1584	* Create a kernel process/thread/whatever. It shares it's address space
	1585	* with proc0 - ie: kernel only.
	1586	*
	1587	* If the cpu is not specified one will be selected. In the future
	1588	* specifying a cpu of -1 will enable kernel thread migration between
	1589	* cpus.
	1590	*/
	1591	int
	1592	lwkt_create(void (func)(void ), void arg, struct thread *tdp,
	1593	thread_t template, int tdflags, int cpu, const char *fmt, ...)
	1594	{
	1595	thread_t td;
	1596	__va_list ap;
	1597
	1598	td = lwkt_alloc_thread(template, LWKT_THREAD_STACK, cpu,
	1599	tdflags);
	1600	if (tdp)
	1601	*tdp = td;
	1602	cpu_set_thread_handler(td, lwkt_exit, func, arg);
	1603
	1604	/*
	1605	* Set up arg0 for 'ps' etc
	1606	*/
	1607	__va_start(ap, fmt);
	1608	kvsnprintf(td->td_comm, sizeof(td->td_comm), fmt, ap);
	1609	__va_end(ap);
	1610
	1611	/*
	1612	* Schedule the thread to run
	1613	*/
	1614	if (td->td_flags & TDF_NOSTART)
	1615	td->td_flags &= ~TDF_NOSTART;
	1616	else
	1617	lwkt_schedule(td);
	1618	return 0;
	1619	}
	1620
	1621	/*
	1622	* Destroy an LWKT thread. Warning! This function is not called when
	1623	* a process exits, cpu_proc_exit() directly calls cpu_thread_exit() and
	1624	* uses a different reaping mechanism.
	1625	*/
	1626	void
	1627	lwkt_exit(void)
	1628	{
	1629	thread_t td = curthread;
	1630	thread_t std;
	1631	globaldata_t gd;
	1632
	1633	/*
	1634	* Do any cleanup that might block here
	1635	*/
	1636	if (td->td_flags & TDF_VERBOSE)
	1637	kprintf("kthread %p %s has exited\n", td, td->td_comm);
	1638	biosched_done(td);
	1639	dsched_exit_thread(td);
	1640
	1641	/*
	1642	* Get us into a critical section to interlock gd_freetd and loop
	1643	* until we can get it freed.
	1644	*
	1645	* We have to cache the current td in gd_freetd because objcache_put()ing
	1646	* it would rip it out from under us while our thread is still active.
	1647	*
	1648	* We are the current thread so of course our own TDF_RUNNING bit will
	1649	* be set, so unlike the lwp reap code we don't wait for it to clear.
	1650	*/
	1651	gd = mycpu;
	1652	crit_enter_quick(td);
	1653	for (;;) {
	1654	if (td->td_refs) {
	1655	tsleep(td, 0, "tdreap", 1);
	1656	continue;
	1657	}
	1658	if ((std = gd->gd_freetd) != NULL) {
	1659	KKASSERT((std->td_flags & (TDF_RUNNING\|TDF_PREEMPT_LOCK)) == 0);
	1660	gd->gd_freetd = NULL;
	1661	objcache_put(thread_cache, std);
	1662	continue;
	1663	}
	1664	break;
	1665	}
	1666
	1667	/*
	1668	* Remove thread resources from kernel lists and deschedule us for
	1669	* the last time. We cannot block after this point or we may end
	1670	* up with a stale td on the tsleepq.
	1671	*
	1672	* None of this may block, the critical section is the only thing
	1673	* protecting tdallq and the only thing preventing new lwkt_hold()
	1674	* thread refs now.
	1675	*/
	1676	if (td->td_flags & TDF_TSLEEPQ)
	1677	tsleep_remove(td);
	1678	lwkt_deschedule_self(td);
	1679	lwkt_remove_tdallq(td);
	1680	KKASSERT(td->td_refs == 0);
	1681
	1682	/*
	1683	* Final cleanup
	1684	*/
	1685	KKASSERT(gd->gd_freetd == NULL);
	1686	if (td->td_flags & TDF_ALLOCATED_THREAD)
	1687	gd->gd_freetd = td;
	1688	cpu_thread_exit();
	1689	}
	1690
	1691	void
	1692	lwkt_remove_tdallq(thread_t td)
	1693	{
	1694	KKASSERT(td->td_gd == mycpu);
	1695	TAILQ_REMOVE(&td->td_gd->gd_tdallq, td, td_allq);
	1696	}
	1697
	1698	/*
	1699	* Code reduction and branch prediction improvements. Call/return
	1700	* overhead on modern cpus often degenerates into 0 cycles due to
	1701	* the cpu's branch prediction hardware and return pc cache. We
	1702	* can take advantage of this by not inlining medium-complexity
	1703	* functions and we can also reduce the branch prediction impact
	1704	* by collapsing perfectly predictable branches into a single
	1705	* procedure instead of duplicating it.
	1706	*
	1707	* Is any of this noticeable? Probably not, so I'll take the
	1708	* smaller code size.
	1709	*/
	1710	void
	1711	crit_exit_wrapper(__DEBUG_CRIT_ARG__)
	1712	{
	1713	_crit_exit(mycpu __DEBUG_CRIT_PASS_ARG__);
	1714	}
	1715
	1716	void
	1717	crit_panic(void)
	1718	{
	1719	thread_t td = curthread;
	1720	int lcrit = td->td_critcount;
	1721
	1722	td->td_critcount = 0;
	1723	panic("td_critcount is/would-go negative! %p %d", td, lcrit);
	1724	/* NOT REACHED */
	1725	}
	1726
	1727	/*
	1728	* Called from debugger/panic on cpus which have been stopped. We must still
	1729	* process the IPIQ while stopped.
	1730	*
	1731	* If we are dumping also try to process any pending interrupts. This may
	1732	* or may not work depending on the state of the cpu at the point it was
	1733	* stopped.
	1734	*/
	1735	void
	1736	lwkt_smp_stopped(void)
	1737	{
	1738	globaldata_t gd = mycpu;
	1739
	1740	if (dumping) {
	1741	lwkt_process_ipiq();
	1742	--gd->gd_intr_nesting_level;
	1743	splz();
	1744	++gd->gd_intr_nesting_level;
	1745	} else {
	1746	lwkt_process_ipiq();
	1747	}
	1748	cpu_smp_stopped();
	1749	}