gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2003-2010 The DragonFly Project. All rights reserved.
	3	*
	4	* This code is derived from software contributed to The DragonFly Project
	5	* by Matthew Dillon <dillon@backplane.com>
	6	*
	7	* Redistribution and use in source and binary forms, with or without
	8	* modification, are permitted provided that the following conditions
	9	* are met:
	10	*
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in
	15	* the documentation and/or other materials provided with the
	16	* distribution.
	17	* 3. Neither the name of The DragonFly Project nor the names of its
	18	* contributors may be used to endorse or promote products derived
	19	* from this software without specific, prior written permission.
	20	*
	21	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	22	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	23	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	24	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	25	* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	26	* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
	27	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	28	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	29	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	30	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	31	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	32	* SUCH DAMAGE.
	33	*/
	34
	35	/*
	36	* Each cpu in a system has its own self-contained light weight kernel
	37	* thread scheduler, which means that generally speaking we only need
	38	* to use a critical section to avoid problems. Foreign thread
	39	* scheduling is queued via (async) IPIs.
	40	*/
	41
	42	#include <sys/param.h>
	43	#include <sys/systm.h>
	44	#include <sys/kernel.h>
	45	#include <sys/proc.h>
	46	#include <sys/rtprio.h>
	47	#include <sys/queue.h>
	48	#include <sys/sysctl.h>
	49	#include <sys/kthread.h>
	50	#include <machine/cpu.h>
	51	#include <sys/lock.h>
	52	#include <sys/caps.h>
	53	#include <sys/spinlock.h>
	54	#include <sys/ktr.h>
	55
	56	#include <sys/thread2.h>
	57	#include <sys/spinlock2.h>
	58	#include <sys/mplock2.h>
	59
	60	#include <sys/dsched.h>
	61
	62	#include <vm/vm.h>
	63	#include <vm/vm_param.h>
	64	#include <vm/vm_kern.h>
	65	#include <vm/vm_object.h>
	66	#include <vm/vm_page.h>
	67	#include <vm/vm_map.h>
	68	#include <vm/vm_pager.h>
	69	#include <vm/vm_extern.h>
	70
	71	#include <machine/stdarg.h>
	72	#include <machine/smp.h>
	73
	74	#if !defined(KTR_CTXSW)
	75	#define KTR_CTXSW KTR_ALL
	76	#endif
	77	KTR_INFO_MASTER(ctxsw);
	78	KTR_INFO(KTR_CTXSW, ctxsw, sw, 0, "#cpu[%d].td = %p",
	79	sizeof(int) + sizeof(struct thread *));
	80	KTR_INFO(KTR_CTXSW, ctxsw, pre, 1, "#cpu[%d].td = %p",
	81	sizeof(int) + sizeof(struct thread *));
	82	KTR_INFO(KTR_CTXSW, ctxsw, newtd, 2, "#threads[%p].name = %s",
	83	sizeof (struct thread ) + sizeof(char ));
	84	KTR_INFO(KTR_CTXSW, ctxsw, deadtd, 3, "#threads[%p].name = <dead>", sizeof (struct thread *));
	85
	86	static MALLOC_DEFINE(M_THREAD, "thread", "lwkt threads");
	87
	88	#ifdef INVARIANTS
	89	static int panic_on_cscount = 0;
	90	#endif
	91	static __int64_t switch_count = 0;
	92	static __int64_t preempt_hit = 0;
	93	static __int64_t preempt_miss = 0;
	94	static __int64_t preempt_weird = 0;
	95	static __int64_t token_contention_count __debugvar = 0;
	96	static int lwkt_use_spin_port;
	97	static struct objcache *thread_cache;
	98
	99	#ifdef SMP
	100	static void lwkt_schedule_remote(void arg, int arg2, struct intrframe frame);
	101	#endif
	102
	103	extern void cpu_heavy_restore(void);
	104	extern void cpu_lwkt_restore(void);
	105	extern void cpu_kthread_restore(void);
	106	extern void cpu_idle_restore(void);
	107
	108	#ifdef __x86_64__
	109
	110	static int
	111	jg_tos_ok(struct thread *td)
	112	{
	113	void *tos;
	114	int tos_ok;
	115
	116	if (td == NULL) {
	117	return 1;
	118	}
	119	KKASSERT(td->td_sp != NULL);
	120	tos = ((void **)td->td_sp)[0];
	121	tos_ok = 0;
	122	if ((tos == cpu_heavy_restore) \|\| (tos == cpu_lwkt_restore) \|\|
	123	(tos == cpu_kthread_restore) \|\| (tos == cpu_idle_restore)) {
	124	tos_ok = 1;
	125	}
	126	return tos_ok;
	127	}
	128
	129	#endif
	130
	131	/*
	132	* We can make all thread ports use the spin backend instead of the thread
	133	* backend. This should only be set to debug the spin backend.
	134	*/
	135	TUNABLE_INT("lwkt.use_spin_port", &lwkt_use_spin_port);
	136
	137	#ifdef INVARIANTS
	138	SYSCTL_INT(_lwkt, OID_AUTO, panic_on_cscount, CTLFLAG_RW, &panic_on_cscount, 0, "");
	139	#endif
	140	SYSCTL_QUAD(_lwkt, OID_AUTO, switch_count, CTLFLAG_RW, &switch_count, 0, "");
	141	SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_hit, CTLFLAG_RW, &preempt_hit, 0, "");
	142	SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_miss, CTLFLAG_RW, &preempt_miss, 0, "");
	143	SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_weird, CTLFLAG_RW, &preempt_weird, 0, "");
	144	#ifdef INVARIANTS
	145	SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count, CTLFLAG_RW,
	146	&token_contention_count, 0, "spinning due to token contention");
	147	#endif
	148
	149	/*
	150	* These helper procedures handle the runq, they can only be called from
	151	* within a critical section.
	152	*
	153	* WARNING! Prior to SMP being brought up it is possible to enqueue and
	154	* dequeue threads belonging to other cpus, so be sure to use td->td_gd
	155	* instead of 'mycpu' when referencing the globaldata structure. Once
	156	* SMP live enqueuing and dequeueing only occurs on the current cpu.
	157	*/
	158	static __inline
	159	void
	160	_lwkt_dequeue(thread_t td)
	161	{
	162	if (td->td_flags & TDF_RUNQ) {
	163	int nq = td->td_pri & TDPRI_MASK;
	164	struct globaldata *gd = td->td_gd;
	165
	166	td->td_flags &= ~TDF_RUNQ;
	167	TAILQ_REMOVE(&gd->gd_tdrunq[nq], td, td_threadq);
	168	/* runqmask is passively cleaned up by the switcher */
	169	}
	170	}
	171
	172	static __inline
	173	void
	174	_lwkt_enqueue(thread_t td)
	175	{
	176	if ((td->td_flags & (TDF_RUNQ\|TDF_MIGRATING\|TDF_BLOCKQ)) == 0) {
	177	int nq = td->td_pri & TDPRI_MASK;
	178	struct globaldata *gd = td->td_gd;
	179
	180	td->td_flags \|= TDF_RUNQ;
	181	TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], td, td_threadq);
	182	gd->gd_runqmask \|= 1 << nq;
	183	}
	184	}
	185
	186	static __boolean_t
	187	_lwkt_thread_ctor(void obj, void privdata, int ocflags)
	188	{
	189	struct thread td = (struct thread )obj;
	190
	191	td->td_kstack = NULL;
	192	td->td_kstack_size = 0;
	193	td->td_flags = TDF_ALLOCATED_THREAD;
	194	return (1);
	195	}
	196
	197	static void
	198	_lwkt_thread_dtor(void obj, void privdata)
	199	{
	200	struct thread td = (struct thread )obj;
	201
	202	KASSERT(td->td_flags & TDF_ALLOCATED_THREAD,
	203	("_lwkt_thread_dtor: not allocated from objcache"));
	204	KASSERT((td->td_flags & TDF_ALLOCATED_STACK) && td->td_kstack &&
	205	td->td_kstack_size > 0,
	206	("_lwkt_thread_dtor: corrupted stack"));
	207	kmem_free(&kernel_map, (vm_offset_t)td->td_kstack, td->td_kstack_size);
	208	}
	209
	210	/*
	211	* Initialize the lwkt s/system.
	212	*/
	213	void
	214	lwkt_init(void)
	215	{
	216	/* An objcache has 2 magazines per CPU so divide cache size by 2. */
	217	thread_cache = objcache_create_mbacked(M_THREAD, sizeof(struct thread),
	218	NULL, CACHE_NTHREADS/2,
	219	_lwkt_thread_ctor, _lwkt_thread_dtor, NULL);
	220	}
	221
	222	/*
	223	* Schedule a thread to run. As the current thread we can always safely
	224	* schedule ourselves, and a shortcut procedure is provided for that
	225	* function.
	226	*
	227	* (non-blocking, self contained on a per cpu basis)
	228	*/
	229	void
	230	lwkt_schedule_self(thread_t td)
	231	{
	232	crit_enter_quick(td);
	233	KASSERT(td != &td->td_gd->gd_idlethread, ("lwkt_schedule_self(): scheduling gd_idlethread is illegal!"));
	234	KKASSERT(td->td_lwp == NULL \|\| (td->td_lwp->lwp_flag & LWP_ONRUNQ) == 0);
	235	_lwkt_enqueue(td);
	236	crit_exit_quick(td);
	237	}
	238
	239	/*
	240	* Deschedule a thread.
	241	*
	242	* (non-blocking, self contained on a per cpu basis)
	243	*/
	244	void
	245	lwkt_deschedule_self(thread_t td)
	246	{
	247	crit_enter_quick(td);
	248	_lwkt_dequeue(td);
	249	crit_exit_quick(td);
	250	}
	251
	252	/*
	253	* LWKTs operate on a per-cpu basis
	254	*
	255	* WARNING! Called from early boot, 'mycpu' may not work yet.
	256	*/
	257	void
	258	lwkt_gdinit(struct globaldata *gd)
	259	{
	260	int i;
	261
	262	for (i = 0; i < sizeof(gd->gd_tdrunq)/sizeof(gd->gd_tdrunq[0]); ++i)
	263	TAILQ_INIT(&gd->gd_tdrunq[i]);
	264	gd->gd_runqmask = 0;
	265	TAILQ_INIT(&gd->gd_tdallq);
	266	}
	267
	268	/*
	269	* Create a new thread. The thread must be associated with a process context
	270	* or LWKT start address before it can be scheduled. If the target cpu is
	271	* -1 the thread will be created on the current cpu.
	272	*
	273	* If you intend to create a thread without a process context this function
	274	* does everything except load the startup and switcher function.
	275	*/
	276	thread_t
	277	lwkt_alloc_thread(struct thread *td, int stksize, int cpu, int flags)
	278	{
	279	globaldata_t gd = mycpu;
	280	void *stack;
	281
	282	/*
	283	* If static thread storage is not supplied allocate a thread. Reuse
	284	* a cached free thread if possible. gd_freetd is used to keep an exiting
	285	* thread intact through the exit.
	286	*/
	287	if (td == NULL) {
	288	if ((td = gd->gd_freetd) != NULL)
	289	gd->gd_freetd = NULL;
	290	else
	291	td = objcache_get(thread_cache, M_WAITOK);
	292	KASSERT((td->td_flags &
	293	(TDF_ALLOCATED_THREAD\|TDF_RUNNING)) == TDF_ALLOCATED_THREAD,
	294	("lwkt_alloc_thread: corrupted td flags 0x%X", td->td_flags));
	295	flags \|= td->td_flags & (TDF_ALLOCATED_THREAD\|TDF_ALLOCATED_STACK);
	296	}
	297
	298	/*
	299	* Try to reuse cached stack.
	300	*/
	301	if ((stack = td->td_kstack) != NULL && td->td_kstack_size != stksize) {
	302	if (flags & TDF_ALLOCATED_STACK) {
	303	kmem_free(&kernel_map, (vm_offset_t)stack, td->td_kstack_size);
	304	stack = NULL;
	305	}
	306	}
	307	if (stack == NULL) {
	308	stack = (void *)kmem_alloc(&kernel_map, stksize);
	309	flags \|= TDF_ALLOCATED_STACK;
	310	}
	311	if (cpu < 0)
	312	lwkt_init_thread(td, stack, stksize, flags, gd);
	313	else
	314	lwkt_init_thread(td, stack, stksize, flags, globaldata_find(cpu));
	315	return(td);
	316	}
	317
	318	/*
	319	* Initialize a preexisting thread structure. This function is used by
	320	* lwkt_alloc_thread() and also used to initialize the per-cpu idlethread.
	321	*
	322	* All threads start out in a critical section at a priority of
	323	* TDPRI_KERN_DAEMON. Higher level code will modify the priority as
	324	* appropriate. This function may send an IPI message when the
	325	* requested cpu is not the current cpu and consequently gd_tdallq may
	326	* not be initialized synchronously from the point of view of the originating
	327	* cpu.
	328	*
	329	* NOTE! we have to be careful in regards to creating threads for other cpus
	330	* if SMP has not yet been activated.
	331	*/
	332	#ifdef SMP
	333
	334	static void
	335	lwkt_init_thread_remote(void *arg)
	336	{
	337	thread_t td = arg;
	338
	339	/*
	340	* Protected by critical section held by IPI dispatch
	341	*/
	342	TAILQ_INSERT_TAIL(&td->td_gd->gd_tdallq, td, td_allq);
	343	}
	344
	345	#endif
	346
	347	void
	348	lwkt_init_thread(thread_t td, void *stack, int stksize, int flags,
	349	struct globaldata *gd)
	350	{
	351	globaldata_t mygd = mycpu;
	352
	353	bzero(td, sizeof(struct thread));
	354	td->td_kstack = stack;
	355	td->td_kstack_size = stksize;
	356	td->td_flags = flags;
	357	td->td_gd = gd;
	358	td->td_pri = TDPRI_KERN_DAEMON + TDPRI_CRIT;
	359	td->td_toks_stop = &td->td_toks_base;
	360	#ifdef SMP
	361	if ((flags & TDF_MPSAFE) == 0)
	362	td->td_mpcount = 1;
	363	#endif
	364	if (lwkt_use_spin_port)
	365	lwkt_initport_spin(&td->td_msgport);
	366	else
	367	lwkt_initport_thread(&td->td_msgport, td);
	368	pmap_init_thread(td);
	369	#ifdef SMP
	370	/*
	371	* Normally initializing a thread for a remote cpu requires sending an
	372	* IPI. However, the idlethread is setup before the other cpus are
	373	* activated so we have to treat it as a special case. XXX manipulation
	374	* of gd_tdallq requires the BGL.
	375	*/
	376	if (gd == mygd \|\| td == &gd->gd_idlethread) {
	377	crit_enter_gd(mygd);
	378	TAILQ_INSERT_TAIL(&gd->gd_tdallq, td, td_allq);
	379	crit_exit_gd(mygd);
	380	} else {
	381	lwkt_send_ipiq(gd, lwkt_init_thread_remote, td);
	382	}
	383	#else
	384	crit_enter_gd(mygd);
	385	TAILQ_INSERT_TAIL(&gd->gd_tdallq, td, td_allq);
	386	crit_exit_gd(mygd);
	387	#endif
	388
	389	dsched_new_thread(td);
	390	}
	391
	392	void
	393	lwkt_set_comm(thread_t td, const char *ctl, ...)
	394	{
	395	__va_list va;
	396
	397	__va_start(va, ctl);
	398	kvsnprintf(td->td_comm, sizeof(td->td_comm), ctl, va);
	399	__va_end(va);
	400	KTR_LOG(ctxsw_newtd, td, &td->td_comm[0]);
	401	}
	402
	403	void
	404	lwkt_hold(thread_t td)
	405	{
	406	++td->td_refs;
	407	}
	408
	409	void
	410	lwkt_rele(thread_t td)
	411	{
	412	KKASSERT(td->td_refs > 0);
	413	--td->td_refs;
	414	}
	415
	416	void
	417	lwkt_wait_free(thread_t td)
	418	{
	419	while (td->td_refs)
	420	tsleep(td, 0, "tdreap", hz);
	421	}
	422
	423	void
	424	lwkt_free_thread(thread_t td)
	425	{
	426	KASSERT((td->td_flags & TDF_RUNNING) == 0,
	427	("lwkt_free_thread: did not exit! %p", td));
	428
	429	if (td->td_flags & TDF_ALLOCATED_THREAD) {
	430	objcache_put(thread_cache, td);
	431	} else if (td->td_flags & TDF_ALLOCATED_STACK) {
	432	/* client-allocated struct with internally allocated stack */
	433	KASSERT(td->td_kstack && td->td_kstack_size > 0,
	434	("lwkt_free_thread: corrupted stack"));
	435	kmem_free(&kernel_map, (vm_offset_t)td->td_kstack, td->td_kstack_size);
	436	td->td_kstack = NULL;
	437	td->td_kstack_size = 0;
	438	}
	439	KTR_LOG(ctxsw_deadtd, td);
	440	}
	441
	442
	443	/*
	444	* Switch to the next runnable lwkt. If no LWKTs are runnable then
	445	* switch to the idlethread. Switching must occur within a critical
	446	* section to avoid races with the scheduling queue.
	447	*
	448	* We always have full control over our cpu's run queue. Other cpus
	449	* that wish to manipulate our queue must use the cpu_*msg() calls to
	450	* talk to our cpu, so a critical section is all that is needed and
	451	* the result is very, very fast thread switching.
	452	*
	453	* The LWKT scheduler uses a fixed priority model and round-robins at
	454	* each priority level. User process scheduling is a totally
	455	* different beast and LWKT priorities should not be confused with
	456	* user process priorities.
	457	*
	458	* The MP lock may be out of sync with the thread's td_mpcount. lwkt_switch()
	459	* cleans it up. Note that the td_switch() function cannot do anything that
	460	* requires the MP lock since the MP lock will have already been setup for
	461	* the target thread (not the current thread). It's nice to have a scheduler
	462	* that does not need the MP lock to work because it allows us to do some
	463	* really cool high-performance MP lock optimizations.
	464	*
	465	* PREEMPTION NOTE: Preemption occurs via lwkt_preempt(). lwkt_switch()
	466	* is not called by the current thread in the preemption case, only when
	467	* the preempting thread blocks (in order to return to the original thread).
	468	*/
	469	void
	470	lwkt_switch(void)
	471	{
	472	globaldata_t gd = mycpu;
	473	thread_t td = gd->gd_curthread;
	474	thread_t ntd;
	475	#ifdef SMP
	476	int mpheld;
	477	#endif
	478
	479	/*
	480	* Switching from within a 'fast' (non thread switched) interrupt or IPI
	481	* is illegal. However, we may have to do it anyway if we hit a fatal
	482	* kernel trap or we have paniced.
	483	*
	484	* If this case occurs save and restore the interrupt nesting level.
	485	*/
	486	if (gd->gd_intr_nesting_level) {
	487	int savegdnest;
	488	int savegdtrap;
	489
	490	if (gd->gd_trap_nesting_level == 0 && panicstr == NULL) {
	491	panic("lwkt_switch: cannot switch from within "
	492	"a fast interrupt, yet, td %p\n", td);
	493	} else {
	494	savegdnest = gd->gd_intr_nesting_level;
	495	savegdtrap = gd->gd_trap_nesting_level;
	496	gd->gd_intr_nesting_level = 0;
	497	gd->gd_trap_nesting_level = 0;
	498	if ((td->td_flags & TDF_PANICWARN) == 0) {
	499	td->td_flags \|= TDF_PANICWARN;
	500	kprintf("Warning: thread switch from interrupt or IPI, "
	501	"thread %p (%s)\n", td, td->td_comm);
	502	print_backtrace(-1);
	503	}
	504	lwkt_switch();
	505	gd->gd_intr_nesting_level = savegdnest;
	506	gd->gd_trap_nesting_level = savegdtrap;
	507	return;
	508	}
	509	}
	510
	511	/*
	512	* Passive release (used to transition from user to kernel mode
	513	* when we block or switch rather then when we enter the kernel).
	514	* This function is NOT called if we are switching into a preemption
	515	* or returning from a preemption. Typically this causes us to lose
	516	* our current process designation (if we have one) and become a true
	517	* LWKT thread, and may also hand the current process designation to
	518	* another process and schedule thread.
	519	*/
	520	if (td->td_release)
	521	td->td_release(td);
	522
	523	crit_enter_gd(gd);
	524	if (TD_TOKS_HELD(td))
	525	lwkt_relalltokens(td);
	526
	527	/*
	528	* We had better not be holding any spin locks, but don't get into an
	529	* endless panic loop.
	530	*/
	531	KASSERT(gd->gd_spinlock_rd == NULL \|\| panicstr != NULL,
	532	("lwkt_switch: still holding a shared spinlock %p!",
	533	gd->gd_spinlock_rd));
	534	KASSERT(gd->gd_spinlocks_wr == 0 \|\| panicstr != NULL,
	535	("lwkt_switch: still holding %d exclusive spinlocks!",
	536	gd->gd_spinlocks_wr));
	537
	538
	539	#ifdef SMP
	540	/*
	541	* td_mpcount cannot be used to determine if we currently hold the
	542	* MP lock because get_mplock() will increment it prior to attempting
	543	* to get the lock, and switch out if it can't. Our ownership of
	544	* the actual lock will remain stable while we are in a critical section
	545	* (but, of course, another cpu may own or release the lock so the
	546	* actual value of mp_lock is not stable).
	547	*/
	548	mpheld = MP_LOCK_HELD();
	549	#ifdef INVARIANTS
	550	if (td->td_cscount) {
	551	kprintf("Diagnostic: attempt to switch while mastering cpusync: %p\n",
	552	td);
	553	if (panic_on_cscount)
	554	panic("switching while mastering cpusync");
	555	}
	556	#endif
	557	#endif
	558	if ((ntd = td->td_preempted) != NULL) {
	559	/*
	560	* We had preempted another thread on this cpu, resume the preempted
	561	* thread. This occurs transparently, whether the preempted thread
	562	* was scheduled or not (it may have been preempted after descheduling
	563	* itself).
	564	*
	565	* We have to setup the MP lock for the original thread after backing
	566	* out the adjustment that was made to curthread when the original
	567	* was preempted.
	568	*/
	569	KKASSERT(ntd->td_flags & TDF_PREEMPT_LOCK);
	570	#ifdef SMP
	571	if (ntd->td_mpcount && mpheld == 0) {
	572	panic("MPLOCK NOT HELD ON RETURN: %p %p %d %d",
	573	td, ntd, td->td_mpcount, ntd->td_mpcount);
	574	}
	575	if (ntd->td_mpcount) {
	576	td->td_mpcount -= ntd->td_mpcount;
	577	KKASSERT(td->td_mpcount >= 0);
	578	}
	579	#endif
	580	ntd->td_flags \|= TDF_PREEMPT_DONE;
	581
	582	/*
	583	* The interrupt may have woken a thread up, we need to properly
	584	* set the reschedule flag if the originally interrupted thread is
	585	* at a lower priority.
	586	*/
	587	if (gd->gd_runqmask > (2 << (ntd->td_pri & TDPRI_MASK)) - 1)
	588	need_lwkt_resched();
	589	/* YYY release mp lock on switchback if original doesn't need it */
	590	} else {
	591	/*
	592	* Priority queue / round-robin at each priority. Note that user
	593	* processes run at a fixed, low priority and the user process
	594	* scheduler deals with interactions between user processes
	595	* by scheduling and descheduling them from the LWKT queue as
	596	* necessary.
	597	*
	598	* We have to adjust the MP lock for the target thread. If we
	599	* need the MP lock and cannot obtain it we try to locate a
	600	* thread that does not need the MP lock. If we cannot, we spin
	601	* instead of HLT.
	602	*
	603	* A similar issue exists for the tokens held by the target thread.
	604	* If we cannot obtain ownership of the tokens we cannot immediately
	605	* schedule the thread.
	606	*/
	607
	608	/*
	609	* If an LWKT reschedule was requested, well that is what we are
	610	* doing now so clear it.
	611	*/
	612	clear_lwkt_resched();
	613	again:
	614	if (gd->gd_runqmask) {
	615	int nq = bsrl(gd->gd_runqmask);
	616	if ((ntd = TAILQ_FIRST(&gd->gd_tdrunq[nq])) == NULL) {
	617	gd->gd_runqmask &= ~(1 << nq);
	618	goto again;
	619	}
	620	#ifdef SMP
	621	/*
	622	* THREAD SELECTION FOR AN SMP MACHINE BUILD
	623	*
	624	* If the target needs the MP lock and we couldn't get it,
	625	* or if the target is holding tokens and we could not
	626	* gain ownership of the tokens, continue looking for a
	627	* thread to schedule and spin instead of HLT if we can't.
	628	*
	629	* NOTE: the mpheld variable invalid after this conditional, it
	630	* can change due to both cpu_try_mplock() returning success
	631	* AND interactions in lwkt_getalltokens() due to the fact that
	632	* we are trying to check the mpcount of a thread other then
	633	* the current thread. Because of this, if the current thread
	634	* is not holding td_mpcount, an IPI indirectly run via
	635	* lwkt_getalltokens() can obtain and release the MP lock and
	636	* cause the core MP lock to be released.
	637	*/
	638	if ((ntd->td_mpcount && mpheld == 0 && !cpu_try_mplock()) \|\|
	639	(TD_TOKS_HELD(ntd) && lwkt_getalltokens(ntd) == 0)
	640	) {
	641	u_int32_t rqmask = gd->gd_runqmask;
	642
	643	cpu_pause();
	644
	645	mpheld = MP_LOCK_HELD();
	646	ntd = NULL;
	647	while (rqmask) {
	648	TAILQ_FOREACH(ntd, &gd->gd_tdrunq[nq], td_threadq) {
	649	if (ntd->td_mpcount && !mpheld && !cpu_try_mplock()) {
	650	/* spinning due to MP lock being held */
	651	continue;
	652	}
	653
	654	/*
	655	* mpheld state invalid after getalltokens call returns
	656	* failure, but the variable is only needed for
	657	* the loop.
	658	*/
	659	if (TD_TOKS_HELD(ntd) && !lwkt_getalltokens(ntd)) {
	660	/* spinning due to token contention */
	661	#ifdef INVARIANTS
	662	++token_contention_count;
	663	#endif
	664	mpheld = MP_LOCK_HELD();
	665	continue;
	666	}
	667	break;
	668	}
	669	if (ntd)
	670	break;
	671	rqmask &= ~(1 << nq);
	672	nq = bsrl(rqmask);
	673
	674	/*
	675	* We have two choices. We can either refuse to run a
	676	* user thread when a kernel thread needs the MP lock
	677	* but could not get it, or we can allow it to run but
	678	* then expect an IPI (hopefully) later on to force a
	679	* reschedule when the MP lock might become available.
	680	*/
	681	if (nq < TDPRI_KERN_LPSCHED) {
	682	break; /* for now refuse to run */
	683	#if 0
	684	if (chain_mplock == 0)
	685	break;
	686	/* continue loop, allow user threads to be scheduled */
	687	#endif
	688	}
	689	}
	690
	691	/*
	692	* Case where a (kernel) thread needed the MP lock and could
	693	* not get one, and we may or may not have found another
	694	* thread which does not need the MP lock to run while
	695	* we wait (ntd).
	696	*/
	697	if (ntd == NULL) {
	698	ntd = &gd->gd_idlethread;
	699	ntd->td_flags \|= TDF_IDLE_NOHLT;
	700	set_mplock_contention_mask(gd);
	701	cpu_mplock_contested();
	702	goto using_idle_thread;
	703	} else {
	704	clr_mplock_contention_mask(gd);
	705	++gd->gd_cnt.v_swtch;
	706	TAILQ_REMOVE(&gd->gd_tdrunq[nq], ntd, td_threadq);
	707	TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], ntd, td_threadq);
	708	}
	709	} else {
	710	clr_mplock_contention_mask(gd);
	711	++gd->gd_cnt.v_swtch;
	712	TAILQ_REMOVE(&gd->gd_tdrunq[nq], ntd, td_threadq);
	713	TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], ntd, td_threadq);
	714	}
	715	#else
	716	/*
	717	* THREAD SELECTION FOR A UP MACHINE BUILD. We don't have to
	718	* worry about tokens or the BGL. However, we still have
	719	* to call lwkt_getalltokens() in order to properly detect
	720	* stale tokens. This call cannot fail for a UP build!
	721	*/
	722	lwkt_getalltokens(ntd);
	723	++gd->gd_cnt.v_swtch;
	724	TAILQ_REMOVE(&gd->gd_tdrunq[nq], ntd, td_threadq);
	725	TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], ntd, td_threadq);
	726	#endif
	727	} else {
	728	/*
	729	* We have nothing to run but only let the idle loop halt
	730	* the cpu if there are no pending interrupts.
	731	*/
	732	ntd = &gd->gd_idlethread;
	733	if (gd->gd_reqflags & RQF_IDLECHECK_MASK)
	734	ntd->td_flags \|= TDF_IDLE_NOHLT;
	735	#ifdef SMP
	736	using_idle_thread:
	737	/*
	738	* The idle thread should not be holding the MP lock unless we
	739	* are trapping in the kernel or in a panic. Since we select the
	740	* idle thread unconditionally when no other thread is available,
	741	* if the MP lock is desired during a panic or kernel trap, we
	742	* have to loop in the scheduler until we get it.
	743	*/
	744	if (ntd->td_mpcount) {
	745	mpheld = MP_LOCK_HELD();
	746	if (gd->gd_trap_nesting_level == 0 && panicstr == NULL)
	747	panic("Idle thread %p was holding the BGL!", ntd);
	748	if (mpheld == 0)
	749	goto again;
	750	}
	751	#endif
	752	}
	753	}
	754	KASSERT(ntd->td_pri >= TDPRI_CRIT,
	755	("priority problem in lwkt_switch %d %d", td->td_pri, ntd->td_pri));
	756
	757	/*
	758	* Do the actual switch. If the new target does not need the MP lock
	759	* and we are holding it, release the MP lock. If the new target requires
	760	* the MP lock we have already acquired it for the target.
	761	*/
	762	#ifdef SMP
	763	if (ntd->td_mpcount == 0 ) {
	764	if (MP_LOCK_HELD())
	765	cpu_rel_mplock();
	766	} else {
	767	ASSERT_MP_LOCK_HELD(ntd);
	768	}
	769	#endif
	770	if (td != ntd) {
	771	++switch_count;
	772	#ifdef __x86_64__
	773	{
	774	int tos_ok __debugvar = jg_tos_ok(ntd);
	775	KKASSERT(tos_ok);
	776	}
	777	#endif
	778	KTR_LOG(ctxsw_sw, gd->gd_cpuid, ntd);
	779	td->td_switch(ntd);
	780	}
	781	/* NOTE: current cpu may have changed after switch */
	782	crit_exit_quick(td);
	783	}
	784
	785	/*
	786	* Request that the target thread preempt the current thread. Preemption
	787	* only works under a specific set of conditions:
	788	*
	789	* - We are not preempting ourselves
	790	* - The target thread is owned by the current cpu
	791	* - We are not currently being preempted
	792	* - The target is not currently being preempted
	793	* - We are not holding any spin locks
	794	* - The target thread is not holding any tokens
	795	* - We are able to satisfy the target's MP lock requirements (if any).
	796	*
	797	* THE CALLER OF LWKT_PREEMPT() MUST BE IN A CRITICAL SECTION. Typically
	798	* this is called via lwkt_schedule() through the td_preemptable callback.
	799	* critpri is the managed critical priority that we should ignore in order
	800	* to determine whether preemption is possible (aka usually just the crit
	801	* priority of lwkt_schedule() itself).
	802	*
	803	* XXX at the moment we run the target thread in a critical section during
	804	* the preemption in order to prevent the target from taking interrupts
	805	* that WE can't. Preemption is strictly limited to interrupt threads
	806	* and interrupt-like threads, outside of a critical section, and the
	807	* preempted source thread will be resumed the instant the target blocks
	808	* whether or not the source is scheduled (i.e. preemption is supposed to
	809	* be as transparent as possible).
	810	*
	811	* The target thread inherits our MP count (added to its own) for the
	812	* duration of the preemption in order to preserve the atomicy of the
	813	* MP lock during the preemption. Therefore, any preempting targets must be
	814	* careful in regards to MP assertions. Note that the MP count may be
	815	* out of sync with the physical mp_lock, but we do not have to preserve
	816	* the original ownership of the lock if it was out of synch (that is, we
	817	* can leave it synchronized on return).
	818	*/
	819	void
	820	lwkt_preempt(thread_t ntd, int critpri)
	821	{
	822	struct globaldata *gd = mycpu;
	823	thread_t td;
	824	#ifdef SMP
	825	int mpheld;
	826	int savecnt;
	827	#endif
	828
	829	/*
	830	* The caller has put us in a critical section. We can only preempt
	831	* if the caller of the caller was not in a critical section (basically
	832	* a local interrupt), as determined by the 'critpri' parameter. We
	833	* also can't preempt if the caller is holding any spinlocks (even if
	834	* he isn't in a critical section). This also handles the tokens test.
	835	*
	836	* YYY The target thread must be in a critical section (else it must
	837	* inherit our critical section? I dunno yet).
	838	*
	839	* Set need_lwkt_resched() unconditionally for now YYY.
	840	*/
	841	KASSERT(ntd->td_pri >= TDPRI_CRIT, ("BADCRIT0 %d", ntd->td_pri));
	842
	843	td = gd->gd_curthread;
	844	if ((ntd->td_pri & TDPRI_MASK) <= (td->td_pri & TDPRI_MASK)) {
	845	++preempt_miss;
	846	return;
	847	}
	848	if ((td->td_pri & ~TDPRI_MASK) > critpri) {
	849	++preempt_miss;
	850	need_lwkt_resched();
	851	return;
	852	}
	853	#ifdef SMP
	854	if (ntd->td_gd != gd) {
	855	++preempt_miss;
	856	need_lwkt_resched();
	857	return;
	858	}
	859	#endif
	860	/*
	861	* Take the easy way out and do not preempt if we are holding
	862	* any spinlocks. We could test whether the thread(s) being
	863	* preempted interlock against the target thread's tokens and whether
	864	* we can get all the target thread's tokens, but this situation
	865	* should not occur very often so its easier to simply not preempt.
	866	* Also, plain spinlocks are impossible to figure out at this point so
	867	* just don't preempt.
	868	*
	869	* Do not try to preempt if the target thread is holding any tokens.
	870	* We could try to acquire the tokens but this case is so rare there
	871	* is no need to support it.
	872	*/
	873	if (gd->gd_spinlock_rd \|\| gd->gd_spinlocks_wr) {
	874	++preempt_miss;
	875	need_lwkt_resched();
	876	return;
	877	}
	878	if (TD_TOKS_HELD(ntd)) {
	879	++preempt_miss;
	880	need_lwkt_resched();
	881	return;
	882	}
	883	if (td == ntd \|\| ((td->td_flags \| ntd->td_flags) & TDF_PREEMPT_LOCK)) {
	884	++preempt_weird;
	885	need_lwkt_resched();
	886	return;
	887	}
	888	if (ntd->td_preempted) {
	889	++preempt_hit;
	890	need_lwkt_resched();
	891	return;
	892	}
	893	#ifdef SMP
	894	/*
	895	* note: an interrupt might have occured just as we were transitioning
	896	* to or from the MP lock. In this case td_mpcount will be pre-disposed
	897	* (non-zero) but not actually synchronized with the actual state of the
	898	* lock. We can use it to imply an MP lock requirement for the
	899	* preemption but we cannot use it to test whether we hold the MP lock
	900	* or not.
	901	*/
	902	savecnt = td->td_mpcount;
	903	mpheld = MP_LOCK_HELD();
	904	ntd->td_mpcount += td->td_mpcount;
	905	if (mpheld == 0 && ntd->td_mpcount && !cpu_try_mplock()) {
	906	ntd->td_mpcount -= td->td_mpcount;
	907	++preempt_miss;
	908	need_lwkt_resched();
	909	return;
	910	}
	911	#endif
	912
	913	/*
	914	* Since we are able to preempt the current thread, there is no need to
	915	* call need_lwkt_resched().
	916	*/
	917	++preempt_hit;
	918	ntd->td_preempted = td;
	919	td->td_flags \|= TDF_PREEMPT_LOCK;
	920	KTR_LOG(ctxsw_pre, gd->gd_cpuid, ntd);
	921	td->td_switch(ntd);
	922
	923	KKASSERT(ntd->td_preempted && (td->td_flags & TDF_PREEMPT_DONE));
	924	#ifdef SMP
	925	KKASSERT(savecnt == td->td_mpcount);
	926	mpheld = MP_LOCK_HELD();
	927	if (mpheld && td->td_mpcount == 0)
	928	cpu_rel_mplock();
	929	else if (mpheld == 0 && td->td_mpcount)
	930	panic("lwkt_preempt(): MP lock was not held through");
	931	#endif
	932	ntd->td_preempted = NULL;
	933	td->td_flags &= ~(TDF_PREEMPT_LOCK\|TDF_PREEMPT_DONE);
	934	}
	935
	936	/*
	937	* Conditionally call splz() if gd_reqflags indicates work is pending.
	938	*
	939	* td_nest_count prevents deep nesting via splz() or doreti() which
	940	* might otherwise blow out the kernel stack. Note that except for
	941	* this special case, we MUST call splz() here to handle any
	942	* pending ints, particularly after we switch, or we might accidently
	943	* halt the cpu with interrupts pending.
	944	*
	945	* (self contained on a per cpu basis)
	946	*/
	947	void
	948	splz_check(void)
	949	{
	950	globaldata_t gd = mycpu;
	951	thread_t td = gd->gd_curthread;
	952
	953	if (gd->gd_reqflags && td->td_nest_count < 2)
	954	splz();
	955	}
	956
	957	/*
	958	* This implements a normal yield which will yield to equal priority
	959	* threads as well as higher priority threads. Note that gd_reqflags
	960	* tests will be handled by the crit_exit() call in lwkt_switch().
	961	*
	962	* (self contained on a per cpu basis)
	963	*/
	964	void
	965	lwkt_yield(void)
	966	{
	967	lwkt_schedule_self(curthread);
	968	lwkt_switch();
	969	}
	970
	971	/*
	972	* This function is used along with the lwkt_passive_recover() inline
	973	* by the trap code to negotiate a passive release of the current
	974	* process/lwp designation with the user scheduler.
	975	*/
	976	void
	977	lwkt_passive_release(struct thread *td)
	978	{
	979	struct lwp *lp = td->td_lwp;
	980
	981	td->td_release = NULL;
	982	lwkt_setpri_self(TDPRI_KERN_USER);
	983	lp->lwp_proc->p_usched->release_curproc(lp);
	984	}
	985
	986	/*
	987	* Make a kernel thread act as if it were in user mode with regards
	988	* to scheduling, to avoid becoming cpu-bound in the kernel. Kernel
	989	* loops which may be potentially cpu-bound can call lwkt_user_yield().
	990	*
	991	* The lwkt_user_yield() function is designed to have very low overhead
	992	* if no yield is determined to be needed.
	993	*/
	994	void
	995	lwkt_user_yield(void)
	996	{
	997	thread_t td = curthread;
	998	struct lwp *lp = td->td_lwp;
	999
	1000	#ifdef SMP
	1001	/*
	1002	* XXX SEVERE TEMPORARY HACK. A cpu-bound operation running in the
	1003	* kernel can prevent other cpus from servicing interrupt threads
	1004	* which still require the MP lock (which is a lot of them). This
	1005	* has a chaining effect since if the interrupt is blocked, so is
	1006	* the event, so normal scheduling will not pick up on the problem.
	1007	*/
	1008	if (mp_lock_contention_mask && td->td_mpcount) {
	1009	yield_mplock(td);
	1010	}
	1011	#endif
	1012
	1013	/*
	1014	* Another kernel thread wants the cpu
	1015	*/
	1016	if (lwkt_resched_wanted())
	1017	lwkt_switch();
	1018
	1019	/*
	1020	* If the user scheduler has asynchronously determined that the current
	1021	* process (when running in user mode) needs to lose the cpu then make
	1022	* sure we are released.
	1023	*/
	1024	if (user_resched_wanted()) {
	1025	if (td->td_release)
	1026	td->td_release(td);
	1027	}
	1028
	1029	/*
	1030	* If we are released reduce our priority
	1031	*/
	1032	if (td->td_release == NULL) {
	1033	if (lwkt_check_resched(td) > 0)
	1034	lwkt_switch();
	1035	if (lp) {
	1036	lp->lwp_proc->p_usched->acquire_curproc(lp);
	1037	td->td_release = lwkt_passive_release;
	1038	lwkt_setpri_self(TDPRI_USER_NORM);
	1039	}
	1040	}
	1041	}
	1042
	1043	/*
	1044	* Return 0 if no runnable threads are pending at the same or higher
	1045	* priority as the passed thread.
	1046	*
	1047	* Return 1 if runnable threads are pending at the same priority.
	1048	*
	1049	* Return 2 if runnable threads are pending at a higher priority.
	1050	*/
	1051	int
	1052	lwkt_check_resched(thread_t td)
	1053	{
	1054	int pri = td->td_pri & TDPRI_MASK;
	1055
	1056	if (td->td_gd->gd_runqmask > (2 << pri) - 1)
	1057	return(2);
	1058	if (TAILQ_NEXT(td, td_threadq))
	1059	return(1);
	1060	return(0);
	1061	}
	1062
	1063	/*
	1064	* Generic schedule. Possibly schedule threads belonging to other cpus and
	1065	* deal with threads that might be blocked on a wait queue.
	1066	*
	1067	* We have a little helper inline function which does additional work after
	1068	* the thread has been enqueued, including dealing with preemption and
	1069	* setting need_lwkt_resched() (which prevents the kernel from returning
	1070	* to userland until it has processed higher priority threads).
	1071	*
	1072	* It is possible for this routine to be called after a failed _enqueue
	1073	* (due to the target thread migrating, sleeping, or otherwise blocked).
	1074	* We have to check that the thread is actually on the run queue!
	1075	*
	1076	* reschedok is an optimized constant propagated from lwkt_schedule() or
	1077	* lwkt_schedule_noresched(). By default it is non-zero, causing a
	1078	* reschedule to be requested if the target thread has a higher priority.
	1079	* The port messaging code will set MSG_NORESCHED and cause reschedok to
	1080	* be 0, prevented undesired reschedules.
	1081	*/
	1082	static __inline
	1083	void
	1084	_lwkt_schedule_post(globaldata_t gd, thread_t ntd, int cpri, int reschedok)
	1085	{
	1086	thread_t otd;
	1087
	1088	if (ntd->td_flags & TDF_RUNQ) {
	1089	if (ntd->td_preemptable && reschedok) {
	1090	ntd->td_preemptable(ntd, cpri); /* YYY +token */
	1091	} else if (reschedok) {
	1092	otd = curthread;
	1093	if ((ntd->td_pri & TDPRI_MASK) > (otd->td_pri & TDPRI_MASK))
	1094	need_lwkt_resched();
	1095	}
	1096	}
	1097	}
	1098
	1099	static __inline
	1100	void
	1101	_lwkt_schedule(thread_t td, int reschedok)
	1102	{
	1103	globaldata_t mygd = mycpu;
	1104
	1105	KASSERT(td != &td->td_gd->gd_idlethread, ("lwkt_schedule(): scheduling gd_idlethread is illegal!"));
	1106	crit_enter_gd(mygd);
	1107	KKASSERT(td->td_lwp == NULL \|\| (td->td_lwp->lwp_flag & LWP_ONRUNQ) == 0);
	1108	if (td == mygd->gd_curthread) {
	1109	_lwkt_enqueue(td);
	1110	} else {
	1111	/*
	1112	* If we own the thread, there is no race (since we are in a
	1113	* critical section). If we do not own the thread there might
	1114	* be a race but the target cpu will deal with it.
	1115	*/
	1116	#ifdef SMP
	1117	if (td->td_gd == mygd) {
	1118	_lwkt_enqueue(td);
	1119	_lwkt_schedule_post(mygd, td, TDPRI_CRIT, reschedok);
	1120	} else {
	1121	lwkt_send_ipiq3(td->td_gd, lwkt_schedule_remote, td, 0);
	1122	}
	1123	#else
	1124	_lwkt_enqueue(td);
	1125	_lwkt_schedule_post(mygd, td, TDPRI_CRIT, reschedok);
	1126	#endif
	1127	}
	1128	crit_exit_gd(mygd);
	1129	}
	1130
	1131	void
	1132	lwkt_schedule(thread_t td)
	1133	{
	1134	_lwkt_schedule(td, 1);
	1135	}
	1136
	1137	void
	1138	lwkt_schedule_noresched(thread_t td)
	1139	{
	1140	_lwkt_schedule(td, 0);
	1141	}
	1142
	1143	#ifdef SMP
	1144
	1145	/*
	1146	* When scheduled remotely if frame != NULL the IPIQ is being
	1147	* run via doreti or an interrupt then preemption can be allowed.
	1148	*
	1149	* To allow preemption we have to drop the critical section so only
	1150	* one is present in _lwkt_schedule_post.
	1151	*/
	1152	static void
	1153	lwkt_schedule_remote(void arg, int arg2, struct intrframe frame)
	1154	{
	1155	thread_t td = curthread;
	1156	thread_t ntd = arg;
	1157
	1158	if (frame && ntd->td_preemptable) {
	1159	crit_exit_noyield(td);
	1160	_lwkt_schedule(ntd, 1);
	1161	crit_enter_quick(td);
	1162	} else {
	1163	_lwkt_schedule(ntd, 1);
	1164	}
	1165	}
	1166
	1167	/*
	1168	* Thread migration using a 'Pull' method. The thread may or may not be
	1169	* the current thread. It MUST be descheduled and in a stable state.
	1170	* lwkt_giveaway() must be called on the cpu owning the thread.
	1171	*
	1172	* At any point after lwkt_giveaway() is called, the target cpu may
	1173	* 'pull' the thread by calling lwkt_acquire().
	1174	*
	1175	* We have to make sure the thread is not sitting on a per-cpu tsleep
	1176	* queue or it will blow up when it moves to another cpu.
	1177	*
	1178	* MPSAFE - must be called under very specific conditions.
	1179	*/
	1180	void
	1181	lwkt_giveaway(thread_t td)
	1182	{
	1183	globaldata_t gd = mycpu;
	1184
	1185	crit_enter_gd(gd);
	1186	if (td->td_flags & TDF_TSLEEPQ)
	1187	tsleep_remove(td);
	1188	KKASSERT(td->td_gd == gd);
	1189	TAILQ_REMOVE(&gd->gd_tdallq, td, td_allq);
	1190	td->td_flags \|= TDF_MIGRATING;
	1191	crit_exit_gd(gd);
	1192	}
	1193
	1194	void
	1195	lwkt_acquire(thread_t td)
	1196	{
	1197	globaldata_t gd;
	1198	globaldata_t mygd;
	1199
	1200	KKASSERT(td->td_flags & TDF_MIGRATING);
	1201	gd = td->td_gd;
	1202	mygd = mycpu;
	1203	if (gd != mycpu) {
	1204	cpu_lfence();
	1205	KKASSERT((td->td_flags & TDF_RUNQ) == 0);
	1206	crit_enter_gd(mygd);
	1207	while (td->td_flags & (TDF_RUNNING\|TDF_PREEMPT_LOCK)) {
	1208	#ifdef SMP
	1209	lwkt_process_ipiq();
	1210	#endif
	1211	cpu_lfence();
	1212	}
	1213	td->td_gd = mygd;
	1214	TAILQ_INSERT_TAIL(&mygd->gd_tdallq, td, td_allq);
	1215	td->td_flags &= ~TDF_MIGRATING;
	1216	crit_exit_gd(mygd);
	1217	} else {
	1218	crit_enter_gd(mygd);
	1219	TAILQ_INSERT_TAIL(&mygd->gd_tdallq, td, td_allq);
	1220	td->td_flags &= ~TDF_MIGRATING;
	1221	crit_exit_gd(mygd);
	1222	}
	1223	}
	1224
	1225	#endif
	1226
	1227	/*
	1228	* Generic deschedule. Descheduling threads other then your own should be
	1229	* done only in carefully controlled circumstances. Descheduling is
	1230	* asynchronous.
	1231	*
	1232	* This function may block if the cpu has run out of messages.
	1233	*/
	1234	void
	1235	lwkt_deschedule(thread_t td)
	1236	{
	1237	crit_enter();
	1238	#ifdef SMP
	1239	if (td == curthread) {
	1240	_lwkt_dequeue(td);
	1241	} else {
	1242	if (td->td_gd == mycpu) {
	1243	_lwkt_dequeue(td);
	1244	} else {
	1245	lwkt_send_ipiq(td->td_gd, (ipifunc1_t)lwkt_deschedule, td);
	1246	}
	1247	}
	1248	#else
	1249	_lwkt_dequeue(td);
	1250	#endif
	1251	crit_exit();
	1252	}
	1253
	1254	/*
	1255	* Set the target thread's priority. This routine does not automatically
	1256	* switch to a higher priority thread, LWKT threads are not designed for
	1257	* continuous priority changes. Yield if you want to switch.
	1258	*
	1259	* We have to retain the critical section count which uses the high bits
	1260	* of the td_pri field. The specified priority may also indicate zero or
	1261	* more critical sections by adding TDPRI_CRIT*N.
	1262	*
	1263	* Note that we requeue the thread whether it winds up on a different runq
	1264	* or not. uio_yield() depends on this and the routine is not normally
	1265	* called with the same priority otherwise.
	1266	*/
	1267	void
	1268	lwkt_setpri(thread_t td, int pri)
	1269	{
	1270	KKASSERT(pri >= 0);
	1271	KKASSERT(td->td_gd == mycpu);
	1272	crit_enter();
	1273	if (td->td_flags & TDF_RUNQ) {
	1274	_lwkt_dequeue(td);
	1275	td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri;
	1276	_lwkt_enqueue(td);
	1277	} else {
	1278	td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri;
	1279	}
	1280	crit_exit();
	1281	}
	1282
	1283	/*
	1284	* Set the initial priority for a thread prior to it being scheduled for
	1285	* the first time. The thread MUST NOT be scheduled before or during
	1286	* this call. The thread may be assigned to a cpu other then the current
	1287	* cpu.
	1288	*
	1289	* Typically used after a thread has been created with TDF_STOPPREQ,
	1290	* and before the thread is initially scheduled.
	1291	*/
	1292	void
	1293	lwkt_setpri_initial(thread_t td, int pri)
	1294	{
	1295	KKASSERT(pri >= 0);
	1296	KKASSERT((td->td_flags & TDF_RUNQ) == 0);
	1297	td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri;
	1298	}
	1299
	1300	void
	1301	lwkt_setpri_self(int pri)
	1302	{
	1303	thread_t td = curthread;
	1304
	1305	KKASSERT(pri >= 0 && pri <= TDPRI_MAX);
	1306	crit_enter();
	1307	if (td->td_flags & TDF_RUNQ) {
	1308	_lwkt_dequeue(td);
	1309	td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri;
	1310	_lwkt_enqueue(td);
	1311	} else {
	1312	td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri;
	1313	}
	1314	crit_exit();
	1315	}
	1316
	1317	/*
	1318	* Migrate the current thread to the specified cpu.
	1319	*
	1320	* This is accomplished by descheduling ourselves from the current cpu,
	1321	* moving our thread to the tdallq of the target cpu, IPI messaging the
	1322	* target cpu, and switching out. TDF_MIGRATING prevents scheduling
	1323	* races while the thread is being migrated.
	1324	*
	1325	* We must be sure to remove ourselves from the current cpu's tsleepq
	1326	* before potentially moving to another queue. The thread can be on
	1327	* a tsleepq due to a left-over tsleep_interlock().
	1328	*/
	1329	#ifdef SMP
	1330	static void lwkt_setcpu_remote(void *arg);
	1331	#endif
	1332
	1333	void
	1334	lwkt_setcpu_self(globaldata_t rgd)
	1335	{
	1336	#ifdef SMP
	1337	thread_t td = curthread;
	1338
	1339	if (td->td_gd != rgd) {
	1340	crit_enter_quick(td);
	1341	if (td->td_flags & TDF_TSLEEPQ)
	1342	tsleep_remove(td);
	1343	td->td_flags \|= TDF_MIGRATING;
	1344	lwkt_deschedule_self(td);
	1345	TAILQ_REMOVE(&td->td_gd->gd_tdallq, td, td_allq);
	1346	lwkt_send_ipiq(rgd, (ipifunc1_t)lwkt_setcpu_remote, td);
	1347	lwkt_switch();
	1348	/* we are now on the target cpu */
	1349	TAILQ_INSERT_TAIL(&rgd->gd_tdallq, td, td_allq);
	1350	crit_exit_quick(td);
	1351	}
	1352	#endif
	1353	}
	1354
	1355	void
	1356	lwkt_migratecpu(int cpuid)
	1357	{
	1358	#ifdef SMP
	1359	globaldata_t rgd;
	1360
	1361	rgd = globaldata_find(cpuid);
	1362	lwkt_setcpu_self(rgd);
	1363	#endif
	1364	}
	1365
	1366	/*
	1367	* Remote IPI for cpu migration (called while in a critical section so we
	1368	* do not have to enter another one). The thread has already been moved to
	1369	* our cpu's allq, but we must wait for the thread to be completely switched
	1370	* out on the originating cpu before we schedule it on ours or the stack
	1371	* state may be corrupt. We clear TDF_MIGRATING after flushing the GD
	1372	* change to main memory.
	1373	*
	1374	* XXX The use of TDF_MIGRATING might not be sufficient to avoid races
	1375	* against wakeups. It is best if this interface is used only when there
	1376	* are no pending events that might try to schedule the thread.
	1377	*/
	1378	#ifdef SMP
	1379	static void
	1380	lwkt_setcpu_remote(void *arg)
	1381	{
	1382	thread_t td = arg;
	1383	globaldata_t gd = mycpu;
	1384
	1385	while (td->td_flags & (TDF_RUNNING\|TDF_PREEMPT_LOCK)) {
	1386	#ifdef SMP
	1387	lwkt_process_ipiq();
	1388	#endif
	1389	cpu_lfence();
	1390	}
	1391	td->td_gd = gd;
	1392	cpu_sfence();
	1393	td->td_flags &= ~TDF_MIGRATING;
	1394	KKASSERT(td->td_lwp == NULL \|\| (td->td_lwp->lwp_flag & LWP_ONRUNQ) == 0);
	1395	_lwkt_enqueue(td);
	1396	}
	1397	#endif
	1398
	1399	struct lwp *
	1400	lwkt_preempted_proc(void)
	1401	{
	1402	thread_t td = curthread;
	1403	while (td->td_preempted)
	1404	td = td->td_preempted;
	1405	return(td->td_lwp);
	1406	}
	1407
	1408	/*
	1409	* Create a kernel process/thread/whatever. It shares it's address space
	1410	* with proc0 - ie: kernel only.
	1411	*
	1412	* NOTE! By default new threads are created with the MP lock held. A
	1413	* thread which does not require the MP lock should release it by calling
	1414	* rel_mplock() at the start of the new thread.
	1415	*/
	1416	int
	1417	lwkt_create(void (func)(void ), void *arg,
	1418	struct thread **tdp, thread_t template, int tdflags, int cpu,
	1419	const char *fmt, ...)
	1420	{
	1421	thread_t td;
	1422	__va_list ap;
	1423
	1424	td = lwkt_alloc_thread(template, LWKT_THREAD_STACK, cpu,
	1425	tdflags);
	1426	if (tdp)
	1427	*tdp = td;
	1428	cpu_set_thread_handler(td, lwkt_exit, func, arg);
	1429
	1430	/*
	1431	* Set up arg0 for 'ps' etc
	1432	*/
	1433	__va_start(ap, fmt);
	1434	kvsnprintf(td->td_comm, sizeof(td->td_comm), fmt, ap);
	1435	__va_end(ap);
	1436
	1437	/*
	1438	* Schedule the thread to run
	1439	*/
	1440	if ((td->td_flags & TDF_STOPREQ) == 0)
	1441	lwkt_schedule(td);
	1442	else
	1443	td->td_flags &= ~TDF_STOPREQ;
	1444	return 0;
	1445	}
	1446
	1447	/*
	1448	* Destroy an LWKT thread. Warning! This function is not called when
	1449	* a process exits, cpu_proc_exit() directly calls cpu_thread_exit() and
	1450	* uses a different reaping mechanism.
	1451	*/
	1452	void
	1453	lwkt_exit(void)
	1454	{
	1455	thread_t td = curthread;
	1456	thread_t std;
	1457	globaldata_t gd;
	1458
	1459	if (td->td_flags & TDF_VERBOSE)
	1460	kprintf("kthread %p %s has exited\n", td, td->td_comm);
	1461	caps_exit(td);
	1462
	1463	/*
	1464	* Get us into a critical section to interlock gd_freetd and loop
	1465	* until we can get it freed.
	1466	*
	1467	* We have to cache the current td in gd_freetd because objcache_put()ing
	1468	* it would rip it out from under us while our thread is still active.
	1469	*/
	1470	gd = mycpu;
	1471	crit_enter_quick(td);
	1472	while ((std = gd->gd_freetd) != NULL) {
	1473	gd->gd_freetd = NULL;
	1474	objcache_put(thread_cache, std);
	1475	}
	1476
	1477	/*
	1478	* Remove thread resources from kernel lists and deschedule us for
	1479	* the last time.
	1480	*/
	1481	if (td->td_flags & TDF_TSLEEPQ)
	1482	tsleep_remove(td);
	1483	biosched_done(td);
	1484	dsched_exit_thread(td);
	1485	lwkt_deschedule_self(td);
	1486	lwkt_remove_tdallq(td);
	1487	if (td->td_flags & TDF_ALLOCATED_THREAD)
	1488	gd->gd_freetd = td;
	1489	cpu_thread_exit();
	1490	}
	1491
	1492	void
	1493	lwkt_remove_tdallq(thread_t td)
	1494	{
	1495	KKASSERT(td->td_gd == mycpu);
	1496	TAILQ_REMOVE(&td->td_gd->gd_tdallq, td, td_allq);
	1497	}
	1498
	1499	void
	1500	crit_panic(void)
	1501	{
	1502	thread_t td = curthread;
	1503	int lpri = td->td_pri;
	1504
	1505	td->td_pri = 0;
	1506	panic("td_pri is/would-go negative! %p %d", td, lpri);
	1507	}
	1508
	1509	#ifdef SMP
	1510
	1511	/*
	1512	* Called from debugger/panic on cpus which have been stopped. We must still
	1513	* process the IPIQ while stopped, even if we were stopped while in a critical
	1514	* section (XXX).
	1515	*
	1516	* If we are dumping also try to process any pending interrupts. This may
	1517	* or may not work depending on the state of the cpu at the point it was
	1518	* stopped.
	1519	*/
	1520	void
	1521	lwkt_smp_stopped(void)
	1522	{
	1523	globaldata_t gd = mycpu;
	1524
	1525	crit_enter_gd(gd);
	1526	if (dumping) {
	1527	lwkt_process_ipiq();
	1528	splz();
	1529	} else {
	1530	lwkt_process_ipiq();
	1531	}
	1532	crit_exit_gd(gd);
	1533	}
	1534
	1535	#endif