gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2003,2004 The DragonFly Project. All rights reserved.
	3	*
	4	* This code is derived from software contributed to The DragonFly Project
	5	* by Matthew Dillon <dillon@backplane.com>
	6	*
	7	* Redistribution and use in source and binary forms, with or without
	8	* modification, are permitted provided that the following conditions
	9	* are met:
	10	*
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in
	15	* the documentation and/or other materials provided with the
	16	* distribution.
	17	* 3. Neither the name of The DragonFly Project nor the names of its
	18	* contributors may be used to endorse or promote products derived
	19	* from this software without specific, prior written permission.
	20	*
	21	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	22	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	23	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	24	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	25	* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	26	* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
	27	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	28	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	29	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	30	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	31	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	32	* SUCH DAMAGE.
	33	*/
	34
	35	/*
	36	* Each cpu in a system has its own self-contained light weight kernel
	37	* thread scheduler, which means that generally speaking we only need
	38	* to use a critical section to avoid problems. Foreign thread
	39	* scheduling is queued via (async) IPIs.
	40	*/
	41
	42	#include <sys/param.h>
	43	#include <sys/systm.h>
	44	#include <sys/kernel.h>
	45	#include <sys/proc.h>
	46	#include <sys/rtprio.h>
	47	#include <sys/queue.h>
	48	#include <sys/sysctl.h>
	49	#include <sys/kthread.h>
	50	#include <machine/cpu.h>
	51	#include <sys/lock.h>
	52	#include <sys/caps.h>
	53	#include <sys/spinlock.h>
	54	#include <sys/ktr.h>
	55
	56	#include <sys/thread2.h>
	57	#include <sys/spinlock2.h>
	58
	59	#include <vm/vm.h>
	60	#include <vm/vm_param.h>
	61	#include <vm/vm_kern.h>
	62	#include <vm/vm_object.h>
	63	#include <vm/vm_page.h>
	64	#include <vm/vm_map.h>
	65	#include <vm/vm_pager.h>
	66	#include <vm/vm_extern.h>
	67
	68	#include <machine/stdarg.h>
	69	#include <machine/smp.h>
	70
	71
	72	static MALLOC_DEFINE(M_THREAD, "thread", "lwkt threads");
	73
	74	#ifdef SMP
	75	static int mplock_countx = 0;
	76	#endif
	77	#ifdef INVARIANTS
	78	static int panic_on_cscount = 0;
	79	#endif
	80	static __int64_t switch_count = 0;
	81	static __int64_t preempt_hit = 0;
	82	static __int64_t preempt_miss = 0;
	83	static __int64_t preempt_weird = 0;
	84	static __int64_t token_contention_count = 0;
	85	static __int64_t mplock_contention_count = 0;
	86	static int lwkt_use_spin_port;
	87	#ifdef SMP
	88	static int chain_mplock = 0;
	89	#endif
	90	static struct objcache *thread_cache;
	91
	92	volatile cpumask_t mp_lock_contention_mask;
	93
	94	extern void cpu_heavy_restore(void);
	95	extern void cpu_lwkt_restore(void);
	96	extern void cpu_kthread_restore(void);
	97	extern void cpu_idle_restore(void);
	98
	99	#ifdef __amd64__
	100
	101	static int
	102	jg_tos_ok(struct thread *td)
	103	{
	104	void *tos;
	105	int tos_ok;
	106
	107	if (td == NULL) {
	108	return 1;
	109	}
	110	KKASSERT(td->td_sp != NULL);
	111	tos = ((void **)td->td_sp)[0];
	112	tos_ok = 0;
	113	if ((tos == cpu_heavy_restore) \|\| (tos == cpu_lwkt_restore) \|\|
	114	(tos == cpu_kthread_restore) \|\| (tos == cpu_idle_restore)) {
	115	tos_ok = 1;
	116	}
	117	return tos_ok;
	118	}
	119
	120	#endif
	121
	122	/*
	123	* We can make all thread ports use the spin backend instead of the thread
	124	* backend. This should only be set to debug the spin backend.
	125	*/
	126	TUNABLE_INT("lwkt.use_spin_port", &lwkt_use_spin_port);
	127
	128	#ifdef INVARIANTS
	129	SYSCTL_INT(_lwkt, OID_AUTO, panic_on_cscount, CTLFLAG_RW, &panic_on_cscount, 0, "");
	130	#endif
	131	#ifdef SMP
	132	SYSCTL_INT(_lwkt, OID_AUTO, chain_mplock, CTLFLAG_RW, &chain_mplock, 0, "");
	133	#endif
	134	SYSCTL_QUAD(_lwkt, OID_AUTO, switch_count, CTLFLAG_RW, &switch_count, 0, "");
	135	SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_hit, CTLFLAG_RW, &preempt_hit, 0, "");
	136	SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_miss, CTLFLAG_RW, &preempt_miss, 0, "");
	137	SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_weird, CTLFLAG_RW, &preempt_weird, 0, "");
	138	#ifdef INVARIANTS
	139	SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count, CTLFLAG_RW,
	140	&token_contention_count, 0, "spinning due to token contention");
	141	SYSCTL_QUAD(_lwkt, OID_AUTO, mplock_contention_count, CTLFLAG_RW,
	142	&mplock_contention_count, 0, "spinning due to MPLOCK contention");
	143	#endif
	144
	145	/*
	146	* Kernel Trace
	147	*/
	148	#if !defined(KTR_GIANT_CONTENTION)
	149	#define KTR_GIANT_CONTENTION KTR_ALL
	150	#endif
	151
	152	KTR_INFO_MASTER(giant);
	153	KTR_INFO(KTR_GIANT_CONTENTION, giant, beg, 0, "thread=%p", sizeof(void *));
	154	KTR_INFO(KTR_GIANT_CONTENTION, giant, end, 1, "thread=%p", sizeof(void *));
	155
	156	#define loggiant(name) KTR_LOG(giant_ ## name, curthread)
	157
	158	/*
	159	* These helper procedures handle the runq, they can only be called from
	160	* within a critical section.
	161	*
	162	* WARNING! Prior to SMP being brought up it is possible to enqueue and
	163	* dequeue threads belonging to other cpus, so be sure to use td->td_gd
	164	* instead of 'mycpu' when referencing the globaldata structure. Once
	165	* SMP live enqueuing and dequeueing only occurs on the current cpu.
	166	*/
	167	static __inline
	168	void
	169	_lwkt_dequeue(thread_t td)
	170	{
	171	if (td->td_flags & TDF_RUNQ) {
	172	int nq = td->td_pri & TDPRI_MASK;
	173	struct globaldata *gd = td->td_gd;
	174
	175	td->td_flags &= ~TDF_RUNQ;
	176	TAILQ_REMOVE(&gd->gd_tdrunq[nq], td, td_threadq);
	177	/* runqmask is passively cleaned up by the switcher */
	178	}
	179	}
	180
	181	static __inline
	182	void
	183	_lwkt_enqueue(thread_t td)
	184	{
	185	if ((td->td_flags & (TDF_RUNQ\|TDF_MIGRATING\|TDF_BLOCKQ)) == 0) {
	186	int nq = td->td_pri & TDPRI_MASK;
	187	struct globaldata *gd = td->td_gd;
	188
	189	td->td_flags \|= TDF_RUNQ;
	190	TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], td, td_threadq);
	191	gd->gd_runqmask \|= 1 << nq;
	192	}
	193	}
	194
	195	static __boolean_t
	196	_lwkt_thread_ctor(void obj, void privdata, int ocflags)
	197	{
	198	struct thread td = (struct thread )obj;
	199
	200	td->td_kstack = NULL;
	201	td->td_kstack_size = 0;
	202	td->td_flags = TDF_ALLOCATED_THREAD;
	203	return (1);
	204	}
	205
	206	static void
	207	_lwkt_thread_dtor(void obj, void privdata)
	208	{
	209	struct thread td = (struct thread )obj;
	210
	211	KASSERT(td->td_flags & TDF_ALLOCATED_THREAD,
	212	("_lwkt_thread_dtor: not allocated from objcache"));
	213	KASSERT((td->td_flags & TDF_ALLOCATED_STACK) && td->td_kstack &&
	214	td->td_kstack_size > 0,
	215	("_lwkt_thread_dtor: corrupted stack"));
	216	kmem_free(&kernel_map, (vm_offset_t)td->td_kstack, td->td_kstack_size);
	217	}
	218
	219	/*
	220	* Initialize the lwkt s/system.
	221	*/
	222	void
	223	lwkt_init(void)
	224	{
	225	/* An objcache has 2 magazines per CPU so divide cache size by 2. */
	226	thread_cache = objcache_create_mbacked(M_THREAD, sizeof(struct thread),
	227	NULL, CACHE_NTHREADS/2,
	228	_lwkt_thread_ctor, _lwkt_thread_dtor, NULL);
	229	}
	230
	231	/*
	232	* Schedule a thread to run. As the current thread we can always safely
	233	* schedule ourselves, and a shortcut procedure is provided for that
	234	* function.
	235	*
	236	* (non-blocking, self contained on a per cpu basis)
	237	*/
	238	void
	239	lwkt_schedule_self(thread_t td)
	240	{
	241	crit_enter_quick(td);
	242	KASSERT(td != &td->td_gd->gd_idlethread, ("lwkt_schedule_self(): scheduling gd_idlethread is illegal!"));
	243	KKASSERT(td->td_lwp == NULL \|\| (td->td_lwp->lwp_flag & LWP_ONRUNQ) == 0);
	244	_lwkt_enqueue(td);
	245	crit_exit_quick(td);
	246	}
	247
	248	/*
	249	* Deschedule a thread.
	250	*
	251	* (non-blocking, self contained on a per cpu basis)
	252	*/
	253	void
	254	lwkt_deschedule_self(thread_t td)
	255	{
	256	crit_enter_quick(td);
	257	_lwkt_dequeue(td);
	258	crit_exit_quick(td);
	259	}
	260
	261	/*
	262	* LWKTs operate on a per-cpu basis
	263	*
	264	* WARNING! Called from early boot, 'mycpu' may not work yet.
	265	*/
	266	void
	267	lwkt_gdinit(struct globaldata *gd)
	268	{
	269	int i;
	270
	271	for (i = 0; i < sizeof(gd->gd_tdrunq)/sizeof(gd->gd_tdrunq[0]); ++i)
	272	TAILQ_INIT(&gd->gd_tdrunq[i]);
	273	gd->gd_runqmask = 0;
	274	TAILQ_INIT(&gd->gd_tdallq);
	275	}
	276
	277	/*
	278	* Create a new thread. The thread must be associated with a process context
	279	* or LWKT start address before it can be scheduled. If the target cpu is
	280	* -1 the thread will be created on the current cpu.
	281	*
	282	* If you intend to create a thread without a process context this function
	283	* does everything except load the startup and switcher function.
	284	*/
	285	thread_t
	286	lwkt_alloc_thread(struct thread *td, int stksize, int cpu, int flags)
	287	{
	288	globaldata_t gd = mycpu;
	289	void *stack;
	290
	291	/*
	292	* If static thread storage is not supplied allocate a thread. Reuse
	293	* a cached free thread if possible. gd_freetd is used to keep an exiting
	294	* thread intact through the exit.
	295	*/
	296	if (td == NULL) {
	297	if ((td = gd->gd_freetd) != NULL)
	298	gd->gd_freetd = NULL;
	299	else
	300	td = objcache_get(thread_cache, M_WAITOK);
	301	KASSERT((td->td_flags &
	302	(TDF_ALLOCATED_THREAD\|TDF_RUNNING)) == TDF_ALLOCATED_THREAD,
	303	("lwkt_alloc_thread: corrupted td flags 0x%X", td->td_flags));
	304	flags \|= td->td_flags & (TDF_ALLOCATED_THREAD\|TDF_ALLOCATED_STACK);
	305	}
	306
	307	/*
	308	* Try to reuse cached stack.
	309	*/
	310	if ((stack = td->td_kstack) != NULL && td->td_kstack_size != stksize) {
	311	if (flags & TDF_ALLOCATED_STACK) {
	312	kmem_free(&kernel_map, (vm_offset_t)stack, td->td_kstack_size);
	313	stack = NULL;
	314	}
	315	}
	316	if (stack == NULL) {
	317	stack = (void *)kmem_alloc(&kernel_map, stksize);
	318	flags \|= TDF_ALLOCATED_STACK;
	319	}
	320	if (cpu < 0)
	321	lwkt_init_thread(td, stack, stksize, flags, gd);
	322	else
	323	lwkt_init_thread(td, stack, stksize, flags, globaldata_find(cpu));
	324	return(td);
	325	}
	326
	327	/*
	328	* Initialize a preexisting thread structure. This function is used by
	329	* lwkt_alloc_thread() and also used to initialize the per-cpu idlethread.
	330	*
	331	* All threads start out in a critical section at a priority of
	332	* TDPRI_KERN_DAEMON. Higher level code will modify the priority as
	333	* appropriate. This function may send an IPI message when the
	334	* requested cpu is not the current cpu and consequently gd_tdallq may
	335	* not be initialized synchronously from the point of view of the originating
	336	* cpu.
	337	*
	338	* NOTE! we have to be careful in regards to creating threads for other cpus
	339	* if SMP has not yet been activated.
	340	*/
	341	#ifdef SMP
	342
	343	static void
	344	lwkt_init_thread_remote(void *arg)
	345	{
	346	thread_t td = arg;
	347
	348	/*
	349	* Protected by critical section held by IPI dispatch
	350	*/
	351	TAILQ_INSERT_TAIL(&td->td_gd->gd_tdallq, td, td_allq);
	352	}
	353
	354	#endif
	355
	356	void
	357	lwkt_init_thread(thread_t td, void *stack, int stksize, int flags,
	358	struct globaldata *gd)
	359	{
	360	globaldata_t mygd = mycpu;
	361
	362	bzero(td, sizeof(struct thread));
	363	td->td_kstack = stack;
	364	td->td_kstack_size = stksize;
	365	td->td_flags = flags;
	366	td->td_gd = gd;
	367	td->td_pri = TDPRI_KERN_DAEMON + TDPRI_CRIT;
	368	#ifdef SMP
	369	if ((flags & TDF_MPSAFE) == 0)
	370	td->td_mpcount = 1;
	371	#endif
	372	if (lwkt_use_spin_port)
	373	lwkt_initport_spin(&td->td_msgport);
	374	else
	375	lwkt_initport_thread(&td->td_msgport, td);
	376	pmap_init_thread(td);
	377	#ifdef SMP
	378	/*
	379	* Normally initializing a thread for a remote cpu requires sending an
	380	* IPI. However, the idlethread is setup before the other cpus are
	381	* activated so we have to treat it as a special case. XXX manipulation
	382	* of gd_tdallq requires the BGL.
	383	*/
	384	if (gd == mygd \|\| td == &gd->gd_idlethread) {
	385	crit_enter_gd(mygd);
	386	TAILQ_INSERT_TAIL(&gd->gd_tdallq, td, td_allq);
	387	crit_exit_gd(mygd);
	388	} else {
	389	lwkt_send_ipiq(gd, lwkt_init_thread_remote, td);
	390	}
	391	#else
	392	crit_enter_gd(mygd);
	393	TAILQ_INSERT_TAIL(&gd->gd_tdallq, td, td_allq);
	394	crit_exit_gd(mygd);
	395	#endif
	396	}
	397
	398	void
	399	lwkt_set_comm(thread_t td, const char *ctl, ...)
	400	{
	401	__va_list va;
	402
	403	__va_start(va, ctl);
	404	kvsnprintf(td->td_comm, sizeof(td->td_comm), ctl, va);
	405	__va_end(va);
	406	}
	407
	408	void
	409	lwkt_hold(thread_t td)
	410	{
	411	++td->td_refs;
	412	}
	413
	414	void
	415	lwkt_rele(thread_t td)
	416	{
	417	KKASSERT(td->td_refs > 0);
	418	--td->td_refs;
	419	}
	420
	421	void
	422	lwkt_wait_free(thread_t td)
	423	{
	424	while (td->td_refs)
	425	tsleep(td, 0, "tdreap", hz);
	426	}
	427
	428	void
	429	lwkt_free_thread(thread_t td)
	430	{
	431	KASSERT((td->td_flags & TDF_RUNNING) == 0,
	432	("lwkt_free_thread: did not exit! %p", td));
	433
	434	if (td->td_flags & TDF_ALLOCATED_THREAD) {
	435	objcache_put(thread_cache, td);
	436	} else if (td->td_flags & TDF_ALLOCATED_STACK) {
	437	/* client-allocated struct with internally allocated stack */
	438	KASSERT(td->td_kstack && td->td_kstack_size > 0,
	439	("lwkt_free_thread: corrupted stack"));
	440	kmem_free(&kernel_map, (vm_offset_t)td->td_kstack, td->td_kstack_size);
	441	td->td_kstack = NULL;
	442	td->td_kstack_size = 0;
	443	}
	444	}
	445
	446
	447	/*
	448	* Switch to the next runnable lwkt. If no LWKTs are runnable then
	449	* switch to the idlethread. Switching must occur within a critical
	450	* section to avoid races with the scheduling queue.
	451	*
	452	* We always have full control over our cpu's run queue. Other cpus
	453	* that wish to manipulate our queue must use the cpu_*msg() calls to
	454	* talk to our cpu, so a critical section is all that is needed and
	455	* the result is very, very fast thread switching.
	456	*
	457	* The LWKT scheduler uses a fixed priority model and round-robins at
	458	* each priority level. User process scheduling is a totally
	459	* different beast and LWKT priorities should not be confused with
	460	* user process priorities.
	461	*
	462	* The MP lock may be out of sync with the thread's td_mpcount. lwkt_switch()
	463	* cleans it up. Note that the td_switch() function cannot do anything that
	464	* requires the MP lock since the MP lock will have already been setup for
	465	* the target thread (not the current thread). It's nice to have a scheduler
	466	* that does not need the MP lock to work because it allows us to do some
	467	* really cool high-performance MP lock optimizations.
	468	*
	469	* PREEMPTION NOTE: Preemption occurs via lwkt_preempt(). lwkt_switch()
	470	* is not called by the current thread in the preemption case, only when
	471	* the preempting thread blocks (in order to return to the original thread).
	472	*/
	473	void
	474	lwkt_switch(void)
	475	{
	476	globaldata_t gd = mycpu;
	477	thread_t td = gd->gd_curthread;
	478	thread_t ntd;
	479	#ifdef SMP
	480	int mpheld;
	481	#endif
	482
	483	/*
	484	* Switching from within a 'fast' (non thread switched) interrupt or IPI
	485	* is illegal. However, we may have to do it anyway if we hit a fatal
	486	* kernel trap or we have paniced.
	487	*
	488	* If this case occurs save and restore the interrupt nesting level.
	489	*/
	490	if (gd->gd_intr_nesting_level) {
	491	int savegdnest;
	492	int savegdtrap;
	493
	494	if (gd->gd_trap_nesting_level == 0 && panicstr == NULL) {
	495	panic("lwkt_switch: cannot switch from within "
	496	"a fast interrupt, yet, td %p\n", td);
	497	} else {
	498	savegdnest = gd->gd_intr_nesting_level;
	499	savegdtrap = gd->gd_trap_nesting_level;
	500	gd->gd_intr_nesting_level = 0;
	501	gd->gd_trap_nesting_level = 0;
	502	if ((td->td_flags & TDF_PANICWARN) == 0) {
	503	td->td_flags \|= TDF_PANICWARN;
	504	kprintf("Warning: thread switch from interrupt or IPI, "
	505	"thread %p (%s)\n", td, td->td_comm);
	506	print_backtrace();
	507	}
	508	lwkt_switch();
	509	gd->gd_intr_nesting_level = savegdnest;
	510	gd->gd_trap_nesting_level = savegdtrap;
	511	return;
	512	}
	513	}
	514
	515	/*
	516	* Passive release (used to transition from user to kernel mode
	517	* when we block or switch rather then when we enter the kernel).
	518	* This function is NOT called if we are switching into a preemption
	519	* or returning from a preemption. Typically this causes us to lose
	520	* our current process designation (if we have one) and become a true
	521	* LWKT thread, and may also hand the current process designation to
	522	* another process and schedule thread.
	523	*/
	524	if (td->td_release)
	525	td->td_release(td);
	526
	527	crit_enter_gd(gd);
	528	if (td->td_toks)
	529	lwkt_relalltokens(td);
	530
	531	/*
	532	* We had better not be holding any spin locks, but don't get into an
	533	* endless panic loop.
	534	*/
	535	KASSERT(gd->gd_spinlock_rd == NULL \|\| panicstr != NULL,
	536	("lwkt_switch: still holding a shared spinlock %p!",
	537	gd->gd_spinlock_rd));
	538	KASSERT(gd->gd_spinlocks_wr == 0 \|\| panicstr != NULL,
	539	("lwkt_switch: still holding %d exclusive spinlocks!",
	540	gd->gd_spinlocks_wr));
	541
	542
	543	#ifdef SMP
	544	/*
	545	* td_mpcount cannot be used to determine if we currently hold the
	546	* MP lock because get_mplock() will increment it prior to attempting
	547	* to get the lock, and switch out if it can't. Our ownership of
	548	* the actual lock will remain stable while we are in a critical section
	549	* (but, of course, another cpu may own or release the lock so the
	550	* actual value of mp_lock is not stable).
	551	*/
	552	mpheld = MP_LOCK_HELD();
	553	#ifdef INVARIANTS
	554	if (td->td_cscount) {
	555	kprintf("Diagnostic: attempt to switch while mastering cpusync: %p\n",
	556	td);
	557	if (panic_on_cscount)
	558	panic("switching while mastering cpusync");
	559	}
	560	#endif
	561	#endif
	562	if ((ntd = td->td_preempted) != NULL) {
	563	/*
	564	* We had preempted another thread on this cpu, resume the preempted
	565	* thread. This occurs transparently, whether the preempted thread
	566	* was scheduled or not (it may have been preempted after descheduling
	567	* itself).
	568	*
	569	* We have to setup the MP lock for the original thread after backing
	570	* out the adjustment that was made to curthread when the original
	571	* was preempted.
	572	*/
	573	KKASSERT(ntd->td_flags & TDF_PREEMPT_LOCK);
	574	#ifdef SMP
	575	if (ntd->td_mpcount && mpheld == 0) {
	576	panic("MPLOCK NOT HELD ON RETURN: %p %p %d %d",
	577	td, ntd, td->td_mpcount, ntd->td_mpcount);
	578	}
	579	if (ntd->td_mpcount) {
	580	td->td_mpcount -= ntd->td_mpcount;
	581	KKASSERT(td->td_mpcount >= 0);
	582	}
	583	#endif
	584	ntd->td_flags \|= TDF_PREEMPT_DONE;
	585
	586	/*
	587	* The interrupt may have woken a thread up, we need to properly
	588	* set the reschedule flag if the originally interrupted thread is
	589	* at a lower priority.
	590	*/
	591	if (gd->gd_runqmask > (2 << (ntd->td_pri & TDPRI_MASK)) - 1)
	592	need_lwkt_resched();
	593	/* YYY release mp lock on switchback if original doesn't need it */
	594	} else {
	595	/*
	596	* Priority queue / round-robin at each priority. Note that user
	597	* processes run at a fixed, low priority and the user process
	598	* scheduler deals with interactions between user processes
	599	* by scheduling and descheduling them from the LWKT queue as
	600	* necessary.
	601	*
	602	* We have to adjust the MP lock for the target thread. If we
	603	* need the MP lock and cannot obtain it we try to locate a
	604	* thread that does not need the MP lock. If we cannot, we spin
	605	* instead of HLT.
	606	*
	607	* A similar issue exists for the tokens held by the target thread.
	608	* If we cannot obtain ownership of the tokens we cannot immediately
	609	* schedule the thread.
	610	*/
	611
	612	/*
	613	* If an LWKT reschedule was requested, well that is what we are
	614	* doing now so clear it.
	615	*/
	616	clear_lwkt_resched();
	617	again:
	618	if (gd->gd_runqmask) {
	619	int nq = bsrl(gd->gd_runqmask);
	620	if ((ntd = TAILQ_FIRST(&gd->gd_tdrunq[nq])) == NULL) {
	621	gd->gd_runqmask &= ~(1 << nq);
	622	goto again;
	623	}
	624	#ifdef SMP
	625	/*
	626	* THREAD SELECTION FOR AN SMP MACHINE BUILD
	627	*
	628	* If the target needs the MP lock and we couldn't get it,
	629	* or if the target is holding tokens and we could not
	630	* gain ownership of the tokens, continue looking for a
	631	* thread to schedule and spin instead of HLT if we can't.
	632	*
	633	* NOTE: the mpheld variable invalid after this conditional, it
	634	* can change due to both cpu_try_mplock() returning success
	635	* AND interactions in lwkt_getalltokens() due to the fact that
	636	* we are trying to check the mpcount of a thread other then
	637	* the current thread. Because of this, if the current thread
	638	* is not holding td_mpcount, an IPI indirectly run via
	639	* lwkt_getalltokens() can obtain and release the MP lock and
	640	* cause the core MP lock to be released.
	641	*/
	642	if ((ntd->td_mpcount && mpheld == 0 && !cpu_try_mplock()) \|\|
	643	(ntd->td_toks && lwkt_getalltokens(ntd) == 0)
	644	) {
	645	u_int32_t rqmask = gd->gd_runqmask;
	646
	647	mpheld = MP_LOCK_HELD();
	648	ntd = NULL;
	649	while (rqmask) {
	650	TAILQ_FOREACH(ntd, &gd->gd_tdrunq[nq], td_threadq) {
	651	if (ntd->td_mpcount && !mpheld && !cpu_try_mplock()) {
	652	/* spinning due to MP lock being held */
	653	#ifdef INVARIANTS
	654	++mplock_contention_count;
	655	#endif
	656	/* mplock still not held, 'mpheld' still valid */
	657	continue;
	658	}
	659
	660	/*
	661	* mpheld state invalid after getalltokens call returns
	662	* failure, but the variable is only needed for
	663	* the loop.
	664	*/
	665	if (ntd->td_toks && !lwkt_getalltokens(ntd)) {
	666	/* spinning due to token contention */
	667	#ifdef INVARIANTS
	668	++token_contention_count;
	669	#endif
	670	mpheld = MP_LOCK_HELD();
	671	continue;
	672	}
	673	break;
	674	}
	675	if (ntd)
	676	break;
	677	rqmask &= ~(1 << nq);
	678	nq = bsrl(rqmask);
	679
	680	/*
	681	* We have two choices. We can either refuse to run a
	682	* user thread when a kernel thread needs the MP lock
	683	* but could not get it, or we can allow it to run but
	684	* then expect an IPI (hopefully) later on to force a
	685	* reschedule when the MP lock might become available.
	686	*/
	687	if (nq < TDPRI_KERN_LPSCHED) {
	688	if (chain_mplock == 0)
	689	break;
	690	atomic_set_int(&mp_lock_contention_mask,
	691	gd->gd_cpumask);
	692	/* continue loop, allow user threads to be scheduled */
	693	}
	694	}
	695	if (ntd == NULL) {
	696	cpu_mplock_contested();
	697	ntd = &gd->gd_idlethread;
	698	ntd->td_flags \|= TDF_IDLE_NOHLT;
	699	goto using_idle_thread;
	700	} else {
	701	++gd->gd_cnt.v_swtch;
	702	TAILQ_REMOVE(&gd->gd_tdrunq[nq], ntd, td_threadq);
	703	TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], ntd, td_threadq);
	704	}
	705	} else {
	706	if (ntd->td_mpcount)
	707	++mplock_countx;
	708	++gd->gd_cnt.v_swtch;
	709	TAILQ_REMOVE(&gd->gd_tdrunq[nq], ntd, td_threadq);
	710	TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], ntd, td_threadq);
	711	}
	712	#else
	713	/*
	714	* THREAD SELECTION FOR A UP MACHINE BUILD. We don't have to
	715	* worry about tokens or the BGL. However, we still have
	716	* to call lwkt_getalltokens() in order to properly detect
	717	* stale tokens. This call cannot fail for a UP build!
	718	*/
	719	lwkt_getalltokens(ntd);
	720	++gd->gd_cnt.v_swtch;
	721	TAILQ_REMOVE(&gd->gd_tdrunq[nq], ntd, td_threadq);
	722	TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], ntd, td_threadq);
	723	#endif
	724	} else {
	725	/*
	726	* We have nothing to run but only let the idle loop halt
	727	* the cpu if there are no pending interrupts.
	728	*/
	729	ntd = &gd->gd_idlethread;
	730	if (gd->gd_reqflags & RQF_IDLECHECK_MASK)
	731	ntd->td_flags \|= TDF_IDLE_NOHLT;
	732	#ifdef SMP
	733	using_idle_thread:
	734	/*
	735	* The idle thread should not be holding the MP lock unless we
	736	* are trapping in the kernel or in a panic. Since we select the
	737	* idle thread unconditionally when no other thread is available,
	738	* if the MP lock is desired during a panic or kernel trap, we
	739	* have to loop in the scheduler until we get it.
	740	*/
	741	if (ntd->td_mpcount) {
	742	mpheld = MP_LOCK_HELD();
	743	if (gd->gd_trap_nesting_level == 0 && panicstr == NULL) {
	744	panic("Idle thread %p was holding the BGL!", ntd);
	745	} else if (mpheld == 0) {
	746	cpu_mplock_contested();
	747	goto again;
	748	}
	749	}
	750	#endif
	751	}
	752	}
	753	KASSERT(ntd->td_pri >= TDPRI_CRIT,
	754	("priority problem in lwkt_switch %d %d", td->td_pri, ntd->td_pri));
	755
	756	/*
	757	* Do the actual switch. If the new target does not need the MP lock
	758	* and we are holding it, release the MP lock. If the new target requires
	759	* the MP lock we have already acquired it for the target.
	760	*/
	761	#ifdef SMP
	762	if (ntd->td_mpcount == 0 ) {
	763	if (MP_LOCK_HELD())
	764	cpu_rel_mplock();
	765	} else {
	766	ASSERT_MP_LOCK_HELD(ntd);
	767	}
	768	#endif
	769	if (td != ntd) {
	770	++switch_count;
	771	#ifdef __amd64__
	772	KKASSERT(jg_tos_ok(ntd));
	773	#endif
	774	td->td_switch(ntd);
	775	}
	776	/* NOTE: current cpu may have changed after switch */
	777	crit_exit_quick(td);
	778	}
	779
	780	/*
	781	* Request that the target thread preempt the current thread. Preemption
	782	* only works under a specific set of conditions:
	783	*
	784	* - We are not preempting ourselves
	785	* - The target thread is owned by the current cpu
	786	* - We are not currently being preempted
	787	* - The target is not currently being preempted
	788	* - We are not holding any spin locks
	789	* - The target thread is not holding any tokens
	790	* - We are able to satisfy the target's MP lock requirements (if any).
	791	*
	792	* THE CALLER OF LWKT_PREEMPT() MUST BE IN A CRITICAL SECTION. Typically
	793	* this is called via lwkt_schedule() through the td_preemptable callback.
	794	* critpri is the managed critical priority that we should ignore in order
	795	* to determine whether preemption is possible (aka usually just the crit
	796	* priority of lwkt_schedule() itself).
	797	*
	798	* XXX at the moment we run the target thread in a critical section during
	799	* the preemption in order to prevent the target from taking interrupts
	800	* that WE can't. Preemption is strictly limited to interrupt threads
	801	* and interrupt-like threads, outside of a critical section, and the
	802	* preempted source thread will be resumed the instant the target blocks
	803	* whether or not the source is scheduled (i.e. preemption is supposed to
	804	* be as transparent as possible).
	805	*
	806	* The target thread inherits our MP count (added to its own) for the
	807	* duration of the preemption in order to preserve the atomicy of the
	808	* MP lock during the preemption. Therefore, any preempting targets must be
	809	* careful in regards to MP assertions. Note that the MP count may be
	810	* out of sync with the physical mp_lock, but we do not have to preserve
	811	* the original ownership of the lock if it was out of synch (that is, we
	812	* can leave it synchronized on return).
	813	*/
	814	void
	815	lwkt_preempt(thread_t ntd, int critpri)
	816	{
	817	struct globaldata *gd = mycpu;
	818	thread_t td;
	819	#ifdef SMP
	820	int mpheld;
	821	int savecnt;
	822	#endif
	823
	824	/*
	825	* The caller has put us in a critical section. We can only preempt
	826	* if the caller of the caller was not in a critical section (basically
	827	* a local interrupt), as determined by the 'critpri' parameter. We
	828	* also can't preempt if the caller is holding any spinlocks (even if
	829	* he isn't in a critical section). This also handles the tokens test.
	830	*
	831	* YYY The target thread must be in a critical section (else it must
	832	* inherit our critical section? I dunno yet).
	833	*
	834	* Set need_lwkt_resched() unconditionally for now YYY.
	835	*/
	836	KASSERT(ntd->td_pri >= TDPRI_CRIT, ("BADCRIT0 %d", ntd->td_pri));
	837
	838	td = gd->gd_curthread;
	839	if ((ntd->td_pri & TDPRI_MASK) <= (td->td_pri & TDPRI_MASK)) {
	840	++preempt_miss;
	841	return;
	842	}
	843	if ((td->td_pri & ~TDPRI_MASK) > critpri) {
	844	++preempt_miss;
	845	need_lwkt_resched();
	846	return;
	847	}
	848	#ifdef SMP
	849	if (ntd->td_gd != gd) {
	850	++preempt_miss;
	851	need_lwkt_resched();
	852	return;
	853	}
	854	#endif
	855	/*
	856	* Take the easy way out and do not preempt if we are holding
	857	* any spinlocks. We could test whether the thread(s) being
	858	* preempted interlock against the target thread's tokens and whether
	859	* we can get all the target thread's tokens, but this situation
	860	* should not occur very often so its easier to simply not preempt.
	861	* Also, plain spinlocks are impossible to figure out at this point so
	862	* just don't preempt.
	863	*
	864	* Do not try to preempt if the target thread is holding any tokens.
	865	* We could try to acquire the tokens but this case is so rare there
	866	* is no need to support it.
	867	*/
	868	if (gd->gd_spinlock_rd \|\| gd->gd_spinlocks_wr) {
	869	++preempt_miss;
	870	need_lwkt_resched();
	871	return;
	872	}
	873	if (ntd->td_toks) {
	874	++preempt_miss;
	875	need_lwkt_resched();
	876	return;
	877	}
	878	if (td == ntd \|\| ((td->td_flags \| ntd->td_flags) & TDF_PREEMPT_LOCK)) {
	879	++preempt_weird;
	880	need_lwkt_resched();
	881	return;
	882	}
	883	if (ntd->td_preempted) {
	884	++preempt_hit;
	885	need_lwkt_resched();
	886	return;
	887	}
	888	#ifdef SMP
	889	/*
	890	* note: an interrupt might have occured just as we were transitioning
	891	* to or from the MP lock. In this case td_mpcount will be pre-disposed
	892	* (non-zero) but not actually synchronized with the actual state of the
	893	* lock. We can use it to imply an MP lock requirement for the
	894	* preemption but we cannot use it to test whether we hold the MP lock
	895	* or not.
	896	*/
	897	savecnt = td->td_mpcount;
	898	mpheld = MP_LOCK_HELD();
	899	ntd->td_mpcount += td->td_mpcount;
	900	if (mpheld == 0 && ntd->td_mpcount && !cpu_try_mplock()) {
	901	ntd->td_mpcount -= td->td_mpcount;
	902	++preempt_miss;
	903	need_lwkt_resched();
	904	return;
	905	}
	906	#endif
	907
	908	/*
	909	* Since we are able to preempt the current thread, there is no need to
	910	* call need_lwkt_resched().
	911	*/
	912	++preempt_hit;
	913	ntd->td_preempted = td;
	914	td->td_flags \|= TDF_PREEMPT_LOCK;
	915	td->td_switch(ntd);
	916
	917	KKASSERT(ntd->td_preempted && (td->td_flags & TDF_PREEMPT_DONE));
	918	#ifdef SMP
	919	KKASSERT(savecnt == td->td_mpcount);
	920	mpheld = MP_LOCK_HELD();
	921	if (mpheld && td->td_mpcount == 0)
	922	cpu_rel_mplock();
	923	else if (mpheld == 0 && td->td_mpcount)
	924	panic("lwkt_preempt(): MP lock was not held through");
	925	#endif
	926	ntd->td_preempted = NULL;
	927	td->td_flags &= ~(TDF_PREEMPT_LOCK\|TDF_PREEMPT_DONE);
	928	}
	929
	930	/*
	931	* Conditionally call splz() if gd_reqflags indicates work is pending.
	932	*
	933	* td_nest_count prevents deep nesting via splz() or doreti() which
	934	* might otherwise blow out the kernel stack. Note that except for
	935	* this special case, we MUST call splz() here to handle any
	936	* pending ints, particularly after we switch, or we might accidently
	937	* halt the cpu with interrupts pending.
	938	*
	939	* (self contained on a per cpu basis)
	940	*/
	941	void
	942	splz_check(void)
	943	{
	944	globaldata_t gd = mycpu;
	945	thread_t td = gd->gd_curthread;
	946
	947	if (gd->gd_reqflags && td->td_nest_count < 2)
	948	splz();
	949	}
	950
	951	/*
	952	* This implements a normal yield which will yield to equal priority
	953	* threads as well as higher priority threads. Note that gd_reqflags
	954	* tests will be handled by the crit_exit() call in lwkt_switch().
	955	*
	956	* (self contained on a per cpu basis)
	957	*/
	958	void
	959	lwkt_yield(void)
	960	{
	961	lwkt_schedule_self(curthread);
	962	lwkt_switch();
	963	}
	964
	965	/*
	966	* This function is used along with the lwkt_passive_recover() inline
	967	* by the trap code to negotiate a passive release of the current
	968	* process/lwp designation with the user scheduler.
	969	*/
	970	void
	971	lwkt_passive_release(struct thread *td)
	972	{
	973	struct lwp *lp = td->td_lwp;
	974
	975	td->td_release = NULL;
	976	lwkt_setpri_self(TDPRI_KERN_USER);
	977	lp->lwp_proc->p_usched->release_curproc(lp);
	978	}
	979
	980	/*
	981	* Make a kernel thread act as if it were in user mode with regards
	982	* to scheduling, to avoid becoming cpu-bound in the kernel. Kernel
	983	* loops which may be potentially cpu-bound can call lwkt_user_yield().
	984	*
	985	* The lwkt_user_yield() function is designed to have very low overhead
	986	* if no yield is determined to be needed.
	987	*/
	988	void
	989	lwkt_user_yield(void)
	990	{
	991	thread_t td = curthread;
	992	struct lwp *lp = td->td_lwp;
	993
	994	#ifdef SMP
	995	/*
	996	* XXX SEVERE TEMPORARY HACK. A cpu-bound operation running in the
	997	* kernel can prevent other cpus from servicing interrupt threads
	998	* which still require the MP lock (which is a lot of them). This
	999	* has a chaining effect since if the interrupt is blocked, so is
	1000	* the event, so normal scheduling will not pick up on the problem.
	1001	*/
	1002	if (mplock_countx && td->td_mpcount) {
	1003	int savecnt = td->td_mpcount;
	1004
	1005	td->td_mpcount = 1;
	1006	rel_mplock();
	1007	DELAY(10);
	1008	get_mplock();
	1009	td->td_mpcount = savecnt;
	1010	mplock_countx = 0;
	1011	}
	1012	#endif
	1013
	1014	/*
	1015	* Another kernel thread wants the cpu
	1016	*/
	1017	if (lwkt_resched_wanted())
	1018	lwkt_switch();
	1019
	1020	/*
	1021	* If the user scheduler has asynchronously determined that the current
	1022	* process (when running in user mode) needs to lose the cpu then make
	1023	* sure we are released.
	1024	*/
	1025	if (user_resched_wanted()) {
	1026	if (td->td_release)
	1027	td->td_release(td);
	1028	}
	1029
	1030	/*
	1031	* If we are released reduce our priority
	1032	*/
	1033	if (td->td_release == NULL) {
	1034	if (lwkt_check_resched(td) > 0)
	1035	lwkt_switch();
	1036	lp->lwp_proc->p_usched->acquire_curproc(lp);
	1037	td->td_release = lwkt_passive_release;
	1038	lwkt_setpri_self(TDPRI_USER_NORM);
	1039	}
	1040	}
	1041
	1042	/*
	1043	* Return 0 if no runnable threads are pending at the same or higher
	1044	* priority as the passed thread.
	1045	*
	1046	* Return 1 if runnable threads are pending at the same priority.
	1047	*
	1048	* Return 2 if runnable threads are pending at a higher priority.
	1049	*/
	1050	int
	1051	lwkt_check_resched(thread_t td)
	1052	{
	1053	int pri = td->td_pri & TDPRI_MASK;
	1054
	1055	if (td->td_gd->gd_runqmask > (2 << pri) - 1)
	1056	return(2);
	1057	if (TAILQ_NEXT(td, td_threadq))
	1058	return(1);
	1059	return(0);
	1060	}
	1061
	1062	/*
	1063	* Generic schedule. Possibly schedule threads belonging to other cpus and
	1064	* deal with threads that might be blocked on a wait queue.
	1065	*
	1066	* We have a little helper inline function which does additional work after
	1067	* the thread has been enqueued, including dealing with preemption and
	1068	* setting need_lwkt_resched() (which prevents the kernel from returning
	1069	* to userland until it has processed higher priority threads).
	1070	*
	1071	* It is possible for this routine to be called after a failed _enqueue
	1072	* (due to the target thread migrating, sleeping, or otherwise blocked).
	1073	* We have to check that the thread is actually on the run queue!
	1074	*
	1075	* reschedok is an optimized constant propagated from lwkt_schedule() or
	1076	* lwkt_schedule_noresched(). By default it is non-zero, causing a
	1077	* reschedule to be requested if the target thread has a higher priority.
	1078	* The port messaging code will set MSG_NORESCHED and cause reschedok to
	1079	* be 0, prevented undesired reschedules.
	1080	*/
	1081	static __inline
	1082	void
	1083	_lwkt_schedule_post(globaldata_t gd, thread_t ntd, int cpri, int reschedok)
	1084	{
	1085	thread_t otd;
	1086
	1087	if (ntd->td_flags & TDF_RUNQ) {
	1088	if (ntd->td_preemptable && reschedok) {
	1089	ntd->td_preemptable(ntd, cpri); /* YYY +token */
	1090	} else if (reschedok) {
	1091	otd = curthread;
	1092	if ((ntd->td_pri & TDPRI_MASK) > (otd->td_pri & TDPRI_MASK))
	1093	need_lwkt_resched();
	1094	}
	1095	}
	1096	}
	1097
	1098	static __inline
	1099	void
	1100	_lwkt_schedule(thread_t td, int reschedok)
	1101	{
	1102	globaldata_t mygd = mycpu;
	1103
	1104	KASSERT(td != &td->td_gd->gd_idlethread, ("lwkt_schedule(): scheduling gd_idlethread is illegal!"));
	1105	crit_enter_gd(mygd);
	1106	KKASSERT(td->td_lwp == NULL \|\| (td->td_lwp->lwp_flag & LWP_ONRUNQ) == 0);
	1107	if (td == mygd->gd_curthread) {
	1108	_lwkt_enqueue(td);
	1109	} else {
	1110	/*
	1111	* If we own the thread, there is no race (since we are in a
	1112	* critical section). If we do not own the thread there might
	1113	* be a race but the target cpu will deal with it.
	1114	*/
	1115	#ifdef SMP
	1116	if (td->td_gd == mygd) {
	1117	_lwkt_enqueue(td);
	1118	_lwkt_schedule_post(mygd, td, TDPRI_CRIT, reschedok);
	1119	} else {
	1120	lwkt_send_ipiq(td->td_gd, (ipifunc1_t)lwkt_schedule, td);
	1121	}
	1122	#else
	1123	_lwkt_enqueue(td);
	1124	_lwkt_schedule_post(mygd, td, TDPRI_CRIT, reschedok);
	1125	#endif
	1126	}
	1127	crit_exit_gd(mygd);
	1128	}
	1129
	1130	void
	1131	lwkt_schedule(thread_t td)
	1132	{
	1133	_lwkt_schedule(td, 1);
	1134	}
	1135
	1136	void
	1137	lwkt_schedule_noresched(thread_t td)
	1138	{
	1139	_lwkt_schedule(td, 0);
	1140	}
	1141
	1142	#ifdef SMP
	1143
	1144	/*
	1145	* Thread migration using a 'Pull' method. The thread may or may not be
	1146	* the current thread. It MUST be descheduled and in a stable state.
	1147	* lwkt_giveaway() must be called on the cpu owning the thread.
	1148	*
	1149	* At any point after lwkt_giveaway() is called, the target cpu may
	1150	* 'pull' the thread by calling lwkt_acquire().
	1151	*
	1152	* We have to make sure the thread is not sitting on a per-cpu tsleep
	1153	* queue or it will blow up when it moves to another cpu.
	1154	*
	1155	* MPSAFE - must be called under very specific conditions.
	1156	*/
	1157	void
	1158	lwkt_giveaway(thread_t td)
	1159	{
	1160	globaldata_t gd = mycpu;
	1161
	1162	crit_enter_gd(gd);
	1163	if (td->td_flags & TDF_TSLEEPQ)
	1164	tsleep_remove(td);
	1165	KKASSERT(td->td_gd == gd);
	1166	TAILQ_REMOVE(&gd->gd_tdallq, td, td_allq);
	1167	td->td_flags \|= TDF_MIGRATING;
	1168	crit_exit_gd(gd);
	1169	}
	1170
	1171	void
	1172	lwkt_acquire(thread_t td)
	1173	{
	1174	globaldata_t gd;
	1175	globaldata_t mygd;
	1176
	1177	KKASSERT(td->td_flags & TDF_MIGRATING);
	1178	gd = td->td_gd;
	1179	mygd = mycpu;
	1180	if (gd != mycpu) {
	1181	cpu_lfence();
	1182	KKASSERT((td->td_flags & TDF_RUNQ) == 0);
	1183	crit_enter_gd(mygd);
	1184	while (td->td_flags & (TDF_RUNNING\|TDF_PREEMPT_LOCK)) {
	1185	#ifdef SMP
	1186	lwkt_process_ipiq();
	1187	#endif
	1188	cpu_lfence();
	1189	}
	1190	td->td_gd = mygd;
	1191	TAILQ_INSERT_TAIL(&mygd->gd_tdallq, td, td_allq);
	1192	td->td_flags &= ~TDF_MIGRATING;
	1193	crit_exit_gd(mygd);
	1194	} else {
	1195	crit_enter_gd(mygd);
	1196	TAILQ_INSERT_TAIL(&mygd->gd_tdallq, td, td_allq);
	1197	td->td_flags &= ~TDF_MIGRATING;
	1198	crit_exit_gd(mygd);
	1199	}
	1200	}
	1201
	1202	#endif
	1203
	1204	/*
	1205	* Generic deschedule. Descheduling threads other then your own should be
	1206	* done only in carefully controlled circumstances. Descheduling is
	1207	* asynchronous.
	1208	*
	1209	* This function may block if the cpu has run out of messages.
	1210	*/
	1211	void
	1212	lwkt_deschedule(thread_t td)
	1213	{
	1214	crit_enter();
	1215	#ifdef SMP
	1216	if (td == curthread) {
	1217	_lwkt_dequeue(td);
	1218	} else {
	1219	if (td->td_gd == mycpu) {
	1220	_lwkt_dequeue(td);
	1221	} else {
	1222	lwkt_send_ipiq(td->td_gd, (ipifunc1_t)lwkt_deschedule, td);
	1223	}
	1224	}
	1225	#else
	1226	_lwkt_dequeue(td);
	1227	#endif
	1228	crit_exit();
	1229	}
	1230
	1231	/*
	1232	* Set the target thread's priority. This routine does not automatically
	1233	* switch to a higher priority thread, LWKT threads are not designed for
	1234	* continuous priority changes. Yield if you want to switch.
	1235	*
	1236	* We have to retain the critical section count which uses the high bits
	1237	* of the td_pri field. The specified priority may also indicate zero or
	1238	* more critical sections by adding TDPRI_CRIT*N.
	1239	*
	1240	* Note that we requeue the thread whether it winds up on a different runq
	1241	* or not. uio_yield() depends on this and the routine is not normally
	1242	* called with the same priority otherwise.
	1243	*/
	1244	void
	1245	lwkt_setpri(thread_t td, int pri)
	1246	{
	1247	KKASSERT(pri >= 0);
	1248	KKASSERT(td->td_gd == mycpu);
	1249	crit_enter();
	1250	if (td->td_flags & TDF_RUNQ) {
	1251	_lwkt_dequeue(td);
	1252	td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri;
	1253	_lwkt_enqueue(td);
	1254	} else {
	1255	td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri;
	1256	}
	1257	crit_exit();
	1258	}
	1259
	1260	void
	1261	lwkt_setpri_self(int pri)
	1262	{
	1263	thread_t td = curthread;
	1264
	1265	KKASSERT(pri >= 0 && pri <= TDPRI_MAX);
	1266	crit_enter();
	1267	if (td->td_flags & TDF_RUNQ) {
	1268	_lwkt_dequeue(td);
	1269	td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri;
	1270	_lwkt_enqueue(td);
	1271	} else {
	1272	td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri;
	1273	}
	1274	crit_exit();
	1275	}
	1276
	1277	/*
	1278	* Migrate the current thread to the specified cpu.
	1279	*
	1280	* This is accomplished by descheduling ourselves from the current cpu,
	1281	* moving our thread to the tdallq of the target cpu, IPI messaging the
	1282	* target cpu, and switching out. TDF_MIGRATING prevents scheduling
	1283	* races while the thread is being migrated.
	1284	*
	1285	* We must be sure to remove ourselves from the current cpu's tsleepq
	1286	* before potentially moving to another queue. The thread can be on
	1287	* a tsleepq due to a left-over tsleep_interlock().
	1288	*/
	1289	#ifdef SMP
	1290	static void lwkt_setcpu_remote(void *arg);
	1291	#endif
	1292
	1293	void
	1294	lwkt_setcpu_self(globaldata_t rgd)
	1295	{
	1296	#ifdef SMP
	1297	thread_t td = curthread;
	1298
	1299	if (td->td_gd != rgd) {
	1300	crit_enter_quick(td);
	1301	if (td->td_flags & TDF_TSLEEPQ)
	1302	tsleep_remove(td);
	1303	td->td_flags \|= TDF_MIGRATING;
	1304	lwkt_deschedule_self(td);
	1305	TAILQ_REMOVE(&td->td_gd->gd_tdallq, td, td_allq);
	1306	lwkt_send_ipiq(rgd, (ipifunc1_t)lwkt_setcpu_remote, td);
	1307	lwkt_switch();
	1308	/* we are now on the target cpu */
	1309	TAILQ_INSERT_TAIL(&rgd->gd_tdallq, td, td_allq);
	1310	crit_exit_quick(td);
	1311	}
	1312	#endif
	1313	}
	1314
	1315	void
	1316	lwkt_migratecpu(int cpuid)
	1317	{
	1318	#ifdef SMP
	1319	globaldata_t rgd;
	1320
	1321	rgd = globaldata_find(cpuid);
	1322	lwkt_setcpu_self(rgd);
	1323	#endif
	1324	}
	1325
	1326	/*
	1327	* Remote IPI for cpu migration (called while in a critical section so we
	1328	* do not have to enter another one). The thread has already been moved to
	1329	* our cpu's allq, but we must wait for the thread to be completely switched
	1330	* out on the originating cpu before we schedule it on ours or the stack
	1331	* state may be corrupt. We clear TDF_MIGRATING after flushing the GD
	1332	* change to main memory.
	1333	*
	1334	* XXX The use of TDF_MIGRATING might not be sufficient to avoid races
	1335	* against wakeups. It is best if this interface is used only when there
	1336	* are no pending events that might try to schedule the thread.
	1337	*/
	1338	#ifdef SMP
	1339	static void
	1340	lwkt_setcpu_remote(void *arg)
	1341	{
	1342	thread_t td = arg;
	1343	globaldata_t gd = mycpu;
	1344
	1345	while (td->td_flags & (TDF_RUNNING\|TDF_PREEMPT_LOCK)) {
	1346	#ifdef SMP
	1347	lwkt_process_ipiq();
	1348	#endif
	1349	cpu_lfence();
	1350	}
	1351	td->td_gd = gd;
	1352	cpu_sfence();
	1353	td->td_flags &= ~TDF_MIGRATING;
	1354	KKASSERT(td->td_lwp == NULL \|\| (td->td_lwp->lwp_flag & LWP_ONRUNQ) == 0);
	1355	_lwkt_enqueue(td);
	1356	}
	1357	#endif
	1358
	1359	struct lwp *
	1360	lwkt_preempted_proc(void)
	1361	{
	1362	thread_t td = curthread;
	1363	while (td->td_preempted)
	1364	td = td->td_preempted;
	1365	return(td->td_lwp);
	1366	}
	1367
	1368	/*
	1369	* Create a kernel process/thread/whatever. It shares it's address space
	1370	* with proc0 - ie: kernel only.
	1371	*
	1372	* NOTE! By default new threads are created with the MP lock held. A
	1373	* thread which does not require the MP lock should release it by calling
	1374	* rel_mplock() at the start of the new thread.
	1375	*/
	1376	int
	1377	lwkt_create(void (func)(void ), void *arg,
	1378	struct thread **tdp, thread_t template, int tdflags, int cpu,
	1379	const char *fmt, ...)
	1380	{
	1381	thread_t td;
	1382	__va_list ap;
	1383
	1384	td = lwkt_alloc_thread(template, LWKT_THREAD_STACK, cpu,
	1385	tdflags);
	1386	if (tdp)
	1387	*tdp = td;
	1388	cpu_set_thread_handler(td, lwkt_exit, func, arg);
	1389
	1390	/*
	1391	* Set up arg0 for 'ps' etc
	1392	*/
	1393	__va_start(ap, fmt);
	1394	kvsnprintf(td->td_comm, sizeof(td->td_comm), fmt, ap);
	1395	__va_end(ap);
	1396
	1397	/*
	1398	* Schedule the thread to run
	1399	*/
	1400	if ((td->td_flags & TDF_STOPREQ) == 0)
	1401	lwkt_schedule(td);
	1402	else
	1403	td->td_flags &= ~TDF_STOPREQ;
	1404	return 0;
	1405	}
	1406
	1407	/*
	1408	* Destroy an LWKT thread. Warning! This function is not called when
	1409	* a process exits, cpu_proc_exit() directly calls cpu_thread_exit() and
	1410	* uses a different reaping mechanism.
	1411	*/
	1412	void
	1413	lwkt_exit(void)
	1414	{
	1415	thread_t td = curthread;
	1416	thread_t std;
	1417	globaldata_t gd;
	1418
	1419	if (td->td_flags & TDF_VERBOSE)
	1420	kprintf("kthread %p %s has exited\n", td, td->td_comm);
	1421	caps_exit(td);
	1422
	1423	/*
	1424	* Get us into a critical section to interlock gd_freetd and loop
	1425	* until we can get it freed.
	1426	*
	1427	* We have to cache the current td in gd_freetd because objcache_put()ing
	1428	* it would rip it out from under us while our thread is still active.
	1429	*/
	1430	gd = mycpu;
	1431	crit_enter_quick(td);
	1432	while ((std = gd->gd_freetd) != NULL) {
	1433	gd->gd_freetd = NULL;
	1434	objcache_put(thread_cache, std);
	1435	}
	1436
	1437	/*
	1438	* Remove thread resources from kernel lists and deschedule us for
	1439	* the last time.
	1440	*/
	1441	if (td->td_flags & TDF_TSLEEPQ)
	1442	tsleep_remove(td);
	1443	lwkt_deschedule_self(td);
	1444	lwkt_remove_tdallq(td);
	1445	if (td->td_flags & TDF_ALLOCATED_THREAD)
	1446	gd->gd_freetd = td;
	1447	cpu_thread_exit();
	1448	}
	1449
	1450	void
	1451	lwkt_remove_tdallq(thread_t td)
	1452	{
	1453	KKASSERT(td->td_gd == mycpu);
	1454	TAILQ_REMOVE(&td->td_gd->gd_tdallq, td, td_allq);
	1455	}
	1456
	1457	void
	1458	crit_panic(void)
	1459	{
	1460	thread_t td = curthread;
	1461	int lpri = td->td_pri;
	1462
	1463	td->td_pri = 0;
	1464	panic("td_pri is/would-go negative! %p %d", td, lpri);
	1465	}
	1466
	1467	#ifdef SMP
	1468
	1469	/*
	1470	* Called from debugger/panic on cpus which have been stopped. We must still
	1471	* process the IPIQ while stopped, even if we were stopped while in a critical
	1472	* section (XXX).
	1473	*
	1474	* If we are dumping also try to process any pending interrupts. This may
	1475	* or may not work depending on the state of the cpu at the point it was
	1476	* stopped.
	1477	*/
	1478	void
	1479	lwkt_smp_stopped(void)
	1480	{
	1481	globaldata_t gd = mycpu;
	1482
	1483	crit_enter_gd(gd);
	1484	if (dumping) {
	1485	lwkt_process_ipiq();
	1486	splz();
	1487	} else {
	1488	lwkt_process_ipiq();
	1489	}
	1490	crit_exit_gd(gd);
	1491	}
	1492
	1493	/*
	1494	* get_mplock() calls this routine if it is unable to obtain the MP lock.
	1495	* get_mplock() has already incremented td_mpcount. We must block and
	1496	* not return until giant is held.
	1497	*
	1498	* All we have to do is lwkt_switch() away. The LWKT scheduler will not
	1499	* reschedule the thread until it can obtain the giant lock for it.
	1500	*/
	1501	void
	1502	lwkt_mp_lock_contested(void)
	1503	{
	1504	++mplock_countx;
	1505	loggiant(beg);
	1506	lwkt_switch();
	1507	loggiant(end);
	1508	}
	1509
	1510	/*
	1511	* The rel_mplock() code will call this function after releasing the
	1512	* last reference on the MP lock if mp_lock_contention_mask is non-zero.
	1513	*
	1514	* We then chain an IPI to a single other cpu potentially needing the
	1515	* lock. This is a bit heuristical and we can wind up with IPIs flying
	1516	* all over the place.
	1517	*/
	1518	static void lwkt_mp_lock_uncontested_remote(void *arg __unused);
	1519
	1520	void
	1521	lwkt_mp_lock_uncontested(void)
	1522	{
	1523	globaldata_t gd;
	1524	globaldata_t dgd;
	1525	cpumask_t mask;
	1526	cpumask_t tmpmask;
	1527	int cpuid;
	1528
	1529	if (chain_mplock) {
	1530	gd = mycpu;
	1531	atomic_clear_int(&mp_lock_contention_mask, gd->gd_cpumask);
	1532	mask = mp_lock_contention_mask;
	1533	tmpmask = ~((1 << gd->gd_cpuid) - 1);
	1534
	1535	if (mask) {
	1536	if (mask & tmpmask)
	1537	cpuid = bsfl(mask & tmpmask);
	1538	else
	1539	cpuid = bsfl(mask);
	1540	atomic_clear_int(&mp_lock_contention_mask, 1 << cpuid);
	1541	dgd = globaldata_find(cpuid);
	1542	lwkt_send_ipiq(dgd, lwkt_mp_lock_uncontested_remote, NULL);
	1543	}
	1544	}
	1545	}
	1546
	1547	/*
	1548	* The idea is for this IPI to interrupt a potentially lower priority
	1549	* thread, such as a user thread, to allow the scheduler to reschedule
	1550	* a higher priority kernel thread that needs the MP lock.
	1551	*
	1552	* For now we set the LWKT reschedule flag which generates an AST in
	1553	* doreti, though theoretically it is also possible to possibly preempt
	1554	* here if the underlying thread was operating in user mode. Nah.
	1555	*/
	1556	static void
	1557	lwkt_mp_lock_uncontested_remote(void *arg __unused)
	1558	{
	1559	need_lwkt_resched();
	1560	}
	1561
	1562	#endif