gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2003,2004 The DragonFly Project. All rights reserved.
	3	*
	4	* This code is derived from software contributed to The DragonFly Project
	5	* by Matthew Dillon <dillon@backplane.com>
	6	*
	7	* Redistribution and use in source and binary forms, with or without
	8	* modification, are permitted provided that the following conditions
	9	* are met:
	10	*
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in
	15	* the documentation and/or other materials provided with the
	16	* distribution.
	17	* 3. Neither the name of The DragonFly Project nor the names of its
	18	* contributors may be used to endorse or promote products derived
	19	* from this software without specific, prior written permission.
	20	*
	21	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	22	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	23	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	24	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	25	* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	26	* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
	27	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	28	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	29	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	30	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	31	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	32	* SUCH DAMAGE.
	33	*/
	34
	35	/*
	36	* Each cpu in a system has its own self-contained light weight kernel
	37	* thread scheduler, which means that generally speaking we only need
	38	* to use a critical section to avoid problems. Foreign thread
	39	* scheduling is queued via (async) IPIs.
	40	*/
	41
	42	#include <sys/param.h>
	43	#include <sys/systm.h>
	44	#include <sys/kernel.h>
	45	#include <sys/proc.h>
	46	#include <sys/rtprio.h>
	47	#include <sys/queue.h>
	48	#include <sys/sysctl.h>
	49	#include <sys/kthread.h>
	50	#include <machine/cpu.h>
	51	#include <sys/lock.h>
	52	#include <sys/caps.h>
	53	#include <sys/spinlock.h>
	54	#include <sys/ktr.h>
	55
	56	#include <sys/thread2.h>
	57	#include <sys/spinlock2.h>
	58
	59	#include <vm/vm.h>
	60	#include <vm/vm_param.h>
	61	#include <vm/vm_kern.h>
	62	#include <vm/vm_object.h>
	63	#include <vm/vm_page.h>
	64	#include <vm/vm_map.h>
	65	#include <vm/vm_pager.h>
	66	#include <vm/vm_extern.h>
	67
	68	#include <machine/stdarg.h>
	69	#include <machine/smp.h>
	70
	71
	72	static MALLOC_DEFINE(M_THREAD, "thread", "lwkt threads");
	73
	74	#ifdef SMP
	75	static int mplock_countx = 0;
	76	#endif
	77	#ifdef INVARIANTS
	78	static int panic_on_cscount = 0;
	79	#endif
	80	static __int64_t switch_count = 0;
	81	static __int64_t preempt_hit = 0;
	82	static __int64_t preempt_miss = 0;
	83	static __int64_t preempt_weird = 0;
	84	static __int64_t token_contention_count = 0;
	85	static __int64_t mplock_contention_count = 0;
	86	static int lwkt_use_spin_port;
	87	#ifdef SMP
	88	static int chain_mplock = 0;
	89	static int bgl_yield = 10;
	90	#endif
	91	static struct objcache *thread_cache;
	92
	93	volatile cpumask_t mp_lock_contention_mask;
	94
	95	static void lwkt_schedule_remote(void arg, int arg2, struct intrframe frame);
	96
	97	extern void cpu_heavy_restore(void);
	98	extern void cpu_lwkt_restore(void);
	99	extern void cpu_kthread_restore(void);
	100	extern void cpu_idle_restore(void);
	101
	102	#ifdef __amd64__
	103
	104	static int
	105	jg_tos_ok(struct thread *td)
	106	{
	107	void *tos;
	108	int tos_ok;
	109
	110	if (td == NULL) {
	111	return 1;
	112	}
	113	KKASSERT(td->td_sp != NULL);
	114	tos = ((void **)td->td_sp)[0];
	115	tos_ok = 0;
	116	if ((tos == cpu_heavy_restore) \|\| (tos == cpu_lwkt_restore) \|\|
	117	(tos == cpu_kthread_restore) \|\| (tos == cpu_idle_restore)) {
	118	tos_ok = 1;
	119	}
	120	return tos_ok;
	121	}
	122
	123	#endif
	124
	125	/*
	126	* We can make all thread ports use the spin backend instead of the thread
	127	* backend. This should only be set to debug the spin backend.
	128	*/
	129	TUNABLE_INT("lwkt.use_spin_port", &lwkt_use_spin_port);
	130
	131	#ifdef INVARIANTS
	132	SYSCTL_INT(_lwkt, OID_AUTO, panic_on_cscount, CTLFLAG_RW, &panic_on_cscount, 0, "");
	133	#endif
	134	#ifdef SMP
	135	SYSCTL_INT(_lwkt, OID_AUTO, chain_mplock, CTLFLAG_RW, &chain_mplock, 0, "");
	136	SYSCTL_INT(_lwkt, OID_AUTO, bgl_yield_delay, CTLFLAG_RW, &bgl_yield, 0, "");
	137	#endif
	138	SYSCTL_QUAD(_lwkt, OID_AUTO, switch_count, CTLFLAG_RW, &switch_count, 0, "");
	139	SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_hit, CTLFLAG_RW, &preempt_hit, 0, "");
	140	SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_miss, CTLFLAG_RW, &preempt_miss, 0, "");
	141	SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_weird, CTLFLAG_RW, &preempt_weird, 0, "");
	142	#ifdef INVARIANTS
	143	SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count, CTLFLAG_RW,
	144	&token_contention_count, 0, "spinning due to token contention");
	145	SYSCTL_QUAD(_lwkt, OID_AUTO, mplock_contention_count, CTLFLAG_RW,
	146	&mplock_contention_count, 0, "spinning due to MPLOCK contention");
	147	#endif
	148
	149	/*
	150	* Kernel Trace
	151	*/
	152	#if !defined(KTR_GIANT_CONTENTION)
	153	#define KTR_GIANT_CONTENTION KTR_ALL
	154	#endif
	155
	156	KTR_INFO_MASTER(giant);
	157	KTR_INFO(KTR_GIANT_CONTENTION, giant, beg, 0, "thread=%p", sizeof(void *));
	158	KTR_INFO(KTR_GIANT_CONTENTION, giant, end, 1, "thread=%p", sizeof(void *));
	159
	160	#define loggiant(name) KTR_LOG(giant_ ## name, curthread)
	161
	162	/*
	163	* These helper procedures handle the runq, they can only be called from
	164	* within a critical section.
	165	*
	166	* WARNING! Prior to SMP being brought up it is possible to enqueue and
	167	* dequeue threads belonging to other cpus, so be sure to use td->td_gd
	168	* instead of 'mycpu' when referencing the globaldata structure. Once
	169	* SMP live enqueuing and dequeueing only occurs on the current cpu.
	170	*/
	171	static __inline
	172	void
	173	_lwkt_dequeue(thread_t td)
	174	{
	175	if (td->td_flags & TDF_RUNQ) {
	176	int nq = td->td_pri & TDPRI_MASK;
	177	struct globaldata *gd = td->td_gd;
	178
	179	td->td_flags &= ~TDF_RUNQ;
	180	TAILQ_REMOVE(&gd->gd_tdrunq[nq], td, td_threadq);
	181	/* runqmask is passively cleaned up by the switcher */
	182	}
	183	}
	184
	185	static __inline
	186	void
	187	_lwkt_enqueue(thread_t td)
	188	{
	189	if ((td->td_flags & (TDF_RUNQ\|TDF_MIGRATING\|TDF_BLOCKQ)) == 0) {
	190	int nq = td->td_pri & TDPRI_MASK;
	191	struct globaldata *gd = td->td_gd;
	192
	193	td->td_flags \|= TDF_RUNQ;
	194	TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], td, td_threadq);
	195	gd->gd_runqmask \|= 1 << nq;
	196	}
	197	}
	198
	199	static __boolean_t
	200	_lwkt_thread_ctor(void obj, void privdata, int ocflags)
	201	{
	202	struct thread td = (struct thread )obj;
	203
	204	td->td_kstack = NULL;
	205	td->td_kstack_size = 0;
	206	td->td_flags = TDF_ALLOCATED_THREAD;
	207	return (1);
	208	}
	209
	210	static void
	211	_lwkt_thread_dtor(void obj, void privdata)
	212	{
	213	struct thread td = (struct thread )obj;
	214
	215	KASSERT(td->td_flags & TDF_ALLOCATED_THREAD,
	216	("_lwkt_thread_dtor: not allocated from objcache"));
	217	KASSERT((td->td_flags & TDF_ALLOCATED_STACK) && td->td_kstack &&
	218	td->td_kstack_size > 0,
	219	("_lwkt_thread_dtor: corrupted stack"));
	220	kmem_free(&kernel_map, (vm_offset_t)td->td_kstack, td->td_kstack_size);
	221	}
	222
	223	/*
	224	* Initialize the lwkt s/system.
	225	*/
	226	void
	227	lwkt_init(void)
	228	{
	229	/* An objcache has 2 magazines per CPU so divide cache size by 2. */
	230	thread_cache = objcache_create_mbacked(M_THREAD, sizeof(struct thread),
	231	NULL, CACHE_NTHREADS/2,
	232	_lwkt_thread_ctor, _lwkt_thread_dtor, NULL);
	233	}
	234
	235	/*
	236	* Schedule a thread to run. As the current thread we can always safely
	237	* schedule ourselves, and a shortcut procedure is provided for that
	238	* function.
	239	*
	240	* (non-blocking, self contained on a per cpu basis)
	241	*/
	242	void
	243	lwkt_schedule_self(thread_t td)
	244	{
	245	crit_enter_quick(td);
	246	KASSERT(td != &td->td_gd->gd_idlethread, ("lwkt_schedule_self(): scheduling gd_idlethread is illegal!"));
	247	KKASSERT(td->td_lwp == NULL \|\| (td->td_lwp->lwp_flag & LWP_ONRUNQ) == 0);
	248	_lwkt_enqueue(td);
	249	crit_exit_quick(td);
	250	}
	251
	252	/*
	253	* Deschedule a thread.
	254	*
	255	* (non-blocking, self contained on a per cpu basis)
	256	*/
	257	void
	258	lwkt_deschedule_self(thread_t td)
	259	{
	260	crit_enter_quick(td);
	261	_lwkt_dequeue(td);
	262	crit_exit_quick(td);
	263	}
	264
	265	/*
	266	* LWKTs operate on a per-cpu basis
	267	*
	268	* WARNING! Called from early boot, 'mycpu' may not work yet.
	269	*/
	270	void
	271	lwkt_gdinit(struct globaldata *gd)
	272	{
	273	int i;
	274
	275	for (i = 0; i < sizeof(gd->gd_tdrunq)/sizeof(gd->gd_tdrunq[0]); ++i)
	276	TAILQ_INIT(&gd->gd_tdrunq[i]);
	277	gd->gd_runqmask = 0;
	278	TAILQ_INIT(&gd->gd_tdallq);
	279	}
	280
	281	/*
	282	* Create a new thread. The thread must be associated with a process context
	283	* or LWKT start address before it can be scheduled. If the target cpu is
	284	* -1 the thread will be created on the current cpu.
	285	*
	286	* If you intend to create a thread without a process context this function
	287	* does everything except load the startup and switcher function.
	288	*/
	289	thread_t
	290	lwkt_alloc_thread(struct thread *td, int stksize, int cpu, int flags)
	291	{
	292	globaldata_t gd = mycpu;
	293	void *stack;
	294
	295	/*
	296	* If static thread storage is not supplied allocate a thread. Reuse
	297	* a cached free thread if possible. gd_freetd is used to keep an exiting
	298	* thread intact through the exit.
	299	*/
	300	if (td == NULL) {
	301	if ((td = gd->gd_freetd) != NULL)
	302	gd->gd_freetd = NULL;
	303	else
	304	td = objcache_get(thread_cache, M_WAITOK);
	305	KASSERT((td->td_flags &
	306	(TDF_ALLOCATED_THREAD\|TDF_RUNNING)) == TDF_ALLOCATED_THREAD,
	307	("lwkt_alloc_thread: corrupted td flags 0x%X", td->td_flags));
	308	flags \|= td->td_flags & (TDF_ALLOCATED_THREAD\|TDF_ALLOCATED_STACK);
	309	}
	310
	311	/*
	312	* Try to reuse cached stack.
	313	*/
	314	if ((stack = td->td_kstack) != NULL && td->td_kstack_size != stksize) {
	315	if (flags & TDF_ALLOCATED_STACK) {
	316	kmem_free(&kernel_map, (vm_offset_t)stack, td->td_kstack_size);
	317	stack = NULL;
	318	}
	319	}
	320	if (stack == NULL) {
	321	stack = (void *)kmem_alloc(&kernel_map, stksize);
	322	flags \|= TDF_ALLOCATED_STACK;
	323	}
	324	if (cpu < 0)
	325	lwkt_init_thread(td, stack, stksize, flags, gd);
	326	else
	327	lwkt_init_thread(td, stack, stksize, flags, globaldata_find(cpu));
	328	return(td);
	329	}
	330
	331	/*
	332	* Initialize a preexisting thread structure. This function is used by
	333	* lwkt_alloc_thread() and also used to initialize the per-cpu idlethread.
	334	*
	335	* All threads start out in a critical section at a priority of
	336	* TDPRI_KERN_DAEMON. Higher level code will modify the priority as
	337	* appropriate. This function may send an IPI message when the
	338	* requested cpu is not the current cpu and consequently gd_tdallq may
	339	* not be initialized synchronously from the point of view of the originating
	340	* cpu.
	341	*
	342	* NOTE! we have to be careful in regards to creating threads for other cpus
	343	* if SMP has not yet been activated.
	344	*/
	345	#ifdef SMP
	346
	347	static void
	348	lwkt_init_thread_remote(void *arg)
	349	{
	350	thread_t td = arg;
	351
	352	/*
	353	* Protected by critical section held by IPI dispatch
	354	*/
	355	TAILQ_INSERT_TAIL(&td->td_gd->gd_tdallq, td, td_allq);
	356	}
	357
	358	#endif
	359
	360	void
	361	lwkt_init_thread(thread_t td, void *stack, int stksize, int flags,
	362	struct globaldata *gd)
	363	{
	364	globaldata_t mygd = mycpu;
	365
	366	bzero(td, sizeof(struct thread));
	367	td->td_kstack = stack;
	368	td->td_kstack_size = stksize;
	369	td->td_flags = flags;
	370	td->td_gd = gd;
	371	td->td_pri = TDPRI_KERN_DAEMON + TDPRI_CRIT;
	372	#ifdef SMP
	373	if ((flags & TDF_MPSAFE) == 0)
	374	td->td_mpcount = 1;
	375	#endif
	376	if (lwkt_use_spin_port)
	377	lwkt_initport_spin(&td->td_msgport);
	378	else
	379	lwkt_initport_thread(&td->td_msgport, td);
	380	pmap_init_thread(td);
	381	#ifdef SMP
	382	/*
	383	* Normally initializing a thread for a remote cpu requires sending an
	384	* IPI. However, the idlethread is setup before the other cpus are
	385	* activated so we have to treat it as a special case. XXX manipulation
	386	* of gd_tdallq requires the BGL.
	387	*/
	388	if (gd == mygd \|\| td == &gd->gd_idlethread) {
	389	crit_enter_gd(mygd);
	390	TAILQ_INSERT_TAIL(&gd->gd_tdallq, td, td_allq);
	391	crit_exit_gd(mygd);
	392	} else {
	393	lwkt_send_ipiq(gd, lwkt_init_thread_remote, td);
	394	}
	395	#else
	396	crit_enter_gd(mygd);
	397	TAILQ_INSERT_TAIL(&gd->gd_tdallq, td, td_allq);
	398	crit_exit_gd(mygd);
	399	#endif
	400	}
	401
	402	void
	403	lwkt_set_comm(thread_t td, const char *ctl, ...)
	404	{
	405	__va_list va;
	406
	407	__va_start(va, ctl);
	408	kvsnprintf(td->td_comm, sizeof(td->td_comm), ctl, va);
	409	__va_end(va);
	410	}
	411
	412	void
	413	lwkt_hold(thread_t td)
	414	{
	415	++td->td_refs;
	416	}
	417
	418	void
	419	lwkt_rele(thread_t td)
	420	{
	421	KKASSERT(td->td_refs > 0);
	422	--td->td_refs;
	423	}
	424
	425	void
	426	lwkt_wait_free(thread_t td)
	427	{
	428	while (td->td_refs)
	429	tsleep(td, 0, "tdreap", hz);
	430	}
	431
	432	void
	433	lwkt_free_thread(thread_t td)
	434	{
	435	KASSERT((td->td_flags & TDF_RUNNING) == 0,
	436	("lwkt_free_thread: did not exit! %p", td));
	437
	438	if (td->td_flags & TDF_ALLOCATED_THREAD) {
	439	objcache_put(thread_cache, td);
	440	} else if (td->td_flags & TDF_ALLOCATED_STACK) {
	441	/* client-allocated struct with internally allocated stack */
	442	KASSERT(td->td_kstack && td->td_kstack_size > 0,
	443	("lwkt_free_thread: corrupted stack"));
	444	kmem_free(&kernel_map, (vm_offset_t)td->td_kstack, td->td_kstack_size);
	445	td->td_kstack = NULL;
	446	td->td_kstack_size = 0;
	447	}
	448	}
	449
	450
	451	/*
	452	* Switch to the next runnable lwkt. If no LWKTs are runnable then
	453	* switch to the idlethread. Switching must occur within a critical
	454	* section to avoid races with the scheduling queue.
	455	*
	456	* We always have full control over our cpu's run queue. Other cpus
	457	* that wish to manipulate our queue must use the cpu_*msg() calls to
	458	* talk to our cpu, so a critical section is all that is needed and
	459	* the result is very, very fast thread switching.
	460	*
	461	* The LWKT scheduler uses a fixed priority model and round-robins at
	462	* each priority level. User process scheduling is a totally
	463	* different beast and LWKT priorities should not be confused with
	464	* user process priorities.
	465	*
	466	* The MP lock may be out of sync with the thread's td_mpcount. lwkt_switch()
	467	* cleans it up. Note that the td_switch() function cannot do anything that
	468	* requires the MP lock since the MP lock will have already been setup for
	469	* the target thread (not the current thread). It's nice to have a scheduler
	470	* that does not need the MP lock to work because it allows us to do some
	471	* really cool high-performance MP lock optimizations.
	472	*
	473	* PREEMPTION NOTE: Preemption occurs via lwkt_preempt(). lwkt_switch()
	474	* is not called by the current thread in the preemption case, only when
	475	* the preempting thread blocks (in order to return to the original thread).
	476	*/
	477	void
	478	lwkt_switch(void)
	479	{
	480	globaldata_t gd = mycpu;
	481	thread_t td = gd->gd_curthread;
	482	thread_t ntd;
	483	#ifdef SMP
	484	int mpheld;
	485	#endif
	486
	487	/*
	488	* Switching from within a 'fast' (non thread switched) interrupt or IPI
	489	* is illegal. However, we may have to do it anyway if we hit a fatal
	490	* kernel trap or we have paniced.
	491	*
	492	* If this case occurs save and restore the interrupt nesting level.
	493	*/
	494	if (gd->gd_intr_nesting_level) {
	495	int savegdnest;
	496	int savegdtrap;
	497
	498	if (gd->gd_trap_nesting_level == 0 && panicstr == NULL) {
	499	panic("lwkt_switch: cannot switch from within "
	500	"a fast interrupt, yet, td %p\n", td);
	501	} else {
	502	savegdnest = gd->gd_intr_nesting_level;
	503	savegdtrap = gd->gd_trap_nesting_level;
	504	gd->gd_intr_nesting_level = 0;
	505	gd->gd_trap_nesting_level = 0;
	506	if ((td->td_flags & TDF_PANICWARN) == 0) {
	507	td->td_flags \|= TDF_PANICWARN;
	508	kprintf("Warning: thread switch from interrupt or IPI, "
	509	"thread %p (%s)\n", td, td->td_comm);
	510	print_backtrace();
	511	}
	512	lwkt_switch();
	513	gd->gd_intr_nesting_level = savegdnest;
	514	gd->gd_trap_nesting_level = savegdtrap;
	515	return;
	516	}
	517	}
	518
	519	/*
	520	* Passive release (used to transition from user to kernel mode
	521	* when we block or switch rather then when we enter the kernel).
	522	* This function is NOT called if we are switching into a preemption
	523	* or returning from a preemption. Typically this causes us to lose
	524	* our current process designation (if we have one) and become a true
	525	* LWKT thread, and may also hand the current process designation to
	526	* another process and schedule thread.
	527	*/
	528	if (td->td_release)
	529	td->td_release(td);
	530
	531	crit_enter_gd(gd);
	532	if (td->td_toks)
	533	lwkt_relalltokens(td);
	534
	535	/*
	536	* We had better not be holding any spin locks, but don't get into an
	537	* endless panic loop.
	538	*/
	539	KASSERT(gd->gd_spinlock_rd == NULL \|\| panicstr != NULL,
	540	("lwkt_switch: still holding a shared spinlock %p!",
	541	gd->gd_spinlock_rd));
	542	KASSERT(gd->gd_spinlocks_wr == 0 \|\| panicstr != NULL,
	543	("lwkt_switch: still holding %d exclusive spinlocks!",
	544	gd->gd_spinlocks_wr));
	545
	546
	547	#ifdef SMP
	548	/*
	549	* td_mpcount cannot be used to determine if we currently hold the
	550	* MP lock because get_mplock() will increment it prior to attempting
	551	* to get the lock, and switch out if it can't. Our ownership of
	552	* the actual lock will remain stable while we are in a critical section
	553	* (but, of course, another cpu may own or release the lock so the
	554	* actual value of mp_lock is not stable).
	555	*/
	556	mpheld = MP_LOCK_HELD();
	557	#ifdef INVARIANTS
	558	if (td->td_cscount) {
	559	kprintf("Diagnostic: attempt to switch while mastering cpusync: %p\n",
	560	td);
	561	if (panic_on_cscount)
	562	panic("switching while mastering cpusync");
	563	}
	564	#endif
	565	#endif
	566	if ((ntd = td->td_preempted) != NULL) {
	567	/*
	568	* We had preempted another thread on this cpu, resume the preempted
	569	* thread. This occurs transparently, whether the preempted thread
	570	* was scheduled or not (it may have been preempted after descheduling
	571	* itself).
	572	*
	573	* We have to setup the MP lock for the original thread after backing
	574	* out the adjustment that was made to curthread when the original
	575	* was preempted.
	576	*/
	577	KKASSERT(ntd->td_flags & TDF_PREEMPT_LOCK);
	578	#ifdef SMP
	579	if (ntd->td_mpcount && mpheld == 0) {
	580	panic("MPLOCK NOT HELD ON RETURN: %p %p %d %d",
	581	td, ntd, td->td_mpcount, ntd->td_mpcount);
	582	}
	583	if (ntd->td_mpcount) {
	584	td->td_mpcount -= ntd->td_mpcount;
	585	KKASSERT(td->td_mpcount >= 0);
	586	}
	587	#endif
	588	ntd->td_flags \|= TDF_PREEMPT_DONE;
	589
	590	/*
	591	* The interrupt may have woken a thread up, we need to properly
	592	* set the reschedule flag if the originally interrupted thread is
	593	* at a lower priority.
	594	*/
	595	if (gd->gd_runqmask > (2 << (ntd->td_pri & TDPRI_MASK)) - 1)
	596	need_lwkt_resched();
	597	/* YYY release mp lock on switchback if original doesn't need it */
	598	} else {
	599	/*
	600	* Priority queue / round-robin at each priority. Note that user
	601	* processes run at a fixed, low priority and the user process
	602	* scheduler deals with interactions between user processes
	603	* by scheduling and descheduling them from the LWKT queue as
	604	* necessary.
	605	*
	606	* We have to adjust the MP lock for the target thread. If we
	607	* need the MP lock and cannot obtain it we try to locate a
	608	* thread that does not need the MP lock. If we cannot, we spin
	609	* instead of HLT.
	610	*
	611	* A similar issue exists for the tokens held by the target thread.
	612	* If we cannot obtain ownership of the tokens we cannot immediately
	613	* schedule the thread.
	614	*/
	615
	616	/*
	617	* If an LWKT reschedule was requested, well that is what we are
	618	* doing now so clear it.
	619	*/
	620	clear_lwkt_resched();
	621	again:
	622	if (gd->gd_runqmask) {
	623	int nq = bsrl(gd->gd_runqmask);
	624	if ((ntd = TAILQ_FIRST(&gd->gd_tdrunq[nq])) == NULL) {
	625	gd->gd_runqmask &= ~(1 << nq);
	626	goto again;
	627	}
	628	#ifdef SMP
	629	/*
	630	* THREAD SELECTION FOR AN SMP MACHINE BUILD
	631	*
	632	* If the target needs the MP lock and we couldn't get it,
	633	* or if the target is holding tokens and we could not
	634	* gain ownership of the tokens, continue looking for a
	635	* thread to schedule and spin instead of HLT if we can't.
	636	*
	637	* NOTE: the mpheld variable invalid after this conditional, it
	638	* can change due to both cpu_try_mplock() returning success
	639	* AND interactions in lwkt_getalltokens() due to the fact that
	640	* we are trying to check the mpcount of a thread other then
	641	* the current thread. Because of this, if the current thread
	642	* is not holding td_mpcount, an IPI indirectly run via
	643	* lwkt_getalltokens() can obtain and release the MP lock and
	644	* cause the core MP lock to be released.
	645	*/
	646	if ((ntd->td_mpcount && mpheld == 0 && !cpu_try_mplock()) \|\|
	647	(ntd->td_toks && lwkt_getalltokens(ntd) == 0)
	648	) {
	649	u_int32_t rqmask = gd->gd_runqmask;
	650
	651	mpheld = MP_LOCK_HELD();
	652	ntd = NULL;
	653	while (rqmask) {
	654	TAILQ_FOREACH(ntd, &gd->gd_tdrunq[nq], td_threadq) {
	655	if (ntd->td_mpcount && !mpheld && !cpu_try_mplock()) {
	656	/* spinning due to MP lock being held */
	657	#ifdef INVARIANTS
	658	++mplock_contention_count;
	659	#endif
	660	/* mplock still not held, 'mpheld' still valid */
	661	continue;
	662	}
	663
	664	/*
	665	* mpheld state invalid after getalltokens call returns
	666	* failure, but the variable is only needed for
	667	* the loop.
	668	*/
	669	if (ntd->td_toks && !lwkt_getalltokens(ntd)) {
	670	/* spinning due to token contention */
	671	#ifdef INVARIANTS
	672	++token_contention_count;
	673	#endif
	674	mpheld = MP_LOCK_HELD();
	675	continue;
	676	}
	677	break;
	678	}
	679	if (ntd)
	680	break;
	681	rqmask &= ~(1 << nq);
	682	nq = bsrl(rqmask);
	683
	684	/*
	685	* We have two choices. We can either refuse to run a
	686	* user thread when a kernel thread needs the MP lock
	687	* but could not get it, or we can allow it to run but
	688	* then expect an IPI (hopefully) later on to force a
	689	* reschedule when the MP lock might become available.
	690	*/
	691	if (nq < TDPRI_KERN_LPSCHED) {
	692	if (chain_mplock == 0)
	693	break;
	694	atomic_set_int(&mp_lock_contention_mask,
	695	gd->gd_cpumask);
	696	/* continue loop, allow user threads to be scheduled */
	697	}
	698	}
	699	if (ntd == NULL) {
	700	cpu_mplock_contested();
	701	ntd = &gd->gd_idlethread;
	702	ntd->td_flags \|= TDF_IDLE_NOHLT;
	703	goto using_idle_thread;
	704	} else {
	705	++gd->gd_cnt.v_swtch;
	706	TAILQ_REMOVE(&gd->gd_tdrunq[nq], ntd, td_threadq);
	707	TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], ntd, td_threadq);
	708	}
	709	} else {
	710	if (ntd->td_mpcount)
	711	++mplock_countx;
	712	++gd->gd_cnt.v_swtch;
	713	TAILQ_REMOVE(&gd->gd_tdrunq[nq], ntd, td_threadq);
	714	TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], ntd, td_threadq);
	715	}
	716	#else
	717	/*
	718	* THREAD SELECTION FOR A UP MACHINE BUILD. We don't have to
	719	* worry about tokens or the BGL. However, we still have
	720	* to call lwkt_getalltokens() in order to properly detect
	721	* stale tokens. This call cannot fail for a UP build!
	722	*/
	723	lwkt_getalltokens(ntd);
	724	++gd->gd_cnt.v_swtch;
	725	TAILQ_REMOVE(&gd->gd_tdrunq[nq], ntd, td_threadq);
	726	TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], ntd, td_threadq);
	727	#endif
	728	} else {
	729	/*
	730	* We have nothing to run but only let the idle loop halt
	731	* the cpu if there are no pending interrupts.
	732	*/
	733	ntd = &gd->gd_idlethread;
	734	if (gd->gd_reqflags & RQF_IDLECHECK_MASK)
	735	ntd->td_flags \|= TDF_IDLE_NOHLT;
	736	#ifdef SMP
	737	using_idle_thread:
	738	/*
	739	* The idle thread should not be holding the MP lock unless we
	740	* are trapping in the kernel or in a panic. Since we select the
	741	* idle thread unconditionally when no other thread is available,
	742	* if the MP lock is desired during a panic or kernel trap, we
	743	* have to loop in the scheduler until we get it.
	744	*/
	745	if (ntd->td_mpcount) {
	746	mpheld = MP_LOCK_HELD();
	747	if (gd->gd_trap_nesting_level == 0 && panicstr == NULL) {
	748	panic("Idle thread %p was holding the BGL!", ntd);
	749	} else if (mpheld == 0) {
	750	cpu_mplock_contested();
	751	goto again;
	752	}
	753	}
	754	#endif
	755	}
	756	}
	757	KASSERT(ntd->td_pri >= TDPRI_CRIT,
	758	("priority problem in lwkt_switch %d %d", td->td_pri, ntd->td_pri));
	759
	760	/*
	761	* Do the actual switch. If the new target does not need the MP lock
	762	* and we are holding it, release the MP lock. If the new target requires
	763	* the MP lock we have already acquired it for the target.
	764	*/
	765	#ifdef SMP
	766	if (ntd->td_mpcount == 0 ) {
	767	if (MP_LOCK_HELD())
	768	cpu_rel_mplock();
	769	} else {
	770	ASSERT_MP_LOCK_HELD(ntd);
	771	}
	772	#endif
	773	if (td != ntd) {
	774	++switch_count;
	775	#ifdef __amd64__
	776	KKASSERT(jg_tos_ok(ntd));
	777	#endif
	778	td->td_switch(ntd);
	779	}
	780	/* NOTE: current cpu may have changed after switch */
	781	crit_exit_quick(td);
	782	}
	783
	784	/*
	785	* Request that the target thread preempt the current thread. Preemption
	786	* only works under a specific set of conditions:
	787	*
	788	* - We are not preempting ourselves
	789	* - The target thread is owned by the current cpu
	790	* - We are not currently being preempted
	791	* - The target is not currently being preempted
	792	* - We are not holding any spin locks
	793	* - The target thread is not holding any tokens
	794	* - We are able to satisfy the target's MP lock requirements (if any).
	795	*
	796	* THE CALLER OF LWKT_PREEMPT() MUST BE IN A CRITICAL SECTION. Typically
	797	* this is called via lwkt_schedule() through the td_preemptable callback.
	798	* critpri is the managed critical priority that we should ignore in order
	799	* to determine whether preemption is possible (aka usually just the crit
	800	* priority of lwkt_schedule() itself).
	801	*
	802	* XXX at the moment we run the target thread in a critical section during
	803	* the preemption in order to prevent the target from taking interrupts
	804	* that WE can't. Preemption is strictly limited to interrupt threads
	805	* and interrupt-like threads, outside of a critical section, and the
	806	* preempted source thread will be resumed the instant the target blocks
	807	* whether or not the source is scheduled (i.e. preemption is supposed to
	808	* be as transparent as possible).
	809	*
	810	* The target thread inherits our MP count (added to its own) for the
	811	* duration of the preemption in order to preserve the atomicy of the
	812	* MP lock during the preemption. Therefore, any preempting targets must be
	813	* careful in regards to MP assertions. Note that the MP count may be
	814	* out of sync with the physical mp_lock, but we do not have to preserve
	815	* the original ownership of the lock if it was out of synch (that is, we
	816	* can leave it synchronized on return).
	817	*/
	818	void
	819	lwkt_preempt(thread_t ntd, int critpri)
	820	{
	821	struct globaldata *gd = mycpu;
	822	thread_t td;
	823	#ifdef SMP
	824	int mpheld;
	825	int savecnt;
	826	#endif
	827
	828	/*
	829	* The caller has put us in a critical section. We can only preempt
	830	* if the caller of the caller was not in a critical section (basically
	831	* a local interrupt), as determined by the 'critpri' parameter. We
	832	* also can't preempt if the caller is holding any spinlocks (even if
	833	* he isn't in a critical section). This also handles the tokens test.
	834	*
	835	* YYY The target thread must be in a critical section (else it must
	836	* inherit our critical section? I dunno yet).
	837	*
	838	* Set need_lwkt_resched() unconditionally for now YYY.
	839	*/
	840	KASSERT(ntd->td_pri >= TDPRI_CRIT, ("BADCRIT0 %d", ntd->td_pri));
	841
	842	td = gd->gd_curthread;
	843	if ((ntd->td_pri & TDPRI_MASK) <= (td->td_pri & TDPRI_MASK)) {
	844	++preempt_miss;
	845	return;
	846	}
	847	if ((td->td_pri & ~TDPRI_MASK) > critpri) {
	848	++preempt_miss;
	849	need_lwkt_resched();
	850	return;
	851	}
	852	#ifdef SMP
	853	if (ntd->td_gd != gd) {
	854	++preempt_miss;
	855	need_lwkt_resched();
	856	return;
	857	}
	858	#endif
	859	/*
	860	* Take the easy way out and do not preempt if we are holding
	861	* any spinlocks. We could test whether the thread(s) being
	862	* preempted interlock against the target thread's tokens and whether
	863	* we can get all the target thread's tokens, but this situation
	864	* should not occur very often so its easier to simply not preempt.
	865	* Also, plain spinlocks are impossible to figure out at this point so
	866	* just don't preempt.
	867	*
	868	* Do not try to preempt if the target thread is holding any tokens.
	869	* We could try to acquire the tokens but this case is so rare there
	870	* is no need to support it.
	871	*/
	872	if (gd->gd_spinlock_rd \|\| gd->gd_spinlocks_wr) {
	873	++preempt_miss;
	874	need_lwkt_resched();
	875	return;
	876	}
	877	if (ntd->td_toks) {
	878	++preempt_miss;
	879	need_lwkt_resched();
	880	return;
	881	}
	882	if (td == ntd \|\| ((td->td_flags \| ntd->td_flags) & TDF_PREEMPT_LOCK)) {
	883	++preempt_weird;
	884	need_lwkt_resched();
	885	return;
	886	}
	887	if (ntd->td_preempted) {
	888	++preempt_hit;
	889	need_lwkt_resched();
	890	return;
	891	}
	892	#ifdef SMP
	893	/*
	894	* note: an interrupt might have occured just as we were transitioning
	895	* to or from the MP lock. In this case td_mpcount will be pre-disposed
	896	* (non-zero) but not actually synchronized with the actual state of the
	897	* lock. We can use it to imply an MP lock requirement for the
	898	* preemption but we cannot use it to test whether we hold the MP lock
	899	* or not.
	900	*/
	901	savecnt = td->td_mpcount;
	902	mpheld = MP_LOCK_HELD();
	903	ntd->td_mpcount += td->td_mpcount;
	904	if (mpheld == 0 && ntd->td_mpcount && !cpu_try_mplock()) {
	905	ntd->td_mpcount -= td->td_mpcount;
	906	++preempt_miss;
	907	need_lwkt_resched();
	908	return;
	909	}
	910	#endif
	911
	912	/*
	913	* Since we are able to preempt the current thread, there is no need to
	914	* call need_lwkt_resched().
	915	*/
	916	++preempt_hit;
	917	ntd->td_preempted = td;
	918	td->td_flags \|= TDF_PREEMPT_LOCK;
	919	td->td_switch(ntd);
	920
	921	KKASSERT(ntd->td_preempted && (td->td_flags & TDF_PREEMPT_DONE));
	922	#ifdef SMP
	923	KKASSERT(savecnt == td->td_mpcount);
	924	mpheld = MP_LOCK_HELD();
	925	if (mpheld && td->td_mpcount == 0)
	926	cpu_rel_mplock();
	927	else if (mpheld == 0 && td->td_mpcount)
	928	panic("lwkt_preempt(): MP lock was not held through");
	929	#endif
	930	ntd->td_preempted = NULL;
	931	td->td_flags &= ~(TDF_PREEMPT_LOCK\|TDF_PREEMPT_DONE);
	932	}
	933
	934	/*
	935	* Conditionally call splz() if gd_reqflags indicates work is pending.
	936	*
	937	* td_nest_count prevents deep nesting via splz() or doreti() which
	938	* might otherwise blow out the kernel stack. Note that except for
	939	* this special case, we MUST call splz() here to handle any
	940	* pending ints, particularly after we switch, or we might accidently
	941	* halt the cpu with interrupts pending.
	942	*
	943	* (self contained on a per cpu basis)
	944	*/
	945	void
	946	splz_check(void)
	947	{
	948	globaldata_t gd = mycpu;
	949	thread_t td = gd->gd_curthread;
	950
	951	if (gd->gd_reqflags && td->td_nest_count < 2)
	952	splz();
	953	}
	954
	955	/*
	956	* This implements a normal yield which will yield to equal priority
	957	* threads as well as higher priority threads. Note that gd_reqflags
	958	* tests will be handled by the crit_exit() call in lwkt_switch().
	959	*
	960	* (self contained on a per cpu basis)
	961	*/
	962	void
	963	lwkt_yield(void)
	964	{
	965	lwkt_schedule_self(curthread);
	966	lwkt_switch();
	967	}
	968
	969	/*
	970	* This function is used along with the lwkt_passive_recover() inline
	971	* by the trap code to negotiate a passive release of the current
	972	* process/lwp designation with the user scheduler.
	973	*/
	974	void
	975	lwkt_passive_release(struct thread *td)
	976	{
	977	struct lwp *lp = td->td_lwp;
	978
	979	td->td_release = NULL;
	980	lwkt_setpri_self(TDPRI_KERN_USER);
	981	lp->lwp_proc->p_usched->release_curproc(lp);
	982	}
	983
	984	/*
	985	* Make a kernel thread act as if it were in user mode with regards
	986	* to scheduling, to avoid becoming cpu-bound in the kernel. Kernel
	987	* loops which may be potentially cpu-bound can call lwkt_user_yield().
	988	*
	989	* The lwkt_user_yield() function is designed to have very low overhead
	990	* if no yield is determined to be needed.
	991	*/
	992	void
	993	lwkt_user_yield(void)
	994	{
	995	thread_t td = curthread;
	996	struct lwp *lp = td->td_lwp;
	997
	998	#ifdef SMP
	999	/*
	1000	* XXX SEVERE TEMPORARY HACK. A cpu-bound operation running in the
	1001	* kernel can prevent other cpus from servicing interrupt threads
	1002	* which still require the MP lock (which is a lot of them). This
	1003	* has a chaining effect since if the interrupt is blocked, so is
	1004	* the event, so normal scheduling will not pick up on the problem.
	1005	*/
	1006	if (mplock_countx && td->td_mpcount) {
	1007	int savecnt = td->td_mpcount;
	1008
	1009	td->td_mpcount = 1;
	1010	mplock_countx = 0;
	1011	rel_mplock();
	1012	DELAY(bgl_yield);
	1013	get_mplock();
	1014	td->td_mpcount = savecnt;
	1015	}
	1016	#endif
	1017
	1018	/*
	1019	* Another kernel thread wants the cpu
	1020	*/
	1021	if (lwkt_resched_wanted())
	1022	lwkt_switch();
	1023
	1024	/*
	1025	* If the user scheduler has asynchronously determined that the current
	1026	* process (when running in user mode) needs to lose the cpu then make
	1027	* sure we are released.
	1028	*/
	1029	if (user_resched_wanted()) {
	1030	if (td->td_release)
	1031	td->td_release(td);
	1032	}
	1033
	1034	/*
	1035	* If we are released reduce our priority
	1036	*/
	1037	if (td->td_release == NULL) {
	1038	if (lwkt_check_resched(td) > 0)
	1039	lwkt_switch();
	1040	if (lp) {
	1041	lp->lwp_proc->p_usched->acquire_curproc(lp);
	1042	td->td_release = lwkt_passive_release;
	1043	lwkt_setpri_self(TDPRI_USER_NORM);
	1044	}
	1045	}
	1046	}
	1047
	1048	/*
	1049	* Return 0 if no runnable threads are pending at the same or higher
	1050	* priority as the passed thread.
	1051	*
	1052	* Return 1 if runnable threads are pending at the same priority.
	1053	*
	1054	* Return 2 if runnable threads are pending at a higher priority.
	1055	*/
	1056	int
	1057	lwkt_check_resched(thread_t td)
	1058	{
	1059	int pri = td->td_pri & TDPRI_MASK;
	1060
	1061	if (td->td_gd->gd_runqmask > (2 << pri) - 1)
	1062	return(2);
	1063	if (TAILQ_NEXT(td, td_threadq))
	1064	return(1);
	1065	return(0);
	1066	}
	1067
	1068	/*
	1069	* Generic schedule. Possibly schedule threads belonging to other cpus and
	1070	* deal with threads that might be blocked on a wait queue.
	1071	*
	1072	* We have a little helper inline function which does additional work after
	1073	* the thread has been enqueued, including dealing with preemption and
	1074	* setting need_lwkt_resched() (which prevents the kernel from returning
	1075	* to userland until it has processed higher priority threads).
	1076	*
	1077	* It is possible for this routine to be called after a failed _enqueue
	1078	* (due to the target thread migrating, sleeping, or otherwise blocked).
	1079	* We have to check that the thread is actually on the run queue!
	1080	*
	1081	* reschedok is an optimized constant propagated from lwkt_schedule() or
	1082	* lwkt_schedule_noresched(). By default it is non-zero, causing a
	1083	* reschedule to be requested if the target thread has a higher priority.
	1084	* The port messaging code will set MSG_NORESCHED and cause reschedok to
	1085	* be 0, prevented undesired reschedules.
	1086	*/
	1087	static __inline
	1088	void
	1089	_lwkt_schedule_post(globaldata_t gd, thread_t ntd, int cpri, int reschedok)
	1090	{
	1091	thread_t otd;
	1092
	1093	if (ntd->td_flags & TDF_RUNQ) {
	1094	if (ntd->td_preemptable && reschedok) {
	1095	ntd->td_preemptable(ntd, cpri); /* YYY +token */
	1096	} else if (reschedok) {
	1097	otd = curthread;
	1098	if ((ntd->td_pri & TDPRI_MASK) > (otd->td_pri & TDPRI_MASK))
	1099	need_lwkt_resched();
	1100	}
	1101	}
	1102	}
	1103
	1104	static __inline
	1105	void
	1106	_lwkt_schedule(thread_t td, int reschedok)
	1107	{
	1108	globaldata_t mygd = mycpu;
	1109
	1110	KASSERT(td != &td->td_gd->gd_idlethread, ("lwkt_schedule(): scheduling gd_idlethread is illegal!"));
	1111	crit_enter_gd(mygd);
	1112	KKASSERT(td->td_lwp == NULL \|\| (td->td_lwp->lwp_flag & LWP_ONRUNQ) == 0);
	1113	if (td == mygd->gd_curthread) {
	1114	_lwkt_enqueue(td);
	1115	} else {
	1116	/*
	1117	* If we own the thread, there is no race (since we are in a
	1118	* critical section). If we do not own the thread there might
	1119	* be a race but the target cpu will deal with it.
	1120	*/
	1121	#ifdef SMP
	1122	if (td->td_gd == mygd) {
	1123	_lwkt_enqueue(td);
	1124	_lwkt_schedule_post(mygd, td, TDPRI_CRIT, reschedok);
	1125	} else {
	1126	lwkt_send_ipiq3(td->td_gd, lwkt_schedule_remote, td, 0);
	1127	}
	1128	#else
	1129	_lwkt_enqueue(td);
	1130	_lwkt_schedule_post(mygd, td, TDPRI_CRIT, reschedok);
	1131	#endif
	1132	}
	1133	crit_exit_gd(mygd);
	1134	}
	1135
	1136	void
	1137	lwkt_schedule(thread_t td)
	1138	{
	1139	_lwkt_schedule(td, 1);
	1140	}
	1141
	1142	void
	1143	lwkt_schedule_noresched(thread_t td)
	1144	{
	1145	_lwkt_schedule(td, 0);
	1146	}
	1147
	1148	/*
	1149	* When scheduled remotely if frame != NULL the IPIQ is being
	1150	* run via doreti or an interrupt then preemption can be allowed.
	1151	*
	1152	* To allow preemption we have to drop the critical section so only
	1153	* one is present in _lwkt_schedule_post.
	1154	*/
	1155	static void
	1156	lwkt_schedule_remote(void arg, int arg2, struct intrframe frame)
	1157	{
	1158	thread_t td = curthread;
	1159	thread_t ntd = arg;
	1160
	1161	if (frame && ntd->td_preemptable) {
	1162	crit_exit_noyield(td);
	1163	_lwkt_schedule(ntd, 1);
	1164	crit_enter_quick(td);
	1165	} else {
	1166	_lwkt_schedule(ntd, 1);
	1167	}
	1168	}
	1169
	1170	#ifdef SMP
	1171
	1172	/*
	1173	* Thread migration using a 'Pull' method. The thread may or may not be
	1174	* the current thread. It MUST be descheduled and in a stable state.
	1175	* lwkt_giveaway() must be called on the cpu owning the thread.
	1176	*
	1177	* At any point after lwkt_giveaway() is called, the target cpu may
	1178	* 'pull' the thread by calling lwkt_acquire().
	1179	*
	1180	* We have to make sure the thread is not sitting on a per-cpu tsleep
	1181	* queue or it will blow up when it moves to another cpu.
	1182	*
	1183	* MPSAFE - must be called under very specific conditions.
	1184	*/
	1185	void
	1186	lwkt_giveaway(thread_t td)
	1187	{
	1188	globaldata_t gd = mycpu;
	1189
	1190	crit_enter_gd(gd);
	1191	if (td->td_flags & TDF_TSLEEPQ)
	1192	tsleep_remove(td);
	1193	KKASSERT(td->td_gd == gd);
	1194	TAILQ_REMOVE(&gd->gd_tdallq, td, td_allq);
	1195	td->td_flags \|= TDF_MIGRATING;
	1196	crit_exit_gd(gd);
	1197	}
	1198
	1199	void
	1200	lwkt_acquire(thread_t td)
	1201	{
	1202	globaldata_t gd;
	1203	globaldata_t mygd;
	1204
	1205	KKASSERT(td->td_flags & TDF_MIGRATING);
	1206	gd = td->td_gd;
	1207	mygd = mycpu;
	1208	if (gd != mycpu) {
	1209	cpu_lfence();
	1210	KKASSERT((td->td_flags & TDF_RUNQ) == 0);
	1211	crit_enter_gd(mygd);
	1212	while (td->td_flags & (TDF_RUNNING\|TDF_PREEMPT_LOCK)) {
	1213	#ifdef SMP
	1214	lwkt_process_ipiq();
	1215	#endif
	1216	cpu_lfence();
	1217	}
	1218	td->td_gd = mygd;
	1219	TAILQ_INSERT_TAIL(&mygd->gd_tdallq, td, td_allq);
	1220	td->td_flags &= ~TDF_MIGRATING;
	1221	crit_exit_gd(mygd);
	1222	} else {
	1223	crit_enter_gd(mygd);
	1224	TAILQ_INSERT_TAIL(&mygd->gd_tdallq, td, td_allq);
	1225	td->td_flags &= ~TDF_MIGRATING;
	1226	crit_exit_gd(mygd);
	1227	}
	1228	}
	1229
	1230	#endif
	1231
	1232	/*
	1233	* Generic deschedule. Descheduling threads other then your own should be
	1234	* done only in carefully controlled circumstances. Descheduling is
	1235	* asynchronous.
	1236	*
	1237	* This function may block if the cpu has run out of messages.
	1238	*/
	1239	void
	1240	lwkt_deschedule(thread_t td)
	1241	{
	1242	crit_enter();
	1243	#ifdef SMP
	1244	if (td == curthread) {
	1245	_lwkt_dequeue(td);
	1246	} else {
	1247	if (td->td_gd == mycpu) {
	1248	_lwkt_dequeue(td);
	1249	} else {
	1250	lwkt_send_ipiq(td->td_gd, (ipifunc1_t)lwkt_deschedule, td);
	1251	}
	1252	}
	1253	#else
	1254	_lwkt_dequeue(td);
	1255	#endif
	1256	crit_exit();
	1257	}
	1258
	1259	/*
	1260	* Set the target thread's priority. This routine does not automatically
	1261	* switch to a higher priority thread, LWKT threads are not designed for
	1262	* continuous priority changes. Yield if you want to switch.
	1263	*
	1264	* We have to retain the critical section count which uses the high bits
	1265	* of the td_pri field. The specified priority may also indicate zero or
	1266	* more critical sections by adding TDPRI_CRIT*N.
	1267	*
	1268	* Note that we requeue the thread whether it winds up on a different runq
	1269	* or not. uio_yield() depends on this and the routine is not normally
	1270	* called with the same priority otherwise.
	1271	*/
	1272	void
	1273	lwkt_setpri(thread_t td, int pri)
	1274	{
	1275	KKASSERT(pri >= 0);
	1276	KKASSERT(td->td_gd == mycpu);
	1277	crit_enter();
	1278	if (td->td_flags & TDF_RUNQ) {
	1279	_lwkt_dequeue(td);
	1280	td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri;
	1281	_lwkt_enqueue(td);
	1282	} else {
	1283	td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri;
	1284	}
	1285	crit_exit();
	1286	}
	1287
	1288	void
	1289	lwkt_setpri_self(int pri)
	1290	{
	1291	thread_t td = curthread;
	1292
	1293	KKASSERT(pri >= 0 && pri <= TDPRI_MAX);
	1294	crit_enter();
	1295	if (td->td_flags & TDF_RUNQ) {
	1296	_lwkt_dequeue(td);
	1297	td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri;
	1298	_lwkt_enqueue(td);
	1299	} else {
	1300	td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri;
	1301	}
	1302	crit_exit();
	1303	}
	1304
	1305	/*
	1306	* Migrate the current thread to the specified cpu.
	1307	*
	1308	* This is accomplished by descheduling ourselves from the current cpu,
	1309	* moving our thread to the tdallq of the target cpu, IPI messaging the
	1310	* target cpu, and switching out. TDF_MIGRATING prevents scheduling
	1311	* races while the thread is being migrated.
	1312	*
	1313	* We must be sure to remove ourselves from the current cpu's tsleepq
	1314	* before potentially moving to another queue. The thread can be on
	1315	* a tsleepq due to a left-over tsleep_interlock().
	1316	*/
	1317	#ifdef SMP
	1318	static void lwkt_setcpu_remote(void *arg);
	1319	#endif
	1320
	1321	void
	1322	lwkt_setcpu_self(globaldata_t rgd)
	1323	{
	1324	#ifdef SMP
	1325	thread_t td = curthread;
	1326
	1327	if (td->td_gd != rgd) {
	1328	crit_enter_quick(td);
	1329	if (td->td_flags & TDF_TSLEEPQ)
	1330	tsleep_remove(td);
	1331	td->td_flags \|= TDF_MIGRATING;
	1332	lwkt_deschedule_self(td);
	1333	TAILQ_REMOVE(&td->td_gd->gd_tdallq, td, td_allq);
	1334	lwkt_send_ipiq(rgd, (ipifunc1_t)lwkt_setcpu_remote, td);
	1335	lwkt_switch();
	1336	/* we are now on the target cpu */
	1337	TAILQ_INSERT_TAIL(&rgd->gd_tdallq, td, td_allq);
	1338	crit_exit_quick(td);
	1339	}
	1340	#endif
	1341	}
	1342
	1343	void
	1344	lwkt_migratecpu(int cpuid)
	1345	{
	1346	#ifdef SMP
	1347	globaldata_t rgd;
	1348
	1349	rgd = globaldata_find(cpuid);
	1350	lwkt_setcpu_self(rgd);
	1351	#endif
	1352	}
	1353
	1354	/*
	1355	* Remote IPI for cpu migration (called while in a critical section so we
	1356	* do not have to enter another one). The thread has already been moved to
	1357	* our cpu's allq, but we must wait for the thread to be completely switched
	1358	* out on the originating cpu before we schedule it on ours or the stack
	1359	* state may be corrupt. We clear TDF_MIGRATING after flushing the GD
	1360	* change to main memory.
	1361	*
	1362	* XXX The use of TDF_MIGRATING might not be sufficient to avoid races
	1363	* against wakeups. It is best if this interface is used only when there
	1364	* are no pending events that might try to schedule the thread.
	1365	*/
	1366	#ifdef SMP
	1367	static void
	1368	lwkt_setcpu_remote(void *arg)
	1369	{
	1370	thread_t td = arg;
	1371	globaldata_t gd = mycpu;
	1372
	1373	while (td->td_flags & (TDF_RUNNING\|TDF_PREEMPT_LOCK)) {
	1374	#ifdef SMP
	1375	lwkt_process_ipiq();
	1376	#endif
	1377	cpu_lfence();
	1378	}
	1379	td->td_gd = gd;
	1380	cpu_sfence();
	1381	td->td_flags &= ~TDF_MIGRATING;
	1382	KKASSERT(td->td_lwp == NULL \|\| (td->td_lwp->lwp_flag & LWP_ONRUNQ) == 0);
	1383	_lwkt_enqueue(td);
	1384	}
	1385	#endif
	1386
	1387	struct lwp *
	1388	lwkt_preempted_proc(void)
	1389	{
	1390	thread_t td = curthread;
	1391	while (td->td_preempted)
	1392	td = td->td_preempted;
	1393	return(td->td_lwp);
	1394	}
	1395
	1396	/*
	1397	* Create a kernel process/thread/whatever. It shares it's address space
	1398	* with proc0 - ie: kernel only.
	1399	*
	1400	* NOTE! By default new threads are created with the MP lock held. A
	1401	* thread which does not require the MP lock should release it by calling
	1402	* rel_mplock() at the start of the new thread.
	1403	*/
	1404	int
	1405	lwkt_create(void (func)(void ), void *arg,
	1406	struct thread **tdp, thread_t template, int tdflags, int cpu,
	1407	const char *fmt, ...)
	1408	{
	1409	thread_t td;
	1410	__va_list ap;
	1411
	1412	td = lwkt_alloc_thread(template, LWKT_THREAD_STACK, cpu,
	1413	tdflags);
	1414	if (tdp)
	1415	*tdp = td;
	1416	cpu_set_thread_handler(td, lwkt_exit, func, arg);
	1417
	1418	/*
	1419	* Set up arg0 for 'ps' etc
	1420	*/
	1421	__va_start(ap, fmt);
	1422	kvsnprintf(td->td_comm, sizeof(td->td_comm), fmt, ap);
	1423	__va_end(ap);
	1424
	1425	/*
	1426	* Schedule the thread to run
	1427	*/
	1428	if ((td->td_flags & TDF_STOPREQ) == 0)
	1429	lwkt_schedule(td);
	1430	else
	1431	td->td_flags &= ~TDF_STOPREQ;
	1432	return 0;
	1433	}
	1434
	1435	/*
	1436	* Destroy an LWKT thread. Warning! This function is not called when
	1437	* a process exits, cpu_proc_exit() directly calls cpu_thread_exit() and
	1438	* uses a different reaping mechanism.
	1439	*/
	1440	void
	1441	lwkt_exit(void)
	1442	{
	1443	thread_t td = curthread;
	1444	thread_t std;
	1445	globaldata_t gd;
	1446
	1447	if (td->td_flags & TDF_VERBOSE)
	1448	kprintf("kthread %p %s has exited\n", td, td->td_comm);
	1449	caps_exit(td);
	1450
	1451	/*
	1452	* Get us into a critical section to interlock gd_freetd and loop
	1453	* until we can get it freed.
	1454	*
	1455	* We have to cache the current td in gd_freetd because objcache_put()ing
	1456	* it would rip it out from under us while our thread is still active.
	1457	*/
	1458	gd = mycpu;
	1459	crit_enter_quick(td);
	1460	while ((std = gd->gd_freetd) != NULL) {
	1461	gd->gd_freetd = NULL;
	1462	objcache_put(thread_cache, std);
	1463	}
	1464
	1465	/*
	1466	* Remove thread resources from kernel lists and deschedule us for
	1467	* the last time.
	1468	*/
	1469	if (td->td_flags & TDF_TSLEEPQ)
	1470	tsleep_remove(td);
	1471	lwkt_deschedule_self(td);
	1472	lwkt_remove_tdallq(td);
	1473	if (td->td_flags & TDF_ALLOCATED_THREAD)
	1474	gd->gd_freetd = td;
	1475	cpu_thread_exit();
	1476	}
	1477
	1478	void
	1479	lwkt_remove_tdallq(thread_t td)
	1480	{
	1481	KKASSERT(td->td_gd == mycpu);
	1482	TAILQ_REMOVE(&td->td_gd->gd_tdallq, td, td_allq);
	1483	}
	1484
	1485	void
	1486	crit_panic(void)
	1487	{
	1488	thread_t td = curthread;
	1489	int lpri = td->td_pri;
	1490
	1491	td->td_pri = 0;
	1492	panic("td_pri is/would-go negative! %p %d", td, lpri);
	1493	}
	1494
	1495	#ifdef SMP
	1496
	1497	/*
	1498	* Called from debugger/panic on cpus which have been stopped. We must still
	1499	* process the IPIQ while stopped, even if we were stopped while in a critical
	1500	* section (XXX).
	1501	*
	1502	* If we are dumping also try to process any pending interrupts. This may
	1503	* or may not work depending on the state of the cpu at the point it was
	1504	* stopped.
	1505	*/
	1506	void
	1507	lwkt_smp_stopped(void)
	1508	{
	1509	globaldata_t gd = mycpu;
	1510
	1511	crit_enter_gd(gd);
	1512	if (dumping) {
	1513	lwkt_process_ipiq();
	1514	splz();
	1515	} else {
	1516	lwkt_process_ipiq();
	1517	}
	1518	crit_exit_gd(gd);
	1519	}
	1520
	1521	/*
	1522	* get_mplock() calls this routine if it is unable to obtain the MP lock.
	1523	* get_mplock() has already incremented td_mpcount. We must block and
	1524	* not return until giant is held.
	1525	*
	1526	* All we have to do is lwkt_switch() away. The LWKT scheduler will not
	1527	* reschedule the thread until it can obtain the giant lock for it.
	1528	*/
	1529	void
	1530	lwkt_mp_lock_contested(void)
	1531	{
	1532	++mplock_countx;
	1533	loggiant(beg);
	1534	lwkt_switch();
	1535	loggiant(end);
	1536	}
	1537
	1538	/*
	1539	* The rel_mplock() code will call this function after releasing the
	1540	* last reference on the MP lock if mp_lock_contention_mask is non-zero.
	1541	*
	1542	* We then chain an IPI to a single other cpu potentially needing the
	1543	* lock. This is a bit heuristical and we can wind up with IPIs flying
	1544	* all over the place.
	1545	*/
	1546	static void lwkt_mp_lock_uncontested_remote(void *arg __unused);
	1547
	1548	void
	1549	lwkt_mp_lock_uncontested(void)
	1550	{
	1551	globaldata_t gd;
	1552	globaldata_t dgd;
	1553	cpumask_t mask;
	1554	cpumask_t tmpmask;
	1555	int cpuid;
	1556
	1557	if (chain_mplock) {
	1558	gd = mycpu;
	1559	atomic_clear_int(&mp_lock_contention_mask, gd->gd_cpumask);
	1560	mask = mp_lock_contention_mask;
	1561	tmpmask = ~((1 << gd->gd_cpuid) - 1);
	1562
	1563	if (mask) {
	1564	if (mask & tmpmask)
	1565	cpuid = bsfl(mask & tmpmask);
	1566	else
	1567	cpuid = bsfl(mask);
	1568	atomic_clear_int(&mp_lock_contention_mask, 1 << cpuid);
	1569	dgd = globaldata_find(cpuid);
	1570	lwkt_send_ipiq(dgd, lwkt_mp_lock_uncontested_remote, NULL);
	1571	}
	1572	}
	1573	}
	1574
	1575	/*
	1576	* The idea is for this IPI to interrupt a potentially lower priority
	1577	* thread, such as a user thread, to allow the scheduler to reschedule
	1578	* a higher priority kernel thread that needs the MP lock.
	1579	*
	1580	* For now we set the LWKT reschedule flag which generates an AST in
	1581	* doreti, though theoretically it is also possible to possibly preempt
	1582	* here if the underlying thread was operating in user mode. Nah.
	1583	*/
	1584	static void
	1585	lwkt_mp_lock_uncontested_remote(void *arg __unused)
	1586	{
	1587	need_lwkt_resched();
	1588	}
	1589
	1590	#endif