gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2003-2010 The DragonFly Project. All rights reserved.
	3	*
	4	* This code is derived from software contributed to The DragonFly Project
	5	* by Matthew Dillon <dillon@backplane.com>
	6	*
	7	* Redistribution and use in source and binary forms, with or without
	8	* modification, are permitted provided that the following conditions
	9	* are met:
	10	*
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in
	15	* the documentation and/or other materials provided with the
	16	* distribution.
	17	* 3. Neither the name of The DragonFly Project nor the names of its
	18	* contributors may be used to endorse or promote products derived
	19	* from this software without specific, prior written permission.
	20	*
	21	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	22	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	23	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	24	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	25	* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	26	* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
	27	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	28	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	29	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	30	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	31	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	32	* SUCH DAMAGE.
	33	*/
	34
	35	/*
	36	* Each cpu in a system has its own self-contained light weight kernel
	37	* thread scheduler, which means that generally speaking we only need
	38	* to use a critical section to avoid problems. Foreign thread
	39	* scheduling is queued via (async) IPIs.
	40	*/
	41
	42	#include <sys/param.h>
	43	#include <sys/systm.h>
	44	#include <sys/kernel.h>
	45	#include <sys/proc.h>
	46	#include <sys/rtprio.h>
	47	#include <sys/queue.h>
	48	#include <sys/sysctl.h>
	49	#include <sys/kthread.h>
	50	#include <machine/cpu.h>
	51	#include <sys/lock.h>
	52	#include <sys/caps.h>
	53	#include <sys/spinlock.h>
	54	#include <sys/ktr.h>
	55
	56	#include <sys/thread2.h>
	57	#include <sys/spinlock2.h>
	58	#include <sys/mplock2.h>
	59
	60	#include <sys/dsched.h>
	61
	62	#include <vm/vm.h>
	63	#include <vm/vm_param.h>
	64	#include <vm/vm_kern.h>
	65	#include <vm/vm_object.h>
	66	#include <vm/vm_page.h>
	67	#include <vm/vm_map.h>
	68	#include <vm/vm_pager.h>
	69	#include <vm/vm_extern.h>
	70
	71	#include <machine/stdarg.h>
	72	#include <machine/smp.h>
	73
	74	#if !defined(KTR_CTXSW)
	75	#define KTR_CTXSW KTR_ALL
	76	#endif
	77	KTR_INFO_MASTER(ctxsw);
	78	KTR_INFO(KTR_CTXSW, ctxsw, sw, 0, "#cpu[%d].td = %p",
	79	sizeof(int) + sizeof(struct thread *));
	80	KTR_INFO(KTR_CTXSW, ctxsw, pre, 1, "#cpu[%d].td = %p",
	81	sizeof(int) + sizeof(struct thread *));
	82	KTR_INFO(KTR_CTXSW, ctxsw, newtd, 2, "#threads[%p].name = %s",
	83	sizeof (struct thread ) + sizeof(char ));
	84	KTR_INFO(KTR_CTXSW, ctxsw, deadtd, 3, "#threads[%p].name = <dead>", sizeof (struct thread *));
	85
	86	static MALLOC_DEFINE(M_THREAD, "thread", "lwkt threads");
	87
	88	#ifdef INVARIANTS
	89	static int panic_on_cscount = 0;
	90	#endif
	91	static __int64_t switch_count = 0;
	92	static __int64_t preempt_hit = 0;
	93	static __int64_t preempt_miss = 0;
	94	static __int64_t preempt_weird = 0;
	95	static __int64_t token_contention_count __debugvar = 0;
	96	static int lwkt_use_spin_port;
	97	static struct objcache *thread_cache;
	98
	99	#ifdef SMP
	100	static void lwkt_schedule_remote(void arg, int arg2, struct intrframe frame);
	101	#endif
	102
	103	extern void cpu_heavy_restore(void);
	104	extern void cpu_lwkt_restore(void);
	105	extern void cpu_kthread_restore(void);
	106	extern void cpu_idle_restore(void);
	107
	108	#ifdef __x86_64__
	109
	110	static int
	111	jg_tos_ok(struct thread *td)
	112	{
	113	void *tos;
	114	int tos_ok;
	115
	116	if (td == NULL) {
	117	return 1;
	118	}
	119	KKASSERT(td->td_sp != NULL);
	120	tos = ((void **)td->td_sp)[0];
	121	tos_ok = 0;
	122	if ((tos == cpu_heavy_restore) \|\| (tos == cpu_lwkt_restore) \|\|
	123	(tos == cpu_kthread_restore) \|\| (tos == cpu_idle_restore)) {
	124	tos_ok = 1;
	125	}
	126	return tos_ok;
	127	}
	128
	129	#endif
	130
	131	/*
	132	* We can make all thread ports use the spin backend instead of the thread
	133	* backend. This should only be set to debug the spin backend.
	134	*/
	135	TUNABLE_INT("lwkt.use_spin_port", &lwkt_use_spin_port);
	136
	137	#ifdef INVARIANTS
	138	SYSCTL_INT(_lwkt, OID_AUTO, panic_on_cscount, CTLFLAG_RW, &panic_on_cscount, 0, "");
	139	#endif
	140	SYSCTL_QUAD(_lwkt, OID_AUTO, switch_count, CTLFLAG_RW, &switch_count, 0, "");
	141	SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_hit, CTLFLAG_RW, &preempt_hit, 0, "");
	142	SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_miss, CTLFLAG_RW, &preempt_miss, 0, "");
	143	SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_weird, CTLFLAG_RW, &preempt_weird, 0, "");
	144	#ifdef INVARIANTS
	145	SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count, CTLFLAG_RW,
	146	&token_contention_count, 0, "spinning due to token contention");
	147	#endif
	148
	149	/*
	150	* These helper procedures handle the runq, they can only be called from
	151	* within a critical section.
	152	*
	153	* WARNING! Prior to SMP being brought up it is possible to enqueue and
	154	* dequeue threads belonging to other cpus, so be sure to use td->td_gd
	155	* instead of 'mycpu' when referencing the globaldata structure. Once
	156	* SMP live enqueuing and dequeueing only occurs on the current cpu.
	157	*/
	158	static __inline
	159	void
	160	_lwkt_dequeue(thread_t td)
	161	{
	162	if (td->td_flags & TDF_RUNQ) {
	163	int nq = td->td_pri & TDPRI_MASK;
	164	struct globaldata *gd = td->td_gd;
	165
	166	td->td_flags &= ~TDF_RUNQ;
	167	TAILQ_REMOVE(&gd->gd_tdrunq[nq], td, td_threadq);
	168	/* runqmask is passively cleaned up by the switcher */
	169	}
	170	}
	171
	172	static __inline
	173	void
	174	_lwkt_enqueue(thread_t td)
	175	{
	176	if ((td->td_flags & (TDF_RUNQ\|TDF_MIGRATING\|TDF_BLOCKQ)) == 0) {
	177	int nq = td->td_pri & TDPRI_MASK;
	178	struct globaldata *gd = td->td_gd;
	179
	180	td->td_flags \|= TDF_RUNQ;
	181	TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], td, td_threadq);
	182	gd->gd_runqmask \|= 1 << nq;
	183	}
	184	}
	185
	186	static __boolean_t
	187	_lwkt_thread_ctor(void obj, void privdata, int ocflags)
	188	{
	189	struct thread td = (struct thread )obj;
	190
	191	td->td_kstack = NULL;
	192	td->td_kstack_size = 0;
	193	td->td_flags = TDF_ALLOCATED_THREAD;
	194	return (1);
	195	}
	196
	197	static void
	198	_lwkt_thread_dtor(void obj, void privdata)
	199	{
	200	struct thread td = (struct thread )obj;
	201
	202	KASSERT(td->td_flags & TDF_ALLOCATED_THREAD,
	203	("_lwkt_thread_dtor: not allocated from objcache"));
	204	KASSERT((td->td_flags & TDF_ALLOCATED_STACK) && td->td_kstack &&
	205	td->td_kstack_size > 0,
	206	("_lwkt_thread_dtor: corrupted stack"));
	207	kmem_free(&kernel_map, (vm_offset_t)td->td_kstack, td->td_kstack_size);
	208	}
	209
	210	/*
	211	* Initialize the lwkt s/system.
	212	*/
	213	void
	214	lwkt_init(void)
	215	{
	216	/* An objcache has 2 magazines per CPU so divide cache size by 2. */
	217	thread_cache = objcache_create_mbacked(M_THREAD, sizeof(struct thread),
	218	NULL, CACHE_NTHREADS/2,
	219	_lwkt_thread_ctor, _lwkt_thread_dtor, NULL);
	220	}
	221
	222	/*
	223	* Schedule a thread to run. As the current thread we can always safely
	224	* schedule ourselves, and a shortcut procedure is provided for that
	225	* function.
	226	*
	227	* (non-blocking, self contained on a per cpu basis)
	228	*/
	229	void
	230	lwkt_schedule_self(thread_t td)
	231	{
	232	crit_enter_quick(td);
	233	KASSERT(td != &td->td_gd->gd_idlethread, ("lwkt_schedule_self(): scheduling gd_idlethread is illegal!"));
	234	KKASSERT(td->td_lwp == NULL \|\| (td->td_lwp->lwp_flag & LWP_ONRUNQ) == 0);
	235	_lwkt_enqueue(td);
	236	crit_exit_quick(td);
	237	}
	238
	239	/*
	240	* Deschedule a thread.
	241	*
	242	* (non-blocking, self contained on a per cpu basis)
	243	*/
	244	void
	245	lwkt_deschedule_self(thread_t td)
	246	{
	247	crit_enter_quick(td);
	248	_lwkt_dequeue(td);
	249	crit_exit_quick(td);
	250	}
	251
	252	/*
	253	* LWKTs operate on a per-cpu basis
	254	*
	255	* WARNING! Called from early boot, 'mycpu' may not work yet.
	256	*/
	257	void
	258	lwkt_gdinit(struct globaldata *gd)
	259	{
	260	int i;
	261
	262	for (i = 0; i < sizeof(gd->gd_tdrunq)/sizeof(gd->gd_tdrunq[0]); ++i)
	263	TAILQ_INIT(&gd->gd_tdrunq[i]);
	264	gd->gd_runqmask = 0;
	265	TAILQ_INIT(&gd->gd_tdallq);
	266	}
	267
	268	/*
	269	* Create a new thread. The thread must be associated with a process context
	270	* or LWKT start address before it can be scheduled. If the target cpu is
	271	* -1 the thread will be created on the current cpu.
	272	*
	273	* If you intend to create a thread without a process context this function
	274	* does everything except load the startup and switcher function.
	275	*/
	276	thread_t
	277	lwkt_alloc_thread(struct thread *td, int stksize, int cpu, int flags)
	278	{
	279	globaldata_t gd = mycpu;
	280	void *stack;
	281
	282	/*
	283	* If static thread storage is not supplied allocate a thread. Reuse
	284	* a cached free thread if possible. gd_freetd is used to keep an exiting
	285	* thread intact through the exit.
	286	*/
	287	if (td == NULL) {
	288	if ((td = gd->gd_freetd) != NULL)
	289	gd->gd_freetd = NULL;
	290	else
	291	td = objcache_get(thread_cache, M_WAITOK);
	292	KASSERT((td->td_flags &
	293	(TDF_ALLOCATED_THREAD\|TDF_RUNNING)) == TDF_ALLOCATED_THREAD,
	294	("lwkt_alloc_thread: corrupted td flags 0x%X", td->td_flags));
	295	flags \|= td->td_flags & (TDF_ALLOCATED_THREAD\|TDF_ALLOCATED_STACK);
	296	}
	297
	298	/*
	299	* Try to reuse cached stack.
	300	*/
	301	if ((stack = td->td_kstack) != NULL && td->td_kstack_size != stksize) {
	302	if (flags & TDF_ALLOCATED_STACK) {
	303	kmem_free(&kernel_map, (vm_offset_t)stack, td->td_kstack_size);
	304	stack = NULL;
	305	}
	306	}
	307	if (stack == NULL) {
	308	stack = (void *)kmem_alloc(&kernel_map, stksize);
	309	flags \|= TDF_ALLOCATED_STACK;
	310	}
	311	if (cpu < 0)
	312	lwkt_init_thread(td, stack, stksize, flags, gd);
	313	else
	314	lwkt_init_thread(td, stack, stksize, flags, globaldata_find(cpu));
	315	return(td);
	316	}
	317
	318	/*
	319	* Initialize a preexisting thread structure. This function is used by
	320	* lwkt_alloc_thread() and also used to initialize the per-cpu idlethread.
	321	*
	322	* All threads start out in a critical section at a priority of
	323	* TDPRI_KERN_DAEMON. Higher level code will modify the priority as
	324	* appropriate. This function may send an IPI message when the
	325	* requested cpu is not the current cpu and consequently gd_tdallq may
	326	* not be initialized synchronously from the point of view of the originating
	327	* cpu.
	328	*
	329	* NOTE! we have to be careful in regards to creating threads for other cpus
	330	* if SMP has not yet been activated.
	331	*/
	332	#ifdef SMP
	333
	334	static void
	335	lwkt_init_thread_remote(void *arg)
	336	{
	337	thread_t td = arg;
	338
	339	/*
	340	* Protected by critical section held by IPI dispatch
	341	*/
	342	TAILQ_INSERT_TAIL(&td->td_gd->gd_tdallq, td, td_allq);
	343	}
	344
	345	#endif
	346
	347	void
	348	lwkt_init_thread(thread_t td, void *stack, int stksize, int flags,
	349	struct globaldata *gd)
	350	{
	351	globaldata_t mygd = mycpu;
	352
	353	bzero(td, sizeof(struct thread));
	354	td->td_kstack = stack;
	355	td->td_kstack_size = stksize;
	356	td->td_flags = flags;
	357	td->td_gd = gd;
	358	td->td_pri = TDPRI_KERN_DAEMON + TDPRI_CRIT;
	359	td->td_toks_stop = &td->td_toks_base;
	360	#ifdef SMP
	361	if ((flags & TDF_MPSAFE) == 0)
	362	td->td_mpcount = 1;
	363	#endif
	364	if (lwkt_use_spin_port)
	365	lwkt_initport_spin(&td->td_msgport);
	366	else
	367	lwkt_initport_thread(&td->td_msgport, td);
	368	pmap_init_thread(td);
	369	#ifdef SMP
	370	/*
	371	* Normally initializing a thread for a remote cpu requires sending an
	372	* IPI. However, the idlethread is setup before the other cpus are
	373	* activated so we have to treat it as a special case. XXX manipulation
	374	* of gd_tdallq requires the BGL.
	375	*/
	376	if (gd == mygd \|\| td == &gd->gd_idlethread) {
	377	crit_enter_gd(mygd);
	378	TAILQ_INSERT_TAIL(&gd->gd_tdallq, td, td_allq);
	379	crit_exit_gd(mygd);
	380	} else {
	381	lwkt_send_ipiq(gd, lwkt_init_thread_remote, td);
	382	}
	383	#else
	384	crit_enter_gd(mygd);
	385	TAILQ_INSERT_TAIL(&gd->gd_tdallq, td, td_allq);
	386	crit_exit_gd(mygd);
	387	#endif
	388
	389	dsched_new_thread(td);
	390	}
	391
	392	void
	393	lwkt_set_comm(thread_t td, const char *ctl, ...)
	394	{
	395	__va_list va;
	396
	397	__va_start(va, ctl);
	398	kvsnprintf(td->td_comm, sizeof(td->td_comm), ctl, va);
	399	__va_end(va);
	400	KTR_LOG(ctxsw_newtd, td, &td->td_comm[0]);
	401	}
	402
	403	void
	404	lwkt_hold(thread_t td)
	405	{
	406	++td->td_refs;
	407	}
	408
	409	void
	410	lwkt_rele(thread_t td)
	411	{
	412	KKASSERT(td->td_refs > 0);
	413	--td->td_refs;
	414	}
	415
	416	void
	417	lwkt_wait_free(thread_t td)
	418	{
	419	while (td->td_refs)
	420	tsleep(td, 0, "tdreap", hz);
	421	}
	422
	423	void
	424	lwkt_free_thread(thread_t td)
	425	{
	426	KASSERT((td->td_flags & TDF_RUNNING) == 0,
	427	("lwkt_free_thread: did not exit! %p", td));
	428
	429	if (td->td_flags & TDF_ALLOCATED_THREAD) {
	430	objcache_put(thread_cache, td);
	431	} else if (td->td_flags & TDF_ALLOCATED_STACK) {
	432	/* client-allocated struct with internally allocated stack */
	433	KASSERT(td->td_kstack && td->td_kstack_size > 0,
	434	("lwkt_free_thread: corrupted stack"));
	435	kmem_free(&kernel_map, (vm_offset_t)td->td_kstack, td->td_kstack_size);
	436	td->td_kstack = NULL;
	437	td->td_kstack_size = 0;
	438	}
	439	KTR_LOG(ctxsw_deadtd, td);
	440	}
	441
	442
	443	/*
	444	* Switch to the next runnable lwkt. If no LWKTs are runnable then
	445	* switch to the idlethread. Switching must occur within a critical
	446	* section to avoid races with the scheduling queue.
	447	*
	448	* We always have full control over our cpu's run queue. Other cpus
	449	* that wish to manipulate our queue must use the cpu_*msg() calls to
	450	* talk to our cpu, so a critical section is all that is needed and
	451	* the result is very, very fast thread switching.
	452	*
	453	* The LWKT scheduler uses a fixed priority model and round-robins at
	454	* each priority level. User process scheduling is a totally
	455	* different beast and LWKT priorities should not be confused with
	456	* user process priorities.
	457	*
	458	* The MP lock may be out of sync with the thread's td_mpcount. lwkt_switch()
	459	* cleans it up. Note that the td_switch() function cannot do anything that
	460	* requires the MP lock since the MP lock will have already been setup for
	461	* the target thread (not the current thread). It's nice to have a scheduler
	462	* that does not need the MP lock to work because it allows us to do some
	463	* really cool high-performance MP lock optimizations.
	464	*
	465	* PREEMPTION NOTE: Preemption occurs via lwkt_preempt(). lwkt_switch()
	466	* is not called by the current thread in the preemption case, only when
	467	* the preempting thread blocks (in order to return to the original thread).
	468	*/
	469	void
	470	lwkt_switch(void)
	471	{
	472	globaldata_t gd = mycpu;
	473	thread_t td = gd->gd_curthread;
	474	thread_t ntd;
	475	#ifdef SMP
	476	int mpheld;
	477	#endif
	478
	479	/*
	480	* Switching from within a 'fast' (non thread switched) interrupt or IPI
	481	* is illegal. However, we may have to do it anyway if we hit a fatal
	482	* kernel trap or we have paniced.
	483	*
	484	* If this case occurs save and restore the interrupt nesting level.
	485	*/
	486	if (gd->gd_intr_nesting_level) {
	487	int savegdnest;
	488	int savegdtrap;
	489
	490	if (gd->gd_trap_nesting_level == 0 && panicstr == NULL) {
	491	panic("lwkt_switch: cannot switch from within "
	492	"a fast interrupt, yet, td %p\n", td);
	493	} else {
	494	savegdnest = gd->gd_intr_nesting_level;
	495	savegdtrap = gd->gd_trap_nesting_level;
	496	gd->gd_intr_nesting_level = 0;
	497	gd->gd_trap_nesting_level = 0;
	498	if ((td->td_flags & TDF_PANICWARN) == 0) {
	499	td->td_flags \|= TDF_PANICWARN;
	500	kprintf("Warning: thread switch from interrupt or IPI, "
	501	"thread %p (%s)\n", td, td->td_comm);
	502	print_backtrace(-1);
	503	}
	504	lwkt_switch();
	505	gd->gd_intr_nesting_level = savegdnest;
	506	gd->gd_trap_nesting_level = savegdtrap;
	507	return;
	508	}
	509	}
	510
	511	/*
	512	* Passive release (used to transition from user to kernel mode
	513	* when we block or switch rather then when we enter the kernel).
	514	* This function is NOT called if we are switching into a preemption
	515	* or returning from a preemption. Typically this causes us to lose
	516	* our current process designation (if we have one) and become a true
	517	* LWKT thread, and may also hand the current process designation to
	518	* another process and schedule thread.
	519	*/
	520	if (td->td_release)
	521	td->td_release(td);
	522
	523	crit_enter_gd(gd);
	524	if (TD_TOKS_HELD(td))
	525	lwkt_relalltokens(td);
	526
	527	/*
	528	* We had better not be holding any spin locks, but don't get into an
	529	* endless panic loop.
	530	*/
	531	KASSERT(gd->gd_spinlock_rd == NULL \|\| panicstr != NULL,
	532	("lwkt_switch: still holding a shared spinlock %p!",
	533	gd->gd_spinlock_rd));
	534	KASSERT(gd->gd_spinlocks_wr == 0 \|\| panicstr != NULL,
	535	("lwkt_switch: still holding %d exclusive spinlocks!",
	536	gd->gd_spinlocks_wr));
	537
	538
	539	#ifdef SMP
	540	/*
	541	* td_mpcount cannot be used to determine if we currently hold the
	542	* MP lock because get_mplock() will increment it prior to attempting
	543	* to get the lock, and switch out if it can't. Our ownership of
	544	* the actual lock will remain stable while we are in a critical section
	545	* (but, of course, another cpu may own or release the lock so the
	546	* actual value of mp_lock is not stable).
	547	*/
	548	mpheld = MP_LOCK_HELD();
	549	#ifdef INVARIANTS
	550	if (td->td_cscount) {
	551	kprintf("Diagnostic: attempt to switch while mastering cpusync: %p\n",
	552	td);
	553	if (panic_on_cscount)
	554	panic("switching while mastering cpusync");
	555	}
	556	#endif
	557	#endif
	558	if ((ntd = td->td_preempted) != NULL) {
	559	/*
	560	* We had preempted another thread on this cpu, resume the preempted
	561	* thread. This occurs transparently, whether the preempted thread
	562	* was scheduled or not (it may have been preempted after descheduling
	563	* itself).
	564	*
	565	* We have to setup the MP lock for the original thread after backing
	566	* out the adjustment that was made to curthread when the original
	567	* was preempted.
	568	*/
	569	KKASSERT(ntd->td_flags & TDF_PREEMPT_LOCK);
	570	#ifdef SMP
	571	if (ntd->td_mpcount && mpheld == 0) {
	572	panic("MPLOCK NOT HELD ON RETURN: %p %p %d %d",
	573	td, ntd, td->td_mpcount, ntd->td_mpcount);
	574	}
	575	if (ntd->td_mpcount) {
	576	td->td_mpcount -= ntd->td_mpcount;
	577	KKASSERT(td->td_mpcount >= 0);
	578	}
	579	#endif
	580	ntd->td_flags \|= TDF_PREEMPT_DONE;
	581
	582	/*
	583	* The interrupt may have woken a thread up, we need to properly
	584	* set the reschedule flag if the originally interrupted thread is
	585	* at a lower priority.
	586	*/
	587	if (gd->gd_runqmask > (2 << (ntd->td_pri & TDPRI_MASK)) - 1)
	588	need_lwkt_resched();
	589	/* YYY release mp lock on switchback if original doesn't need it */
	590	} else {
	591	/*
	592	* Priority queue / round-robin at each priority. Note that user
	593	* processes run at a fixed, low priority and the user process
	594	* scheduler deals with interactions between user processes
	595	* by scheduling and descheduling them from the LWKT queue as
	596	* necessary.
	597	*
	598	* We have to adjust the MP lock for the target thread. If we
	599	* need the MP lock and cannot obtain it we try to locate a
	600	* thread that does not need the MP lock. If we cannot, we spin
	601	* instead of HLT.
	602	*
	603	* A similar issue exists for the tokens held by the target thread.
	604	* If we cannot obtain ownership of the tokens we cannot immediately
	605	* schedule the thread.
	606	*/
	607
	608	/*
	609	* If an LWKT reschedule was requested, well that is what we are
	610	* doing now so clear it.
	611	*/
	612	clear_lwkt_resched();
	613	again:
	614	if (gd->gd_runqmask) {
	615	int nq = bsrl(gd->gd_runqmask);
	616	if ((ntd = TAILQ_FIRST(&gd->gd_tdrunq[nq])) == NULL) {
	617	gd->gd_runqmask &= ~(1 << nq);
	618	goto again;
	619	}
	620	#ifdef SMP
	621	/*
	622	* THREAD SELECTION FOR AN SMP MACHINE BUILD
	623	*
	624	* If the target needs the MP lock and we couldn't get it,
	625	* or if the target is holding tokens and we could not
	626	* gain ownership of the tokens, continue looking for a
	627	* thread to schedule and spin instead of HLT if we can't.
	628	*
	629	* NOTE: the mpheld variable invalid after this conditional, it
	630	* can change due to both cpu_try_mplock() returning success
	631	* AND interactions in lwkt_getalltokens() due to the fact that
	632	* we are trying to check the mpcount of a thread other then
	633	* the current thread. Because of this, if the current thread
	634	* is not holding td_mpcount, an IPI indirectly run via
	635	* lwkt_getalltokens() can obtain and release the MP lock and
	636	* cause the core MP lock to be released.
	637	*/
	638	if ((ntd->td_mpcount && mpheld == 0 && !cpu_try_mplock()) \|\|
	639	(TD_TOKS_HELD(ntd) && lwkt_getalltokens(ntd) == 0)
	640	) {
	641	u_int32_t rqmask = gd->gd_runqmask;
	642
	643	mpheld = MP_LOCK_HELD();
	644	ntd = NULL;
	645	while (rqmask) {
	646	TAILQ_FOREACH(ntd, &gd->gd_tdrunq[nq], td_threadq) {
	647	if (ntd->td_mpcount && !mpheld && !cpu_try_mplock()) {
	648	/* spinning due to MP lock being held */
	649	continue;
	650	}
	651
	652	/*
	653	* mpheld state invalid after getalltokens call returns
	654	* failure, but the variable is only needed for
	655	* the loop.
	656	*/
	657	if (TD_TOKS_HELD(ntd) && !lwkt_getalltokens(ntd)) {
	658	/* spinning due to token contention */
	659	#ifdef INVARIANTS
	660	++token_contention_count;
	661	#endif
	662	mpheld = MP_LOCK_HELD();
	663	continue;
	664	}
	665	break;
	666	}
	667	if (ntd)
	668	break;
	669	rqmask &= ~(1 << nq);
	670	nq = bsrl(rqmask);
	671
	672	/*
	673	* We have two choices. We can either refuse to run a
	674	* user thread when a kernel thread needs the MP lock
	675	* but could not get it, or we can allow it to run but
	676	* then expect an IPI (hopefully) later on to force a
	677	* reschedule when the MP lock might become available.
	678	*/
	679	if (nq < TDPRI_KERN_LPSCHED) {
	680	break; /* for now refuse to run */
	681	#if 0
	682	if (chain_mplock == 0)
	683	break;
	684	/* continue loop, allow user threads to be scheduled */
	685	#endif
	686	}
	687	}
	688
	689	/*
	690	* Case where a (kernel) thread needed the MP lock and could
	691	* not get one, and we may or may not have found another
	692	* thread which does not need the MP lock to run while
	693	* we wait (ntd).
	694	*/
	695	if (ntd == NULL) {
	696	ntd = &gd->gd_idlethread;
	697	ntd->td_flags \|= TDF_IDLE_NOHLT;
	698	set_mplock_contention_mask(gd);
	699	cpu_mplock_contested();
	700	goto using_idle_thread;
	701	} else {
	702	clr_mplock_contention_mask(gd);
	703	++gd->gd_cnt.v_swtch;
	704	TAILQ_REMOVE(&gd->gd_tdrunq[nq], ntd, td_threadq);
	705	TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], ntd, td_threadq);
	706	}
	707	} else {
	708	clr_mplock_contention_mask(gd);
	709	++gd->gd_cnt.v_swtch;
	710	TAILQ_REMOVE(&gd->gd_tdrunq[nq], ntd, td_threadq);
	711	TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], ntd, td_threadq);
	712	}
	713	#else
	714	/*
	715	* THREAD SELECTION FOR A UP MACHINE BUILD. We don't have to
	716	* worry about tokens or the BGL. However, we still have
	717	* to call lwkt_getalltokens() in order to properly detect
	718	* stale tokens. This call cannot fail for a UP build!
	719	*/
	720	lwkt_getalltokens(ntd);
	721	++gd->gd_cnt.v_swtch;
	722	TAILQ_REMOVE(&gd->gd_tdrunq[nq], ntd, td_threadq);
	723	TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], ntd, td_threadq);
	724	#endif
	725	} else {
	726	/*
	727	* We have nothing to run but only let the idle loop halt
	728	* the cpu if there are no pending interrupts.
	729	*/
	730	ntd = &gd->gd_idlethread;
	731	if (gd->gd_reqflags & RQF_IDLECHECK_MASK)
	732	ntd->td_flags \|= TDF_IDLE_NOHLT;
	733	#ifdef SMP
	734	using_idle_thread:
	735	/*
	736	* The idle thread should not be holding the MP lock unless we
	737	* are trapping in the kernel or in a panic. Since we select the
	738	* idle thread unconditionally when no other thread is available,
	739	* if the MP lock is desired during a panic or kernel trap, we
	740	* have to loop in the scheduler until we get it.
	741	*/
	742	if (ntd->td_mpcount) {
	743	mpheld = MP_LOCK_HELD();
	744	if (gd->gd_trap_nesting_level == 0 && panicstr == NULL)
	745	panic("Idle thread %p was holding the BGL!", ntd);
	746	if (mpheld == 0)
	747	goto again;
	748	}
	749	#endif
	750	}
	751	}
	752	KASSERT(ntd->td_pri >= TDPRI_CRIT,
	753	("priority problem in lwkt_switch %d %d", td->td_pri, ntd->td_pri));
	754
	755	/*
	756	* Do the actual switch. If the new target does not need the MP lock
	757	* and we are holding it, release the MP lock. If the new target requires
	758	* the MP lock we have already acquired it for the target.
	759	*/
	760	#ifdef SMP
	761	if (ntd->td_mpcount == 0 ) {
	762	if (MP_LOCK_HELD())
	763	cpu_rel_mplock();
	764	} else {
	765	ASSERT_MP_LOCK_HELD(ntd);
	766	}
	767	#endif
	768	if (td != ntd) {
	769	++switch_count;
	770	#ifdef __x86_64__
	771	{
	772	int tos_ok __debugvar = jg_tos_ok(ntd);
	773	KKASSERT(tos_ok);
	774	}
	775	#endif
	776	KTR_LOG(ctxsw_sw, gd->gd_cpuid, ntd);
	777	td->td_switch(ntd);
	778	}
	779	/* NOTE: current cpu may have changed after switch */
	780	crit_exit_quick(td);
	781	}
	782
	783	/*
	784	* Request that the target thread preempt the current thread. Preemption
	785	* only works under a specific set of conditions:
	786	*
	787	* - We are not preempting ourselves
	788	* - The target thread is owned by the current cpu
	789	* - We are not currently being preempted
	790	* - The target is not currently being preempted
	791	* - We are not holding any spin locks
	792	* - The target thread is not holding any tokens
	793	* - We are able to satisfy the target's MP lock requirements (if any).
	794	*
	795	* THE CALLER OF LWKT_PREEMPT() MUST BE IN A CRITICAL SECTION. Typically
	796	* this is called via lwkt_schedule() through the td_preemptable callback.
	797	* critpri is the managed critical priority that we should ignore in order
	798	* to determine whether preemption is possible (aka usually just the crit
	799	* priority of lwkt_schedule() itself).
	800	*
	801	* XXX at the moment we run the target thread in a critical section during
	802	* the preemption in order to prevent the target from taking interrupts
	803	* that WE can't. Preemption is strictly limited to interrupt threads
	804	* and interrupt-like threads, outside of a critical section, and the
	805	* preempted source thread will be resumed the instant the target blocks
	806	* whether or not the source is scheduled (i.e. preemption is supposed to
	807	* be as transparent as possible).
	808	*
	809	* The target thread inherits our MP count (added to its own) for the
	810	* duration of the preemption in order to preserve the atomicy of the
	811	* MP lock during the preemption. Therefore, any preempting targets must be
	812	* careful in regards to MP assertions. Note that the MP count may be
	813	* out of sync with the physical mp_lock, but we do not have to preserve
	814	* the original ownership of the lock if it was out of synch (that is, we
	815	* can leave it synchronized on return).
	816	*/
	817	void
	818	lwkt_preempt(thread_t ntd, int critpri)
	819	{
	820	struct globaldata *gd = mycpu;
	821	thread_t td;
	822	#ifdef SMP
	823	int mpheld;
	824	int savecnt;
	825	#endif
	826
	827	/*
	828	* The caller has put us in a critical section. We can only preempt
	829	* if the caller of the caller was not in a critical section (basically
	830	* a local interrupt), as determined by the 'critpri' parameter. We
	831	* also can't preempt if the caller is holding any spinlocks (even if
	832	* he isn't in a critical section). This also handles the tokens test.
	833	*
	834	* YYY The target thread must be in a critical section (else it must
	835	* inherit our critical section? I dunno yet).
	836	*
	837	* Set need_lwkt_resched() unconditionally for now YYY.
	838	*/
	839	KASSERT(ntd->td_pri >= TDPRI_CRIT, ("BADCRIT0 %d", ntd->td_pri));
	840
	841	td = gd->gd_curthread;
	842	if ((ntd->td_pri & TDPRI_MASK) <= (td->td_pri & TDPRI_MASK)) {
	843	++preempt_miss;
	844	return;
	845	}
	846	if ((td->td_pri & ~TDPRI_MASK) > critpri) {
	847	++preempt_miss;
	848	need_lwkt_resched();
	849	return;
	850	}
	851	#ifdef SMP
	852	if (ntd->td_gd != gd) {
	853	++preempt_miss;
	854	need_lwkt_resched();
	855	return;
	856	}
	857	#endif
	858	/*
	859	* Take the easy way out and do not preempt if we are holding
	860	* any spinlocks. We could test whether the thread(s) being
	861	* preempted interlock against the target thread's tokens and whether
	862	* we can get all the target thread's tokens, but this situation
	863	* should not occur very often so its easier to simply not preempt.
	864	* Also, plain spinlocks are impossible to figure out at this point so
	865	* just don't preempt.
	866	*
	867	* Do not try to preempt if the target thread is holding any tokens.
	868	* We could try to acquire the tokens but this case is so rare there
	869	* is no need to support it.
	870	*/
	871	if (gd->gd_spinlock_rd \|\| gd->gd_spinlocks_wr) {
	872	++preempt_miss;
	873	need_lwkt_resched();
	874	return;
	875	}
	876	if (TD_TOKS_HELD(ntd)) {
	877	++preempt_miss;
	878	need_lwkt_resched();
	879	return;
	880	}
	881	if (td == ntd \|\| ((td->td_flags \| ntd->td_flags) & TDF_PREEMPT_LOCK)) {
	882	++preempt_weird;
	883	need_lwkt_resched();
	884	return;
	885	}
	886	if (ntd->td_preempted) {
	887	++preempt_hit;
	888	need_lwkt_resched();
	889	return;
	890	}
	891	#ifdef SMP
	892	/*
	893	* note: an interrupt might have occured just as we were transitioning
	894	* to or from the MP lock. In this case td_mpcount will be pre-disposed
	895	* (non-zero) but not actually synchronized with the actual state of the
	896	* lock. We can use it to imply an MP lock requirement for the
	897	* preemption but we cannot use it to test whether we hold the MP lock
	898	* or not.
	899	*/
	900	savecnt = td->td_mpcount;
	901	mpheld = MP_LOCK_HELD();
	902	ntd->td_mpcount += td->td_mpcount;
	903	if (mpheld == 0 && ntd->td_mpcount && !cpu_try_mplock()) {
	904	ntd->td_mpcount -= td->td_mpcount;
	905	++preempt_miss;
	906	need_lwkt_resched();
	907	return;
	908	}
	909	#endif
	910
	911	/*
	912	* Since we are able to preempt the current thread, there is no need to
	913	* call need_lwkt_resched().
	914	*/
	915	++preempt_hit;
	916	ntd->td_preempted = td;
	917	td->td_flags \|= TDF_PREEMPT_LOCK;
	918	KTR_LOG(ctxsw_pre, gd->gd_cpuid, ntd);
	919	td->td_switch(ntd);
	920
	921	KKASSERT(ntd->td_preempted && (td->td_flags & TDF_PREEMPT_DONE));
	922	#ifdef SMP
	923	KKASSERT(savecnt == td->td_mpcount);
	924	mpheld = MP_LOCK_HELD();
	925	if (mpheld && td->td_mpcount == 0)
	926	cpu_rel_mplock();
	927	else if (mpheld == 0 && td->td_mpcount)
	928	panic("lwkt_preempt(): MP lock was not held through");
	929	#endif
	930	ntd->td_preempted = NULL;
	931	td->td_flags &= ~(TDF_PREEMPT_LOCK\|TDF_PREEMPT_DONE);
	932	}
	933
	934	/*
	935	* Conditionally call splz() if gd_reqflags indicates work is pending.
	936	*
	937	* td_nest_count prevents deep nesting via splz() or doreti() which
	938	* might otherwise blow out the kernel stack. Note that except for
	939	* this special case, we MUST call splz() here to handle any
	940	* pending ints, particularly after we switch, or we might accidently
	941	* halt the cpu with interrupts pending.
	942	*
	943	* (self contained on a per cpu basis)
	944	*/
	945	void
	946	splz_check(void)
	947	{
	948	globaldata_t gd = mycpu;
	949	thread_t td = gd->gd_curthread;
	950
	951	if (gd->gd_reqflags && td->td_nest_count < 2)
	952	splz();
	953	}
	954
	955	/*
	956	* This implements a normal yield which will yield to equal priority
	957	* threads as well as higher priority threads. Note that gd_reqflags
	958	* tests will be handled by the crit_exit() call in lwkt_switch().
	959	*
	960	* (self contained on a per cpu basis)
	961	*/
	962	void
	963	lwkt_yield(void)
	964	{
	965	lwkt_schedule_self(curthread);
	966	lwkt_switch();
	967	}
	968
	969	/*
	970	* This function is used along with the lwkt_passive_recover() inline
	971	* by the trap code to negotiate a passive release of the current
	972	* process/lwp designation with the user scheduler.
	973	*/
	974	void
	975	lwkt_passive_release(struct thread *td)
	976	{
	977	struct lwp *lp = td->td_lwp;
	978
	979	td->td_release = NULL;
	980	lwkt_setpri_self(TDPRI_KERN_USER);
	981	lp->lwp_proc->p_usched->release_curproc(lp);
	982	}
	983
	984	/*
	985	* Make a kernel thread act as if it were in user mode with regards
	986	* to scheduling, to avoid becoming cpu-bound in the kernel. Kernel
	987	* loops which may be potentially cpu-bound can call lwkt_user_yield().
	988	*
	989	* The lwkt_user_yield() function is designed to have very low overhead
	990	* if no yield is determined to be needed.
	991	*/
	992	void
	993	lwkt_user_yield(void)
	994	{
	995	thread_t td = curthread;
	996	struct lwp *lp = td->td_lwp;
	997
	998	#ifdef SMP
	999	/*
	1000	* XXX SEVERE TEMPORARY HACK. A cpu-bound operation running in the
	1001	* kernel can prevent other cpus from servicing interrupt threads
	1002	* which still require the MP lock (which is a lot of them). This
	1003	* has a chaining effect since if the interrupt is blocked, so is
	1004	* the event, so normal scheduling will not pick up on the problem.
	1005	*/
	1006	if (mp_lock_contention_mask && td->td_mpcount) {
	1007	yield_mplock(td);
	1008	}
	1009	#endif
	1010
	1011	/*
	1012	* Another kernel thread wants the cpu
	1013	*/
	1014	if (lwkt_resched_wanted())
	1015	lwkt_switch();
	1016
	1017	/*
	1018	* If the user scheduler has asynchronously determined that the current
	1019	* process (when running in user mode) needs to lose the cpu then make
	1020	* sure we are released.
	1021	*/
	1022	if (user_resched_wanted()) {
	1023	if (td->td_release)
	1024	td->td_release(td);
	1025	}
	1026
	1027	/*
	1028	* If we are released reduce our priority
	1029	*/
	1030	if (td->td_release == NULL) {
	1031	if (lwkt_check_resched(td) > 0)
	1032	lwkt_switch();
	1033	if (lp) {
	1034	lp->lwp_proc->p_usched->acquire_curproc(lp);
	1035	td->td_release = lwkt_passive_release;
	1036	lwkt_setpri_self(TDPRI_USER_NORM);
	1037	}
	1038	}
	1039	}
	1040
	1041	/*
	1042	* Return 0 if no runnable threads are pending at the same or higher
	1043	* priority as the passed thread.
	1044	*
	1045	* Return 1 if runnable threads are pending at the same priority.
	1046	*
	1047	* Return 2 if runnable threads are pending at a higher priority.
	1048	*/
	1049	int
	1050	lwkt_check_resched(thread_t td)
	1051	{
	1052	int pri = td->td_pri & TDPRI_MASK;
	1053
	1054	if (td->td_gd->gd_runqmask > (2 << pri) - 1)
	1055	return(2);
	1056	if (TAILQ_NEXT(td, td_threadq))
	1057	return(1);
	1058	return(0);
	1059	}
	1060
	1061	/*
	1062	* Generic schedule. Possibly schedule threads belonging to other cpus and
	1063	* deal with threads that might be blocked on a wait queue.
	1064	*
	1065	* We have a little helper inline function which does additional work after
	1066	* the thread has been enqueued, including dealing with preemption and
	1067	* setting need_lwkt_resched() (which prevents the kernel from returning
	1068	* to userland until it has processed higher priority threads).
	1069	*
	1070	* It is possible for this routine to be called after a failed _enqueue
	1071	* (due to the target thread migrating, sleeping, or otherwise blocked).
	1072	* We have to check that the thread is actually on the run queue!
	1073	*
	1074	* reschedok is an optimized constant propagated from lwkt_schedule() or
	1075	* lwkt_schedule_noresched(). By default it is non-zero, causing a
	1076	* reschedule to be requested if the target thread has a higher priority.
	1077	* The port messaging code will set MSG_NORESCHED and cause reschedok to
	1078	* be 0, prevented undesired reschedules.
	1079	*/
	1080	static __inline
	1081	void
	1082	_lwkt_schedule_post(globaldata_t gd, thread_t ntd, int cpri, int reschedok)
	1083	{
	1084	thread_t otd;
	1085
	1086	if (ntd->td_flags & TDF_RUNQ) {
	1087	if (ntd->td_preemptable && reschedok) {
	1088	ntd->td_preemptable(ntd, cpri); /* YYY +token */
	1089	} else if (reschedok) {
	1090	otd = curthread;
	1091	if ((ntd->td_pri & TDPRI_MASK) > (otd->td_pri & TDPRI_MASK))
	1092	need_lwkt_resched();
	1093	}
	1094	}
	1095	}
	1096
	1097	static __inline
	1098	void
	1099	_lwkt_schedule(thread_t td, int reschedok)
	1100	{
	1101	globaldata_t mygd = mycpu;
	1102
	1103	KASSERT(td != &td->td_gd->gd_idlethread, ("lwkt_schedule(): scheduling gd_idlethread is illegal!"));
	1104	crit_enter_gd(mygd);
	1105	KKASSERT(td->td_lwp == NULL \|\| (td->td_lwp->lwp_flag & LWP_ONRUNQ) == 0);
	1106	if (td == mygd->gd_curthread) {
	1107	_lwkt_enqueue(td);
	1108	} else {
	1109	/*
	1110	* If we own the thread, there is no race (since we are in a
	1111	* critical section). If we do not own the thread there might
	1112	* be a race but the target cpu will deal with it.
	1113	*/
	1114	#ifdef SMP
	1115	if (td->td_gd == mygd) {
	1116	_lwkt_enqueue(td);
	1117	_lwkt_schedule_post(mygd, td, TDPRI_CRIT, reschedok);
	1118	} else {
	1119	lwkt_send_ipiq3(td->td_gd, lwkt_schedule_remote, td, 0);
	1120	}
	1121	#else
	1122	_lwkt_enqueue(td);
	1123	_lwkt_schedule_post(mygd, td, TDPRI_CRIT, reschedok);
	1124	#endif
	1125	}
	1126	crit_exit_gd(mygd);
	1127	}
	1128
	1129	void
	1130	lwkt_schedule(thread_t td)
	1131	{
	1132	_lwkt_schedule(td, 1);
	1133	}
	1134
	1135	void
	1136	lwkt_schedule_noresched(thread_t td)
	1137	{
	1138	_lwkt_schedule(td, 0);
	1139	}
	1140
	1141	#ifdef SMP
	1142
	1143	/*
	1144	* When scheduled remotely if frame != NULL the IPIQ is being
	1145	* run via doreti or an interrupt then preemption can be allowed.
	1146	*
	1147	* To allow preemption we have to drop the critical section so only
	1148	* one is present in _lwkt_schedule_post.
	1149	*/
	1150	static void
	1151	lwkt_schedule_remote(void arg, int arg2, struct intrframe frame)
	1152	{
	1153	thread_t td = curthread;
	1154	thread_t ntd = arg;
	1155
	1156	if (frame && ntd->td_preemptable) {
	1157	crit_exit_noyield(td);
	1158	_lwkt_schedule(ntd, 1);
	1159	crit_enter_quick(td);
	1160	} else {
	1161	_lwkt_schedule(ntd, 1);
	1162	}
	1163	}
	1164
	1165	/*
	1166	* Thread migration using a 'Pull' method. The thread may or may not be
	1167	* the current thread. It MUST be descheduled and in a stable state.
	1168	* lwkt_giveaway() must be called on the cpu owning the thread.
	1169	*
	1170	* At any point after lwkt_giveaway() is called, the target cpu may
	1171	* 'pull' the thread by calling lwkt_acquire().
	1172	*
	1173	* We have to make sure the thread is not sitting on a per-cpu tsleep
	1174	* queue or it will blow up when it moves to another cpu.
	1175	*
	1176	* MPSAFE - must be called under very specific conditions.
	1177	*/
	1178	void
	1179	lwkt_giveaway(thread_t td)
	1180	{
	1181	globaldata_t gd = mycpu;
	1182
	1183	crit_enter_gd(gd);
	1184	if (td->td_flags & TDF_TSLEEPQ)
	1185	tsleep_remove(td);
	1186	KKASSERT(td->td_gd == gd);
	1187	TAILQ_REMOVE(&gd->gd_tdallq, td, td_allq);
	1188	td->td_flags \|= TDF_MIGRATING;
	1189	crit_exit_gd(gd);
	1190	}
	1191
	1192	void
	1193	lwkt_acquire(thread_t td)
	1194	{
	1195	globaldata_t gd;
	1196	globaldata_t mygd;
	1197
	1198	KKASSERT(td->td_flags & TDF_MIGRATING);
	1199	gd = td->td_gd;
	1200	mygd = mycpu;
	1201	if (gd != mycpu) {
	1202	cpu_lfence();
	1203	KKASSERT((td->td_flags & TDF_RUNQ) == 0);
	1204	crit_enter_gd(mygd);
	1205	while (td->td_flags & (TDF_RUNNING\|TDF_PREEMPT_LOCK)) {
	1206	#ifdef SMP
	1207	lwkt_process_ipiq();
	1208	#endif
	1209	cpu_lfence();
	1210	}
	1211	td->td_gd = mygd;
	1212	TAILQ_INSERT_TAIL(&mygd->gd_tdallq, td, td_allq);
	1213	td->td_flags &= ~TDF_MIGRATING;
	1214	crit_exit_gd(mygd);
	1215	} else {
	1216	crit_enter_gd(mygd);
	1217	TAILQ_INSERT_TAIL(&mygd->gd_tdallq, td, td_allq);
	1218	td->td_flags &= ~TDF_MIGRATING;
	1219	crit_exit_gd(mygd);
	1220	}
	1221	}
	1222
	1223	#endif
	1224
	1225	/*
	1226	* Generic deschedule. Descheduling threads other then your own should be
	1227	* done only in carefully controlled circumstances. Descheduling is
	1228	* asynchronous.
	1229	*
	1230	* This function may block if the cpu has run out of messages.
	1231	*/
	1232	void
	1233	lwkt_deschedule(thread_t td)
	1234	{
	1235	crit_enter();
	1236	#ifdef SMP
	1237	if (td == curthread) {
	1238	_lwkt_dequeue(td);
	1239	} else {
	1240	if (td->td_gd == mycpu) {
	1241	_lwkt_dequeue(td);
	1242	} else {
	1243	lwkt_send_ipiq(td->td_gd, (ipifunc1_t)lwkt_deschedule, td);
	1244	}
	1245	}
	1246	#else
	1247	_lwkt_dequeue(td);
	1248	#endif
	1249	crit_exit();
	1250	}
	1251
	1252	/*
	1253	* Set the target thread's priority. This routine does not automatically
	1254	* switch to a higher priority thread, LWKT threads are not designed for
	1255	* continuous priority changes. Yield if you want to switch.
	1256	*
	1257	* We have to retain the critical section count which uses the high bits
	1258	* of the td_pri field. The specified priority may also indicate zero or
	1259	* more critical sections by adding TDPRI_CRIT*N.
	1260	*
	1261	* Note that we requeue the thread whether it winds up on a different runq
	1262	* or not. uio_yield() depends on this and the routine is not normally
	1263	* called with the same priority otherwise.
	1264	*/
	1265	void
	1266	lwkt_setpri(thread_t td, int pri)
	1267	{
	1268	KKASSERT(pri >= 0);
	1269	KKASSERT(td->td_gd == mycpu);
	1270	crit_enter();
	1271	if (td->td_flags & TDF_RUNQ) {
	1272	_lwkt_dequeue(td);
	1273	td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri;
	1274	_lwkt_enqueue(td);
	1275	} else {
	1276	td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri;
	1277	}
	1278	crit_exit();
	1279	}
	1280
	1281	/*
	1282	* Set the initial priority for a thread prior to it being scheduled for
	1283	* the first time. The thread MUST NOT be scheduled before or during
	1284	* this call. The thread may be assigned to a cpu other then the current
	1285	* cpu.
	1286	*
	1287	* Typically used after a thread has been created with TDF_STOPPREQ,
	1288	* and before the thread is initially scheduled.
	1289	*/
	1290	void
	1291	lwkt_setpri_initial(thread_t td, int pri)
	1292	{
	1293	KKASSERT(pri >= 0);
	1294	KKASSERT((td->td_flags & TDF_RUNQ) == 0);
	1295	td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri;
	1296	}
	1297
	1298	void
	1299	lwkt_setpri_self(int pri)
	1300	{
	1301	thread_t td = curthread;
	1302
	1303	KKASSERT(pri >= 0 && pri <= TDPRI_MAX);
	1304	crit_enter();
	1305	if (td->td_flags & TDF_RUNQ) {
	1306	_lwkt_dequeue(td);
	1307	td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri;
	1308	_lwkt_enqueue(td);
	1309	} else {
	1310	td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri;
	1311	}
	1312	crit_exit();
	1313	}
	1314
	1315	/*
	1316	* Migrate the current thread to the specified cpu.
	1317	*
	1318	* This is accomplished by descheduling ourselves from the current cpu,
	1319	* moving our thread to the tdallq of the target cpu, IPI messaging the
	1320	* target cpu, and switching out. TDF_MIGRATING prevents scheduling
	1321	* races while the thread is being migrated.
	1322	*
	1323	* We must be sure to remove ourselves from the current cpu's tsleepq
	1324	* before potentially moving to another queue. The thread can be on
	1325	* a tsleepq due to a left-over tsleep_interlock().
	1326	*/
	1327	#ifdef SMP
	1328	static void lwkt_setcpu_remote(void *arg);
	1329	#endif
	1330
	1331	void
	1332	lwkt_setcpu_self(globaldata_t rgd)
	1333	{
	1334	#ifdef SMP
	1335	thread_t td = curthread;
	1336
	1337	if (td->td_gd != rgd) {
	1338	crit_enter_quick(td);
	1339	if (td->td_flags & TDF_TSLEEPQ)
	1340	tsleep_remove(td);
	1341	td->td_flags \|= TDF_MIGRATING;
	1342	lwkt_deschedule_self(td);
	1343	TAILQ_REMOVE(&td->td_gd->gd_tdallq, td, td_allq);
	1344	lwkt_send_ipiq(rgd, (ipifunc1_t)lwkt_setcpu_remote, td);
	1345	lwkt_switch();
	1346	/* we are now on the target cpu */
	1347	TAILQ_INSERT_TAIL(&rgd->gd_tdallq, td, td_allq);
	1348	crit_exit_quick(td);
	1349	}
	1350	#endif
	1351	}
	1352
	1353	void
	1354	lwkt_migratecpu(int cpuid)
	1355	{
	1356	#ifdef SMP
	1357	globaldata_t rgd;
	1358
	1359	rgd = globaldata_find(cpuid);
	1360	lwkt_setcpu_self(rgd);
	1361	#endif
	1362	}
	1363
	1364	/*
	1365	* Remote IPI for cpu migration (called while in a critical section so we
	1366	* do not have to enter another one). The thread has already been moved to
	1367	* our cpu's allq, but we must wait for the thread to be completely switched
	1368	* out on the originating cpu before we schedule it on ours or the stack
	1369	* state may be corrupt. We clear TDF_MIGRATING after flushing the GD
	1370	* change to main memory.
	1371	*
	1372	* XXX The use of TDF_MIGRATING might not be sufficient to avoid races
	1373	* against wakeups. It is best if this interface is used only when there
	1374	* are no pending events that might try to schedule the thread.
	1375	*/
	1376	#ifdef SMP
	1377	static void
	1378	lwkt_setcpu_remote(void *arg)
	1379	{
	1380	thread_t td = arg;
	1381	globaldata_t gd = mycpu;
	1382
	1383	while (td->td_flags & (TDF_RUNNING\|TDF_PREEMPT_LOCK)) {
	1384	#ifdef SMP
	1385	lwkt_process_ipiq();
	1386	#endif
	1387	cpu_lfence();
	1388	}
	1389	td->td_gd = gd;
	1390	cpu_sfence();
	1391	td->td_flags &= ~TDF_MIGRATING;
	1392	KKASSERT(td->td_lwp == NULL \|\| (td->td_lwp->lwp_flag & LWP_ONRUNQ) == 0);
	1393	_lwkt_enqueue(td);
	1394	}
	1395	#endif
	1396
	1397	struct lwp *
	1398	lwkt_preempted_proc(void)
	1399	{
	1400	thread_t td = curthread;
	1401	while (td->td_preempted)
	1402	td = td->td_preempted;
	1403	return(td->td_lwp);
	1404	}
	1405
	1406	/*
	1407	* Create a kernel process/thread/whatever. It shares it's address space
	1408	* with proc0 - ie: kernel only.
	1409	*
	1410	* NOTE! By default new threads are created with the MP lock held. A
	1411	* thread which does not require the MP lock should release it by calling
	1412	* rel_mplock() at the start of the new thread.
	1413	*/
	1414	int
	1415	lwkt_create(void (func)(void ), void *arg,
	1416	struct thread **tdp, thread_t template, int tdflags, int cpu,
	1417	const char *fmt, ...)
	1418	{
	1419	thread_t td;
	1420	__va_list ap;
	1421
	1422	td = lwkt_alloc_thread(template, LWKT_THREAD_STACK, cpu,
	1423	tdflags);
	1424	if (tdp)
	1425	*tdp = td;
	1426	cpu_set_thread_handler(td, lwkt_exit, func, arg);
	1427
	1428	/*
	1429	* Set up arg0 for 'ps' etc
	1430	*/
	1431	__va_start(ap, fmt);
	1432	kvsnprintf(td->td_comm, sizeof(td->td_comm), fmt, ap);
	1433	__va_end(ap);
	1434
	1435	/*
	1436	* Schedule the thread to run
	1437	*/
	1438	if ((td->td_flags & TDF_STOPREQ) == 0)
	1439	lwkt_schedule(td);
	1440	else
	1441	td->td_flags &= ~TDF_STOPREQ;
	1442	return 0;
	1443	}
	1444
	1445	/*
	1446	* Destroy an LWKT thread. Warning! This function is not called when
	1447	* a process exits, cpu_proc_exit() directly calls cpu_thread_exit() and
	1448	* uses a different reaping mechanism.
	1449	*/
	1450	void
	1451	lwkt_exit(void)
	1452	{
	1453	thread_t td = curthread;
	1454	thread_t std;
	1455	globaldata_t gd;
	1456
	1457	if (td->td_flags & TDF_VERBOSE)
	1458	kprintf("kthread %p %s has exited\n", td, td->td_comm);
	1459	caps_exit(td);
	1460
	1461	/*
	1462	* Get us into a critical section to interlock gd_freetd and loop
	1463	* until we can get it freed.
	1464	*
	1465	* We have to cache the current td in gd_freetd because objcache_put()ing
	1466	* it would rip it out from under us while our thread is still active.
	1467	*/
	1468	gd = mycpu;
	1469	crit_enter_quick(td);
	1470	while ((std = gd->gd_freetd) != NULL) {
	1471	gd->gd_freetd = NULL;
	1472	objcache_put(thread_cache, std);
	1473	}
	1474
	1475	/*
	1476	* Remove thread resources from kernel lists and deschedule us for
	1477	* the last time.
	1478	*/
	1479	if (td->td_flags & TDF_TSLEEPQ)
	1480	tsleep_remove(td);
	1481	biosched_done(td);
	1482	dsched_exit_thread(td);
	1483	lwkt_deschedule_self(td);
	1484	lwkt_remove_tdallq(td);
	1485	if (td->td_flags & TDF_ALLOCATED_THREAD)
	1486	gd->gd_freetd = td;
	1487	cpu_thread_exit();
	1488	}
	1489
	1490	void
	1491	lwkt_remove_tdallq(thread_t td)
	1492	{
	1493	KKASSERT(td->td_gd == mycpu);
	1494	TAILQ_REMOVE(&td->td_gd->gd_tdallq, td, td_allq);
	1495	}
	1496
	1497	void
	1498	crit_panic(void)
	1499	{
	1500	thread_t td = curthread;
	1501	int lpri = td->td_pri;
	1502
	1503	td->td_pri = 0;
	1504	panic("td_pri is/would-go negative! %p %d", td, lpri);
	1505	}
	1506
	1507	#ifdef SMP
	1508
	1509	/*
	1510	* Called from debugger/panic on cpus which have been stopped. We must still
	1511	* process the IPIQ while stopped, even if we were stopped while in a critical
	1512	* section (XXX).
	1513	*
	1514	* If we are dumping also try to process any pending interrupts. This may
	1515	* or may not work depending on the state of the cpu at the point it was
	1516	* stopped.
	1517	*/
	1518	void
	1519	lwkt_smp_stopped(void)
	1520	{
	1521	globaldata_t gd = mycpu;
	1522
	1523	crit_enter_gd(gd);
	1524	if (dumping) {
	1525	lwkt_process_ipiq();
	1526	splz();
	1527	} else {
	1528	lwkt_process_ipiq();
	1529	}
	1530	crit_exit_gd(gd);
	1531	}
	1532
	1533	#endif