gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2003,2004 The DragonFly Project. All rights reserved.
	3	*
	4	* This code is derived from software contributed to The DragonFly Project
	5	* by Matthew Dillon <dillon@backplane.com>
	6	*
	7	* Redistribution and use in source and binary forms, with or without
	8	* modification, are permitted provided that the following conditions
	9	* are met:
	10	*
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in
	15	* the documentation and/or other materials provided with the
	16	* distribution.
	17	* 3. Neither the name of The DragonFly Project nor the names of its
	18	* contributors may be used to endorse or promote products derived
	19	* from this software without specific, prior written permission.
	20	*
	21	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	22	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	23	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	24	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	25	* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	26	* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
	27	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	28	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	29	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	30	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	31	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	32	* SUCH DAMAGE.
	33	*
	34	* $DragonFly: src/sys/kern/lwkt_thread.c,v 1.99 2006/06/01 05:38:45 dillon Exp $
	35	*/
	36
	37	/*
	38	* Each cpu in a system has its own self-contained light weight kernel
	39	* thread scheduler, which means that generally speaking we only need
	40	* to use a critical section to avoid problems. Foreign thread
	41	* scheduling is queued via (async) IPIs.
	42	*/
	43
	44	#ifdef _KERNEL
	45
	46	#include <sys/param.h>
	47	#include <sys/systm.h>
	48	#include <sys/kernel.h>
	49	#include <sys/proc.h>
	50	#include <sys/rtprio.h>
	51	#include <sys/queue.h>
	52	#include <sys/sysctl.h>
	53	#include <sys/kthread.h>
	54	#include <machine/cpu.h>
	55	#include <sys/lock.h>
	56	#include <sys/caps.h>
	57	#include <sys/spinlock.h>
	58	#include <sys/ktr.h>
	59
	60	#include <sys/thread2.h>
	61	#include <sys/spinlock2.h>
	62
	63	#include <vm/vm.h>
	64	#include <vm/vm_param.h>
	65	#include <vm/vm_kern.h>
	66	#include <vm/vm_object.h>
	67	#include <vm/vm_page.h>
	68	#include <vm/vm_map.h>
	69	#include <vm/vm_pager.h>
	70	#include <vm/vm_extern.h>
	71	#include <vm/vm_zone.h>
	72
	73	#include <machine/stdarg.h>
	74	#include <machine/ipl.h>
	75	#include <machine/smp.h>
	76
	77	#else
	78
	79	#include <sys/stdint.h>
	80	#include <libcaps/thread.h>
	81	#include <sys/thread.h>
	82	#include <sys/msgport.h>
	83	#include <sys/errno.h>
	84	#include <libcaps/globaldata.h>
	85	#include <machine/cpufunc.h>
	86	#include <sys/thread2.h>
	87	#include <sys/msgport2.h>
	88	#include <stdio.h>
	89	#include <stdlib.h>
	90	#include <string.h>
	91	#include <machine/lock.h>
	92	#include <machine/atomic.h>
	93	#include <machine/cpu.h>
	94
	95	#endif
	96
	97	static int untimely_switch = 0;
	98	#ifdef INVARIANTS
	99	static int panic_on_cscount = 0;
	100	#endif
	101	static __int64_t switch_count = 0;
	102	static __int64_t preempt_hit = 0;
	103	static __int64_t preempt_miss = 0;
	104	static __int64_t preempt_weird = 0;
	105	static __int64_t token_contention_count = 0;
	106	static __int64_t mplock_contention_count = 0;
	107
	108	#ifdef _KERNEL
	109
	110	SYSCTL_INT(_lwkt, OID_AUTO, untimely_switch, CTLFLAG_RW, &untimely_switch, 0, "");
	111	#ifdef INVARIANTS
	112	SYSCTL_INT(_lwkt, OID_AUTO, panic_on_cscount, CTLFLAG_RW, &panic_on_cscount, 0, "");
	113	#endif
	114	SYSCTL_QUAD(_lwkt, OID_AUTO, switch_count, CTLFLAG_RW, &switch_count, 0, "");
	115	SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_hit, CTLFLAG_RW, &preempt_hit, 0, "");
	116	SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_miss, CTLFLAG_RW, &preempt_miss, 0, "");
	117	SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_weird, CTLFLAG_RW, &preempt_weird, 0, "");
	118	#ifdef INVARIANTS
	119	SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count, CTLFLAG_RW,
	120	&token_contention_count, 0, "spinning due to token contention");
	121	SYSCTL_QUAD(_lwkt, OID_AUTO, mplock_contention_count, CTLFLAG_RW,
	122	&mplock_contention_count, 0, "spinning due to MPLOCK contention");
	123	#endif
	124	#endif
	125
	126	/*
	127	* Kernel Trace
	128	*/
	129	#ifdef _KERNEL
	130
	131	#if !defined(KTR_GIANT_CONTENTION)
	132	#define KTR_GIANT_CONTENTION KTR_ALL
	133	#endif
	134
	135	KTR_INFO_MASTER(giant);
	136	KTR_INFO(KTR_GIANT_CONTENTION, giant, beg, 0, "thread=%p", sizeof(void *));
	137	KTR_INFO(KTR_GIANT_CONTENTION, giant, end, 1, "thread=%p", sizeof(void *));
	138
	139	#define loggiant(name) KTR_LOG(giant_ ## name, curthread)
	140
	141	#endif
	142
	143	/*
	144	* These helper procedures handle the runq, they can only be called from
	145	* within a critical section.
	146	*
	147	* WARNING! Prior to SMP being brought up it is possible to enqueue and
	148	* dequeue threads belonging to other cpus, so be sure to use td->td_gd
	149	* instead of 'mycpu' when referencing the globaldata structure. Once
	150	* SMP live enqueuing and dequeueing only occurs on the current cpu.
	151	*/
	152	static __inline
	153	void
	154	_lwkt_dequeue(thread_t td)
	155	{
	156	if (td->td_flags & TDF_RUNQ) {
	157	int nq = td->td_pri & TDPRI_MASK;
	158	struct globaldata *gd = td->td_gd;
	159
	160	td->td_flags &= ~TDF_RUNQ;
	161	TAILQ_REMOVE(&gd->gd_tdrunq[nq], td, td_threadq);
	162	/* runqmask is passively cleaned up by the switcher */
	163	}
	164	}
	165
	166	static __inline
	167	void
	168	_lwkt_enqueue(thread_t td)
	169	{
	170	if ((td->td_flags & (TDF_RUNQ\|TDF_MIGRATING\|TDF_TSLEEPQ\|TDF_BLOCKQ)) == 0) {
	171	int nq = td->td_pri & TDPRI_MASK;
	172	struct globaldata *gd = td->td_gd;
	173
	174	td->td_flags \|= TDF_RUNQ;
	175	TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], td, td_threadq);
	176	gd->gd_runqmask \|= 1 << nq;
	177	}
	178	}
	179
	180	/*
	181	* Schedule a thread to run. As the current thread we can always safely
	182	* schedule ourselves, and a shortcut procedure is provided for that
	183	* function.
	184	*
	185	* (non-blocking, self contained on a per cpu basis)
	186	*/
	187	void
	188	lwkt_schedule_self(thread_t td)
	189	{
	190	crit_enter_quick(td);
	191	KASSERT(td->td_wait == NULL, ("lwkt_schedule_self(): td_wait not NULL!"));
	192	KASSERT(td != &td->td_gd->gd_idlethread, ("lwkt_schedule_self(): scheduling gd_idlethread is illegal!"));
	193	KKASSERT(td->td_proc == NULL \|\| (td->td_proc->p_flag & P_ONRUNQ) == 0);
	194	_lwkt_enqueue(td);
	195	crit_exit_quick(td);
	196	}
	197
	198	/*
	199	* Deschedule a thread.
	200	*
	201	* (non-blocking, self contained on a per cpu basis)
	202	*/
	203	void
	204	lwkt_deschedule_self(thread_t td)
	205	{
	206	crit_enter_quick(td);
	207	KASSERT(td->td_wait == NULL, ("lwkt_schedule_self(): td_wait not NULL!"));
	208	_lwkt_dequeue(td);
	209	crit_exit_quick(td);
	210	}
	211
	212	#ifdef _KERNEL
	213
	214	/*
	215	* LWKTs operate on a per-cpu basis
	216	*
	217	* WARNING! Called from early boot, 'mycpu' may not work yet.
	218	*/
	219	void
	220	lwkt_gdinit(struct globaldata *gd)
	221	{
	222	int i;
	223
	224	for (i = 0; i < sizeof(gd->gd_tdrunq)/sizeof(gd->gd_tdrunq[0]); ++i)
	225	TAILQ_INIT(&gd->gd_tdrunq[i]);
	226	gd->gd_runqmask = 0;
	227	TAILQ_INIT(&gd->gd_tdallq);
	228	}
	229
	230	#endif /* _KERNEL */
	231
	232	/*
	233	* Initialize a thread wait structure prior to first use.
	234	*
	235	* NOTE! called from low level boot code, we cannot do anything fancy!
	236	*/
	237	void
	238	lwkt_wait_init(lwkt_wait_t w)
	239	{
	240	spin_init(&w->wa_spinlock);
	241	TAILQ_INIT(&w->wa_waitq);
	242	w->wa_gen = 0;
	243	w->wa_count = 0;
	244	}
	245
	246	/*
	247	* Create a new thread. The thread must be associated with a process context
	248	* or LWKT start address before it can be scheduled. If the target cpu is
	249	* -1 the thread will be created on the current cpu.
	250	*
	251	* If you intend to create a thread without a process context this function
	252	* does everything except load the startup and switcher function.
	253	*/
	254	thread_t
	255	lwkt_alloc_thread(struct thread *td, int stksize, int cpu, int flags)
	256	{
	257	void *stack;
	258	globaldata_t gd = mycpu;
	259
	260	if (td == NULL) {
	261	crit_enter_gd(gd);
	262	if (gd->gd_tdfreecount > 0) {
	263	--gd->gd_tdfreecount;
	264	td = TAILQ_FIRST(&gd->gd_tdfreeq);
	265	KASSERT(td != NULL && (td->td_flags & TDF_RUNNING) == 0,
	266	("lwkt_alloc_thread: unexpected NULL or corrupted td"));
	267	TAILQ_REMOVE(&gd->gd_tdfreeq, td, td_threadq);
	268	crit_exit_gd(gd);
	269	flags \|= td->td_flags & (TDF_ALLOCATED_STACK\|TDF_ALLOCATED_THREAD);
	270	} else {
	271	crit_exit_gd(gd);
	272	#ifdef _KERNEL
	273	td = zalloc(thread_zone);
	274	#else
	275	td = malloc(sizeof(struct thread));
	276	#endif
	277	td->td_kstack = NULL;
	278	td->td_kstack_size = 0;
	279	flags \|= TDF_ALLOCATED_THREAD;
	280	}
	281	}
	282	if ((stack = td->td_kstack) != NULL && td->td_kstack_size != stksize) {
	283	if (flags & TDF_ALLOCATED_STACK) {
	284	#ifdef _KERNEL
	285	kmem_free(kernel_map, (vm_offset_t)stack, td->td_kstack_size);
	286	#else
	287	libcaps_free_stack(stack, td->td_kstack_size);
	288	#endif
	289	stack = NULL;
	290	}
	291	}
	292	if (stack == NULL) {
	293	#ifdef _KERNEL
	294	stack = (void *)kmem_alloc(kernel_map, stksize);
	295	#else
	296	stack = libcaps_alloc_stack(stksize);
	297	#endif
	298	flags \|= TDF_ALLOCATED_STACK;
	299	}
	300	if (cpu < 0)
	301	lwkt_init_thread(td, stack, stksize, flags, mycpu);
	302	else
	303	lwkt_init_thread(td, stack, stksize, flags, globaldata_find(cpu));
	304	return(td);
	305	}
	306
	307	#ifdef _KERNEL
	308
	309	/*
	310	* Initialize a preexisting thread structure. This function is used by
	311	* lwkt_alloc_thread() and also used to initialize the per-cpu idlethread.
	312	*
	313	* All threads start out in a critical section at a priority of
	314	* TDPRI_KERN_DAEMON. Higher level code will modify the priority as
	315	* appropriate. This function may send an IPI message when the
	316	* requested cpu is not the current cpu and consequently gd_tdallq may
	317	* not be initialized synchronously from the point of view of the originating
	318	* cpu.
	319	*
	320	* NOTE! we have to be careful in regards to creating threads for other cpus
	321	* if SMP has not yet been activated.
	322	*/
	323	#ifdef SMP
	324
	325	static void
	326	lwkt_init_thread_remote(void *arg)
	327	{
	328	thread_t td = arg;
	329
	330	/*
	331	* Protected by critical section held by IPI dispatch
	332	*/
	333	TAILQ_INSERT_TAIL(&td->td_gd->gd_tdallq, td, td_allq);
	334	}
	335
	336	#endif
	337
	338	void
	339	lwkt_init_thread(thread_t td, void *stack, int stksize, int flags,
	340	struct globaldata *gd)
	341	{
	342	globaldata_t mygd = mycpu;
	343
	344	bzero(td, sizeof(struct thread));
	345	td->td_kstack = stack;
	346	td->td_kstack_size = stksize;
	347	td->td_flags = flags;
	348	td->td_gd = gd;
	349	td->td_pri = TDPRI_KERN_DAEMON + TDPRI_CRIT;
	350	#ifdef SMP
	351	if ((flags & TDF_MPSAFE) == 0)
	352	td->td_mpcount = 1;
	353	#endif
	354	lwkt_initport(&td->td_msgport, td);
	355	pmap_init_thread(td);
	356	#ifdef SMP
	357	/*
	358	* Normally initializing a thread for a remote cpu requires sending an
	359	* IPI. However, the idlethread is setup before the other cpus are
	360	* activated so we have to treat it as a special case. XXX manipulation
	361	* of gd_tdallq requires the BGL.
	362	*/
	363	if (gd == mygd \|\| td == &gd->gd_idlethread) {
	364	crit_enter_gd(mygd);
	365	TAILQ_INSERT_TAIL(&gd->gd_tdallq, td, td_allq);
	366	crit_exit_gd(mygd);
	367	} else {
	368	lwkt_send_ipiq(gd, lwkt_init_thread_remote, td);
	369	}
	370	#else
	371	crit_enter_gd(mygd);
	372	TAILQ_INSERT_TAIL(&gd->gd_tdallq, td, td_allq);
	373	crit_exit_gd(mygd);
	374	#endif
	375	}
	376
	377	#endif /* _KERNEL */
	378
	379	void
	380	lwkt_set_comm(thread_t td, const char *ctl, ...)
	381	{
	382	__va_list va;
	383
	384	__va_start(va, ctl);
	385	vsnprintf(td->td_comm, sizeof(td->td_comm), ctl, va);
	386	__va_end(va);
	387	}
	388
	389	void
	390	lwkt_hold(thread_t td)
	391	{
	392	++td->td_refs;
	393	}
	394
	395	void
	396	lwkt_rele(thread_t td)
	397	{
	398	KKASSERT(td->td_refs > 0);
	399	--td->td_refs;
	400	}
	401
	402	#ifdef _KERNEL
	403
	404	void
	405	lwkt_wait_free(thread_t td)
	406	{
	407	while (td->td_refs)
	408	tsleep(td, 0, "tdreap", hz);
	409	}
	410
	411	#endif
	412
	413	void
	414	lwkt_free_thread(thread_t td)
	415	{
	416	struct globaldata *gd = mycpu;
	417
	418	KASSERT((td->td_flags & TDF_RUNNING) == 0,
	419	("lwkt_free_thread: did not exit! %p", td));
	420
	421	crit_enter_gd(gd);
	422	if (gd->gd_tdfreecount < CACHE_NTHREADS &&
	423	(td->td_flags & TDF_ALLOCATED_THREAD)
	424	) {
	425	++gd->gd_tdfreecount;
	426	TAILQ_INSERT_HEAD(&gd->gd_tdfreeq, td, td_threadq);
	427	crit_exit_gd(gd);
	428	} else {
	429	crit_exit_gd(gd);
	430	if (td->td_kstack && (td->td_flags & TDF_ALLOCATED_STACK)) {
	431	#ifdef _KERNEL
	432	kmem_free(kernel_map, (vm_offset_t)td->td_kstack, td->td_kstack_size);
	433	#else
	434	libcaps_free_stack(td->td_kstack, td->td_kstack_size);
	435	#endif
	436	/* gd invalid */
	437	td->td_kstack = NULL;
	438	td->td_kstack_size = 0;
	439	}
	440	if (td->td_flags & TDF_ALLOCATED_THREAD) {
	441	#ifdef _KERNEL
	442	zfree(thread_zone, td);
	443	#else
	444	free(td);
	445	#endif
	446	}
	447	}
	448	}
	449
	450
	451	/*
	452	* Switch to the next runnable lwkt. If no LWKTs are runnable then
	453	* switch to the idlethread. Switching must occur within a critical
	454	* section to avoid races with the scheduling queue.
	455	*
	456	* We always have full control over our cpu's run queue. Other cpus
	457	* that wish to manipulate our queue must use the cpu_*msg() calls to
	458	* talk to our cpu, so a critical section is all that is needed and
	459	* the result is very, very fast thread switching.
	460	*
	461	* The LWKT scheduler uses a fixed priority model and round-robins at
	462	* each priority level. User process scheduling is a totally
	463	* different beast and LWKT priorities should not be confused with
	464	* user process priorities.
	465	*
	466	* The MP lock may be out of sync with the thread's td_mpcount. lwkt_switch()
	467	* cleans it up. Note that the td_switch() function cannot do anything that
	468	* requires the MP lock since the MP lock will have already been setup for
	469	* the target thread (not the current thread). It's nice to have a scheduler
	470	* that does not need the MP lock to work because it allows us to do some
	471	* really cool high-performance MP lock optimizations.
	472	*
	473	* PREEMPTION NOTE: Preemption occurs via lwkt_preempt(). lwkt_switch()
	474	* is not called by the current thread in the preemption case, only when
	475	* the preempting thread blocks (in order to return to the original thread).
	476	*/
	477	void
	478	lwkt_switch(void)
	479	{
	480	globaldata_t gd = mycpu;
	481	thread_t td = gd->gd_curthread;
	482	thread_t ntd;
	483	#ifdef SMP
	484	int mpheld;
	485	#endif
	486
	487	/*
	488	* Switching from within a 'fast' (non thread switched) interrupt or IPI
	489	* is illegal. However, we may have to do it anyway if we hit a fatal
	490	* kernel trap or we have paniced.
	491	*
	492	* If this case occurs save and restore the interrupt nesting level.
	493	*/
	494	if (gd->gd_intr_nesting_level) {
	495	int savegdnest;
	496	int savegdtrap;
	497
	498	if (gd->gd_trap_nesting_level == 0 && panicstr == NULL) {
	499	panic("lwkt_switch: cannot switch from within "
	500	"a fast interrupt, yet, td %p\n", td);
	501	} else {
	502	savegdnest = gd->gd_intr_nesting_level;
	503	savegdtrap = gd->gd_trap_nesting_level;
	504	gd->gd_intr_nesting_level = 0;
	505	gd->gd_trap_nesting_level = 0;
	506	if ((td->td_flags & TDF_PANICWARN) == 0) {
	507	td->td_flags \|= TDF_PANICWARN;
	508	printf("Warning: thread switch from interrupt or IPI, "
	509	"thread %p (%s)\n", td, td->td_comm);
	510	#ifdef DDB
	511	db_print_backtrace();
	512	#endif
	513	}
	514	lwkt_switch();
	515	gd->gd_intr_nesting_level = savegdnest;
	516	gd->gd_trap_nesting_level = savegdtrap;
	517	return;
	518	}
	519	}
	520
	521	/*
	522	* Passive release (used to transition from user to kernel mode
	523	* when we block or switch rather then when we enter the kernel).
	524	* This function is NOT called if we are switching into a preemption
	525	* or returning from a preemption. Typically this causes us to lose
	526	* our current process designation (if we have one) and become a true
	527	* LWKT thread, and may also hand the current process designation to
	528	* another process and schedule thread.
	529	*/
	530	if (td->td_release)
	531	td->td_release(td);
	532
	533	crit_enter_gd(gd);
	534	#ifdef SMP
	535	if (td->td_toks)
	536	lwkt_relalltokens(td);
	537	#endif
	538
	539	/*
	540	* We had better not be holding any spin locks, but don't get into an
	541	* endless panic loop.
	542	*/
	543	KASSERT(gd->gd_spinlocks_rd == 0 \|\| panicstr != NULL,
	544	("lwkt_switch: still holding %d shared spinlocks!",
	545	gd->gd_spinlocks_rd));
	546	KASSERT(gd->gd_spinlocks_wr == 0 \|\| panicstr != NULL,
	547	("lwkt_switch: still holding %d exclusive spinlocks!",
	548	gd->gd_spinlocks_wr));
	549
	550
	551	#ifdef SMP
	552	/*
	553	* td_mpcount cannot be used to determine if we currently hold the
	554	* MP lock because get_mplock() will increment it prior to attempting
	555	* to get the lock, and switch out if it can't. Our ownership of
	556	* the actual lock will remain stable while we are in a critical section
	557	* (but, of course, another cpu may own or release the lock so the
	558	* actual value of mp_lock is not stable).
	559	*/
	560	mpheld = MP_LOCK_HELD();
	561	#ifdef INVARIANTS
	562	if (td->td_cscount) {
	563	printf("Diagnostic: attempt to switch while mastering cpusync: %p\n",
	564	td);
	565	if (panic_on_cscount)
	566	panic("switching while mastering cpusync");
	567	}
	568	#endif
	569	#endif
	570	if ((ntd = td->td_preempted) != NULL) {
	571	/*
	572	* We had preempted another thread on this cpu, resume the preempted
	573	* thread. This occurs transparently, whether the preempted thread
	574	* was scheduled or not (it may have been preempted after descheduling
	575	* itself).
	576	*
	577	* We have to setup the MP lock for the original thread after backing
	578	* out the adjustment that was made to curthread when the original
	579	* was preempted.
	580	*/
	581	KKASSERT(ntd->td_flags & TDF_PREEMPT_LOCK);
	582	#ifdef SMP
	583	if (ntd->td_mpcount && mpheld == 0) {
	584	panic("MPLOCK NOT HELD ON RETURN: %p %p %d %d",
	585	td, ntd, td->td_mpcount, ntd->td_mpcount);
	586	}
	587	if (ntd->td_mpcount) {
	588	td->td_mpcount -= ntd->td_mpcount;
	589	KKASSERT(td->td_mpcount >= 0);
	590	}
	591	#endif
	592	ntd->td_flags \|= TDF_PREEMPT_DONE;
	593
	594	/*
	595	* XXX. The interrupt may have woken a thread up, we need to properly
	596	* set the reschedule flag if the originally interrupted thread is at
	597	* a lower priority.
	598	*/
	599	if (gd->gd_runqmask > (2 << (ntd->td_pri & TDPRI_MASK)) - 1)
	600	need_lwkt_resched();
	601	/* YYY release mp lock on switchback if original doesn't need it */
	602	} else {
	603	/*
	604	* Priority queue / round-robin at each priority. Note that user
	605	* processes run at a fixed, low priority and the user process
	606	* scheduler deals with interactions between user processes
	607	* by scheduling and descheduling them from the LWKT queue as
	608	* necessary.
	609	*
	610	* We have to adjust the MP lock for the target thread. If we
	611	* need the MP lock and cannot obtain it we try to locate a
	612	* thread that does not need the MP lock. If we cannot, we spin
	613	* instead of HLT.
	614	*
	615	* A similar issue exists for the tokens held by the target thread.
	616	* If we cannot obtain ownership of the tokens we cannot immediately
	617	* schedule the thread.
	618	*/
	619
	620	/*
	621	* If an LWKT reschedule was requested, well that is what we are
	622	* doing now so clear it.
	623	*/
	624	clear_lwkt_resched();
	625	again:
	626	if (gd->gd_runqmask) {
	627	int nq = bsrl(gd->gd_runqmask);
	628	if ((ntd = TAILQ_FIRST(&gd->gd_tdrunq[nq])) == NULL) {
	629	gd->gd_runqmask &= ~(1 << nq);
	630	goto again;
	631	}
	632	#ifdef SMP
	633	/*
	634	* THREAD SELECTION FOR AN SMP MACHINE BUILD
	635	*
	636	* If the target needs the MP lock and we couldn't get it,
	637	* or if the target is holding tokens and we could not
	638	* gain ownership of the tokens, continue looking for a
	639	* thread to schedule and spin instead of HLT if we can't.
	640	*
	641	* NOTE: the mpheld variable invalid after this conditional, it
	642	* can change due to both cpu_try_mplock() returning success
	643	* AND interactions in lwkt_getalltokens() due to the fact that
	644	* we are trying to check the mpcount of a thread other then
	645	* the current thread. Because of this, if the current thread
	646	* is not holding td_mpcount, an IPI indirectly run via
	647	* lwkt_getalltokens() can obtain and release the MP lock and
	648	* cause the core MP lock to be released.
	649	*/
	650	if ((ntd->td_mpcount && mpheld == 0 && !cpu_try_mplock()) \|\|
	651	(ntd->td_toks && lwkt_getalltokens(ntd) == 0)
	652	) {
	653	u_int32_t rqmask = gd->gd_runqmask;
	654
	655	mpheld = MP_LOCK_HELD();
	656	ntd = NULL;
	657	while (rqmask) {
	658	TAILQ_FOREACH(ntd, &gd->gd_tdrunq[nq], td_threadq) {
	659	if (ntd->td_mpcount && !mpheld && !cpu_try_mplock()) {
	660	/* spinning due to MP lock being held */
	661	#ifdef INVARIANTS
	662	++mplock_contention_count;
	663	#endif
	664	/* mplock still not held, 'mpheld' still valid */
	665	continue;
	666	}
	667
	668	/*
	669	* mpheld state invalid after getalltokens call returns
	670	* failure, but the variable is only needed for
	671	* the loop.
	672	*/
	673	if (ntd->td_toks && !lwkt_getalltokens(ntd)) {
	674	/* spinning due to token contention */
	675	#ifdef INVARIANTS
	676	++token_contention_count;
	677	#endif
	678	mpheld = MP_LOCK_HELD();
	679	continue;
	680	}
	681	break;
	682	}
	683	if (ntd)
	684	break;
	685	rqmask &= ~(1 << nq);
	686	nq = bsrl(rqmask);
	687	}
	688	if (ntd == NULL) {
	689	ntd = &gd->gd_idlethread;
	690	ntd->td_flags \|= TDF_IDLE_NOHLT;
	691	goto using_idle_thread;
	692	} else {
	693	++gd->gd_cnt.v_swtch;
	694	TAILQ_REMOVE(&gd->gd_tdrunq[nq], ntd, td_threadq);
	695	TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], ntd, td_threadq);
	696	}
	697	} else {
	698	++gd->gd_cnt.v_swtch;
	699	TAILQ_REMOVE(&gd->gd_tdrunq[nq], ntd, td_threadq);
	700	TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], ntd, td_threadq);
	701	}
	702	#else
	703	/*
	704	* THREAD SELECTION FOR A UP MACHINE BUILD. We don't have to
	705	* worry about tokens or the BGL.
	706	*/
	707	++gd->gd_cnt.v_swtch;
	708	TAILQ_REMOVE(&gd->gd_tdrunq[nq], ntd, td_threadq);
	709	TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], ntd, td_threadq);
	710	#endif
	711	} else {
	712	/*
	713	* We have nothing to run but only let the idle loop halt
	714	* the cpu if there are no pending interrupts.
	715	*/
	716	ntd = &gd->gd_idlethread;
	717	if (gd->gd_reqflags & RQF_IDLECHECK_MASK)
	718	ntd->td_flags \|= TDF_IDLE_NOHLT;
	719	#ifdef SMP
	720	using_idle_thread:
	721	/*
	722	* The idle thread should not be holding the MP lock unless we
	723	* are trapping in the kernel or in a panic. Since we select the
	724	* idle thread unconditionally when no other thread is available,
	725	* if the MP lock is desired during a panic or kernel trap, we
	726	* have to loop in the scheduler until we get it.
	727	*/
	728	if (ntd->td_mpcount) {
	729	mpheld = MP_LOCK_HELD();
	730	if (gd->gd_trap_nesting_level == 0 && panicstr == NULL)
	731	panic("Idle thread %p was holding the BGL!", ntd);
	732	else if (mpheld == 0)
	733	goto again;
	734	}
	735	#endif
	736	}
	737	}
	738	KASSERT(ntd->td_pri >= TDPRI_CRIT,
	739	("priority problem in lwkt_switch %d %d", td->td_pri, ntd->td_pri));
	740
	741	/*
	742	* Do the actual switch. If the new target does not need the MP lock
	743	* and we are holding it, release the MP lock. If the new target requires
	744	* the MP lock we have already acquired it for the target.
	745	*/
	746	#ifdef SMP
	747	if (ntd->td_mpcount == 0 ) {
	748	if (MP_LOCK_HELD())
	749	cpu_rel_mplock();
	750	} else {
	751	ASSERT_MP_LOCK_HELD(ntd);
	752	}
	753	#endif
	754	if (td != ntd) {
	755	++switch_count;
	756	td->td_switch(ntd);
	757	}
	758	/* NOTE: current cpu may have changed after switch */
	759	crit_exit_quick(td);
	760	}
	761
	762	/*
	763	* Request that the target thread preempt the current thread. Preemption
	764	* only works under a specific set of conditions:
	765	*
	766	* - We are not preempting ourselves
	767	* - The target thread is owned by the current cpu
	768	* - We are not currently being preempted
	769	* - The target is not currently being preempted
	770	* - We are able to satisfy the target's MP lock requirements (if any).
	771	*
	772	* THE CALLER OF LWKT_PREEMPT() MUST BE IN A CRITICAL SECTION. Typically
	773	* this is called via lwkt_schedule() through the td_preemptable callback.
	774	* critpri is the managed critical priority that we should ignore in order
	775	* to determine whether preemption is possible (aka usually just the crit
	776	* priority of lwkt_schedule() itself).
	777	*
	778	* XXX at the moment we run the target thread in a critical section during
	779	* the preemption in order to prevent the target from taking interrupts
	780	* that WE can't. Preemption is strictly limited to interrupt threads
	781	* and interrupt-like threads, outside of a critical section, and the
	782	* preempted source thread will be resumed the instant the target blocks
	783	* whether or not the source is scheduled (i.e. preemption is supposed to
	784	* be as transparent as possible).
	785	*
	786	* The target thread inherits our MP count (added to its own) for the
	787	* duration of the preemption in order to preserve the atomicy of the
	788	* MP lock during the preemption. Therefore, any preempting targets must be
	789	* careful in regards to MP assertions. Note that the MP count may be
	790	* out of sync with the physical mp_lock, but we do not have to preserve
	791	* the original ownership of the lock if it was out of synch (that is, we
	792	* can leave it synchronized on return).
	793	*/
	794	void
	795	lwkt_preempt(thread_t ntd, int critpri)
	796	{
	797	struct globaldata *gd = mycpu;
	798	thread_t td;
	799	#ifdef SMP
	800	int mpheld;
	801	int savecnt;
	802	#endif
	803
	804	/*
	805	* The caller has put us in a critical section. We can only preempt
	806	* if the caller of the caller was not in a critical section (basically
	807	* a local interrupt), as determined by the 'critpri' parameter. We
	808	* also acn't preempt if the caller is holding any spinlocks (even if
	809	* he isn't in a critical section). This also handles the tokens test.
	810	*
	811	* YYY The target thread must be in a critical section (else it must
	812	* inherit our critical section? I dunno yet).
	813	*
	814	* Set need_lwkt_resched() unconditionally for now YYY.
	815	*/
	816	KASSERT(ntd->td_pri >= TDPRI_CRIT, ("BADCRIT0 %d", ntd->td_pri));
	817
	818	td = gd->gd_curthread;
	819	if ((ntd->td_pri & TDPRI_MASK) <= (td->td_pri & TDPRI_MASK)) {
	820	++preempt_miss;
	821	return;
	822	}
	823	if ((td->td_pri & ~TDPRI_MASK) > critpri) {
	824	++preempt_miss;
	825	need_lwkt_resched();
	826	return;
	827	}
	828	#ifdef SMP
	829	if (ntd->td_gd != gd) {
	830	++preempt_miss;
	831	need_lwkt_resched();
	832	return;
	833	}
	834	#endif
	835	/*
	836	* Take the easy way out and do not preempt if the target is holding
	837	* any spinlocks. We could test whether the thread(s) being
	838	* preempted interlock against the target thread's tokens and whether
	839	* we can get all the target thread's tokens, but this situation
	840	* should not occur very often so its easier to simply not preempt.
	841	* Also, plain spinlocks are impossible to figure out at this point so
	842	* just don't preempt.
	843	*/
	844	if (gd->gd_spinlocks_rd + gd->gd_spinlocks_wr != 0) {
	845	++preempt_miss;
	846	need_lwkt_resched();
	847	return;
	848	}
	849	if (td == ntd \|\| ((td->td_flags \| ntd->td_flags) & TDF_PREEMPT_LOCK)) {
	850	++preempt_weird;
	851	need_lwkt_resched();
	852	return;
	853	}
	854	if (ntd->td_preempted) {
	855	++preempt_hit;
	856	need_lwkt_resched();
	857	return;
	858	}
	859	#ifdef SMP
	860	/*
	861	* note: an interrupt might have occured just as we were transitioning
	862	* to or from the MP lock. In this case td_mpcount will be pre-disposed
	863	* (non-zero) but not actually synchronized with the actual state of the
	864	* lock. We can use it to imply an MP lock requirement for the
	865	* preemption but we cannot use it to test whether we hold the MP lock
	866	* or not.
	867	*/
	868	savecnt = td->td_mpcount;
	869	mpheld = MP_LOCK_HELD();
	870	ntd->td_mpcount += td->td_mpcount;
	871	if (mpheld == 0 && ntd->td_mpcount && !cpu_try_mplock()) {
	872	ntd->td_mpcount -= td->td_mpcount;
	873	++preempt_miss;
	874	need_lwkt_resched();
	875	return;
	876	}
	877	#endif
	878
	879	/*
	880	* Since we are able to preempt the current thread, there is no need to
	881	* call need_lwkt_resched().
	882	*/
	883	++preempt_hit;
	884	ntd->td_preempted = td;
	885	td->td_flags \|= TDF_PREEMPT_LOCK;
	886	td->td_switch(ntd);
	887	KKASSERT(ntd->td_preempted && (td->td_flags & TDF_PREEMPT_DONE));
	888	#ifdef SMP
	889	KKASSERT(savecnt == td->td_mpcount);
	890	mpheld = MP_LOCK_HELD();
	891	if (mpheld && td->td_mpcount == 0)
	892	cpu_rel_mplock();
	893	else if (mpheld == 0 && td->td_mpcount)
	894	panic("lwkt_preempt(): MP lock was not held through");
	895	#endif
	896	ntd->td_preempted = NULL;
	897	td->td_flags &= ~(TDF_PREEMPT_LOCK\|TDF_PREEMPT_DONE);
	898	}
	899
	900	/*
	901	* Yield our thread while higher priority threads are pending. This is
	902	* typically called when we leave a critical section but it can be safely
	903	* called while we are in a critical section.
	904	*
	905	* This function will not generally yield to equal priority threads but it
	906	* can occur as a side effect. Note that lwkt_switch() is called from
	907	* inside the critical section to prevent its own crit_exit() from reentering
	908	* lwkt_yield_quick().
	909	*
	910	* gd_reqflags indicates that something changed, e.g. an interrupt or softint
	911	* came along but was blocked and made pending.
	912	*
	913	* (self contained on a per cpu basis)
	914	*/
	915	void
	916	lwkt_yield_quick(void)
	917	{
	918	globaldata_t gd = mycpu;
	919	thread_t td = gd->gd_curthread;
	920
	921	/*
	922	* gd_reqflags is cleared in splz if the cpl is 0. If we were to clear
	923	* it with a non-zero cpl then we might not wind up calling splz after
	924	* a task switch when the critical section is exited even though the
	925	* new task could accept the interrupt.
	926	*
	927	* XXX from crit_exit() only called after last crit section is released.
	928	* If called directly will run splz() even if in a critical section.
	929	*
	930	* td_nest_count prevent deep nesting via splz() or doreti(). Note that
	931	* except for this special case, we MUST call splz() here to handle any
	932	* pending ints, particularly after we switch, or we might accidently
	933	* halt the cpu with interrupts pending.
	934	*/
	935	if (gd->gd_reqflags && td->td_nest_count < 2)
	936	splz();
	937
	938	/*
	939	* YYY enabling will cause wakeup() to task-switch, which really
	940	* confused the old 4.x code. This is a good way to simulate
	941	* preemption and MP without actually doing preemption or MP, because a
	942	* lot of code assumes that wakeup() does not block.
	943	*/
	944	if (untimely_switch && td->td_nest_count == 0 &&
	945	gd->gd_intr_nesting_level == 0
	946	) {
	947	crit_enter_quick(td);
	948	/*
	949	* YYY temporary hacks until we disassociate the userland scheduler
	950	* from the LWKT scheduler.
	951	*/
	952	if (td->td_flags & TDF_RUNQ) {
	953	lwkt_switch(); /* will not reenter yield function */
	954	} else {
	955	lwkt_schedule_self(td); /* make sure we are scheduled */
	956	lwkt_switch(); /* will not reenter yield function */
	957	lwkt_deschedule_self(td); /* make sure we are descheduled */
	958	}
	959	crit_exit_noyield(td);
	960	}
	961	}
	962
	963	/*
	964	* This implements a normal yield which, unlike _quick, will yield to equal
	965	* priority threads as well. Note that gd_reqflags tests will be handled by
	966	* the crit_exit() call in lwkt_switch().
	967	*
	968	* (self contained on a per cpu basis)
	969	*/
	970	void
	971	lwkt_yield(void)
	972	{
	973	lwkt_schedule_self(curthread);
	974	lwkt_switch();
	975	}
	976
	977	/*
	978	* Generic schedule. Possibly schedule threads belonging to other cpus and
	979	* deal with threads that might be blocked on a wait queue.
	980	*
	981	* We have a little helper inline function which does additional work after
	982	* the thread has been enqueued, including dealing with preemption and
	983	* setting need_lwkt_resched() (which prevents the kernel from returning
	984	* to userland until it has processed higher priority threads).
	985	*
	986	* It is possible for this routine to be called after a failed _enqueue
	987	* (due to the target thread migrating, sleeping, or otherwise blocked).
	988	* We have to check that the thread is actually on the run queue!
	989	*/
	990	static __inline
	991	void
	992	_lwkt_schedule_post(globaldata_t gd, thread_t ntd, int cpri)
	993	{
	994	if (ntd->td_flags & TDF_RUNQ) {
	995	if (ntd->td_preemptable) {
	996	ntd->td_preemptable(ntd, cpri); /* YYY +token */
	997	} else if ((ntd->td_flags & TDF_NORESCHED) == 0 &&
	998	(ntd->td_pri & TDPRI_MASK) > (gd->gd_curthread->td_pri & TDPRI_MASK)
	999	) {
	1000	need_lwkt_resched();
	1001	}
	1002	}
	1003	}
	1004
	1005	void
	1006	lwkt_schedule(thread_t td)
	1007	{
	1008	globaldata_t mygd = mycpu;
	1009
	1010	KASSERT(td != &td->td_gd->gd_idlethread, ("lwkt_schedule(): scheduling gd_idlethread is illegal!"));
	1011	crit_enter_gd(mygd);
	1012	KKASSERT(td->td_proc == NULL \|\| (td->td_proc->p_flag & P_ONRUNQ) == 0);
	1013	if (td == mygd->gd_curthread) {
	1014	_lwkt_enqueue(td);
	1015	} else {
	1016	lwkt_wait_t w;
	1017
	1018	/*
	1019	* If the thread is on a wait list we have to send our scheduling
	1020	* request to the owner of the wait structure. Otherwise we send
	1021	* the scheduling request to the cpu owning the thread. Races
	1022	* are ok, the target will forward the message as necessary (the
	1023	* message may chase the thread around before it finally gets
	1024	* acted upon).
	1025	*
	1026	* (remember, wait structures use stable storage)
	1027	*
	1028	* NOTE: we have to account for the number of critical sections
	1029	* under our control when calling _lwkt_schedule_post() so it
	1030	* can figure out whether preemption is allowed.
	1031	*
	1032	* NOTE: The wait structure algorithms are a mess and need to be
	1033	* rewritten.
	1034	*
	1035	* NOTE: We cannot safely acquire or release a token, even
	1036	* non-blocking, because this routine may be called in the context
	1037	* of a thread already holding the token and thus not provide any
	1038	* interlock protection. We cannot safely manipulate the td_toks
	1039	* list for the same reason. Instead we depend on our critical
	1040	* section if the token is owned by our cpu.
	1041	*/
	1042	if ((w = td->td_wait) != NULL) {
	1043	spin_lock_wr(&w->wa_spinlock);
	1044	TAILQ_REMOVE(&w->wa_waitq, td, td_threadq);
	1045	--w->wa_count;
	1046	td->td_wait = NULL;
	1047	spin_unlock_wr(&w->wa_spinlock);
	1048	#ifdef SMP
	1049	if (td->td_gd == mygd) {
	1050	_lwkt_enqueue(td);
	1051	_lwkt_schedule_post(mygd, td, TDPRI_CRIT);
	1052	} else {
	1053	lwkt_send_ipiq(td->td_gd, (ipifunc1_t)lwkt_schedule, td);
	1054	}
	1055	#else
	1056	_lwkt_enqueue(td);
	1057	_lwkt_schedule_post(mygd, td, TDPRI_CRIT);
	1058	#endif
	1059	} else {
	1060	/*
	1061	* If the wait structure is NULL and we own the thread, there
	1062	* is no race (since we are in a critical section). If we
	1063	* do not own the thread there might be a race but the
	1064	* target cpu will deal with it.
	1065	*/
	1066	#ifdef SMP
	1067	if (td->td_gd == mygd) {
	1068	_lwkt_enqueue(td);
	1069	_lwkt_schedule_post(mygd, td, TDPRI_CRIT);
	1070	} else {
	1071	lwkt_send_ipiq(td->td_gd, (ipifunc1_t)lwkt_schedule, td);
	1072	}
	1073	#else
	1074	_lwkt_enqueue(td);
	1075	_lwkt_schedule_post(mygd, td, TDPRI_CRIT);
	1076	#endif
	1077	}
	1078	}
	1079	crit_exit_gd(mygd);
	1080	}
	1081
	1082	#ifdef SMP
	1083
	1084	/*
	1085	* Thread migration using a 'Pull' method. The thread may or may not be
	1086	* the current thread. It MUST be descheduled and in a stable state.
	1087	* lwkt_giveaway() must be called on the cpu owning the thread.
	1088	*
	1089	* At any point after lwkt_giveaway() is called, the target cpu may
	1090	* 'pull' the thread by calling lwkt_acquire().
	1091	*
	1092	* MPSAFE - must be called under very specific conditions.
	1093	*/
	1094	void
	1095	lwkt_giveaway(thread_t td)
	1096	{
	1097	globaldata_t gd = mycpu;
	1098
	1099	crit_enter_gd(gd);
	1100	KKASSERT(td->td_gd == gd);
	1101	TAILQ_REMOVE(&gd->gd_tdallq, td, td_allq);
	1102	td->td_flags \|= TDF_MIGRATING;
	1103	crit_exit_gd(gd);
	1104	}
	1105
	1106	void
	1107	lwkt_acquire(thread_t td)
	1108	{
	1109	globaldata_t gd;
	1110	globaldata_t mygd;
	1111
	1112	KKASSERT(td->td_flags & TDF_MIGRATING);
	1113	gd = td->td_gd;
	1114	mygd = mycpu;
	1115	if (gd != mycpu) {
	1116	cpu_lfence();
	1117	KKASSERT((td->td_flags & TDF_RUNQ) == 0);
	1118	crit_enter_gd(mygd);
	1119	while (td->td_flags & (TDF_RUNNING\|TDF_PREEMPT_LOCK))
	1120	cpu_lfence();
	1121	td->td_gd = mygd;
	1122	TAILQ_INSERT_TAIL(&mygd->gd_tdallq, td, td_allq);
	1123	td->td_flags &= ~TDF_MIGRATING;
	1124	crit_exit_gd(mygd);
	1125	} else {
	1126	crit_enter_gd(mygd);
	1127	TAILQ_INSERT_TAIL(&mygd->gd_tdallq, td, td_allq);
	1128	td->td_flags &= ~TDF_MIGRATING;
	1129	crit_exit_gd(mygd);
	1130	}
	1131	}
	1132
	1133	#endif
	1134
	1135	/*
	1136	* Generic deschedule. Descheduling threads other then your own should be
	1137	* done only in carefully controlled circumstances. Descheduling is
	1138	* asynchronous.
	1139	*
	1140	* This function may block if the cpu has run out of messages.
	1141	*/
	1142	void
	1143	lwkt_deschedule(thread_t td)
	1144	{
	1145	crit_enter();
	1146	#ifdef SMP
	1147	if (td == curthread) {
	1148	_lwkt_dequeue(td);
	1149	} else {
	1150	if (td->td_gd == mycpu) {
	1151	_lwkt_dequeue(td);
	1152	} else {
	1153	lwkt_send_ipiq(td->td_gd, (ipifunc1_t)lwkt_deschedule, td);
	1154	}
	1155	}
	1156	#else
	1157	_lwkt_dequeue(td);
	1158	#endif
	1159	crit_exit();
	1160	}
	1161
	1162	/*
	1163	* Set the target thread's priority. This routine does not automatically
	1164	* switch to a higher priority thread, LWKT threads are not designed for
	1165	* continuous priority changes. Yield if you want to switch.
	1166	*
	1167	* We have to retain the critical section count which uses the high bits
	1168	* of the td_pri field. The specified priority may also indicate zero or
	1169	* more critical sections by adding TDPRI_CRIT*N.
	1170	*
	1171	* Note that we requeue the thread whether it winds up on a different runq
	1172	* or not. uio_yield() depends on this and the routine is not normally
	1173	* called with the same priority otherwise.
	1174	*/
	1175	void
	1176	lwkt_setpri(thread_t td, int pri)
	1177	{
	1178	KKASSERT(pri >= 0);
	1179	KKASSERT(td->td_gd == mycpu);
	1180	crit_enter();
	1181	if (td->td_flags & TDF_RUNQ) {
	1182	_lwkt_dequeue(td);
	1183	td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri;
	1184	_lwkt_enqueue(td);
	1185	} else {
	1186	td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri;
	1187	}
	1188	crit_exit();
	1189	}
	1190
	1191	void
	1192	lwkt_setpri_self(int pri)
	1193	{
	1194	thread_t td = curthread;
	1195
	1196	KKASSERT(pri >= 0 && pri <= TDPRI_MAX);
	1197	crit_enter();
	1198	if (td->td_flags & TDF_RUNQ) {
	1199	_lwkt_dequeue(td);
	1200	td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri;
	1201	_lwkt_enqueue(td);
	1202	} else {
	1203	td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri;
	1204	}
	1205	crit_exit();
	1206	}
	1207
	1208	/*
	1209	* Determine if there is a runnable thread at a higher priority then
	1210	* the current thread. lwkt_setpri() does not check this automatically.
	1211	* Return 1 if there is, 0 if there isn't.
	1212	*
	1213	* Example: if bit 31 of runqmask is set and the current thread is priority
	1214	* 30, then we wind up checking the mask: 0x80000000 against 0x7fffffff.
	1215	*
	1216	* If nq reaches 31 the shift operation will overflow to 0 and we will wind
	1217	* up comparing against 0xffffffff, a comparison that will always be false.
	1218	*/
	1219	int
	1220	lwkt_checkpri_self(void)
	1221	{
	1222	globaldata_t gd = mycpu;
	1223	thread_t td = gd->gd_curthread;
	1224	int nq = td->td_pri & TDPRI_MASK;
	1225
	1226	while (gd->gd_runqmask > (__uint32_t)(2 << nq) - 1) {
	1227	if (TAILQ_FIRST(&gd->gd_tdrunq[nq + 1]))
	1228	return(1);
	1229	++nq;
	1230	}
	1231	return(0);
	1232	}
	1233
	1234	/*
	1235	* Migrate the current thread to the specified cpu.
	1236	*
	1237	* This is accomplished by descheduling ourselves from the current cpu,
	1238	* moving our thread to the tdallq of the target cpu, IPI messaging the
	1239	* target cpu, and switching out. TDF_MIGRATING prevents scheduling
	1240	* races while the thread is being migrated.
	1241	*/
	1242	#ifdef SMP
	1243	static void lwkt_setcpu_remote(void *arg);
	1244	#endif
	1245
	1246	void
	1247	lwkt_setcpu_self(globaldata_t rgd)
	1248	{
	1249	#ifdef SMP
	1250	thread_t td = curthread;
	1251
	1252	if (td->td_gd != rgd) {
	1253	crit_enter_quick(td);
	1254	td->td_flags \|= TDF_MIGRATING;
	1255	lwkt_deschedule_self(td);
	1256	TAILQ_REMOVE(&td->td_gd->gd_tdallq, td, td_allq);
	1257	lwkt_send_ipiq(rgd, (ipifunc1_t)lwkt_setcpu_remote, td);
	1258	lwkt_switch();
	1259	/* we are now on the target cpu */
	1260	TAILQ_INSERT_TAIL(&rgd->gd_tdallq, td, td_allq);
	1261	crit_exit_quick(td);
	1262	}
	1263	#endif
	1264	}
	1265
	1266	void
	1267	lwkt_migratecpu(int cpuid)
	1268	{
	1269	#ifdef SMP
	1270	globaldata_t rgd;
	1271
	1272	rgd = globaldata_find(cpuid);
	1273	lwkt_setcpu_self(rgd);
	1274	#endif
	1275	}
	1276
	1277	/*
	1278	* Remote IPI for cpu migration (called while in a critical section so we
	1279	* do not have to enter another one). The thread has already been moved to
	1280	* our cpu's allq, but we must wait for the thread to be completely switched
	1281	* out on the originating cpu before we schedule it on ours or the stack
	1282	* state may be corrupt. We clear TDF_MIGRATING after flushing the GD
	1283	* change to main memory.
	1284	*
	1285	* XXX The use of TDF_MIGRATING might not be sufficient to avoid races
	1286	* against wakeups. It is best if this interface is used only when there
	1287	* are no pending events that might try to schedule the thread.
	1288	*/
	1289	#ifdef SMP
	1290	static void
	1291	lwkt_setcpu_remote(void *arg)
	1292	{
	1293	thread_t td = arg;
	1294	globaldata_t gd = mycpu;
	1295
	1296	while (td->td_flags & (TDF_RUNNING\|TDF_PREEMPT_LOCK))
	1297	cpu_lfence();
	1298	td->td_gd = gd;
	1299	cpu_sfence();
	1300	td->td_flags &= ~TDF_MIGRATING;
	1301	KKASSERT(td->td_proc == NULL \|\| (td->td_proc->p_flag & P_ONRUNQ) == 0);
	1302	_lwkt_enqueue(td);
	1303	}
	1304	#endif
	1305
	1306	struct lwp *
	1307	lwkt_preempted_proc(void)
	1308	{
	1309	thread_t td = curthread;
	1310	while (td->td_preempted)
	1311	td = td->td_preempted;
	1312	return(td->td_lwp);
	1313	}
	1314
	1315	/*
	1316	* Block on the specified wait queue until signaled. A generation number
	1317	* must be supplied to interlock the wait queue. The function will
	1318	* return immediately if the generation number does not match the wait
	1319	* structure's generation number.
	1320	*/
	1321	void
	1322	lwkt_block(lwkt_wait_t w, const char wmesg, int gen)
	1323	{
	1324	thread_t td = curthread;
	1325
	1326	spin_lock_wr(&w->wa_spinlock);
	1327	if (w->wa_gen == *gen) {
	1328	_lwkt_dequeue(td);
	1329	td->td_flags \|= TDF_BLOCKQ;
	1330	TAILQ_INSERT_TAIL(&w->wa_waitq, td, td_threadq);
	1331	++w->wa_count;
	1332	td->td_wait = w;
	1333	td->td_wmesg = wmesg;
	1334	spin_unlock_wr(&w->wa_spinlock);
	1335	lwkt_switch();
	1336	KKASSERT((td->td_flags & TDF_BLOCKQ) == 0);
	1337	td->td_wmesg = NULL;
	1338	*gen = w->wa_gen;
	1339	} else {
	1340	*gen = w->wa_gen;
	1341	spin_unlock_wr(&w->wa_spinlock);
	1342	}
	1343	}
	1344
	1345	/*
	1346	* Signal a wait queue. We gain ownership of the wait queue in order to
	1347	* signal it. Once a thread is removed from the wait queue we have to
	1348	* deal with the cpu owning the thread.
	1349	*
	1350	* Note: alternatively we could message the target cpu owning the wait
	1351	* queue. YYY implement as sysctl.
	1352	*/
	1353	void
	1354	lwkt_signal(lwkt_wait_t w, int count)
	1355	{
	1356	thread_t td;
	1357
	1358	spin_lock_wr(&w->wa_spinlock);
	1359	++w->wa_gen;
	1360	if (count < 0)
	1361	count = w->wa_count;
	1362	while ((td = TAILQ_FIRST(&w->wa_waitq)) != NULL && count) {
	1363	--count;
	1364	--w->wa_count;
	1365	KKASSERT(td->td_flags & TDF_BLOCKQ);
	1366	TAILQ_REMOVE(&w->wa_waitq, td, td_threadq);
	1367	td->td_flags &= ~TDF_BLOCKQ;
	1368	td->td_wait = NULL;
	1369	spin_unlock_wr(&w->wa_spinlock);
	1370	KKASSERT(td->td_proc == NULL \|\| (td->td_proc->p_flag & P_ONRUNQ) == 0);
	1371	#ifdef SMP
	1372	if (td->td_gd == mycpu) {
	1373	_lwkt_enqueue(td);
	1374	} else {
	1375	lwkt_send_ipiq(td->td_gd, (ipifunc1_t)lwkt_schedule, td);
	1376	}
	1377	#else
	1378	_lwkt_enqueue(td);
	1379	#endif
	1380	spin_lock_wr(&w->wa_spinlock);
	1381	}
	1382	spin_unlock_wr(&w->wa_spinlock);
	1383	}
	1384
	1385	/*
	1386	* Create a kernel process/thread/whatever. It shares it's address space
	1387	* with proc0 - ie: kernel only.
	1388	*
	1389	* NOTE! By default new threads are created with the MP lock held. A
	1390	* thread which does not require the MP lock should release it by calling
	1391	* rel_mplock() at the start of the new thread.
	1392	*/
	1393	int
	1394	lwkt_create(void (func)(void ), void *arg,
	1395	struct thread **tdp, thread_t template, int tdflags, int cpu,
	1396	const char *fmt, ...)
	1397	{
	1398	thread_t td;
	1399	__va_list ap;
	1400
	1401	td = lwkt_alloc_thread(template, LWKT_THREAD_STACK, cpu,
	1402	tdflags \| TDF_VERBOSE);
	1403	if (tdp)
	1404	*tdp = td;
	1405	cpu_set_thread_handler(td, lwkt_exit, func, arg);
	1406
	1407	/*
	1408	* Set up arg0 for 'ps' etc
	1409	*/
	1410	__va_start(ap, fmt);
	1411	vsnprintf(td->td_comm, sizeof(td->td_comm), fmt, ap);
	1412	__va_end(ap);
	1413
	1414	/*
	1415	* Schedule the thread to run
	1416	*/
	1417	if ((td->td_flags & TDF_STOPREQ) == 0)
	1418	lwkt_schedule(td);
	1419	else
	1420	td->td_flags &= ~TDF_STOPREQ;
	1421	return 0;
	1422	}
	1423
	1424	/*
	1425	* kthread_* is specific to the kernel and is not needed by userland.
	1426	*/
	1427	#ifdef _KERNEL
	1428
	1429	/*
	1430	* Destroy an LWKT thread. Warning! This function is not called when
	1431	* a process exits, cpu_proc_exit() directly calls cpu_thread_exit() and
	1432	* uses a different reaping mechanism.
	1433	*/
	1434	void
	1435	lwkt_exit(void)
	1436	{
	1437	thread_t td = curthread;
	1438	globaldata_t gd;
	1439
	1440	if (td->td_flags & TDF_VERBOSE)
	1441	printf("kthread %p %s has exited\n", td, td->td_comm);
	1442	caps_exit(td);
	1443	crit_enter_quick(td);
	1444	lwkt_deschedule_self(td);
	1445	gd = mycpu;
	1446	lwkt_remove_tdallq(td);
	1447	if (td->td_flags & TDF_ALLOCATED_THREAD) {
	1448	++gd->gd_tdfreecount;
	1449	TAILQ_INSERT_TAIL(&gd->gd_tdfreeq, td, td_threadq);
	1450	}
	1451	cpu_thread_exit();
	1452	}
	1453
	1454	void
	1455	lwkt_remove_tdallq(thread_t td)
	1456	{
	1457	KKASSERT(td->td_gd == mycpu);
	1458	TAILQ_REMOVE(&td->td_gd->gd_tdallq, td, td_allq);
	1459	}
	1460
	1461	#endif /* _KERNEL */
	1462
	1463	void
	1464	crit_panic(void)
	1465	{
	1466	thread_t td = curthread;
	1467	int lpri = td->td_pri;
	1468
	1469	td->td_pri = 0;
	1470	panic("td_pri is/would-go negative! %p %d", td, lpri);
	1471	}
	1472
	1473	#ifdef SMP
	1474
	1475	/*
	1476	* Called from debugger/panic on cpus which have been stopped. We must still
	1477	* process the IPIQ while stopped, even if we were stopped while in a critical
	1478	* section (XXX).
	1479	*
	1480	* If we are dumping also try to process any pending interrupts. This may
	1481	* or may not work depending on the state of the cpu at the point it was
	1482	* stopped.
	1483	*/
	1484	void
	1485	lwkt_smp_stopped(void)
	1486	{
	1487	globaldata_t gd = mycpu;
	1488
	1489	crit_enter_gd(gd);
	1490	if (dumping) {
	1491	lwkt_process_ipiq();
	1492	splz();
	1493	} else {
	1494	lwkt_process_ipiq();
	1495	}
	1496	crit_exit_gd(gd);
	1497	}
	1498
	1499	/*
	1500	* get_mplock() calls this routine if it is unable to obtain the MP lock.
	1501	* get_mplock() has already incremented td_mpcount. We must block and
	1502	* not return until giant is held.
	1503	*
	1504	* All we have to do is lwkt_switch() away. The LWKT scheduler will not
	1505	* reschedule the thread until it can obtain the giant lock for it.
	1506	*/
	1507	void
	1508	lwkt_mp_lock_contested(void)
	1509	{
	1510	#ifdef _KERNEL
	1511	loggiant(beg);
	1512	#endif
	1513	lwkt_switch();
	1514	#ifdef _KERNEL
	1515	loggiant(end);
	1516	#endif
	1517	}
	1518
	1519	#endif