gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2003,2004 The DragonFly Project. All rights reserved.
	3	*
	4	* This code is derived from software contributed to The DragonFly Project
	5	* by Matthew Dillon <dillon@backplane.com>
	6	*
	7	* Redistribution and use in source and binary forms, with or without
	8	* modification, are permitted provided that the following conditions
	9	* are met:
	10	*
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in
	15	* the documentation and/or other materials provided with the
	16	* distribution.
	17	* 3. Neither the name of The DragonFly Project nor the names of its
	18	* contributors may be used to endorse or promote products derived
	19	* from this software without specific, prior written permission.
	20	*
	21	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	22	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	23	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	24	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	25	* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	26	* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
	27	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	28	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	29	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	30	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	31	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	32	* SUCH DAMAGE.
	33	*
	34	* $DragonFly: src/sys/kern/lwkt_thread.c,v 1.90 2005/12/10 18:50:36 dillon Exp $
	35	*/
	36
	37	/*
	38	* Each cpu in a system has its own self-contained light weight kernel
	39	* thread scheduler, which means that generally speaking we only need
	40	* to use a critical section to avoid problems. Foreign thread
	41	* scheduling is queued via (async) IPIs.
	42	*/
	43
	44	#ifdef _KERNEL
	45
	46	#include <sys/param.h>
	47	#include <sys/systm.h>
	48	#include <sys/kernel.h>
	49	#include <sys/proc.h>
	50	#include <sys/rtprio.h>
	51	#include <sys/queue.h>
	52	#include <sys/thread2.h>
	53	#include <sys/sysctl.h>
	54	#include <sys/kthread.h>
	55	#include <machine/cpu.h>
	56	#include <sys/lock.h>
	57	#include <sys/caps.h>
	58
	59	#include <vm/vm.h>
	60	#include <vm/vm_param.h>
	61	#include <vm/vm_kern.h>
	62	#include <vm/vm_object.h>
	63	#include <vm/vm_page.h>
	64	#include <vm/vm_map.h>
	65	#include <vm/vm_pager.h>
	66	#include <vm/vm_extern.h>
	67	#include <vm/vm_zone.h>
	68
	69	#include <machine/stdarg.h>
	70	#include <machine/ipl.h>
	71	#include <machine/smp.h>
	72
	73	#else
	74
	75	#include <sys/stdint.h>
	76	#include <libcaps/thread.h>
	77	#include <sys/thread.h>
	78	#include <sys/msgport.h>
	79	#include <sys/errno.h>
	80	#include <libcaps/globaldata.h>
	81	#include <machine/cpufunc.h>
	82	#include <sys/thread2.h>
	83	#include <sys/msgport2.h>
	84	#include <stdio.h>
	85	#include <stdlib.h>
	86	#include <string.h>
	87	#include <machine/lock.h>
	88	#include <machine/atomic.h>
	89	#include <machine/cpu.h>
	90
	91	#endif
	92
	93	static int untimely_switch = 0;
	94	#ifdef INVARIANTS
	95	static int panic_on_cscount = 0;
	96	#endif
	97	static __int64_t switch_count = 0;
	98	static __int64_t preempt_hit = 0;
	99	static __int64_t preempt_miss = 0;
	100	static __int64_t preempt_weird = 0;
	101	static __int64_t token_contention_count = 0;
	102	static __int64_t mplock_contention_count = 0;
	103
	104	#ifdef _KERNEL
	105
	106	SYSCTL_INT(_lwkt, OID_AUTO, untimely_switch, CTLFLAG_RW, &untimely_switch, 0, "");
	107	#ifdef INVARIANTS
	108	SYSCTL_INT(_lwkt, OID_AUTO, panic_on_cscount, CTLFLAG_RW, &panic_on_cscount, 0, "");
	109	#endif
	110	SYSCTL_QUAD(_lwkt, OID_AUTO, switch_count, CTLFLAG_RW, &switch_count, 0, "");
	111	SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_hit, CTLFLAG_RW, &preempt_hit, 0, "");
	112	SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_miss, CTLFLAG_RW, &preempt_miss, 0, "");
	113	SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_weird, CTLFLAG_RW, &preempt_weird, 0, "");
	114	#ifdef INVARIANTS
	115	SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count, CTLFLAG_RW,
	116	&token_contention_count, 0, "spinning due to token contention");
	117	SYSCTL_QUAD(_lwkt, OID_AUTO, mplock_contention_count, CTLFLAG_RW,
	118	&mplock_contention_count, 0, "spinning due to MPLOCK contention");
	119	#endif
	120	#endif
	121
	122	/*
	123	* These helper procedures handle the runq, they can only be called from
	124	* within a critical section.
	125	*
	126	* WARNING! Prior to SMP being brought up it is possible to enqueue and
	127	* dequeue threads belonging to other cpus, so be sure to use td->td_gd
	128	* instead of 'mycpu' when referencing the globaldata structure. Once
	129	* SMP live enqueuing and dequeueing only occurs on the current cpu.
	130	*/
	131	static __inline
	132	void
	133	_lwkt_dequeue(thread_t td)
	134	{
	135	if (td->td_flags & TDF_RUNQ) {
	136	int nq = td->td_pri & TDPRI_MASK;
	137	struct globaldata *gd = td->td_gd;
	138
	139	td->td_flags &= ~TDF_RUNQ;
	140	TAILQ_REMOVE(&gd->gd_tdrunq[nq], td, td_threadq);
	141	/* runqmask is passively cleaned up by the switcher */
	142	}
	143	}
	144
	145	static __inline
	146	void
	147	_lwkt_enqueue(thread_t td)
	148	{
	149	if ((td->td_flags & (TDF_RUNQ\|TDF_MIGRATING\|TDF_TSLEEPQ\|TDF_BLOCKQ)) == 0) {
	150	int nq = td->td_pri & TDPRI_MASK;
	151	struct globaldata *gd = td->td_gd;
	152
	153	td->td_flags \|= TDF_RUNQ;
	154	TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], td, td_threadq);
	155	gd->gd_runqmask \|= 1 << nq;
	156	}
	157	}
	158
	159	/*
	160	* Schedule a thread to run. As the current thread we can always safely
	161	* schedule ourselves, and a shortcut procedure is provided for that
	162	* function.
	163	*
	164	* (non-blocking, self contained on a per cpu basis)
	165	*/
	166	void
	167	lwkt_schedule_self(thread_t td)
	168	{
	169	crit_enter_quick(td);
	170	KASSERT(td->td_wait == NULL, ("lwkt_schedule_self(): td_wait not NULL!"));
	171	KASSERT(td != &td->td_gd->gd_idlethread, ("lwkt_schedule_self(): scheduling gd_idlethread is illegal!"));
	172	KKASSERT(td->td_proc == NULL \|\| (td->td_proc->p_flag & P_ONRUNQ) == 0);
	173	_lwkt_enqueue(td);
	174	crit_exit_quick(td);
	175	}
	176
	177	/*
	178	* Deschedule a thread.
	179	*
	180	* (non-blocking, self contained on a per cpu basis)
	181	*/
	182	void
	183	lwkt_deschedule_self(thread_t td)
	184	{
	185	crit_enter_quick(td);
	186	KASSERT(td->td_wait == NULL, ("lwkt_schedule_self(): td_wait not NULL!"));
	187	_lwkt_dequeue(td);
	188	crit_exit_quick(td);
	189	}
	190
	191	#ifdef _KERNEL
	192
	193	/*
	194	* LWKTs operate on a per-cpu basis
	195	*
	196	* WARNING! Called from early boot, 'mycpu' may not work yet.
	197	*/
	198	void
	199	lwkt_gdinit(struct globaldata *gd)
	200	{
	201	int i;
	202
	203	for (i = 0; i < sizeof(gd->gd_tdrunq)/sizeof(gd->gd_tdrunq[0]); ++i)
	204	TAILQ_INIT(&gd->gd_tdrunq[i]);
	205	gd->gd_runqmask = 0;
	206	TAILQ_INIT(&gd->gd_tdallq);
	207	}
	208
	209	#endif /* _KERNEL */
	210
	211	/*
	212	* Initialize a thread wait structure prior to first use.
	213	*
	214	* NOTE! called from low level boot code, we cannot do anything fancy!
	215	*/
	216	void
	217	lwkt_wait_init(lwkt_wait_t w)
	218	{
	219	lwkt_token_init(&w->wa_token);
	220	TAILQ_INIT(&w->wa_waitq);
	221	w->wa_gen = 0;
	222	w->wa_count = 0;
	223	}
	224
	225	/*
	226	* Create a new thread. The thread must be associated with a process context
	227	* or LWKT start address before it can be scheduled. If the target cpu is
	228	* -1 the thread will be created on the current cpu.
	229	*
	230	* If you intend to create a thread without a process context this function
	231	* does everything except load the startup and switcher function.
	232	*/
	233	thread_t
	234	lwkt_alloc_thread(struct thread *td, int stksize, int cpu, int flags)
	235	{
	236	void *stack;
	237	globaldata_t gd = mycpu;
	238
	239	if (td == NULL) {
	240	crit_enter_gd(gd);
	241	if (gd->gd_tdfreecount > 0) {
	242	--gd->gd_tdfreecount;
	243	td = TAILQ_FIRST(&gd->gd_tdfreeq);
	244	KASSERT(td != NULL && (td->td_flags & TDF_RUNNING) == 0,
	245	("lwkt_alloc_thread: unexpected NULL or corrupted td"));
	246	TAILQ_REMOVE(&gd->gd_tdfreeq, td, td_threadq);
	247	crit_exit_gd(gd);
	248	flags \|= td->td_flags & (TDF_ALLOCATED_STACK\|TDF_ALLOCATED_THREAD);
	249	} else {
	250	crit_exit_gd(gd);
	251	#ifdef _KERNEL
	252	td = zalloc(thread_zone);
	253	#else
	254	td = malloc(sizeof(struct thread));
	255	#endif
	256	td->td_kstack = NULL;
	257	td->td_kstack_size = 0;
	258	flags \|= TDF_ALLOCATED_THREAD;
	259	}
	260	}
	261	if ((stack = td->td_kstack) != NULL && td->td_kstack_size != stksize) {
	262	if (flags & TDF_ALLOCATED_STACK) {
	263	#ifdef _KERNEL
	264	kmem_free(kernel_map, (vm_offset_t)stack, td->td_kstack_size);
	265	#else
	266	libcaps_free_stack(stack, td->td_kstack_size);
	267	#endif
	268	stack = NULL;
	269	}
	270	}
	271	if (stack == NULL) {
	272	#ifdef _KERNEL
	273	stack = (void *)kmem_alloc(kernel_map, stksize);
	274	#else
	275	stack = libcaps_alloc_stack(stksize);
	276	#endif
	277	flags \|= TDF_ALLOCATED_STACK;
	278	}
	279	if (cpu < 0)
	280	lwkt_init_thread(td, stack, stksize, flags, mycpu);
	281	else
	282	lwkt_init_thread(td, stack, stksize, flags, globaldata_find(cpu));
	283	return(td);
	284	}
	285
	286	#ifdef _KERNEL
	287
	288	/*
	289	* Initialize a preexisting thread structure. This function is used by
	290	* lwkt_alloc_thread() and also used to initialize the per-cpu idlethread.
	291	*
	292	* All threads start out in a critical section at a priority of
	293	* TDPRI_KERN_DAEMON. Higher level code will modify the priority as
	294	* appropriate. This function may send an IPI message when the
	295	* requested cpu is not the current cpu and consequently gd_tdallq may
	296	* not be initialized synchronously from the point of view of the originating
	297	* cpu.
	298	*
	299	* NOTE! we have to be careful in regards to creating threads for other cpus
	300	* if SMP has not yet been activated.
	301	*/
	302	#ifdef SMP
	303
	304	static void
	305	lwkt_init_thread_remote(void *arg)
	306	{
	307	thread_t td = arg;
	308
	309	TAILQ_INSERT_TAIL(&td->td_gd->gd_tdallq, td, td_allq);
	310	}
	311
	312	#endif
	313
	314	void
	315	lwkt_init_thread(thread_t td, void *stack, int stksize, int flags,
	316	struct globaldata *gd)
	317	{
	318	globaldata_t mygd = mycpu;
	319
	320	bzero(td, sizeof(struct thread));
	321	td->td_kstack = stack;
	322	td->td_kstack_size = stksize;
	323	td->td_flags = flags;
	324	td->td_gd = gd;
	325	td->td_pri = TDPRI_KERN_DAEMON + TDPRI_CRIT;
	326	#ifdef SMP
	327	if ((flags & TDF_MPSAFE) == 0)
	328	td->td_mpcount = 1;
	329	#endif
	330	lwkt_initport(&td->td_msgport, td);
	331	pmap_init_thread(td);
	332	#ifdef SMP
	333	/*
	334	* Normally initializing a thread for a remote cpu requires sending an
	335	* IPI. However, the idlethread is setup before the other cpus are
	336	* activated so we have to treat it as a special case. XXX manipulation
	337	* of gd_tdallq requires the BGL.
	338	*/
	339	if (gd == mygd \|\| td == &gd->gd_idlethread) {
	340	crit_enter_gd(mygd);
	341	TAILQ_INSERT_TAIL(&gd->gd_tdallq, td, td_allq);
	342	crit_exit_gd(mygd);
	343	} else {
	344	lwkt_send_ipiq(gd, lwkt_init_thread_remote, td);
	345	}
	346	#else
	347	crit_enter_gd(mygd);
	348	TAILQ_INSERT_TAIL(&gd->gd_tdallq, td, td_allq);
	349	crit_exit_gd(mygd);
	350	#endif
	351	}
	352
	353	#endif /* _KERNEL */
	354
	355	void
	356	lwkt_set_comm(thread_t td, const char *ctl, ...)
	357	{
	358	__va_list va;
	359
	360	__va_start(va, ctl);
	361	vsnprintf(td->td_comm, sizeof(td->td_comm), ctl, va);
	362	__va_end(va);
	363	}
	364
	365	void
	366	lwkt_hold(thread_t td)
	367	{
	368	++td->td_refs;
	369	}
	370
	371	void
	372	lwkt_rele(thread_t td)
	373	{
	374	KKASSERT(td->td_refs > 0);
	375	--td->td_refs;
	376	}
	377
	378	#ifdef _KERNEL
	379
	380	void
	381	lwkt_wait_free(thread_t td)
	382	{
	383	while (td->td_refs)
	384	tsleep(td, 0, "tdreap", hz);
	385	}
	386
	387	#endif
	388
	389	void
	390	lwkt_free_thread(thread_t td)
	391	{
	392	struct globaldata *gd = mycpu;
	393
	394	KASSERT((td->td_flags & TDF_RUNNING) == 0,
	395	("lwkt_free_thread: did not exit! %p", td));
	396
	397	crit_enter_gd(gd);
	398	TAILQ_REMOVE(&gd->gd_tdallq, td, td_allq);
	399	if (gd->gd_tdfreecount < CACHE_NTHREADS &&
	400	(td->td_flags & TDF_ALLOCATED_THREAD)
	401	) {
	402	++gd->gd_tdfreecount;
	403	TAILQ_INSERT_HEAD(&gd->gd_tdfreeq, td, td_threadq);
	404	crit_exit_gd(gd);
	405	} else {
	406	crit_exit_gd(gd);
	407	if (td->td_kstack && (td->td_flags & TDF_ALLOCATED_STACK)) {
	408	#ifdef _KERNEL
	409	kmem_free(kernel_map, (vm_offset_t)td->td_kstack, td->td_kstack_size);
	410	#else
	411	libcaps_free_stack(td->td_kstack, td->td_kstack_size);
	412	#endif
	413	/* gd invalid */
	414	td->td_kstack = NULL;
	415	td->td_kstack_size = 0;
	416	}
	417	if (td->td_flags & TDF_ALLOCATED_THREAD) {
	418	#ifdef _KERNEL
	419	zfree(thread_zone, td);
	420	#else
	421	free(td);
	422	#endif
	423	}
	424	}
	425	}
	426
	427
	428	/*
	429	* Switch to the next runnable lwkt. If no LWKTs are runnable then
	430	* switch to the idlethread. Switching must occur within a critical
	431	* section to avoid races with the scheduling queue.
	432	*
	433	* We always have full control over our cpu's run queue. Other cpus
	434	* that wish to manipulate our queue must use the cpu_*msg() calls to
	435	* talk to our cpu, so a critical section is all that is needed and
	436	* the result is very, very fast thread switching.
	437	*
	438	* The LWKT scheduler uses a fixed priority model and round-robins at
	439	* each priority level. User process scheduling is a totally
	440	* different beast and LWKT priorities should not be confused with
	441	* user process priorities.
	442	*
	443	* The MP lock may be out of sync with the thread's td_mpcount. lwkt_switch()
	444	* cleans it up. Note that the td_switch() function cannot do anything that
	445	* requires the MP lock since the MP lock will have already been setup for
	446	* the target thread (not the current thread). It's nice to have a scheduler
	447	* that does not need the MP lock to work because it allows us to do some
	448	* really cool high-performance MP lock optimizations.
	449	*
	450	* PREEMPTION NOTE: Preemption occurs via lwkt_preempt(). lwkt_switch()
	451	* is not called by the current thread in the preemption case, only when
	452	* the preempting thread blocks (in order to return to the original thread).
	453	*/
	454	void
	455	lwkt_switch(void)
	456	{
	457	globaldata_t gd = mycpu;
	458	thread_t td = gd->gd_curthread;
	459	thread_t ntd;
	460	#ifdef SMP
	461	int mpheld;
	462	#endif
	463
	464	/*
	465	* We had better not be holding any spin locks.
	466	*/
	467	KKASSERT(td->td_spinlocks == 0);
	468
	469	/*
	470	* Switching from within a 'fast' (non thread switched) interrupt or IPI
	471	* is illegal. However, we may have to do it anyway if we hit a fatal
	472	* kernel trap or we have paniced.
	473	*
	474	* If this case occurs save and restore the interrupt nesting level.
	475	*/
	476	if (gd->gd_intr_nesting_level) {
	477	int savegdnest;
	478	int savegdtrap;
	479
	480	if (gd->gd_trap_nesting_level == 0 && panicstr == NULL) {
	481	panic("lwkt_switch: cannot switch from within "
	482	"a fast interrupt, yet, td %p\n", td);
	483	} else {
	484	savegdnest = gd->gd_intr_nesting_level;
	485	savegdtrap = gd->gd_trap_nesting_level;
	486	gd->gd_intr_nesting_level = 0;
	487	gd->gd_trap_nesting_level = 0;
	488	if ((td->td_flags & TDF_PANICWARN) == 0) {
	489	td->td_flags \|= TDF_PANICWARN;
	490	printf("Warning: thread switch from interrupt or IPI, "
	491	"thread %p (%s)\n", td, td->td_comm);
	492	#ifdef DDB
	493	db_print_backtrace();
	494	#endif
	495	}
	496	lwkt_switch();
	497	gd->gd_intr_nesting_level = savegdnest;
	498	gd->gd_trap_nesting_level = savegdtrap;
	499	return;
	500	}
	501	}
	502
	503	/*
	504	* Passive release (used to transition from user to kernel mode
	505	* when we block or switch rather then when we enter the kernel).
	506	* This function is NOT called if we are switching into a preemption
	507	* or returning from a preemption. Typically this causes us to lose
	508	* our current process designation (if we have one) and become a true
	509	* LWKT thread, and may also hand the current process designation to
	510	* another process and schedule thread.
	511	*/
	512	if (td->td_release)
	513	td->td_release(td);
	514
	515	crit_enter_gd(gd);
	516
	517	#ifdef SMP
	518	/*
	519	* td_mpcount cannot be used to determine if we currently hold the
	520	* MP lock because get_mplock() will increment it prior to attempting
	521	* to get the lock, and switch out if it can't. Our ownership of
	522	* the actual lock will remain stable while we are in a critical section
	523	* (but, of course, another cpu may own or release the lock so the
	524	* actual value of mp_lock is not stable).
	525	*/
	526	mpheld = MP_LOCK_HELD();
	527	#ifdef INVARIANTS
	528	if (td->td_cscount) {
	529	printf("Diagnostic: attempt to switch while mastering cpusync: %p\n",
	530	td);
	531	if (panic_on_cscount)
	532	panic("switching while mastering cpusync");
	533	}
	534	#endif
	535	#endif
	536	if ((ntd = td->td_preempted) != NULL) {
	537	/*
	538	* We had preempted another thread on this cpu, resume the preempted
	539	* thread. This occurs transparently, whether the preempted thread
	540	* was scheduled or not (it may have been preempted after descheduling
	541	* itself).
	542	*
	543	* We have to setup the MP lock for the original thread after backing
	544	* out the adjustment that was made to curthread when the original
	545	* was preempted.
	546	*/
	547	KKASSERT(ntd->td_flags & TDF_PREEMPT_LOCK);
	548	#ifdef SMP
	549	if (ntd->td_mpcount && mpheld == 0) {
	550	panic("MPLOCK NOT HELD ON RETURN: %p %p %d %d",
	551	td, ntd, td->td_mpcount, ntd->td_mpcount);
	552	}
	553	if (ntd->td_mpcount) {
	554	td->td_mpcount -= ntd->td_mpcount;
	555	KKASSERT(td->td_mpcount >= 0);
	556	}
	557	#endif
	558	ntd->td_flags \|= TDF_PREEMPT_DONE;
	559
	560	/*
	561	* XXX. The interrupt may have woken a thread up, we need to properly
	562	* set the reschedule flag if the originally interrupted thread is at
	563	* a lower priority.
	564	*/
	565	if (gd->gd_runqmask > (2 << (ntd->td_pri & TDPRI_MASK)) - 1)
	566	need_lwkt_resched();
	567	/* YYY release mp lock on switchback if original doesn't need it */
	568	} else {
	569	/*
	570	* Priority queue / round-robin at each priority. Note that user
	571	* processes run at a fixed, low priority and the user process
	572	* scheduler deals with interactions between user processes
	573	* by scheduling and descheduling them from the LWKT queue as
	574	* necessary.
	575	*
	576	* We have to adjust the MP lock for the target thread. If we
	577	* need the MP lock and cannot obtain it we try to locate a
	578	* thread that does not need the MP lock. If we cannot, we spin
	579	* instead of HLT.
	580	*
	581	* A similar issue exists for the tokens held by the target thread.
	582	* If we cannot obtain ownership of the tokens we cannot immediately
	583	* schedule the thread.
	584	*/
	585
	586	/*
	587	* We are switching threads. If there are any pending requests for
	588	* tokens we can satisfy all of them here.
	589	*/
	590	#ifdef SMP
	591	if (gd->gd_tokreqbase)
	592	lwkt_drain_token_requests();
	593	#endif
	594
	595	/*
	596	* If an LWKT reschedule was requested, well that is what we are
	597	* doing now so clear it.
	598	*/
	599	clear_lwkt_resched();
	600	again:
	601	if (gd->gd_runqmask) {
	602	int nq = bsrl(gd->gd_runqmask);
	603	if ((ntd = TAILQ_FIRST(&gd->gd_tdrunq[nq])) == NULL) {
	604	gd->gd_runqmask &= ~(1 << nq);
	605	goto again;
	606	}
	607	#ifdef SMP
	608	/*
	609	* THREAD SELECTION FOR AN SMP MACHINE BUILD
	610	*
	611	* If the target needs the MP lock and we couldn't get it,
	612	* or if the target is holding tokens and we could not
	613	* gain ownership of the tokens, continue looking for a
	614	* thread to schedule and spin instead of HLT if we can't.
	615	*
	616	* NOTE: the mpheld variable invalid after this conditional, it
	617	* can change due to both cpu_try_mplock() returning success
	618	* AND interactions in lwkt_chktokens() due to the fact that
	619	* we are trying to check the mpcount of a thread other then
	620	* the current thread. Because of this, if the current thread
	621	* is not holding td_mpcount, an IPI indirectly run via
	622	* lwkt_chktokens() can obtain and release the MP lock and
	623	* cause the core MP lock to be released.
	624	*/
	625	if ((ntd->td_mpcount && mpheld == 0 && !cpu_try_mplock()) \|\|
	626	(ntd->td_toks && lwkt_chktokens(ntd) == 0)
	627	) {
	628	u_int32_t rqmask = gd->gd_runqmask;
	629
	630	mpheld = MP_LOCK_HELD();
	631	ntd = NULL;
	632	while (rqmask) {
	633	TAILQ_FOREACH(ntd, &gd->gd_tdrunq[nq], td_threadq) {
	634	if (ntd->td_mpcount && !mpheld && !cpu_try_mplock()) {
	635	/* spinning due to MP lock being held */
	636	#ifdef INVARIANTS
	637	++mplock_contention_count;
	638	#endif
	639	/* mplock still not held, 'mpheld' still valid */
	640	continue;
	641	}
	642
	643	/*
	644	* mpheld state invalid after chktokens call returns
	645	* failure, but the variable is only needed for
	646	* the loop.
	647	*/
	648	if (ntd->td_toks && !lwkt_chktokens(ntd)) {
	649	/* spinning due to token contention */
	650	#ifdef INVARIANTS
	651	++token_contention_count;
	652	#endif
	653	mpheld = MP_LOCK_HELD();
	654	continue;
	655	}
	656	break;
	657	}
	658	if (ntd)
	659	break;
	660	rqmask &= ~(1 << nq);
	661	nq = bsrl(rqmask);
	662	}
	663	if (ntd == NULL) {
	664	ntd = &gd->gd_idlethread;
	665	ntd->td_flags \|= TDF_IDLE_NOHLT;
	666	goto using_idle_thread;
	667	} else {
	668	++gd->gd_cnt.v_swtch;
	669	TAILQ_REMOVE(&gd->gd_tdrunq[nq], ntd, td_threadq);
	670	TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], ntd, td_threadq);
	671	}
	672	} else {
	673	++gd->gd_cnt.v_swtch;
	674	TAILQ_REMOVE(&gd->gd_tdrunq[nq], ntd, td_threadq);
	675	TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], ntd, td_threadq);
	676	}
	677	#else
	678	/*
	679	* THREAD SELECTION FOR A UP MACHINE BUILD. We don't have to
	680	* worry about tokens or the BGL.
	681	*/
	682	++gd->gd_cnt.v_swtch;
	683	TAILQ_REMOVE(&gd->gd_tdrunq[nq], ntd, td_threadq);
	684	TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], ntd, td_threadq);
	685	#endif
	686	} else {
	687	/*
	688	* We have nothing to run but only let the idle loop halt
	689	* the cpu if there are no pending interrupts.
	690	*/
	691	ntd = &gd->gd_idlethread;
	692	if (gd->gd_reqflags & RQF_IDLECHECK_MASK)
	693	ntd->td_flags \|= TDF_IDLE_NOHLT;
	694	#ifdef SMP
	695	using_idle_thread:
	696	/*
	697	* The idle thread should not be holding the MP lock unless we
	698	* are trapping in the kernel or in a panic. Since we select the
	699	* idle thread unconditionally when no other thread is available,
	700	* if the MP lock is desired during a panic or kernel trap, we
	701	* have to loop in the scheduler until we get it.
	702	*/
	703	if (ntd->td_mpcount) {
	704	mpheld = MP_LOCK_HELD();
	705	if (gd->gd_trap_nesting_level == 0 && panicstr == NULL)
	706	panic("Idle thread %p was holding the BGL!", ntd);
	707	else if (mpheld == 0)
	708	goto again;
	709	}
	710	#endif
	711	}
	712	}
	713	KASSERT(ntd->td_pri >= TDPRI_CRIT,
	714	("priority problem in lwkt_switch %d %d", td->td_pri, ntd->td_pri));
	715
	716	/*
	717	* Do the actual switch. If the new target does not need the MP lock
	718	* and we are holding it, release the MP lock. If the new target requires
	719	* the MP lock we have already acquired it for the target.
	720	*/
	721	#ifdef SMP
	722	if (ntd->td_mpcount == 0 ) {
	723	if (MP_LOCK_HELD())
	724	cpu_rel_mplock();
	725	} else {
	726	ASSERT_MP_LOCK_HELD(ntd);
	727	}
	728	#endif
	729	if (td != ntd) {
	730	++switch_count;
	731	td->td_switch(ntd);
	732	}
	733	/* NOTE: current cpu may have changed after switch */
	734	crit_exit_quick(td);
	735	}
	736
	737	/*
	738	* Request that the target thread preempt the current thread. Preemption
	739	* only works under a specific set of conditions:
	740	*
	741	* - We are not preempting ourselves
	742	* - The target thread is owned by the current cpu
	743	* - We are not currently being preempted
	744	* - The target is not currently being preempted
	745	* - We are able to satisfy the target's MP lock requirements (if any).
	746	*
	747	* THE CALLER OF LWKT_PREEMPT() MUST BE IN A CRITICAL SECTION. Typically
	748	* this is called via lwkt_schedule() through the td_preemptable callback.
	749	* critpri is the managed critical priority that we should ignore in order
	750	* to determine whether preemption is possible (aka usually just the crit
	751	* priority of lwkt_schedule() itself).
	752	*
	753	* XXX at the moment we run the target thread in a critical section during
	754	* the preemption in order to prevent the target from taking interrupts
	755	* that WE can't. Preemption is strictly limited to interrupt threads
	756	* and interrupt-like threads, outside of a critical section, and the
	757	* preempted source thread will be resumed the instant the target blocks
	758	* whether or not the source is scheduled (i.e. preemption is supposed to
	759	* be as transparent as possible).
	760	*
	761	* The target thread inherits our MP count (added to its own) for the
	762	* duration of the preemption in order to preserve the atomicy of the
	763	* MP lock during the preemption. Therefore, any preempting targets must be
	764	* careful in regards to MP assertions. Note that the MP count may be
	765	* out of sync with the physical mp_lock, but we do not have to preserve
	766	* the original ownership of the lock if it was out of synch (that is, we
	767	* can leave it synchronized on return).
	768	*/
	769	void
	770	lwkt_preempt(thread_t ntd, int critpri)
	771	{
	772	struct globaldata *gd = mycpu;
	773	thread_t td;
	774	#ifdef SMP
	775	int mpheld;
	776	int savecnt;
	777	#endif
	778
	779	/*
	780	* The caller has put us in a critical section. We can only preempt
	781	* if the caller of the caller was not in a critical section (basically
	782	* a local interrupt), as determined by the 'critpri' parameter.
	783	*
	784	* YYY The target thread must be in a critical section (else it must
	785	* inherit our critical section? I dunno yet).
	786	*
	787	* Any tokens held by the target may not be held by thread(s) being
	788	* preempted. We take the easy way out and do not preempt if
	789	* the target is holding tokens.
	790	*
	791	* Set need_lwkt_resched() unconditionally for now YYY.
	792	*/
	793	KASSERT(ntd->td_pri >= TDPRI_CRIT, ("BADCRIT0 %d", ntd->td_pri));
	794
	795	td = gd->gd_curthread;
	796	if ((ntd->td_pri & TDPRI_MASK) <= (td->td_pri & TDPRI_MASK)) {
	797	++preempt_miss;
	798	return;
	799	}
	800	if ((td->td_pri & ~TDPRI_MASK) > critpri) {
	801	++preempt_miss;
	802	need_lwkt_resched();
	803	return;
	804	}
	805	#ifdef SMP
	806	if (ntd->td_gd != gd) {
	807	++preempt_miss;
	808	need_lwkt_resched();
	809	return;
	810	}
	811	#endif
	812	/*
	813	* Take the easy way out and do not preempt if the target is holding
	814	* one or more tokens. We could test whether the thread(s) being
	815	* preempted interlock against the target thread's tokens and whether
	816	* we can get all the target thread's tokens, but this situation
	817	* should not occur very often so its easier to simply not preempt.
	818	*/
	819	if (ntd->td_toks != NULL) {
	820	++preempt_miss;
	821	need_lwkt_resched();
	822	return;
	823	}
	824	if (td == ntd \|\| ((td->td_flags \| ntd->td_flags) & TDF_PREEMPT_LOCK)) {
	825	++preempt_weird;
	826	need_lwkt_resched();
	827	return;
	828	}
	829	if (ntd->td_preempted) {
	830	++preempt_hit;
	831	need_lwkt_resched();
	832	return;
	833	}
	834	#ifdef SMP
	835	/*
	836	* note: an interrupt might have occured just as we were transitioning
	837	* to or from the MP lock. In this case td_mpcount will be pre-disposed
	838	* (non-zero) but not actually synchronized with the actual state of the
	839	* lock. We can use it to imply an MP lock requirement for the
	840	* preemption but we cannot use it to test whether we hold the MP lock
	841	* or not.
	842	*/
	843	savecnt = td->td_mpcount;
	844	mpheld = MP_LOCK_HELD();
	845	ntd->td_mpcount += td->td_mpcount;
	846	if (mpheld == 0 && ntd->td_mpcount && !cpu_try_mplock()) {
	847	ntd->td_mpcount -= td->td_mpcount;
	848	++preempt_miss;
	849	need_lwkt_resched();
	850	return;
	851	}
	852	#endif
	853
	854	/*
	855	* Since we are able to preempt the current thread, there is no need to
	856	* call need_lwkt_resched().
	857	*/
	858	++preempt_hit;
	859	ntd->td_preempted = td;
	860	td->td_flags \|= TDF_PREEMPT_LOCK;
	861	td->td_switch(ntd);
	862	KKASSERT(ntd->td_preempted && (td->td_flags & TDF_PREEMPT_DONE));
	863	#ifdef SMP
	864	KKASSERT(savecnt == td->td_mpcount);
	865	mpheld = MP_LOCK_HELD();
	866	if (mpheld && td->td_mpcount == 0)
	867	cpu_rel_mplock();
	868	else if (mpheld == 0 && td->td_mpcount)
	869	panic("lwkt_preempt(): MP lock was not held through");
	870	#endif
	871	ntd->td_preempted = NULL;
	872	td->td_flags &= ~(TDF_PREEMPT_LOCK\|TDF_PREEMPT_DONE);
	873	}
	874
	875	/*
	876	* Yield our thread while higher priority threads are pending. This is
	877	* typically called when we leave a critical section but it can be safely
	878	* called while we are in a critical section.
	879	*
	880	* This function will not generally yield to equal priority threads but it
	881	* can occur as a side effect. Note that lwkt_switch() is called from
	882	* inside the critical section to prevent its own crit_exit() from reentering
	883	* lwkt_yield_quick().
	884	*
	885	* gd_reqflags indicates that something changed, e.g. an interrupt or softint
	886	* came along but was blocked and made pending.
	887	*
	888	* (self contained on a per cpu basis)
	889	*/
	890	void
	891	lwkt_yield_quick(void)
	892	{
	893	globaldata_t gd = mycpu;
	894	thread_t td = gd->gd_curthread;
	895
	896	/*
	897	* gd_reqflags is cleared in splz if the cpl is 0. If we were to clear
	898	* it with a non-zero cpl then we might not wind up calling splz after
	899	* a task switch when the critical section is exited even though the
	900	* new task could accept the interrupt.
	901	*
	902	* XXX from crit_exit() only called after last crit section is released.
	903	* If called directly will run splz() even if in a critical section.
	904	*
	905	* td_nest_count prevent deep nesting via splz() or doreti(). Note that
	906	* except for this special case, we MUST call splz() here to handle any
	907	* pending ints, particularly after we switch, or we might accidently
	908	* halt the cpu with interrupts pending.
	909	*/
	910	if (gd->gd_reqflags && td->td_nest_count < 2)
	911	splz();
	912
	913	/*
	914	* YYY enabling will cause wakeup() to task-switch, which really
	915	* confused the old 4.x code. This is a good way to simulate
	916	* preemption and MP without actually doing preemption or MP, because a
	917	* lot of code assumes that wakeup() does not block.
	918	*/
	919	if (untimely_switch && td->td_nest_count == 0 &&
	920	gd->gd_intr_nesting_level == 0
	921	) {
	922	crit_enter_quick(td);
	923	/*
	924	* YYY temporary hacks until we disassociate the userland scheduler
	925	* from the LWKT scheduler.
	926	*/
	927	if (td->td_flags & TDF_RUNQ) {
	928	lwkt_switch(); /* will not reenter yield function */
	929	} else {
	930	lwkt_schedule_self(td); /* make sure we are scheduled */
	931	lwkt_switch(); /* will not reenter yield function */
	932	lwkt_deschedule_self(td); /* make sure we are descheduled */
	933	}
	934	crit_exit_noyield(td);
	935	}
	936	}
	937
	938	/*
	939	* This implements a normal yield which, unlike _quick, will yield to equal
	940	* priority threads as well. Note that gd_reqflags tests will be handled by
	941	* the crit_exit() call in lwkt_switch().
	942	*
	943	* (self contained on a per cpu basis)
	944	*/
	945	void
	946	lwkt_yield(void)
	947	{
	948	lwkt_schedule_self(curthread);
	949	lwkt_switch();
	950	}
	951
	952	/*
	953	* Generic schedule. Possibly schedule threads belonging to other cpus and
	954	* deal with threads that might be blocked on a wait queue.
	955	*
	956	* We have a little helper inline function which does additional work after
	957	* the thread has been enqueued, including dealing with preemption and
	958	* setting need_lwkt_resched() (which prevents the kernel from returning
	959	* to userland until it has processed higher priority threads).
	960	*
	961	* It is possible for this routine to be called after a failed _enqueue
	962	* (due to the target thread migrating, sleeping, or otherwise blocked).
	963	* We have to check that the thread is actually on the run queue!
	964	*/
	965	static __inline
	966	void
	967	_lwkt_schedule_post(globaldata_t gd, thread_t ntd, int cpri)
	968	{
	969	if (ntd->td_flags & TDF_RUNQ) {
	970	if (ntd->td_preemptable) {
	971	ntd->td_preemptable(ntd, cpri); /* YYY +token */
	972	} else if ((ntd->td_flags & TDF_NORESCHED) == 0 &&
	973	(ntd->td_pri & TDPRI_MASK) > (gd->gd_curthread->td_pri & TDPRI_MASK)
	974	) {
	975	need_lwkt_resched();
	976	}
	977	}
	978	}
	979
	980	void
	981	lwkt_schedule(thread_t td)
	982	{
	983	globaldata_t mygd = mycpu;
	984
	985	KASSERT(td != &td->td_gd->gd_idlethread, ("lwkt_schedule(): scheduling gd_idlethread is illegal!"));
	986	crit_enter_gd(mygd);
	987	KKASSERT(td->td_proc == NULL \|\| (td->td_proc->p_flag & P_ONRUNQ) == 0);
	988	if (td == mygd->gd_curthread) {
	989	_lwkt_enqueue(td);
	990	} else {
	991	lwkt_wait_t w;
	992
	993	/*
	994	* If the thread is on a wait list we have to send our scheduling
	995	* request to the owner of the wait structure. Otherwise we send
	996	* the scheduling request to the cpu owning the thread. Races
	997	* are ok, the target will forward the message as necessary (the
	998	* message may chase the thread around before it finally gets
	999	* acted upon).
	1000	*
	1001	* (remember, wait structures use stable storage)
	1002	*
	1003	* NOTE: we have to account for the number of critical sections
	1004	* under our control when calling _lwkt_schedule_post() so it
	1005	* can figure out whether preemption is allowed.
	1006	*
	1007	* NOTE: The wait structure algorithms are a mess and need to be
	1008	* rewritten.
	1009	*
	1010	* NOTE: We cannot safely acquire or release a token, even
	1011	* non-blocking, because this routine may be called in the context
	1012	* of a thread already holding the token and thus not provide any
	1013	* interlock protection. We cannot safely manipulate the td_toks
	1014	* list for the same reason. Instead we depend on our critical
	1015	* section if the token is owned by our cpu.
	1016	*/
	1017	if ((w = td->td_wait) != NULL) {
	1018	if (w->wa_token.t_cpu == mygd) {
	1019	TAILQ_REMOVE(&w->wa_waitq, td, td_threadq);
	1020	--w->wa_count;
	1021	td->td_wait = NULL;
	1022	#ifdef SMP
	1023	if (td->td_gd == mygd) {
	1024	_lwkt_enqueue(td);
	1025	_lwkt_schedule_post(mygd, td, TDPRI_CRIT);
	1026	} else {
	1027	lwkt_send_ipiq(td->td_gd, (ipifunc1_t)lwkt_schedule, td);
	1028	}
	1029	#else
	1030	_lwkt_enqueue(td);
	1031	_lwkt_schedule_post(mygd, td, TDPRI_CRIT);
	1032	#endif
	1033	} else {
	1034	#ifdef SMP
	1035	lwkt_send_ipiq(w->wa_token.t_cpu, (ipifunc1_t)lwkt_schedule, td);
	1036	#else
	1037	panic("bad token %p", &w->wa_token);
	1038	#endif
	1039	}
	1040	} else {
	1041	/*
	1042	* If the wait structure is NULL and we own the thread, there
	1043	* is no race (since we are in a critical section). If we
	1044	* do not own the thread there might be a race but the
	1045	* target cpu will deal with it.
	1046	*/
	1047	#ifdef SMP
	1048	if (td->td_gd == mygd) {
	1049	_lwkt_enqueue(td);
	1050	_lwkt_schedule_post(mygd, td, TDPRI_CRIT);
	1051	} else {
	1052	lwkt_send_ipiq(td->td_gd, (ipifunc1_t)lwkt_schedule, td);
	1053	}
	1054	#else
	1055	_lwkt_enqueue(td);
	1056	_lwkt_schedule_post(mygd, td, TDPRI_CRIT);
	1057	#endif
	1058	}
	1059	}
	1060	crit_exit_gd(mygd);
	1061	}
	1062
	1063	/*
	1064	* Managed acquisition. This code assumes that the MP lock is held for
	1065	* the tdallq operation and that the thread has been descheduled from its
	1066	* original cpu. We also have to wait for the thread to be entirely switched
	1067	* out on its original cpu (this is usually fast enough that we never loop)
	1068	* since the LWKT system does not have to hold the MP lock while switching
	1069	* and the target may have released it before switching.
	1070	*/
	1071	void
	1072	lwkt_acquire(thread_t td)
	1073	{
	1074	globaldata_t gd;
	1075	globaldata_t mygd;
	1076
	1077	gd = td->td_gd;
	1078	mygd = mycpu;
	1079	cpu_lfence();
	1080	KKASSERT((td->td_flags & TDF_RUNQ) == 0);
	1081	while (td->td_flags & (TDF_RUNNING\|TDF_PREEMPT_LOCK)) /* XXX spin */
	1082	cpu_lfence();
	1083	if (gd != mygd) {
	1084	crit_enter_gd(mygd);
	1085	TAILQ_REMOVE(&gd->gd_tdallq, td, td_allq); /* protected by BGL */
	1086	td->td_gd = mygd;
	1087	TAILQ_INSERT_TAIL(&mygd->gd_tdallq, td, td_allq); /* protected by BGL */
	1088	crit_exit_gd(mygd);
	1089	}
	1090	}
	1091
	1092	/*
	1093	* Generic deschedule. Descheduling threads other then your own should be
	1094	* done only in carefully controlled circumstances. Descheduling is
	1095	* asynchronous.
	1096	*
	1097	* This function may block if the cpu has run out of messages.
	1098	*/
	1099	void
	1100	lwkt_deschedule(thread_t td)
	1101	{
	1102	crit_enter();
	1103	#ifdef SMP
	1104	if (td == curthread) {
	1105	_lwkt_dequeue(td);
	1106	} else {
	1107	if (td->td_gd == mycpu) {
	1108	_lwkt_dequeue(td);
	1109	} else {
	1110	lwkt_send_ipiq(td->td_gd, (ipifunc1_t)lwkt_deschedule, td);
	1111	}
	1112	}
	1113	#else
	1114	_lwkt_dequeue(td);
	1115	#endif
	1116	crit_exit();
	1117	}
	1118
	1119	/*
	1120	* Set the target thread's priority. This routine does not automatically
	1121	* switch to a higher priority thread, LWKT threads are not designed for
	1122	* continuous priority changes. Yield if you want to switch.
	1123	*
	1124	* We have to retain the critical section count which uses the high bits
	1125	* of the td_pri field. The specified priority may also indicate zero or
	1126	* more critical sections by adding TDPRI_CRIT*N.
	1127	*
	1128	* Note that we requeue the thread whether it winds up on a different runq
	1129	* or not. uio_yield() depends on this and the routine is not normally
	1130	* called with the same priority otherwise.
	1131	*/
	1132	void
	1133	lwkt_setpri(thread_t td, int pri)
	1134	{
	1135	KKASSERT(pri >= 0);
	1136	KKASSERT(td->td_gd == mycpu);
	1137	crit_enter();
	1138	if (td->td_flags & TDF_RUNQ) {
	1139	_lwkt_dequeue(td);
	1140	td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri;
	1141	_lwkt_enqueue(td);
	1142	} else {
	1143	td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri;
	1144	}
	1145	crit_exit();
	1146	}
	1147
	1148	void
	1149	lwkt_setpri_self(int pri)
	1150	{
	1151	thread_t td = curthread;
	1152
	1153	KKASSERT(pri >= 0 && pri <= TDPRI_MAX);
	1154	crit_enter();
	1155	if (td->td_flags & TDF_RUNQ) {
	1156	_lwkt_dequeue(td);
	1157	td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri;
	1158	_lwkt_enqueue(td);
	1159	} else {
	1160	td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri;
	1161	}
	1162	crit_exit();
	1163	}
	1164
	1165	/*
	1166	* Determine if there is a runnable thread at a higher priority then
	1167	* the current thread. lwkt_setpri() does not check this automatically.
	1168	* Return 1 if there is, 0 if there isn't.
	1169	*
	1170	* Example: if bit 31 of runqmask is set and the current thread is priority
	1171	* 30, then we wind up checking the mask: 0x80000000 against 0x7fffffff.
	1172	*
	1173	* If nq reaches 31 the shift operation will overflow to 0 and we will wind
	1174	* up comparing against 0xffffffff, a comparison that will always be false.
	1175	*/
	1176	int
	1177	lwkt_checkpri_self(void)
	1178	{
	1179	globaldata_t gd = mycpu;
	1180	thread_t td = gd->gd_curthread;
	1181	int nq = td->td_pri & TDPRI_MASK;
	1182
	1183	while (gd->gd_runqmask > (__uint32_t)(2 << nq) - 1) {
	1184	if (TAILQ_FIRST(&gd->gd_tdrunq[nq + 1]))
	1185	return(1);
	1186	++nq;
	1187	}
	1188	return(0);
	1189	}
	1190
	1191	/*
	1192	* Migrate the current thread to the specified cpu. The BGL must be held
	1193	* (for the gd_tdallq manipulation XXX). This is accomplished by
	1194	* descheduling ourselves from the current cpu, moving our thread to the
	1195	* tdallq of the target cpu, IPI messaging the target cpu, and switching out.
	1196	* TDF_MIGRATING prevents scheduling races while the thread is being migrated.
	1197	*/
	1198	#ifdef SMP
	1199	static void lwkt_setcpu_remote(void *arg);
	1200	#endif
	1201
	1202	void
	1203	lwkt_setcpu_self(globaldata_t rgd)
	1204	{
	1205	#ifdef SMP
	1206	thread_t td = curthread;
	1207
	1208	if (td->td_gd != rgd) {
	1209	crit_enter_quick(td);
	1210	td->td_flags \|= TDF_MIGRATING;
	1211	lwkt_deschedule_self(td);
	1212	TAILQ_REMOVE(&td->td_gd->gd_tdallq, td, td_allq); /* protected by BGL */
	1213	TAILQ_INSERT_TAIL(&rgd->gd_tdallq, td, td_allq); /* protected by BGL */
	1214	lwkt_send_ipiq(rgd, (ipifunc1_t)lwkt_setcpu_remote, td);
	1215	lwkt_switch();
	1216	/* we are now on the target cpu */
	1217	crit_exit_quick(td);
	1218	}
	1219	#endif
	1220	}
	1221
	1222	/*
	1223	* Remote IPI for cpu migration (called while in a critical section so we
	1224	* do not have to enter another one). The thread has already been moved to
	1225	* our cpu's allq, but we must wait for the thread to be completely switched
	1226	* out on the originating cpu before we schedule it on ours or the stack
	1227	* state may be corrupt. We clear TDF_MIGRATING after flushing the GD
	1228	* change to main memory.
	1229	*
	1230	* XXX The use of TDF_MIGRATING might not be sufficient to avoid races
	1231	* against wakeups. It is best if this interface is used only when there
	1232	* are no pending events that might try to schedule the thread.
	1233	*/
	1234	#ifdef SMP
	1235	static void
	1236	lwkt_setcpu_remote(void *arg)
	1237	{
	1238	thread_t td = arg;
	1239	globaldata_t gd = mycpu;
	1240
	1241	while (td->td_flags & (TDF_RUNNING\|TDF_PREEMPT_LOCK))
	1242	cpu_lfence();
	1243	td->td_gd = gd;
	1244	cpu_sfence();
	1245	td->td_flags &= ~TDF_MIGRATING;
	1246	KKASSERT(td->td_proc == NULL \|\| (td->td_proc->p_flag & P_ONRUNQ) == 0);
	1247	_lwkt_enqueue(td);
	1248	}
	1249	#endif
	1250
	1251	struct lwp *
	1252	lwkt_preempted_proc(void)
	1253	{
	1254	thread_t td = curthread;
	1255	while (td->td_preempted)
	1256	td = td->td_preempted;
	1257	return(td->td_lwp);
	1258	}
	1259
	1260	/*
	1261	* Block on the specified wait queue until signaled. A generation number
	1262	* must be supplied to interlock the wait queue. The function will
	1263	* return immediately if the generation number does not match the wait
	1264	* structure's generation number.
	1265	*/
	1266	void
	1267	lwkt_block(lwkt_wait_t w, const char wmesg, int gen)
	1268	{
	1269	thread_t td = curthread;
	1270	lwkt_tokref ilock;
	1271
	1272	lwkt_gettoken(&ilock, &w->wa_token);
	1273	crit_enter();
	1274	if (w->wa_gen == *gen) {
	1275	_lwkt_dequeue(td);
	1276	td->td_flags \|= TDF_BLOCKQ;
	1277	TAILQ_INSERT_TAIL(&w->wa_waitq, td, td_threadq);
	1278	++w->wa_count;
	1279	td->td_wait = w;
	1280	td->td_wmesg = wmesg;
	1281	lwkt_switch();
	1282	KKASSERT((td->td_flags & TDF_BLOCKQ) == 0);
	1283	td->td_wmesg = NULL;
	1284	}
	1285	crit_exit();
	1286	*gen = w->wa_gen;
	1287	lwkt_reltoken(&ilock);
	1288	}
	1289
	1290	/*
	1291	* Signal a wait queue. We gain ownership of the wait queue in order to
	1292	* signal it. Once a thread is removed from the wait queue we have to
	1293	* deal with the cpu owning the thread.
	1294	*
	1295	* Note: alternatively we could message the target cpu owning the wait
	1296	* queue. YYY implement as sysctl.
	1297	*/
	1298	void
	1299	lwkt_signal(lwkt_wait_t w, int count)
	1300	{
	1301	thread_t td;
	1302	lwkt_tokref ilock;
	1303
	1304	lwkt_gettoken(&ilock, &w->wa_token);
	1305	++w->wa_gen;
	1306	crit_enter();
	1307	if (count < 0)
	1308	count = w->wa_count;
	1309	while ((td = TAILQ_FIRST(&w->wa_waitq)) != NULL && count) {
	1310	--count;
	1311	--w->wa_count;
	1312	KKASSERT(td->td_flags & TDF_BLOCKQ);
	1313	TAILQ_REMOVE(&w->wa_waitq, td, td_threadq);
	1314	td->td_flags &= ~TDF_BLOCKQ;
	1315	td->td_wait = NULL;
	1316	KKASSERT(td->td_proc == NULL \|\| (td->td_proc->p_flag & P_ONRUNQ) == 0);
	1317	#ifdef SMP
	1318	if (td->td_gd == mycpu) {
	1319	_lwkt_enqueue(td);
	1320	} else {
	1321	lwkt_send_ipiq(td->td_gd, (ipifunc1_t)lwkt_schedule, td);
	1322	}
	1323	#else
	1324	_lwkt_enqueue(td);
	1325	#endif
	1326	}
	1327	crit_exit();
	1328	lwkt_reltoken(&ilock);
	1329	}
	1330
	1331	/*
	1332	* Create a kernel process/thread/whatever. It shares it's address space
	1333	* with proc0 - ie: kernel only.
	1334	*
	1335	* NOTE! By default new threads are created with the MP lock held. A
	1336	* thread which does not require the MP lock should release it by calling
	1337	* rel_mplock() at the start of the new thread.
	1338	*/
	1339	int
	1340	lwkt_create(void (func)(void ), void *arg,
	1341	struct thread **tdp, thread_t template, int tdflags, int cpu,
	1342	const char *fmt, ...)
	1343	{
	1344	thread_t td;
	1345	__va_list ap;
	1346
	1347	td = lwkt_alloc_thread(template, LWKT_THREAD_STACK, cpu,
	1348	tdflags \| TDF_VERBOSE);
	1349	if (tdp)
	1350	*tdp = td;
	1351	cpu_set_thread_handler(td, lwkt_exit, func, arg);
	1352
	1353	/*
	1354	* Set up arg0 for 'ps' etc
	1355	*/
	1356	__va_start(ap, fmt);
	1357	vsnprintf(td->td_comm, sizeof(td->td_comm), fmt, ap);
	1358	__va_end(ap);
	1359
	1360	/*
	1361	* Schedule the thread to run
	1362	*/
	1363	if ((td->td_flags & TDF_STOPREQ) == 0)
	1364	lwkt_schedule(td);
	1365	else
	1366	td->td_flags &= ~TDF_STOPREQ;
	1367	return 0;
	1368	}
	1369
	1370	/*
	1371	* kthread_* is specific to the kernel and is not needed by userland.
	1372	*/
	1373	#ifdef _KERNEL
	1374
	1375	/*
	1376	* Destroy an LWKT thread. Warning! This function is not called when
	1377	* a process exits, cpu_proc_exit() directly calls cpu_thread_exit() and
	1378	* uses a different reaping mechanism.
	1379	*/
	1380	void
	1381	lwkt_exit(void)
	1382	{
	1383	thread_t td = curthread;
	1384	globaldata_t gd;
	1385
	1386	if (td->td_flags & TDF_VERBOSE)
	1387	printf("kthread %p %s has exited\n", td, td->td_comm);
	1388	caps_exit(td);
	1389	crit_enter_quick(td);
	1390	lwkt_deschedule_self(td);
	1391	gd = mycpu;
	1392	KKASSERT(gd == td->td_gd);
	1393	TAILQ_REMOVE(&gd->gd_tdallq, td, td_allq);
	1394	if (td->td_flags & TDF_ALLOCATED_THREAD) {
	1395	++gd->gd_tdfreecount;
	1396	TAILQ_INSERT_TAIL(&gd->gd_tdfreeq, td, td_threadq);
	1397	}
	1398	cpu_thread_exit();
	1399	}
	1400
	1401	#endif /* _KERNEL */
	1402
	1403	void
	1404	crit_panic(void)
	1405	{
	1406	thread_t td = curthread;
	1407	int lpri = td->td_pri;
	1408
	1409	td->td_pri = 0;
	1410	panic("td_pri is/would-go negative! %p %d", td, lpri);
	1411	}
	1412
	1413	#ifdef SMP
	1414
	1415	/*
	1416	* Called from debugger/panic on cpus which have been stopped. We must still
	1417	* process the IPIQ while stopped, even if we were stopped while in a critical
	1418	* section (XXX).
	1419	*
	1420	* If we are dumping also try to process any pending interrupts. This may
	1421	* or may not work depending on the state of the cpu at the point it was
	1422	* stopped.
	1423	*/
	1424	void
	1425	lwkt_smp_stopped(void)
	1426	{
	1427	globaldata_t gd = mycpu;
	1428
	1429	crit_enter_gd(gd);
	1430	if (dumping) {
	1431	lwkt_process_ipiq();
	1432	splz();
	1433	} else {
	1434	lwkt_process_ipiq();
	1435	}
	1436	crit_exit_gd(gd);
	1437	}
	1438
	1439	#endif