gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2003,2004 The DragonFly Project. All rights reserved.
	3	*
	4	* This code is derived from software contributed to The DragonFly Project
	5	* by Matthew Dillon <dillon@backplane.com>
	6	*
	7	* Redistribution and use in source and binary forms, with or without
	8	* modification, are permitted provided that the following conditions
	9	* are met:
	10	*
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in
	15	* the documentation and/or other materials provided with the
	16	* distribution.
	17	* 3. Neither the name of The DragonFly Project nor the names of its
	18	* contributors may be used to endorse or promote products derived
	19	* from this software without specific, prior written permission.
	20	*
	21	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	22	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	23	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	24	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	25	* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	26	* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
	27	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	28	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	29	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	30	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	31	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	32	* SUCH DAMAGE.
	33	*
	34	* $DragonFly: src/sys/kern/lwkt_thread.c,v 1.104 2006/12/23 00:35:04 swildner Exp $
	35	*/
	36
	37	/*
	38	* Each cpu in a system has its own self-contained light weight kernel
	39	* thread scheduler, which means that generally speaking we only need
	40	* to use a critical section to avoid problems. Foreign thread
	41	* scheduling is queued via (async) IPIs.
	42	*/
	43
	44	#ifdef _KERNEL
	45
	46	#include <sys/param.h>
	47	#include <sys/systm.h>
	48	#include <sys/kernel.h>
	49	#include <sys/proc.h>
	50	#include <sys/rtprio.h>
	51	#include <sys/queue.h>
	52	#include <sys/sysctl.h>
	53	#include <sys/kthread.h>
	54	#include <machine/cpu.h>
	55	#include <sys/lock.h>
	56	#include <sys/caps.h>
	57	#include <sys/spinlock.h>
	58	#include <sys/ktr.h>
	59
	60	#include <sys/thread2.h>
	61	#include <sys/spinlock2.h>
	62
	63	#include <vm/vm.h>
	64	#include <vm/vm_param.h>
	65	#include <vm/vm_kern.h>
	66	#include <vm/vm_object.h>
	67	#include <vm/vm_page.h>
	68	#include <vm/vm_map.h>
	69	#include <vm/vm_pager.h>
	70	#include <vm/vm_extern.h>
	71	#include <vm/vm_zone.h>
	72
	73	#include <machine/stdarg.h>
	74	#include <machine/smp.h>
	75
	76	#else
	77
	78	#include <sys/stdint.h>
	79	#include <libcaps/thread.h>
	80	#include <sys/thread.h>
	81	#include <sys/msgport.h>
	82	#include <sys/errno.h>
	83	#include <libcaps/globaldata.h>
	84	#include <machine/cpufunc.h>
	85	#include <sys/thread2.h>
	86	#include <sys/msgport2.h>
	87	#include <stdio.h>
	88	#include <stdlib.h>
	89	#include <string.h>
	90	#include <machine/lock.h>
	91	#include <machine/atomic.h>
	92	#include <machine/cpu.h>
	93
	94	#endif
	95
	96	static int untimely_switch = 0;
	97	#ifdef INVARIANTS
	98	static int panic_on_cscount = 0;
	99	#endif
	100	static __int64_t switch_count = 0;
	101	static __int64_t preempt_hit = 0;
	102	static __int64_t preempt_miss = 0;
	103	static __int64_t preempt_weird = 0;
	104	static __int64_t token_contention_count = 0;
	105	static __int64_t mplock_contention_count = 0;
	106
	107	#ifdef _KERNEL
	108
	109	SYSCTL_INT(_lwkt, OID_AUTO, untimely_switch, CTLFLAG_RW, &untimely_switch, 0, "");
	110	#ifdef INVARIANTS
	111	SYSCTL_INT(_lwkt, OID_AUTO, panic_on_cscount, CTLFLAG_RW, &panic_on_cscount, 0, "");
	112	#endif
	113	SYSCTL_QUAD(_lwkt, OID_AUTO, switch_count, CTLFLAG_RW, &switch_count, 0, "");
	114	SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_hit, CTLFLAG_RW, &preempt_hit, 0, "");
	115	SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_miss, CTLFLAG_RW, &preempt_miss, 0, "");
	116	SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_weird, CTLFLAG_RW, &preempt_weird, 0, "");
	117	#ifdef INVARIANTS
	118	SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count, CTLFLAG_RW,
	119	&token_contention_count, 0, "spinning due to token contention");
	120	SYSCTL_QUAD(_lwkt, OID_AUTO, mplock_contention_count, CTLFLAG_RW,
	121	&mplock_contention_count, 0, "spinning due to MPLOCK contention");
	122	#endif
	123	#endif
	124
	125	/*
	126	* Kernel Trace
	127	*/
	128	#ifdef _KERNEL
	129
	130	#if !defined(KTR_GIANT_CONTENTION)
	131	#define KTR_GIANT_CONTENTION KTR_ALL
	132	#endif
	133
	134	KTR_INFO_MASTER(giant);
	135	KTR_INFO(KTR_GIANT_CONTENTION, giant, beg, 0, "thread=%p", sizeof(void *));
	136	KTR_INFO(KTR_GIANT_CONTENTION, giant, end, 1, "thread=%p", sizeof(void *));
	137
	138	#define loggiant(name) KTR_LOG(giant_ ## name, curthread)
	139
	140	#endif
	141
	142	/*
	143	* These helper procedures handle the runq, they can only be called from
	144	* within a critical section.
	145	*
	146	* WARNING! Prior to SMP being brought up it is possible to enqueue and
	147	* dequeue threads belonging to other cpus, so be sure to use td->td_gd
	148	* instead of 'mycpu' when referencing the globaldata structure. Once
	149	* SMP live enqueuing and dequeueing only occurs on the current cpu.
	150	*/
	151	static __inline
	152	void
	153	_lwkt_dequeue(thread_t td)
	154	{
	155	if (td->td_flags & TDF_RUNQ) {
	156	int nq = td->td_pri & TDPRI_MASK;
	157	struct globaldata *gd = td->td_gd;
	158
	159	td->td_flags &= ~TDF_RUNQ;
	160	TAILQ_REMOVE(&gd->gd_tdrunq[nq], td, td_threadq);
	161	/* runqmask is passively cleaned up by the switcher */
	162	}
	163	}
	164
	165	static __inline
	166	void
	167	_lwkt_enqueue(thread_t td)
	168	{
	169	if ((td->td_flags & (TDF_RUNQ\|TDF_MIGRATING\|TDF_TSLEEPQ\|TDF_BLOCKQ)) == 0) {
	170	int nq = td->td_pri & TDPRI_MASK;
	171	struct globaldata *gd = td->td_gd;
	172
	173	td->td_flags \|= TDF_RUNQ;
	174	TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], td, td_threadq);
	175	gd->gd_runqmask \|= 1 << nq;
	176	}
	177	}
	178
	179	/*
	180	* Schedule a thread to run. As the current thread we can always safely
	181	* schedule ourselves, and a shortcut procedure is provided for that
	182	* function.
	183	*
	184	* (non-blocking, self contained on a per cpu basis)
	185	*/
	186	void
	187	lwkt_schedule_self(thread_t td)
	188	{
	189	crit_enter_quick(td);
	190	KASSERT(td != &td->td_gd->gd_idlethread, ("lwkt_schedule_self(): scheduling gd_idlethread is illegal!"));
	191	KKASSERT(td->td_proc == NULL \|\| (td->td_proc->p_flag & P_ONRUNQ) == 0);
	192	_lwkt_enqueue(td);
	193	crit_exit_quick(td);
	194	}
	195
	196	/*
	197	* Deschedule a thread.
	198	*
	199	* (non-blocking, self contained on a per cpu basis)
	200	*/
	201	void
	202	lwkt_deschedule_self(thread_t td)
	203	{
	204	crit_enter_quick(td);
	205	_lwkt_dequeue(td);
	206	crit_exit_quick(td);
	207	}
	208
	209	#ifdef _KERNEL
	210
	211	/*
	212	* LWKTs operate on a per-cpu basis
	213	*
	214	* WARNING! Called from early boot, 'mycpu' may not work yet.
	215	*/
	216	void
	217	lwkt_gdinit(struct globaldata *gd)
	218	{
	219	int i;
	220
	221	for (i = 0; i < sizeof(gd->gd_tdrunq)/sizeof(gd->gd_tdrunq[0]); ++i)
	222	TAILQ_INIT(&gd->gd_tdrunq[i]);
	223	gd->gd_runqmask = 0;
	224	TAILQ_INIT(&gd->gd_tdallq);
	225	}
	226
	227	#endif /* _KERNEL */
	228
	229	/*
	230	* Create a new thread. The thread must be associated with a process context
	231	* or LWKT start address before it can be scheduled. If the target cpu is
	232	* -1 the thread will be created on the current cpu.
	233	*
	234	* If you intend to create a thread without a process context this function
	235	* does everything except load the startup and switcher function.
	236	*/
	237	thread_t
	238	lwkt_alloc_thread(struct thread *td, int stksize, int cpu, int flags)
	239	{
	240	void *stack;
	241	globaldata_t gd = mycpu;
	242
	243	if (td == NULL) {
	244	crit_enter_gd(gd);
	245	if (gd->gd_tdfreecount > 0) {
	246	--gd->gd_tdfreecount;
	247	td = TAILQ_FIRST(&gd->gd_tdfreeq);
	248	KASSERT(td != NULL && (td->td_flags & TDF_RUNNING) == 0,
	249	("lwkt_alloc_thread: unexpected NULL or corrupted td"));
	250	TAILQ_REMOVE(&gd->gd_tdfreeq, td, td_threadq);
	251	crit_exit_gd(gd);
	252	flags \|= td->td_flags & (TDF_ALLOCATED_STACK\|TDF_ALLOCATED_THREAD);
	253	} else {
	254	crit_exit_gd(gd);
	255	#ifdef _KERNEL
	256	td = zalloc(thread_zone);
	257	#else
	258	td = malloc(sizeof(struct thread));
	259	#endif
	260	td->td_kstack = NULL;
	261	td->td_kstack_size = 0;
	262	flags \|= TDF_ALLOCATED_THREAD;
	263	}
	264	}
	265	if ((stack = td->td_kstack) != NULL && td->td_kstack_size != stksize) {
	266	if (flags & TDF_ALLOCATED_STACK) {
	267	#ifdef _KERNEL
	268	kmem_free(kernel_map, (vm_offset_t)stack, td->td_kstack_size);
	269	#else
	270	libcaps_free_stack(stack, td->td_kstack_size);
	271	#endif
	272	stack = NULL;
	273	}
	274	}
	275	if (stack == NULL) {
	276	#ifdef _KERNEL
	277	stack = (void *)kmem_alloc(kernel_map, stksize);
	278	#else
	279	stack = libcaps_alloc_stack(stksize);
	280	#endif
	281	flags \|= TDF_ALLOCATED_STACK;
	282	}
	283	if (cpu < 0)
	284	lwkt_init_thread(td, stack, stksize, flags, mycpu);
	285	else
	286	lwkt_init_thread(td, stack, stksize, flags, globaldata_find(cpu));
	287	return(td);
	288	}
	289
	290	#ifdef _KERNEL
	291
	292	/*
	293	* Initialize a preexisting thread structure. This function is used by
	294	* lwkt_alloc_thread() and also used to initialize the per-cpu idlethread.
	295	*
	296	* All threads start out in a critical section at a priority of
	297	* TDPRI_KERN_DAEMON. Higher level code will modify the priority as
	298	* appropriate. This function may send an IPI message when the
	299	* requested cpu is not the current cpu and consequently gd_tdallq may
	300	* not be initialized synchronously from the point of view of the originating
	301	* cpu.
	302	*
	303	* NOTE! we have to be careful in regards to creating threads for other cpus
	304	* if SMP has not yet been activated.
	305	*/
	306	#ifdef SMP
	307
	308	static void
	309	lwkt_init_thread_remote(void *arg)
	310	{
	311	thread_t td = arg;
	312
	313	/*
	314	* Protected by critical section held by IPI dispatch
	315	*/
	316	TAILQ_INSERT_TAIL(&td->td_gd->gd_tdallq, td, td_allq);
	317	}
	318
	319	#endif
	320
	321	void
	322	lwkt_init_thread(thread_t td, void *stack, int stksize, int flags,
	323	struct globaldata *gd)
	324	{
	325	globaldata_t mygd = mycpu;
	326
	327	bzero(td, sizeof(struct thread));
	328	td->td_kstack = stack;
	329	td->td_kstack_size = stksize;
	330	td->td_flags = flags;
	331	td->td_gd = gd;
	332	td->td_pri = TDPRI_KERN_DAEMON + TDPRI_CRIT;
	333	#ifdef SMP
	334	if ((flags & TDF_MPSAFE) == 0)
	335	td->td_mpcount = 1;
	336	#endif
	337	lwkt_initport(&td->td_msgport, td);
	338	pmap_init_thread(td);
	339	#ifdef SMP
	340	/*
	341	* Normally initializing a thread for a remote cpu requires sending an
	342	* IPI. However, the idlethread is setup before the other cpus are
	343	* activated so we have to treat it as a special case. XXX manipulation
	344	* of gd_tdallq requires the BGL.
	345	*/
	346	if (gd == mygd \|\| td == &gd->gd_idlethread) {
	347	crit_enter_gd(mygd);
	348	TAILQ_INSERT_TAIL(&gd->gd_tdallq, td, td_allq);
	349	crit_exit_gd(mygd);
	350	} else {
	351	lwkt_send_ipiq(gd, lwkt_init_thread_remote, td);
	352	}
	353	#else
	354	crit_enter_gd(mygd);
	355	TAILQ_INSERT_TAIL(&gd->gd_tdallq, td, td_allq);
	356	crit_exit_gd(mygd);
	357	#endif
	358	}
	359
	360	#endif /* _KERNEL */
	361
	362	void
	363	lwkt_set_comm(thread_t td, const char *ctl, ...)
	364	{
	365	__va_list va;
	366
	367	__va_start(va, ctl);
	368	kvsnprintf(td->td_comm, sizeof(td->td_comm), ctl, va);
	369	__va_end(va);
	370	}
	371
	372	void
	373	lwkt_hold(thread_t td)
	374	{
	375	++td->td_refs;
	376	}
	377
	378	void
	379	lwkt_rele(thread_t td)
	380	{
	381	KKASSERT(td->td_refs > 0);
	382	--td->td_refs;
	383	}
	384
	385	#ifdef _KERNEL
	386
	387	void
	388	lwkt_wait_free(thread_t td)
	389	{
	390	while (td->td_refs)
	391	tsleep(td, 0, "tdreap", hz);
	392	}
	393
	394	#endif
	395
	396	void
	397	lwkt_free_thread(thread_t td)
	398	{
	399	struct globaldata *gd = mycpu;
	400
	401	KASSERT((td->td_flags & TDF_RUNNING) == 0,
	402	("lwkt_free_thread: did not exit! %p", td));
	403
	404	crit_enter_gd(gd);
	405	if (gd->gd_tdfreecount < CACHE_NTHREADS &&
	406	(td->td_flags & TDF_ALLOCATED_THREAD)
	407	) {
	408	++gd->gd_tdfreecount;
	409	TAILQ_INSERT_HEAD(&gd->gd_tdfreeq, td, td_threadq);
	410	crit_exit_gd(gd);
	411	} else {
	412	crit_exit_gd(gd);
	413	if (td->td_kstack && (td->td_flags & TDF_ALLOCATED_STACK)) {
	414	#ifdef _KERNEL
	415	kmem_free(kernel_map, (vm_offset_t)td->td_kstack, td->td_kstack_size);
	416	#else
	417	libcaps_free_stack(td->td_kstack, td->td_kstack_size);
	418	#endif
	419	/* gd invalid */
	420	td->td_kstack = NULL;
	421	td->td_kstack_size = 0;
	422	}
	423	if (td->td_flags & TDF_ALLOCATED_THREAD) {
	424	#ifdef _KERNEL
	425	zfree(thread_zone, td);
	426	#else
	427	free(td);
	428	#endif
	429	}
	430	}
	431	}
	432
	433
	434	/*
	435	* Switch to the next runnable lwkt. If no LWKTs are runnable then
	436	* switch to the idlethread. Switching must occur within a critical
	437	* section to avoid races with the scheduling queue.
	438	*
	439	* We always have full control over our cpu's run queue. Other cpus
	440	* that wish to manipulate our queue must use the cpu_*msg() calls to
	441	* talk to our cpu, so a critical section is all that is needed and
	442	* the result is very, very fast thread switching.
	443	*
	444	* The LWKT scheduler uses a fixed priority model and round-robins at
	445	* each priority level. User process scheduling is a totally
	446	* different beast and LWKT priorities should not be confused with
	447	* user process priorities.
	448	*
	449	* The MP lock may be out of sync with the thread's td_mpcount. lwkt_switch()
	450	* cleans it up. Note that the td_switch() function cannot do anything that
	451	* requires the MP lock since the MP lock will have already been setup for
	452	* the target thread (not the current thread). It's nice to have a scheduler
	453	* that does not need the MP lock to work because it allows us to do some
	454	* really cool high-performance MP lock optimizations.
	455	*
	456	* PREEMPTION NOTE: Preemption occurs via lwkt_preempt(). lwkt_switch()
	457	* is not called by the current thread in the preemption case, only when
	458	* the preempting thread blocks (in order to return to the original thread).
	459	*/
	460	void
	461	lwkt_switch(void)
	462	{
	463	globaldata_t gd = mycpu;
	464	thread_t td = gd->gd_curthread;
	465	thread_t ntd;
	466	#ifdef SMP
	467	int mpheld;
	468	#endif
	469
	470	/*
	471	* Switching from within a 'fast' (non thread switched) interrupt or IPI
	472	* is illegal. However, we may have to do it anyway if we hit a fatal
	473	* kernel trap or we have paniced.
	474	*
	475	* If this case occurs save and restore the interrupt nesting level.
	476	*/
	477	if (gd->gd_intr_nesting_level) {
	478	int savegdnest;
	479	int savegdtrap;
	480
	481	if (gd->gd_trap_nesting_level == 0 && panicstr == NULL) {
	482	panic("lwkt_switch: cannot switch from within "
	483	"a fast interrupt, yet, td %p\n", td);
	484	} else {
	485	savegdnest = gd->gd_intr_nesting_level;
	486	savegdtrap = gd->gd_trap_nesting_level;
	487	gd->gd_intr_nesting_level = 0;
	488	gd->gd_trap_nesting_level = 0;
	489	if ((td->td_flags & TDF_PANICWARN) == 0) {
	490	td->td_flags \|= TDF_PANICWARN;
	491	kprintf("Warning: thread switch from interrupt or IPI, "
	492	"thread %p (%s)\n", td, td->td_comm);
	493	#ifdef DDB
	494	db_print_backtrace();
	495	#endif
	496	}
	497	lwkt_switch();
	498	gd->gd_intr_nesting_level = savegdnest;
	499	gd->gd_trap_nesting_level = savegdtrap;
	500	return;
	501	}
	502	}
	503
	504	/*
	505	* Passive release (used to transition from user to kernel mode
	506	* when we block or switch rather then when we enter the kernel).
	507	* This function is NOT called if we are switching into a preemption
	508	* or returning from a preemption. Typically this causes us to lose
	509	* our current process designation (if we have one) and become a true
	510	* LWKT thread, and may also hand the current process designation to
	511	* another process and schedule thread.
	512	*/
	513	if (td->td_release)
	514	td->td_release(td);
	515
	516	crit_enter_gd(gd);
	517	#ifdef SMP
	518	if (td->td_toks)
	519	lwkt_relalltokens(td);
	520	#endif
	521
	522	/*
	523	* We had better not be holding any spin locks, but don't get into an
	524	* endless panic loop.
	525	*/
	526	KASSERT(gd->gd_spinlock_rd == NULL \|\| panicstr != NULL,
	527	("lwkt_switch: still holding a shared spinlock %p!",
	528	gd->gd_spinlock_rd));
	529	KASSERT(gd->gd_spinlocks_wr == 0 \|\| panicstr != NULL,
	530	("lwkt_switch: still holding %d exclusive spinlocks!",
	531	gd->gd_spinlocks_wr));
	532
	533
	534	#ifdef SMP
	535	/*
	536	* td_mpcount cannot be used to determine if we currently hold the
	537	* MP lock because get_mplock() will increment it prior to attempting
	538	* to get the lock, and switch out if it can't. Our ownership of
	539	* the actual lock will remain stable while we are in a critical section
	540	* (but, of course, another cpu may own or release the lock so the
	541	* actual value of mp_lock is not stable).
	542	*/
	543	mpheld = MP_LOCK_HELD();
	544	#ifdef INVARIANTS
	545	if (td->td_cscount) {
	546	kprintf("Diagnostic: attempt to switch while mastering cpusync: %p\n",
	547	td);
	548	if (panic_on_cscount)
	549	panic("switching while mastering cpusync");
	550	}
	551	#endif
	552	#endif
	553	if ((ntd = td->td_preempted) != NULL) {
	554	/*
	555	* We had preempted another thread on this cpu, resume the preempted
	556	* thread. This occurs transparently, whether the preempted thread
	557	* was scheduled or not (it may have been preempted after descheduling
	558	* itself).
	559	*
	560	* We have to setup the MP lock for the original thread after backing
	561	* out the adjustment that was made to curthread when the original
	562	* was preempted.
	563	*/
	564	KKASSERT(ntd->td_flags & TDF_PREEMPT_LOCK);
	565	#ifdef SMP
	566	if (ntd->td_mpcount && mpheld == 0) {
	567	panic("MPLOCK NOT HELD ON RETURN: %p %p %d %d",
	568	td, ntd, td->td_mpcount, ntd->td_mpcount);
	569	}
	570	if (ntd->td_mpcount) {
	571	td->td_mpcount -= ntd->td_mpcount;
	572	KKASSERT(td->td_mpcount >= 0);
	573	}
	574	#endif
	575	ntd->td_flags \|= TDF_PREEMPT_DONE;
	576
	577	/*
	578	* XXX. The interrupt may have woken a thread up, we need to properly
	579	* set the reschedule flag if the originally interrupted thread is at
	580	* a lower priority.
	581	*/
	582	if (gd->gd_runqmask > (2 << (ntd->td_pri & TDPRI_MASK)) - 1)
	583	need_lwkt_resched();
	584	/* YYY release mp lock on switchback if original doesn't need it */
	585	} else {
	586	/*
	587	* Priority queue / round-robin at each priority. Note that user
	588	* processes run at a fixed, low priority and the user process
	589	* scheduler deals with interactions between user processes
	590	* by scheduling and descheduling them from the LWKT queue as
	591	* necessary.
	592	*
	593	* We have to adjust the MP lock for the target thread. If we
	594	* need the MP lock and cannot obtain it we try to locate a
	595	* thread that does not need the MP lock. If we cannot, we spin
	596	* instead of HLT.
	597	*
	598	* A similar issue exists for the tokens held by the target thread.
	599	* If we cannot obtain ownership of the tokens we cannot immediately
	600	* schedule the thread.
	601	*/
	602
	603	/*
	604	* If an LWKT reschedule was requested, well that is what we are
	605	* doing now so clear it.
	606	*/
	607	clear_lwkt_resched();
	608	again:
	609	if (gd->gd_runqmask) {
	610	int nq = bsrl(gd->gd_runqmask);
	611	if ((ntd = TAILQ_FIRST(&gd->gd_tdrunq[nq])) == NULL) {
	612	gd->gd_runqmask &= ~(1 << nq);
	613	goto again;
	614	}
	615	#ifdef SMP
	616	/*
	617	* THREAD SELECTION FOR AN SMP MACHINE BUILD
	618	*
	619	* If the target needs the MP lock and we couldn't get it,
	620	* or if the target is holding tokens and we could not
	621	* gain ownership of the tokens, continue looking for a
	622	* thread to schedule and spin instead of HLT if we can't.
	623	*
	624	* NOTE: the mpheld variable invalid after this conditional, it
	625	* can change due to both cpu_try_mplock() returning success
	626	* AND interactions in lwkt_getalltokens() due to the fact that
	627	* we are trying to check the mpcount of a thread other then
	628	* the current thread. Because of this, if the current thread
	629	* is not holding td_mpcount, an IPI indirectly run via
	630	* lwkt_getalltokens() can obtain and release the MP lock and
	631	* cause the core MP lock to be released.
	632	*/
	633	if ((ntd->td_mpcount && mpheld == 0 && !cpu_try_mplock()) \|\|
	634	(ntd->td_toks && lwkt_getalltokens(ntd) == 0)
	635	) {
	636	u_int32_t rqmask = gd->gd_runqmask;
	637
	638	mpheld = MP_LOCK_HELD();
	639	ntd = NULL;
	640	while (rqmask) {
	641	TAILQ_FOREACH(ntd, &gd->gd_tdrunq[nq], td_threadq) {
	642	if (ntd->td_mpcount && !mpheld && !cpu_try_mplock()) {
	643	/* spinning due to MP lock being held */
	644	#ifdef INVARIANTS
	645	++mplock_contention_count;
	646	#endif
	647	/* mplock still not held, 'mpheld' still valid */
	648	continue;
	649	}
	650
	651	/*
	652	* mpheld state invalid after getalltokens call returns
	653	* failure, but the variable is only needed for
	654	* the loop.
	655	*/
	656	if (ntd->td_toks && !lwkt_getalltokens(ntd)) {
	657	/* spinning due to token contention */
	658	#ifdef INVARIANTS
	659	++token_contention_count;
	660	#endif
	661	mpheld = MP_LOCK_HELD();
	662	continue;
	663	}
	664	break;
	665	}
	666	if (ntd)
	667	break;
	668	rqmask &= ~(1 << nq);
	669	nq = bsrl(rqmask);
	670	}
	671	if (ntd == NULL) {
	672	ntd = &gd->gd_idlethread;
	673	ntd->td_flags \|= TDF_IDLE_NOHLT;
	674	goto using_idle_thread;
	675	} else {
	676	++gd->gd_cnt.v_swtch;
	677	TAILQ_REMOVE(&gd->gd_tdrunq[nq], ntd, td_threadq);
	678	TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], ntd, td_threadq);
	679	}
	680	} else {
	681	++gd->gd_cnt.v_swtch;
	682	TAILQ_REMOVE(&gd->gd_tdrunq[nq], ntd, td_threadq);
	683	TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], ntd, td_threadq);
	684	}
	685	#else
	686	/*
	687	* THREAD SELECTION FOR A UP MACHINE BUILD. We don't have to
	688	* worry about tokens or the BGL.
	689	*/
	690	++gd->gd_cnt.v_swtch;
	691	TAILQ_REMOVE(&gd->gd_tdrunq[nq], ntd, td_threadq);
	692	TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], ntd, td_threadq);
	693	#endif
	694	} else {
	695	/*
	696	* We have nothing to run but only let the idle loop halt
	697	* the cpu if there are no pending interrupts.
	698	*/
	699	ntd = &gd->gd_idlethread;
	700	if (gd->gd_reqflags & RQF_IDLECHECK_MASK)
	701	ntd->td_flags \|= TDF_IDLE_NOHLT;
	702	#ifdef SMP
	703	using_idle_thread:
	704	/*
	705	* The idle thread should not be holding the MP lock unless we
	706	* are trapping in the kernel or in a panic. Since we select the
	707	* idle thread unconditionally when no other thread is available,
	708	* if the MP lock is desired during a panic or kernel trap, we
	709	* have to loop in the scheduler until we get it.
	710	*/
	711	if (ntd->td_mpcount) {
	712	mpheld = MP_LOCK_HELD();
	713	if (gd->gd_trap_nesting_level == 0 && panicstr == NULL)
	714	panic("Idle thread %p was holding the BGL!", ntd);
	715	else if (mpheld == 0)
	716	goto again;
	717	}
	718	#endif
	719	}
	720	}
	721	KASSERT(ntd->td_pri >= TDPRI_CRIT,
	722	("priority problem in lwkt_switch %d %d", td->td_pri, ntd->td_pri));
	723
	724	/*
	725	* Do the actual switch. If the new target does not need the MP lock
	726	* and we are holding it, release the MP lock. If the new target requires
	727	* the MP lock we have already acquired it for the target.
	728	*/
	729	#ifdef SMP
	730	if (ntd->td_mpcount == 0 ) {
	731	if (MP_LOCK_HELD())
	732	cpu_rel_mplock();
	733	} else {
	734	ASSERT_MP_LOCK_HELD(ntd);
	735	}
	736	#endif
	737	if (td != ntd) {
	738	++switch_count;
	739	td->td_switch(ntd);
	740	}
	741	/* NOTE: current cpu may have changed after switch */
	742	crit_exit_quick(td);
	743	}
	744
	745	/*
	746	* Request that the target thread preempt the current thread. Preemption
	747	* only works under a specific set of conditions:
	748	*
	749	* - We are not preempting ourselves
	750	* - The target thread is owned by the current cpu
	751	* - We are not currently being preempted
	752	* - The target is not currently being preempted
	753	* - We are able to satisfy the target's MP lock requirements (if any).
	754	*
	755	* THE CALLER OF LWKT_PREEMPT() MUST BE IN A CRITICAL SECTION. Typically
	756	* this is called via lwkt_schedule() through the td_preemptable callback.
	757	* critpri is the managed critical priority that we should ignore in order
	758	* to determine whether preemption is possible (aka usually just the crit
	759	* priority of lwkt_schedule() itself).
	760	*
	761	* XXX at the moment we run the target thread in a critical section during
	762	* the preemption in order to prevent the target from taking interrupts
	763	* that WE can't. Preemption is strictly limited to interrupt threads
	764	* and interrupt-like threads, outside of a critical section, and the
	765	* preempted source thread will be resumed the instant the target blocks
	766	* whether or not the source is scheduled (i.e. preemption is supposed to
	767	* be as transparent as possible).
	768	*
	769	* The target thread inherits our MP count (added to its own) for the
	770	* duration of the preemption in order to preserve the atomicy of the
	771	* MP lock during the preemption. Therefore, any preempting targets must be
	772	* careful in regards to MP assertions. Note that the MP count may be
	773	* out of sync with the physical mp_lock, but we do not have to preserve
	774	* the original ownership of the lock if it was out of synch (that is, we
	775	* can leave it synchronized on return).
	776	*/
	777	void
	778	lwkt_preempt(thread_t ntd, int critpri)
	779	{
	780	struct globaldata *gd = mycpu;
	781	thread_t td;
	782	#ifdef SMP
	783	int mpheld;
	784	int savecnt;
	785	#endif
	786
	787	/*
	788	* The caller has put us in a critical section. We can only preempt
	789	* if the caller of the caller was not in a critical section (basically
	790	* a local interrupt), as determined by the 'critpri' parameter. We
	791	* also acn't preempt if the caller is holding any spinlocks (even if
	792	* he isn't in a critical section). This also handles the tokens test.
	793	*
	794	* YYY The target thread must be in a critical section (else it must
	795	* inherit our critical section? I dunno yet).
	796	*
	797	* Set need_lwkt_resched() unconditionally for now YYY.
	798	*/
	799	KASSERT(ntd->td_pri >= TDPRI_CRIT, ("BADCRIT0 %d", ntd->td_pri));
	800
	801	td = gd->gd_curthread;
	802	if ((ntd->td_pri & TDPRI_MASK) <= (td->td_pri & TDPRI_MASK)) {
	803	++preempt_miss;
	804	return;
	805	}
	806	if ((td->td_pri & ~TDPRI_MASK) > critpri) {
	807	++preempt_miss;
	808	need_lwkt_resched();
	809	return;
	810	}
	811	#ifdef SMP
	812	if (ntd->td_gd != gd) {
	813	++preempt_miss;
	814	need_lwkt_resched();
	815	return;
	816	}
	817	#endif
	818	/*
	819	* Take the easy way out and do not preempt if the target is holding
	820	* any spinlocks. We could test whether the thread(s) being
	821	* preempted interlock against the target thread's tokens and whether
	822	* we can get all the target thread's tokens, but this situation
	823	* should not occur very often so its easier to simply not preempt.
	824	* Also, plain spinlocks are impossible to figure out at this point so
	825	* just don't preempt.
	826	*/
	827	if (gd->gd_spinlock_rd \|\| gd->gd_spinlocks_wr) {
	828	++preempt_miss;
	829	need_lwkt_resched();
	830	return;
	831	}
	832	if (td == ntd \|\| ((td->td_flags \| ntd->td_flags) & TDF_PREEMPT_LOCK)) {
	833	++preempt_weird;
	834	need_lwkt_resched();
	835	return;
	836	}
	837	if (ntd->td_preempted) {
	838	++preempt_hit;
	839	need_lwkt_resched();
	840	return;
	841	}
	842	#ifdef SMP
	843	/*
	844	* note: an interrupt might have occured just as we were transitioning
	845	* to or from the MP lock. In this case td_mpcount will be pre-disposed
	846	* (non-zero) but not actually synchronized with the actual state of the
	847	* lock. We can use it to imply an MP lock requirement for the
	848	* preemption but we cannot use it to test whether we hold the MP lock
	849	* or not.
	850	*/
	851	savecnt = td->td_mpcount;
	852	mpheld = MP_LOCK_HELD();
	853	ntd->td_mpcount += td->td_mpcount;
	854	if (mpheld == 0 && ntd->td_mpcount && !cpu_try_mplock()) {
	855	ntd->td_mpcount -= td->td_mpcount;
	856	++preempt_miss;
	857	need_lwkt_resched();
	858	return;
	859	}
	860	#endif
	861
	862	/*
	863	* Since we are able to preempt the current thread, there is no need to
	864	* call need_lwkt_resched().
	865	*/
	866	++preempt_hit;
	867	ntd->td_preempted = td;
	868	td->td_flags \|= TDF_PREEMPT_LOCK;
	869	td->td_switch(ntd);
	870	KKASSERT(ntd->td_preempted && (td->td_flags & TDF_PREEMPT_DONE));
	871	#ifdef SMP
	872	KKASSERT(savecnt == td->td_mpcount);
	873	mpheld = MP_LOCK_HELD();
	874	if (mpheld && td->td_mpcount == 0)
	875	cpu_rel_mplock();
	876	else if (mpheld == 0 && td->td_mpcount)
	877	panic("lwkt_preempt(): MP lock was not held through");
	878	#endif
	879	ntd->td_preempted = NULL;
	880	td->td_flags &= ~(TDF_PREEMPT_LOCK\|TDF_PREEMPT_DONE);
	881	}
	882
	883	/*
	884	* Yield our thread while higher priority threads are pending. This is
	885	* typically called when we leave a critical section but it can be safely
	886	* called while we are in a critical section.
	887	*
	888	* This function will not generally yield to equal priority threads but it
	889	* can occur as a side effect. Note that lwkt_switch() is called from
	890	* inside the critical section to prevent its own crit_exit() from reentering
	891	* lwkt_yield_quick().
	892	*
	893	* gd_reqflags indicates that something changed, e.g. an interrupt or softint
	894	* came along but was blocked and made pending.
	895	*
	896	* (self contained on a per cpu basis)
	897	*/
	898	void
	899	lwkt_yield_quick(void)
	900	{
	901	globaldata_t gd = mycpu;
	902	thread_t td = gd->gd_curthread;
	903
	904	/*
	905	* gd_reqflags is cleared in splz if the cpl is 0. If we were to clear
	906	* it with a non-zero cpl then we might not wind up calling splz after
	907	* a task switch when the critical section is exited even though the
	908	* new task could accept the interrupt.
	909	*
	910	* XXX from crit_exit() only called after last crit section is released.
	911	* If called directly will run splz() even if in a critical section.
	912	*
	913	* td_nest_count prevent deep nesting via splz() or doreti(). Note that
	914	* except for this special case, we MUST call splz() here to handle any
	915	* pending ints, particularly after we switch, or we might accidently
	916	* halt the cpu with interrupts pending.
	917	*/
	918	if (gd->gd_reqflags && td->td_nest_count < 2)
	919	splz();
	920
	921	/*
	922	* YYY enabling will cause wakeup() to task-switch, which really
	923	* confused the old 4.x code. This is a good way to simulate
	924	* preemption and MP without actually doing preemption or MP, because a
	925	* lot of code assumes that wakeup() does not block.
	926	*/
	927	if (untimely_switch && td->td_nest_count == 0 &&
	928	gd->gd_intr_nesting_level == 0
	929	) {
	930	crit_enter_quick(td);
	931	/*
	932	* YYY temporary hacks until we disassociate the userland scheduler
	933	* from the LWKT scheduler.
	934	*/
	935	if (td->td_flags & TDF_RUNQ) {
	936	lwkt_switch(); /* will not reenter yield function */
	937	} else {
	938	lwkt_schedule_self(td); /* make sure we are scheduled */
	939	lwkt_switch(); /* will not reenter yield function */
	940	lwkt_deschedule_self(td); /* make sure we are descheduled */
	941	}
	942	crit_exit_noyield(td);
	943	}
	944	}
	945
	946	/*
	947	* This implements a normal yield which, unlike _quick, will yield to equal
	948	* priority threads as well. Note that gd_reqflags tests will be handled by
	949	* the crit_exit() call in lwkt_switch().
	950	*
	951	* (self contained on a per cpu basis)
	952	*/
	953	void
	954	lwkt_yield(void)
	955	{
	956	lwkt_schedule_self(curthread);
	957	lwkt_switch();
	958	}
	959
	960	/*
	961	* Generic schedule. Possibly schedule threads belonging to other cpus and
	962	* deal with threads that might be blocked on a wait queue.
	963	*
	964	* We have a little helper inline function which does additional work after
	965	* the thread has been enqueued, including dealing with preemption and
	966	* setting need_lwkt_resched() (which prevents the kernel from returning
	967	* to userland until it has processed higher priority threads).
	968	*
	969	* It is possible for this routine to be called after a failed _enqueue
	970	* (due to the target thread migrating, sleeping, or otherwise blocked).
	971	* We have to check that the thread is actually on the run queue!
	972	*/
	973	static __inline
	974	void
	975	_lwkt_schedule_post(globaldata_t gd, thread_t ntd, int cpri)
	976	{
	977	if (ntd->td_flags & TDF_RUNQ) {
	978	if (ntd->td_preemptable) {
	979	ntd->td_preemptable(ntd, cpri); /* YYY +token */
	980	} else if ((ntd->td_flags & TDF_NORESCHED) == 0 &&
	981	(ntd->td_pri & TDPRI_MASK) > (gd->gd_curthread->td_pri & TDPRI_MASK)
	982	) {
	983	need_lwkt_resched();
	984	}
	985	}
	986	}
	987
	988	void
	989	lwkt_schedule(thread_t td)
	990	{
	991	globaldata_t mygd = mycpu;
	992
	993	KASSERT(td != &td->td_gd->gd_idlethread, ("lwkt_schedule(): scheduling gd_idlethread is illegal!"));
	994	crit_enter_gd(mygd);
	995	KKASSERT(td->td_proc == NULL \|\| (td->td_proc->p_flag & P_ONRUNQ) == 0);
	996	if (td == mygd->gd_curthread) {
	997	_lwkt_enqueue(td);
	998	} else {
	999	/*
	1000	* If we own the thread, there is no race (since we are in a
	1001	* critical section). If we do not own the thread there might
	1002	* be a race but the target cpu will deal with it.
	1003	*/
	1004	#ifdef SMP
	1005	if (td->td_gd == mygd) {
	1006	_lwkt_enqueue(td);
	1007	_lwkt_schedule_post(mygd, td, TDPRI_CRIT);
	1008	} else {
	1009	lwkt_send_ipiq(td->td_gd, (ipifunc1_t)lwkt_schedule, td);
	1010	}
	1011	#else
	1012	_lwkt_enqueue(td);
	1013	_lwkt_schedule_post(mygd, td, TDPRI_CRIT);
	1014	#endif
	1015	}
	1016	crit_exit_gd(mygd);
	1017	}
	1018
	1019	#ifdef SMP
	1020
	1021	/*
	1022	* Thread migration using a 'Pull' method. The thread may or may not be
	1023	* the current thread. It MUST be descheduled and in a stable state.
	1024	* lwkt_giveaway() must be called on the cpu owning the thread.
	1025	*
	1026	* At any point after lwkt_giveaway() is called, the target cpu may
	1027	* 'pull' the thread by calling lwkt_acquire().
	1028	*
	1029	* MPSAFE - must be called under very specific conditions.
	1030	*/
	1031	void
	1032	lwkt_giveaway(thread_t td)
	1033	{
	1034	globaldata_t gd = mycpu;
	1035
	1036	crit_enter_gd(gd);
	1037	KKASSERT(td->td_gd == gd);
	1038	TAILQ_REMOVE(&gd->gd_tdallq, td, td_allq);
	1039	td->td_flags \|= TDF_MIGRATING;
	1040	crit_exit_gd(gd);
	1041	}
	1042
	1043	void
	1044	lwkt_acquire(thread_t td)
	1045	{
	1046	globaldata_t gd;
	1047	globaldata_t mygd;
	1048
	1049	KKASSERT(td->td_flags & TDF_MIGRATING);
	1050	gd = td->td_gd;
	1051	mygd = mycpu;
	1052	if (gd != mycpu) {
	1053	cpu_lfence();
	1054	KKASSERT((td->td_flags & TDF_RUNQ) == 0);
	1055	crit_enter_gd(mygd);
	1056	while (td->td_flags & (TDF_RUNNING\|TDF_PREEMPT_LOCK))
	1057	cpu_lfence();
	1058	td->td_gd = mygd;
	1059	TAILQ_INSERT_TAIL(&mygd->gd_tdallq, td, td_allq);
	1060	td->td_flags &= ~TDF_MIGRATING;
	1061	crit_exit_gd(mygd);
	1062	} else {
	1063	crit_enter_gd(mygd);
	1064	TAILQ_INSERT_TAIL(&mygd->gd_tdallq, td, td_allq);
	1065	td->td_flags &= ~TDF_MIGRATING;
	1066	crit_exit_gd(mygd);
	1067	}
	1068	}
	1069
	1070	#endif
	1071
	1072	/*
	1073	* Generic deschedule. Descheduling threads other then your own should be
	1074	* done only in carefully controlled circumstances. Descheduling is
	1075	* asynchronous.
	1076	*
	1077	* This function may block if the cpu has run out of messages.
	1078	*/
	1079	void
	1080	lwkt_deschedule(thread_t td)
	1081	{
	1082	crit_enter();
	1083	#ifdef SMP
	1084	if (td == curthread) {
	1085	_lwkt_dequeue(td);
	1086	} else {
	1087	if (td->td_gd == mycpu) {
	1088	_lwkt_dequeue(td);
	1089	} else {
	1090	lwkt_send_ipiq(td->td_gd, (ipifunc1_t)lwkt_deschedule, td);
	1091	}
	1092	}
	1093	#else
	1094	_lwkt_dequeue(td);
	1095	#endif
	1096	crit_exit();
	1097	}
	1098
	1099	/*
	1100	* Set the target thread's priority. This routine does not automatically
	1101	* switch to a higher priority thread, LWKT threads are not designed for
	1102	* continuous priority changes. Yield if you want to switch.
	1103	*
	1104	* We have to retain the critical section count which uses the high bits
	1105	* of the td_pri field. The specified priority may also indicate zero or
	1106	* more critical sections by adding TDPRI_CRIT*N.
	1107	*
	1108	* Note that we requeue the thread whether it winds up on a different runq
	1109	* or not. uio_yield() depends on this and the routine is not normally
	1110	* called with the same priority otherwise.
	1111	*/
	1112	void
	1113	lwkt_setpri(thread_t td, int pri)
	1114	{
	1115	KKASSERT(pri >= 0);
	1116	KKASSERT(td->td_gd == mycpu);
	1117	crit_enter();
	1118	if (td->td_flags & TDF_RUNQ) {
	1119	_lwkt_dequeue(td);
	1120	td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri;
	1121	_lwkt_enqueue(td);
	1122	} else {
	1123	td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri;
	1124	}
	1125	crit_exit();
	1126	}
	1127
	1128	void
	1129	lwkt_setpri_self(int pri)
	1130	{
	1131	thread_t td = curthread;
	1132
	1133	KKASSERT(pri >= 0 && pri <= TDPRI_MAX);
	1134	crit_enter();
	1135	if (td->td_flags & TDF_RUNQ) {
	1136	_lwkt_dequeue(td);
	1137	td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri;
	1138	_lwkt_enqueue(td);
	1139	} else {
	1140	td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri;
	1141	}
	1142	crit_exit();
	1143	}
	1144
	1145	/*
	1146	* Determine if there is a runnable thread at a higher priority then
	1147	* the current thread. lwkt_setpri() does not check this automatically.
	1148	* Return 1 if there is, 0 if there isn't.
	1149	*
	1150	* Example: if bit 31 of runqmask is set and the current thread is priority
	1151	* 30, then we wind up checking the mask: 0x80000000 against 0x7fffffff.
	1152	*
	1153	* If nq reaches 31 the shift operation will overflow to 0 and we will wind
	1154	* up comparing against 0xffffffff, a comparison that will always be false.
	1155	*/
	1156	int
	1157	lwkt_checkpri_self(void)
	1158	{
	1159	globaldata_t gd = mycpu;
	1160	thread_t td = gd->gd_curthread;
	1161	int nq = td->td_pri & TDPRI_MASK;
	1162
	1163	while (gd->gd_runqmask > (__uint32_t)(2 << nq) - 1) {
	1164	if (TAILQ_FIRST(&gd->gd_tdrunq[nq + 1]))
	1165	return(1);
	1166	++nq;
	1167	}
	1168	return(0);
	1169	}
	1170
	1171	/*
	1172	* Migrate the current thread to the specified cpu.
	1173	*
	1174	* This is accomplished by descheduling ourselves from the current cpu,
	1175	* moving our thread to the tdallq of the target cpu, IPI messaging the
	1176	* target cpu, and switching out. TDF_MIGRATING prevents scheduling
	1177	* races while the thread is being migrated.
	1178	*/
	1179	#ifdef SMP
	1180	static void lwkt_setcpu_remote(void *arg);
	1181	#endif
	1182
	1183	void
	1184	lwkt_setcpu_self(globaldata_t rgd)
	1185	{
	1186	#ifdef SMP
	1187	thread_t td = curthread;
	1188
	1189	if (td->td_gd != rgd) {
	1190	crit_enter_quick(td);
	1191	td->td_flags \|= TDF_MIGRATING;
	1192	lwkt_deschedule_self(td);
	1193	TAILQ_REMOVE(&td->td_gd->gd_tdallq, td, td_allq);
	1194	lwkt_send_ipiq(rgd, (ipifunc1_t)lwkt_setcpu_remote, td);
	1195	lwkt_switch();
	1196	/* we are now on the target cpu */
	1197	TAILQ_INSERT_TAIL(&rgd->gd_tdallq, td, td_allq);
	1198	crit_exit_quick(td);
	1199	}
	1200	#endif
	1201	}
	1202
	1203	void
	1204	lwkt_migratecpu(int cpuid)
	1205	{
	1206	#ifdef SMP
	1207	globaldata_t rgd;
	1208
	1209	rgd = globaldata_find(cpuid);
	1210	lwkt_setcpu_self(rgd);
	1211	#endif
	1212	}
	1213
	1214	/*
	1215	* Remote IPI for cpu migration (called while in a critical section so we
	1216	* do not have to enter another one). The thread has already been moved to
	1217	* our cpu's allq, but we must wait for the thread to be completely switched
	1218	* out on the originating cpu before we schedule it on ours or the stack
	1219	* state may be corrupt. We clear TDF_MIGRATING after flushing the GD
	1220	* change to main memory.
	1221	*
	1222	* XXX The use of TDF_MIGRATING might not be sufficient to avoid races
	1223	* against wakeups. It is best if this interface is used only when there
	1224	* are no pending events that might try to schedule the thread.
	1225	*/
	1226	#ifdef SMP
	1227	static void
	1228	lwkt_setcpu_remote(void *arg)
	1229	{
	1230	thread_t td = arg;
	1231	globaldata_t gd = mycpu;
	1232
	1233	while (td->td_flags & (TDF_RUNNING\|TDF_PREEMPT_LOCK))
	1234	cpu_lfence();
	1235	td->td_gd = gd;
	1236	cpu_sfence();
	1237	td->td_flags &= ~TDF_MIGRATING;
	1238	KKASSERT(td->td_proc == NULL \|\| (td->td_proc->p_flag & P_ONRUNQ) == 0);
	1239	_lwkt_enqueue(td);
	1240	}
	1241	#endif
	1242
	1243	struct lwp *
	1244	lwkt_preempted_proc(void)
	1245	{
	1246	thread_t td = curthread;
	1247	while (td->td_preempted)
	1248	td = td->td_preempted;
	1249	return(td->td_lwp);
	1250	}
	1251
	1252	/*
	1253	* Create a kernel process/thread/whatever. It shares it's address space
	1254	* with proc0 - ie: kernel only.
	1255	*
	1256	* NOTE! By default new threads are created with the MP lock held. A
	1257	* thread which does not require the MP lock should release it by calling
	1258	* rel_mplock() at the start of the new thread.
	1259	*/
	1260	int
	1261	lwkt_create(void (func)(void ), void *arg,
	1262	struct thread **tdp, thread_t template, int tdflags, int cpu,
	1263	const char *fmt, ...)
	1264	{
	1265	thread_t td;
	1266	__va_list ap;
	1267
	1268	td = lwkt_alloc_thread(template, LWKT_THREAD_STACK, cpu,
	1269	tdflags \| TDF_VERBOSE);
	1270	if (tdp)
	1271	*tdp = td;
	1272	cpu_set_thread_handler(td, lwkt_exit, func, arg);
	1273
	1274	/*
	1275	* Set up arg0 for 'ps' etc
	1276	*/
	1277	__va_start(ap, fmt);
	1278	kvsnprintf(td->td_comm, sizeof(td->td_comm), fmt, ap);
	1279	__va_end(ap);
	1280
	1281	/*
	1282	* Schedule the thread to run
	1283	*/
	1284	if ((td->td_flags & TDF_STOPREQ) == 0)
	1285	lwkt_schedule(td);
	1286	else
	1287	td->td_flags &= ~TDF_STOPREQ;
	1288	return 0;
	1289	}
	1290
	1291	/*
	1292	* kthread_* is specific to the kernel and is not needed by userland.
	1293	*/
	1294	#ifdef _KERNEL
	1295
	1296	/*
	1297	* Destroy an LWKT thread. Warning! This function is not called when
	1298	* a process exits, cpu_proc_exit() directly calls cpu_thread_exit() and
	1299	* uses a different reaping mechanism.
	1300	*/
	1301	void
	1302	lwkt_exit(void)
	1303	{
	1304	thread_t td = curthread;
	1305	globaldata_t gd;
	1306
	1307	if (td->td_flags & TDF_VERBOSE)
	1308	kprintf("kthread %p %s has exited\n", td, td->td_comm);
	1309	caps_exit(td);
	1310	crit_enter_quick(td);
	1311	lwkt_deschedule_self(td);
	1312	gd = mycpu;
	1313	lwkt_remove_tdallq(td);
	1314	if (td->td_flags & TDF_ALLOCATED_THREAD) {
	1315	++gd->gd_tdfreecount;
	1316	TAILQ_INSERT_TAIL(&gd->gd_tdfreeq, td, td_threadq);
	1317	}
	1318	cpu_thread_exit();
	1319	}
	1320
	1321	void
	1322	lwkt_remove_tdallq(thread_t td)
	1323	{
	1324	KKASSERT(td->td_gd == mycpu);
	1325	TAILQ_REMOVE(&td->td_gd->gd_tdallq, td, td_allq);
	1326	}
	1327
	1328	#endif /* _KERNEL */
	1329
	1330	void
	1331	crit_panic(void)
	1332	{
	1333	thread_t td = curthread;
	1334	int lpri = td->td_pri;
	1335
	1336	td->td_pri = 0;
	1337	panic("td_pri is/would-go negative! %p %d", td, lpri);
	1338	}
	1339
	1340	#ifdef SMP
	1341
	1342	/*
	1343	* Called from debugger/panic on cpus which have been stopped. We must still
	1344	* process the IPIQ while stopped, even if we were stopped while in a critical
	1345	* section (XXX).
	1346	*
	1347	* If we are dumping also try to process any pending interrupts. This may
	1348	* or may not work depending on the state of the cpu at the point it was
	1349	* stopped.
	1350	*/
	1351	void
	1352	lwkt_smp_stopped(void)
	1353	{
	1354	globaldata_t gd = mycpu;
	1355
	1356	crit_enter_gd(gd);
	1357	if (dumping) {
	1358	lwkt_process_ipiq();
	1359	splz();
	1360	} else {
	1361	lwkt_process_ipiq();
	1362	}
	1363	crit_exit_gd(gd);
	1364	}
	1365
	1366	/*
	1367	* get_mplock() calls this routine if it is unable to obtain the MP lock.
	1368	* get_mplock() has already incremented td_mpcount. We must block and
	1369	* not return until giant is held.
	1370	*
	1371	* All we have to do is lwkt_switch() away. The LWKT scheduler will not
	1372	* reschedule the thread until it can obtain the giant lock for it.
	1373	*/
	1374	void
	1375	lwkt_mp_lock_contested(void)
	1376	{
	1377	#ifdef _KERNEL
	1378	loggiant(beg);
	1379	#endif
	1380	lwkt_switch();
	1381	#ifdef _KERNEL
	1382	loggiant(end);
	1383	#endif
	1384	}
	1385
	1386	#endif