gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2003-2011 The DragonFly Project. All rights reserved.
	3	*
	4	* This code is derived from software contributed to The DragonFly Project
	5	* by Matthew Dillon <dillon@backplane.com>
	6	*
	7	* Redistribution and use in source and binary forms, with or without
	8	* modification, are permitted provided that the following conditions
	9	* are met:
	10	*
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in
	15	* the documentation and/or other materials provided with the
	16	* distribution.
	17	* 3. Neither the name of The DragonFly Project nor the names of its
	18	* contributors may be used to endorse or promote products derived
	19	* from this software without specific, prior written permission.
	20	*
	21	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	22	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	23	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	24	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	25	* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	26	* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
	27	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	28	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	29	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	30	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	31	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	32	* SUCH DAMAGE.
	33	*/
	34
	35	/*
	36	* Each cpu in a system has its own self-contained light weight kernel
	37	* thread scheduler, which means that generally speaking we only need
	38	* to use a critical section to avoid problems. Foreign thread
	39	* scheduling is queued via (async) IPIs.
	40	*/
	41
	42	#include <sys/param.h>
	43	#include <sys/systm.h>
	44	#include <sys/kernel.h>
	45	#include <sys/proc.h>
	46	#include <sys/rtprio.h>
	47	#include <sys/kinfo.h>
	48	#include <sys/queue.h>
	49	#include <sys/sysctl.h>
	50	#include <sys/kthread.h>
	51	#include <machine/cpu.h>
	52	#include <sys/lock.h>
	53	#include <sys/caps.h>
	54	#include <sys/spinlock.h>
	55	#include <sys/ktr.h>
	56
	57	#include <sys/thread2.h>
	58	#include <sys/spinlock2.h>
	59	#include <sys/mplock2.h>
	60
	61	#include <sys/dsched.h>
	62
	63	#include <vm/vm.h>
	64	#include <vm/vm_param.h>
	65	#include <vm/vm_kern.h>
	66	#include <vm/vm_object.h>
	67	#include <vm/vm_page.h>
	68	#include <vm/vm_map.h>
	69	#include <vm/vm_pager.h>
	70	#include <vm/vm_extern.h>
	71
	72	#include <machine/stdarg.h>
	73	#include <machine/smp.h>
	74
	75	#if !defined(KTR_CTXSW)
	76	#define KTR_CTXSW KTR_ALL
	77	#endif
	78	KTR_INFO_MASTER(ctxsw);
	79	KTR_INFO(KTR_CTXSW, ctxsw, sw, 0, "#cpu[%d].td = %p", int cpu, struct thread *td);
	80	KTR_INFO(KTR_CTXSW, ctxsw, pre, 1, "#cpu[%d].td = %p", int cpu, struct thread *td);
	81	KTR_INFO(KTR_CTXSW, ctxsw, newtd, 2, "#threads[%p].name = %s", struct thread td, char comm);
	82	KTR_INFO(KTR_CTXSW, ctxsw, deadtd, 3, "#threads[%p].name = <dead>", struct thread *td);
	83
	84	static MALLOC_DEFINE(M_THREAD, "thread", "lwkt threads");
	85
	86	#ifdef INVARIANTS
	87	static int panic_on_cscount = 0;
	88	#endif
	89	static __int64_t switch_count = 0;
	90	static __int64_t preempt_hit = 0;
	91	static __int64_t preempt_miss = 0;
	92	static __int64_t preempt_weird = 0;
	93	static __int64_t token_contention_count[TDPRI_MAX+1] __debugvar;
	94	static int lwkt_use_spin_port;
	95	static struct objcache *thread_cache;
	96
	97	#ifdef SMP
	98	static void lwkt_schedule_remote(void arg, int arg2, struct intrframe frame);
	99	static void lwkt_setcpu_remote(void *arg);
	100	#endif
	101
	102	extern void cpu_heavy_restore(void);
	103	extern void cpu_lwkt_restore(void);
	104	extern void cpu_kthread_restore(void);
	105	extern void cpu_idle_restore(void);
	106
	107	/*
	108	* We can make all thread ports use the spin backend instead of the thread
	109	* backend. This should only be set to debug the spin backend.
	110	*/
	111	TUNABLE_INT("lwkt.use_spin_port", &lwkt_use_spin_port);
	112
	113	#ifdef INVARIANTS
	114	SYSCTL_INT(_lwkt, OID_AUTO, panic_on_cscount, CTLFLAG_RW, &panic_on_cscount, 0,
	115	"Panic if attempting to switch lwkt's while mastering cpusync");
	116	#endif
	117	SYSCTL_QUAD(_lwkt, OID_AUTO, switch_count, CTLFLAG_RW, &switch_count, 0,
	118	"Number of switched threads");
	119	SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_hit, CTLFLAG_RW, &preempt_hit, 0,
	120	"Successful preemption events");
	121	SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_miss, CTLFLAG_RW, &preempt_miss, 0,
	122	"Failed preemption events");
	123	SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_weird, CTLFLAG_RW, &preempt_weird, 0,
	124	"Number of preempted threads.");
	125	#ifdef INVARIANTS
	126	SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_00, CTLFLAG_RW,
	127	&token_contention_count[0], 0, "spinning due to token contention");
	128	SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_01, CTLFLAG_RW,
	129	&token_contention_count[1], 0, "spinning due to token contention");
	130	SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_02, CTLFLAG_RW,
	131	&token_contention_count[2], 0, "spinning due to token contention");
	132	SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_03, CTLFLAG_RW,
	133	&token_contention_count[3], 0, "spinning due to token contention");
	134	SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_04, CTLFLAG_RW,
	135	&token_contention_count[4], 0, "spinning due to token contention");
	136	SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_05, CTLFLAG_RW,
	137	&token_contention_count[5], 0, "spinning due to token contention");
	138	SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_06, CTLFLAG_RW,
	139	&token_contention_count[6], 0, "spinning due to token contention");
	140	SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_07, CTLFLAG_RW,
	141	&token_contention_count[7], 0, "spinning due to token contention");
	142	SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_08, CTLFLAG_RW,
	143	&token_contention_count[8], 0, "spinning due to token contention");
	144	SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_09, CTLFLAG_RW,
	145	&token_contention_count[9], 0, "spinning due to token contention");
	146	SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_10, CTLFLAG_RW,
	147	&token_contention_count[10], 0, "spinning due to token contention");
	148	SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_11, CTLFLAG_RW,
	149	&token_contention_count[11], 0, "spinning due to token contention");
	150	SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_12, CTLFLAG_RW,
	151	&token_contention_count[12], 0, "spinning due to token contention");
	152	SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_13, CTLFLAG_RW,
	153	&token_contention_count[13], 0, "spinning due to token contention");
	154	SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_14, CTLFLAG_RW,
	155	&token_contention_count[14], 0, "spinning due to token contention");
	156	SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_15, CTLFLAG_RW,
	157	&token_contention_count[15], 0, "spinning due to token contention");
	158	SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_16, CTLFLAG_RW,
	159	&token_contention_count[16], 0, "spinning due to token contention");
	160	SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_17, CTLFLAG_RW,
	161	&token_contention_count[17], 0, "spinning due to token contention");
	162	SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_18, CTLFLAG_RW,
	163	&token_contention_count[18], 0, "spinning due to token contention");
	164	SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_19, CTLFLAG_RW,
	165	&token_contention_count[19], 0, "spinning due to token contention");
	166	SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_20, CTLFLAG_RW,
	167	&token_contention_count[20], 0, "spinning due to token contention");
	168	SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_21, CTLFLAG_RW,
	169	&token_contention_count[21], 0, "spinning due to token contention");
	170	SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_22, CTLFLAG_RW,
	171	&token_contention_count[22], 0, "spinning due to token contention");
	172	SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_23, CTLFLAG_RW,
	173	&token_contention_count[23], 0, "spinning due to token contention");
	174	SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_24, CTLFLAG_RW,
	175	&token_contention_count[24], 0, "spinning due to token contention");
	176	SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_25, CTLFLAG_RW,
	177	&token_contention_count[25], 0, "spinning due to token contention");
	178	SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_26, CTLFLAG_RW,
	179	&token_contention_count[26], 0, "spinning due to token contention");
	180	SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_27, CTLFLAG_RW,
	181	&token_contention_count[27], 0, "spinning due to token contention");
	182	SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_28, CTLFLAG_RW,
	183	&token_contention_count[28], 0, "spinning due to token contention");
	184	SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_29, CTLFLAG_RW,
	185	&token_contention_count[29], 0, "spinning due to token contention");
	186	SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_30, CTLFLAG_RW,
	187	&token_contention_count[30], 0, "spinning due to token contention");
	188	SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count_31, CTLFLAG_RW,
	189	&token_contention_count[31], 0, "spinning due to token contention");
	190	#endif
	191	static int fairq_enable = 0;
	192	SYSCTL_INT(_lwkt, OID_AUTO, fairq_enable, CTLFLAG_RW,
	193	&fairq_enable, 0, "Turn on fairq priority accumulators");
	194	static int fairq_bypass = -1;
	195	SYSCTL_INT(_lwkt, OID_AUTO, fairq_bypass, CTLFLAG_RW,
	196	&fairq_bypass, 0, "Allow fairq to bypass td on token failure");
	197	extern int lwkt_sched_debug;
	198	int lwkt_sched_debug = 0;
	199	SYSCTL_INT(_lwkt, OID_AUTO, sched_debug, CTLFLAG_RW,
	200	&lwkt_sched_debug, 0, "Scheduler debug");
	201	static int lwkt_spin_loops = 10;
	202	SYSCTL_INT(_lwkt, OID_AUTO, spin_loops, CTLFLAG_RW,
	203	&lwkt_spin_loops, 0, "Scheduler spin loops until sorted decon");
	204	static int lwkt_spin_reseq = 0;
	205	SYSCTL_INT(_lwkt, OID_AUTO, spin_reseq, CTLFLAG_RW,
	206	&lwkt_spin_reseq, 0, "Scheduler resequencer enable");
	207	static int lwkt_spin_monitor = 0;
	208	SYSCTL_INT(_lwkt, OID_AUTO, spin_monitor, CTLFLAG_RW,
	209	&lwkt_spin_monitor, 0, "Scheduler uses monitor/mwait");
	210	static int lwkt_spin_fatal = 0; /* disabled */
	211	SYSCTL_INT(_lwkt, OID_AUTO, spin_fatal, CTLFLAG_RW,
	212	&lwkt_spin_fatal, 0, "LWKT scheduler spin loops till fatal panic");
	213	static int preempt_enable = 1;
	214	SYSCTL_INT(_lwkt, OID_AUTO, preempt_enable, CTLFLAG_RW,
	215	&preempt_enable, 0, "Enable preemption");
	216	static int lwkt_cache_threads = 0;
	217	SYSCTL_INT(_lwkt, OID_AUTO, cache_threads, CTLFLAG_RD,
	218	&lwkt_cache_threads, 0, "thread+kstack cache");
	219
	220	static __cachealign int lwkt_cseq_rindex;
	221	static __cachealign int lwkt_cseq_windex;
	222
	223	/*
	224	* These helper procedures handle the runq, they can only be called from
	225	* within a critical section.
	226	*
	227	* WARNING! Prior to SMP being brought up it is possible to enqueue and
	228	* dequeue threads belonging to other cpus, so be sure to use td->td_gd
	229	* instead of 'mycpu' when referencing the globaldata structure. Once
	230	* SMP live enqueuing and dequeueing only occurs on the current cpu.
	231	*/
	232	static __inline
	233	void
	234	_lwkt_dequeue(thread_t td)
	235	{
	236	if (td->td_flags & TDF_RUNQ) {
	237	struct globaldata *gd = td->td_gd;
	238
	239	td->td_flags &= ~TDF_RUNQ;
	240	TAILQ_REMOVE(&gd->gd_tdrunq, td, td_threadq);
	241	if (TAILQ_FIRST(&gd->gd_tdrunq) == NULL)
	242	atomic_clear_int(&gd->gd_reqflags, RQF_RUNNING);
	243	}
	244	}
	245
	246	/*
	247	* Priority enqueue.
	248	*
	249	* NOTE: There are a limited number of lwkt threads runnable since user
	250	* processes only schedule one at a time per cpu.
	251	*/
	252	static __inline
	253	void
	254	_lwkt_enqueue(thread_t td)
	255	{
	256	thread_t xtd;
	257
	258	if ((td->td_flags & (TDF_RUNQ\|TDF_MIGRATING\|TDF_BLOCKQ)) == 0) {
	259	struct globaldata *gd = td->td_gd;
	260
	261	td->td_flags \|= TDF_RUNQ;
	262	xtd = TAILQ_FIRST(&gd->gd_tdrunq);
	263	if (xtd == NULL) {
	264	TAILQ_INSERT_TAIL(&gd->gd_tdrunq, td, td_threadq);
	265	atomic_set_int(&gd->gd_reqflags, RQF_RUNNING);
	266	} else {
	267	while (xtd && xtd->td_pri >= td->td_pri)
	268	xtd = TAILQ_NEXT(xtd, td_threadq);
	269	if (xtd)
	270	TAILQ_INSERT_BEFORE(xtd, td, td_threadq);
	271	else
	272	TAILQ_INSERT_TAIL(&gd->gd_tdrunq, td, td_threadq);
	273	}
	274
	275	/*
	276	* Request a LWKT reschedule if we are now at the head of the queue.
	277	*/
	278	if (TAILQ_FIRST(&gd->gd_tdrunq) == td)
	279	need_lwkt_resched();
	280	}
	281	}
	282
	283	static __boolean_t
	284	_lwkt_thread_ctor(void obj, void privdata, int ocflags)
	285	{
	286	struct thread td = (struct thread )obj;
	287
	288	td->td_kstack = NULL;
	289	td->td_kstack_size = 0;
	290	td->td_flags = TDF_ALLOCATED_THREAD;
	291	td->td_mpflags = 0;
	292	return (1);
	293	}
	294
	295	static void
	296	_lwkt_thread_dtor(void obj, void privdata)
	297	{
	298	struct thread td = (struct thread )obj;
	299
	300	KASSERT(td->td_flags & TDF_ALLOCATED_THREAD,
	301	("_lwkt_thread_dtor: not allocated from objcache"));
	302	KASSERT((td->td_flags & TDF_ALLOCATED_STACK) && td->td_kstack &&
	303	td->td_kstack_size > 0,
	304	("_lwkt_thread_dtor: corrupted stack"));
	305	kmem_free(&kernel_map, (vm_offset_t)td->td_kstack, td->td_kstack_size);
	306	td->td_kstack = NULL;
	307	td->td_flags = 0;
	308	}
	309
	310	/*
	311	* Initialize the lwkt s/system.
	312	*
	313	* Nominally cache up to 32 thread + kstack structures. Cache more on
	314	* systems with a lot of cpu cores.
	315	*/
	316	void
	317	lwkt_init(void)
	318	{
	319	TUNABLE_INT("lwkt.cache_threads", &lwkt_cache_threads);
	320	if (lwkt_cache_threads == 0) {
	321	lwkt_cache_threads = ncpus * 4;
	322	if (lwkt_cache_threads < 32)
	323	lwkt_cache_threads = 32;
	324	}
	325	thread_cache = objcache_create_mbacked(
	326	M_THREAD, sizeof(struct thread),
	327	NULL, lwkt_cache_threads,
	328	_lwkt_thread_ctor, _lwkt_thread_dtor, NULL);
	329	}
	330
	331	/*
	332	* Schedule a thread to run. As the current thread we can always safely
	333	* schedule ourselves, and a shortcut procedure is provided for that
	334	* function.
	335	*
	336	* (non-blocking, self contained on a per cpu basis)
	337	*/
	338	void
	339	lwkt_schedule_self(thread_t td)
	340	{
	341	KKASSERT((td->td_flags & TDF_MIGRATING) == 0);
	342	crit_enter_quick(td);
	343	KASSERT(td != &td->td_gd->gd_idlethread,
	344	("lwkt_schedule_self(): scheduling gd_idlethread is illegal!"));
	345	KKASSERT(td->td_lwp == NULL \|\|
	346	(td->td_lwp->lwp_mpflags & LWP_MP_ONRUNQ) == 0);
	347	_lwkt_enqueue(td);
	348	crit_exit_quick(td);
	349	}
	350
	351	/*
	352	* Deschedule a thread.
	353	*
	354	* (non-blocking, self contained on a per cpu basis)
	355	*/
	356	void
	357	lwkt_deschedule_self(thread_t td)
	358	{
	359	crit_enter_quick(td);
	360	_lwkt_dequeue(td);
	361	crit_exit_quick(td);
	362	}
	363
	364	/*
	365	* LWKTs operate on a per-cpu basis
	366	*
	367	* WARNING! Called from early boot, 'mycpu' may not work yet.
	368	*/
	369	void
	370	lwkt_gdinit(struct globaldata *gd)
	371	{
	372	TAILQ_INIT(&gd->gd_tdrunq);
	373	TAILQ_INIT(&gd->gd_tdallq);
	374	}
	375
	376	/*
	377	* Create a new thread. The thread must be associated with a process context
	378	* or LWKT start address before it can be scheduled. If the target cpu is
	379	* -1 the thread will be created on the current cpu.
	380	*
	381	* If you intend to create a thread without a process context this function
	382	* does everything except load the startup and switcher function.
	383	*/
	384	thread_t
	385	lwkt_alloc_thread(struct thread *td, int stksize, int cpu, int flags)
	386	{
	387	static int cpu_rotator;
	388	globaldata_t gd = mycpu;
	389	void *stack;
	390
	391	/*
	392	* If static thread storage is not supplied allocate a thread. Reuse
	393	* a cached free thread if possible. gd_freetd is used to keep an exiting
	394	* thread intact through the exit.
	395	*/
	396	if (td == NULL) {
	397	crit_enter_gd(gd);
	398	if ((td = gd->gd_freetd) != NULL) {
	399	KKASSERT((td->td_flags & (TDF_RUNNING\|TDF_PREEMPT_LOCK\|
	400	TDF_RUNQ)) == 0);
	401	gd->gd_freetd = NULL;
	402	} else {
	403	td = objcache_get(thread_cache, M_WAITOK);
	404	KKASSERT((td->td_flags & (TDF_RUNNING\|TDF_PREEMPT_LOCK\|
	405	TDF_RUNQ)) == 0);
	406	}
	407	crit_exit_gd(gd);
	408	KASSERT((td->td_flags &
	409	(TDF_ALLOCATED_THREAD\|TDF_RUNNING\|TDF_PREEMPT_LOCK)) ==
	410	TDF_ALLOCATED_THREAD,
	411	("lwkt_alloc_thread: corrupted td flags 0x%X", td->td_flags));
	412	flags \|= td->td_flags & (TDF_ALLOCATED_THREAD\|TDF_ALLOCATED_STACK);
	413	}
	414
	415	/*
	416	* Try to reuse cached stack.
	417	*/
	418	if ((stack = td->td_kstack) != NULL && td->td_kstack_size != stksize) {
	419	if (flags & TDF_ALLOCATED_STACK) {
	420	kmem_free(&kernel_map, (vm_offset_t)stack, td->td_kstack_size);
	421	stack = NULL;
	422	}
	423	}
	424	if (stack == NULL) {
	425	stack = (void *)kmem_alloc_stack(&kernel_map, stksize);
	426	flags \|= TDF_ALLOCATED_STACK;
	427	}
	428	if (cpu < 0) {
	429	cpu = ++cpu_rotator;
	430	cpu_ccfence();
	431	cpu %= ncpus;
	432	}
	433	lwkt_init_thread(td, stack, stksize, flags, globaldata_find(cpu));
	434	return(td);
	435	}
	436
	437	/*
	438	* Initialize a preexisting thread structure. This function is used by
	439	* lwkt_alloc_thread() and also used to initialize the per-cpu idlethread.
	440	*
	441	* All threads start out in a critical section at a priority of
	442	* TDPRI_KERN_DAEMON. Higher level code will modify the priority as
	443	* appropriate. This function may send an IPI message when the
	444	* requested cpu is not the current cpu and consequently gd_tdallq may
	445	* not be initialized synchronously from the point of view of the originating
	446	* cpu.
	447	*
	448	* NOTE! we have to be careful in regards to creating threads for other cpus
	449	* if SMP has not yet been activated.
	450	*/
	451	#ifdef SMP
	452
	453	static void
	454	lwkt_init_thread_remote(void *arg)
	455	{
	456	thread_t td = arg;
	457
	458	/*
	459	* Protected by critical section held by IPI dispatch
	460	*/
	461	TAILQ_INSERT_TAIL(&td->td_gd->gd_tdallq, td, td_allq);
	462	}
	463
	464	#endif
	465
	466	/*
	467	* lwkt core thread structural initialization.
	468	*
	469	* NOTE: All threads are initialized as mpsafe threads.
	470	*/
	471	void
	472	lwkt_init_thread(thread_t td, void *stack, int stksize, int flags,
	473	struct globaldata *gd)
	474	{
	475	globaldata_t mygd = mycpu;
	476
	477	bzero(td, sizeof(struct thread));
	478	td->td_kstack = stack;
	479	td->td_kstack_size = stksize;
	480	td->td_flags = flags;
	481	td->td_mpflags = 0;
	482	td->td_gd = gd;
	483	td->td_pri = TDPRI_KERN_DAEMON;
	484	td->td_critcount = 1;
	485	td->td_toks_have = NULL;
	486	td->td_toks_stop = &td->td_toks_base;
	487	if (lwkt_use_spin_port \|\| (flags & TDF_FORCE_SPINPORT))
	488	lwkt_initport_spin(&td->td_msgport, td);
	489	else
	490	lwkt_initport_thread(&td->td_msgport, td);
	491	pmap_init_thread(td);
	492	#ifdef SMP
	493	/*
	494	* Normally initializing a thread for a remote cpu requires sending an
	495	* IPI. However, the idlethread is setup before the other cpus are
	496	* activated so we have to treat it as a special case. XXX manipulation
	497	* of gd_tdallq requires the BGL.
	498	*/
	499	if (gd == mygd \|\| td == &gd->gd_idlethread) {
	500	crit_enter_gd(mygd);
	501	TAILQ_INSERT_TAIL(&gd->gd_tdallq, td, td_allq);
	502	crit_exit_gd(mygd);
	503	} else {
	504	lwkt_send_ipiq(gd, lwkt_init_thread_remote, td);
	505	}
	506	#else
	507	crit_enter_gd(mygd);
	508	TAILQ_INSERT_TAIL(&gd->gd_tdallq, td, td_allq);
	509	crit_exit_gd(mygd);
	510	#endif
	511
	512	dsched_new_thread(td);
	513	}
	514
	515	void
	516	lwkt_set_comm(thread_t td, const char *ctl, ...)
	517	{
	518	__va_list va;
	519
	520	__va_start(va, ctl);
	521	kvsnprintf(td->td_comm, sizeof(td->td_comm), ctl, va);
	522	__va_end(va);
	523	KTR_LOG(ctxsw_newtd, td, td->td_comm);
	524	}
	525
	526	/*
	527	* Prevent the thread from getting destroyed. Note that unlike PHOLD/PRELE
	528	* this does not prevent the thread from migrating to another cpu so the
	529	* gd_tdallq state is not protected by this.
	530	*/
	531	void
	532	lwkt_hold(thread_t td)
	533	{
	534	atomic_add_int(&td->td_refs, 1);
	535	}
	536
	537	void
	538	lwkt_rele(thread_t td)
	539	{
	540	KKASSERT(td->td_refs > 0);
	541	atomic_add_int(&td->td_refs, -1);
	542	}
	543
	544	void
	545	lwkt_free_thread(thread_t td)
	546	{
	547	KKASSERT(td->td_refs == 0);
	548	KKASSERT((td->td_flags & (TDF_RUNNING \| TDF_PREEMPT_LOCK \|
	549	TDF_RUNQ \| TDF_TSLEEPQ)) == 0);
	550	if (td->td_flags & TDF_ALLOCATED_THREAD) {
	551	objcache_put(thread_cache, td);
	552	} else if (td->td_flags & TDF_ALLOCATED_STACK) {
	553	/* client-allocated struct with internally allocated stack */
	554	KASSERT(td->td_kstack && td->td_kstack_size > 0,
	555	("lwkt_free_thread: corrupted stack"));
	556	kmem_free(&kernel_map, (vm_offset_t)td->td_kstack, td->td_kstack_size);
	557	td->td_kstack = NULL;
	558	td->td_kstack_size = 0;
	559	}
	560	KTR_LOG(ctxsw_deadtd, td);
	561	}
	562
	563
	564	/*
	565	* Switch to the next runnable lwkt. If no LWKTs are runnable then
	566	* switch to the idlethread. Switching must occur within a critical
	567	* section to avoid races with the scheduling queue.
	568	*
	569	* We always have full control over our cpu's run queue. Other cpus
	570	* that wish to manipulate our queue must use the cpu_*msg() calls to
	571	* talk to our cpu, so a critical section is all that is needed and
	572	* the result is very, very fast thread switching.
	573	*
	574	* The LWKT scheduler uses a fixed priority model and round-robins at
	575	* each priority level. User process scheduling is a totally
	576	* different beast and LWKT priorities should not be confused with
	577	* user process priorities.
	578	*
	579	* PREEMPTION NOTE: Preemption occurs via lwkt_preempt(). lwkt_switch()
	580	* is not called by the current thread in the preemption case, only when
	581	* the preempting thread blocks (in order to return to the original thread).
	582	*
	583	* SPECIAL NOTE ON SWITCH ATOMICY: Certain operations such as thread
	584	* migration and tsleep deschedule the current lwkt thread and call
	585	* lwkt_switch(). In particular, the target cpu of the migration fully
	586	* expects the thread to become non-runnable and can deadlock against
	587	* cpusync operations if we run any IPIs prior to switching the thread out.
	588	*
	589	* WE MUST BE VERY CAREFUL NOT TO RUN SPLZ DIRECTLY OR INDIRECTLY IF
	590	* THE CURRENT THREAD HAS BEEN DESCHEDULED!
	591	*/
	592	void
	593	lwkt_switch(void)
	594	{
	595	globaldata_t gd = mycpu;
	596	thread_t td = gd->gd_curthread;
	597	thread_t ntd;
	598	int spinning = 0;
	599
	600	KKASSERT(gd->gd_processing_ipiq == 0);
	601	KKASSERT(td->td_flags & TDF_RUNNING);
	602
	603	/*
	604	* Switching from within a 'fast' (non thread switched) interrupt or IPI
	605	* is illegal. However, we may have to do it anyway if we hit a fatal
	606	* kernel trap or we have paniced.
	607	*
	608	* If this case occurs save and restore the interrupt nesting level.
	609	*/
	610	if (gd->gd_intr_nesting_level) {
	611	int savegdnest;
	612	int savegdtrap;
	613
	614	if (gd->gd_trap_nesting_level == 0 && panic_cpu_gd != mycpu) {
	615	panic("lwkt_switch: Attempt to switch from a "
	616	"fast interrupt, ipi, or hard code section, "
	617	"td %p\n",
	618	td);
	619	} else {
	620	savegdnest = gd->gd_intr_nesting_level;
	621	savegdtrap = gd->gd_trap_nesting_level;
	622	gd->gd_intr_nesting_level = 0;
	623	gd->gd_trap_nesting_level = 0;
	624	if ((td->td_flags & TDF_PANICWARN) == 0) {
	625	td->td_flags \|= TDF_PANICWARN;
	626	kprintf("Warning: thread switch from interrupt, IPI, "
	627	"or hard code section.\n"
	628	"thread %p (%s)\n", td, td->td_comm);
	629	print_backtrace(-1);
	630	}
	631	lwkt_switch();
	632	gd->gd_intr_nesting_level = savegdnest;
	633	gd->gd_trap_nesting_level = savegdtrap;
	634	return;
	635	}
	636	}
	637
	638	/*
	639	* Release our current user process designation if we are blocking
	640	* or if a user reschedule was requested.
	641	*
	642	* NOTE: This function is NOT called if we are switching into or
	643	* returning from a preemption.
	644	*
	645	* NOTE: Releasing our current user process designation may cause
	646	* it to be assigned to another thread, which in turn will
	647	* cause us to block in the usched acquire code when we attempt
	648	* to return to userland.
	649	*
	650	* NOTE: On SMP systems this can be very nasty when heavy token
	651	* contention is present so we want to be careful not to
	652	* release the designation gratuitously.
	653	*/
	654	if (td->td_release &&
	655	(user_resched_wanted() \|\| (td->td_flags & TDF_RUNQ) == 0)) {
	656	td->td_release(td);
	657	}
	658
	659	/*
	660	* Release all tokens
	661	*/
	662	crit_enter_gd(gd);
	663	if (TD_TOKS_HELD(td))
	664	lwkt_relalltokens(td);
	665
	666	/*
	667	* We had better not be holding any spin locks, but don't get into an
	668	* endless panic loop.
	669	*/
	670	KASSERT(gd->gd_spinlocks_wr == 0 \|\| panicstr != NULL,
	671	("lwkt_switch: still holding %d exclusive spinlocks!",
	672	gd->gd_spinlocks_wr));
	673
	674
	675	#ifdef SMP
	676	#ifdef INVARIANTS
	677	if (td->td_cscount) {
	678	kprintf("Diagnostic: attempt to switch while mastering cpusync: %p\n",
	679	td);
	680	if (panic_on_cscount)
	681	panic("switching while mastering cpusync");
	682	}
	683	#endif
	684	#endif
	685
	686	/*
	687	* If we had preempted another thread on this cpu, resume the preempted
	688	* thread. This occurs transparently, whether the preempted thread
	689	* was scheduled or not (it may have been preempted after descheduling
	690	* itself).
	691	*
	692	* We have to setup the MP lock for the original thread after backing
	693	* out the adjustment that was made to curthread when the original
	694	* was preempted.
	695	*/
	696	if ((ntd = td->td_preempted) != NULL) {
	697	KKASSERT(ntd->td_flags & TDF_PREEMPT_LOCK);
	698	ntd->td_flags \|= TDF_PREEMPT_DONE;
	699
	700	/*
	701	* The interrupt may have woken a thread up, we need to properly
	702	* set the reschedule flag if the originally interrupted thread is
	703	* at a lower priority.
	704	*
	705	* The interrupt may not have descheduled.
	706	*/
	707	if (TAILQ_FIRST(&gd->gd_tdrunq) != ntd)
	708	need_lwkt_resched();
	709	goto havethread_preempted;
	710	}
	711
	712	/*
	713	* If we cannot obtain ownership of the tokens we cannot immediately
	714	* schedule the target thread.
	715	*
	716	* Reminder: Again, we cannot afford to run any IPIs in this path if
	717	* the current thread has been descheduled.
	718	*/
	719	for (;;) {
	720	clear_lwkt_resched();
	721
	722	/*
	723	* Hotpath - pull the head of the run queue and attempt to schedule
	724	* it.
	725	*/
	726	ntd = TAILQ_FIRST(&gd->gd_tdrunq);
	727
	728	if (ntd == NULL) {
	729	/*
	730	* Runq is empty, switch to idle to allow it to halt.
	731	*/
	732	ntd = &gd->gd_idlethread;
	733	#ifdef SMP
	734	if (gd->gd_trap_nesting_level == 0 && panicstr == NULL)
	735	ASSERT_NO_TOKENS_HELD(ntd);
	736	#endif
	737	cpu_time.cp_msg[0] = 0;
	738	cpu_time.cp_stallpc = 0;
	739	goto haveidle;
	740	}
	741
	742	/*
	743	* Hotpath - schedule ntd.
	744	*
	745	* NOTE: For UP there is no mplock and lwkt_getalltokens()
	746	* always succeeds.
	747	*/
	748	if (TD_TOKS_NOT_HELD(ntd) \|\|
	749	lwkt_getalltokens(ntd, (spinning >= lwkt_spin_loops)))
	750	{
	751	goto havethread;
	752	}
	753
	754	/*
	755	* Coldpath (SMP only since tokens always succeed on UP)
	756	*
	757	* We had some contention on the thread we wanted to schedule.
	758	* What we do now is try to find a thread that we can schedule
	759	* in its stead.
	760	*
	761	* The coldpath scan does NOT rearrange threads in the run list.
	762	* The lwkt_schedulerclock() will assert need_lwkt_resched() on
	763	* the next tick whenever the current head is not the current thread.
	764	*/
	765	#ifdef INVARIANTS
	766	++token_contention_count[ntd->td_pri];
	767	++ntd->td_contended;
	768	#endif
	769
	770	if (fairq_bypass > 0)
	771	goto skip;
	772
	773	while ((ntd = TAILQ_NEXT(ntd, td_threadq)) != NULL) {
	774	/*
	775	* Never schedule threads returning to userland or the
	776	* user thread scheduler helper thread when higher priority
	777	* threads are present. The runq is sorted by priority
	778	* so we can give up traversing it when we find the first
	779	* low priority thread.
	780	*/
	781	if (ntd->td_pri < TDPRI_KERN_LPSCHED) {
	782	ntd = NULL;
	783	break;
	784	}
	785
	786	/*
	787	* Try this one.
	788	*/
	789	if (TD_TOKS_NOT_HELD(ntd) \|\|
	790	lwkt_getalltokens(ntd, (spinning >= lwkt_spin_loops))) {
	791	goto havethread;
	792	}
	793	#ifdef INVARIANTS
	794	++token_contention_count[ntd->td_pri];
	795	++ntd->td_contended;
	796	#endif
	797	}
	798
	799	skip:
	800	/*
	801	* We exhausted the run list, meaning that all runnable threads
	802	* are contested.
	803	*/
	804	cpu_pause();
	805	ntd = &gd->gd_idlethread;
	806	#ifdef SMP
	807	if (gd->gd_trap_nesting_level == 0 && panicstr == NULL)
	808	ASSERT_NO_TOKENS_HELD(ntd);
	809	/* contention case, do not clear contention mask */
	810	#endif
	811
	812	/*
	813	* We are going to have to retry but if the current thread is not
	814	* on the runq we instead switch through the idle thread to get away
	815	* from the current thread. We have to flag for lwkt reschedule
	816	* to prevent the idle thread from halting.
	817	*
	818	* NOTE: A non-zero spinning is passed to lwkt_getalltokens() to
	819	* instruct it to deal with the potential for deadlocks by
	820	* ordering the tokens by address.
	821	*/
	822	if ((td->td_flags & TDF_RUNQ) == 0) {
	823	need_lwkt_resched(); /* prevent hlt */
	824	goto haveidle;
	825	}
	826	#if defined(INVARIANTS) && defined(__amd64__)
	827	if ((read_rflags() & PSL_I) == 0) {
	828	cpu_enable_intr();
	829	panic("lwkt_switch() called with interrupts disabled");
	830	}
	831	#endif
	832
	833	/*
	834	* Number iterations so far. After a certain point we switch to
	835	* a sorted-address/monitor/mwait version of lwkt_getalltokens()
	836	*/
	837	if (spinning < 0x7FFFFFFF)
	838	++spinning;
	839
	840	#ifdef SMP
	841	/*
	842	* lwkt_getalltokens() failed in sorted token mode, we can use
	843	* monitor/mwait in this case.
	844	*/
	845	if (spinning >= lwkt_spin_loops &&
	846	(cpu_mi_feature & CPU_MI_MONITOR) &&
	847	lwkt_spin_monitor)
	848	{
	849	cpu_mmw_pause_int(&gd->gd_reqflags,
	850	(gd->gd_reqflags \| RQF_SPINNING) &
	851	~RQF_IDLECHECK_WK_MASK);
	852	}
	853	#endif
	854
	855	/*
	856	* We already checked that td is still scheduled so this should be
	857	* safe.
	858	*/
	859	splz_check();
	860
	861	/*
	862	* This experimental resequencer is used as a fall-back to reduce
	863	* hw cache line contention by placing each core's scheduler into a
	864	* time-domain-multplexed slot.
	865	*
	866	* The resequencer is disabled by default. It's functionality has
	867	* largely been superceeded by the token algorithm which limits races
	868	* to a subset of cores.
	869	*
	870	* The resequencer algorithm tends to break down when more than
	871	* 20 cores are contending. What appears to happen is that new
	872	* tokens can be obtained out of address-sorted order by new cores
	873	* while existing cores languish in long delays between retries and
	874	* wind up being starved-out of the token acquisition.
	875	*/
	876	if (lwkt_spin_reseq && spinning >= lwkt_spin_reseq) {
	877	int cseq = atomic_fetchadd_int(&lwkt_cseq_windex, 1);
	878	int oseq;
	879
	880	while ((oseq = lwkt_cseq_rindex) != cseq) {
	881	cpu_ccfence();
	882	#if 1
	883	if (cpu_mi_feature & CPU_MI_MONITOR) {
	884	cpu_mmw_pause_int(&lwkt_cseq_rindex, oseq);
	885	} else {
	886	#endif
	887	cpu_pause();
	888	cpu_lfence();
	889	#if 1
	890	}
	891	#endif
	892	}
	893	DELAY(1);
	894	atomic_add_int(&lwkt_cseq_rindex, 1);
	895	}
	896	/* highest level for(;;) loop */
	897	}
	898
	899	havethread:
	900	/*
	901	* Clear gd_idle_repeat when doing a normal switch to a non-idle
	902	* thread.
	903	*/
	904	ntd->td_wmesg = NULL;
	905	++gd->gd_cnt.v_swtch;
	906	gd->gd_idle_repeat = 0;
	907
	908	havethread_preempted:
	909	/*
	910	* If the new target does not need the MP lock and we are holding it,
	911	* release the MP lock. If the new target requires the MP lock we have
	912	* already acquired it for the target.
	913	*/
	914	;
	915	haveidle:
	916	KASSERT(ntd->td_critcount,
	917	("priority problem in lwkt_switch %d %d",
	918	td->td_critcount, ntd->td_critcount));
	919
	920	if (td != ntd) {
	921	/*
	922	* Execute the actual thread switch operation. This function
	923	* returns to the current thread and returns the previous thread
	924	* (which may be different from the thread we switched to).
	925	*
	926	* We are responsible for marking ntd as TDF_RUNNING.
	927	*/
	928	KKASSERT((ntd->td_flags & TDF_RUNNING) == 0);
	929	++switch_count;
	930	KTR_LOG(ctxsw_sw, gd->gd_cpuid, ntd);
	931	ntd->td_flags \|= TDF_RUNNING;
	932	lwkt_switch_return(td->td_switch(ntd));
	933	/* ntd invalid, td_switch() can return a different thread_t */
	934	}
	935
	936	/*
	937	* catch-all. XXX is this strictly needed?
	938	*/
	939	splz_check();
	940
	941	/* NOTE: current cpu may have changed after switch */
	942	crit_exit_quick(td);
	943	}
	944
	945	/*
	946	* Called by assembly in the td_switch (thread restore path) for thread
	947	* bootstrap cases which do not 'return' to lwkt_switch().
	948	*/
	949	void
	950	lwkt_switch_return(thread_t otd)
	951	{
	952	#ifdef SMP
	953	globaldata_t rgd;
	954
	955	/*
	956	* Check if otd was migrating. Now that we are on ntd we can finish
	957	* up the migration. This is a bit messy but it is the only place
	958	* where td is known to be fully descheduled.
	959	*
	960	* We can only activate the migration if otd was migrating but not
	961	* held on the cpu due to a preemption chain. We still have to
	962	* clear TDF_RUNNING on the old thread either way.
	963	*
	964	* We are responsible for clearing the previously running thread's
	965	* TDF_RUNNING.
	966	*/
	967	if ((rgd = otd->td_migrate_gd) != NULL &&
	968	(otd->td_flags & TDF_PREEMPT_LOCK) == 0) {
	969	KKASSERT((otd->td_flags & (TDF_MIGRATING \| TDF_RUNNING)) ==
	970	(TDF_MIGRATING \| TDF_RUNNING));
	971	otd->td_migrate_gd = NULL;
	972	otd->td_flags &= ~TDF_RUNNING;
	973	lwkt_send_ipiq(rgd, lwkt_setcpu_remote, otd);
	974	} else {
	975	otd->td_flags &= ~TDF_RUNNING;
	976	}
	977	#else
	978	otd->td_flags &= ~TDF_RUNNING;
	979	#endif
	980
	981	/*
	982	* Final exit validations (see lwp_wait()). Note that otd becomes
	983	* invalid the instant we set TDF_MP_EXITSIG.
	984	*/
	985	while (otd->td_flags & TDF_EXITING) {
	986	u_int mpflags;
	987
	988	mpflags = otd->td_mpflags;
	989	cpu_ccfence();
	990
	991	if (mpflags & TDF_MP_EXITWAIT) {
	992	if (atomic_cmpset_int(&otd->td_mpflags, mpflags,
	993	mpflags \| TDF_MP_EXITSIG)) {
	994	wakeup(otd);
	995	break;
	996	}
	997	} else {
	998	if (atomic_cmpset_int(&otd->td_mpflags, mpflags,
	999	mpflags \| TDF_MP_EXITSIG)) {
	1000	wakeup(otd);
	1001	break;
	1002	}
	1003	}
	1004	}
	1005	}
	1006
	1007	/*
	1008	* Request that the target thread preempt the current thread. Preemption
	1009	* can only occur if our only critical section is the one that we were called
	1010	* with, the relative priority of the target thread is higher, and the target
	1011	* thread holds no tokens. This also only works if we are not holding any
	1012	* spinlocks (obviously).
	1013	*
	1014	* THE CALLER OF LWKT_PREEMPT() MUST BE IN A CRITICAL SECTION. Typically
	1015	* this is called via lwkt_schedule() through the td_preemptable callback.
	1016	* critcount is the managed critical priority that we should ignore in order
	1017	* to determine whether preemption is possible (aka usually just the crit
	1018	* priority of lwkt_schedule() itself).
	1019	*
	1020	* Preemption is typically limited to interrupt threads.
	1021	*
	1022	* Operation works in a fairly straight-forward manner. The normal
	1023	* scheduling code is bypassed and we switch directly to the target
	1024	* thread. When the target thread attempts to block or switch away
	1025	* code at the base of lwkt_switch() will switch directly back to our
	1026	* thread. Our thread is able to retain whatever tokens it holds and
	1027	* if the target needs one of them the target will switch back to us
	1028	* and reschedule itself normally.
	1029	*/
	1030	void
	1031	lwkt_preempt(thread_t ntd, int critcount)
	1032	{
	1033	struct globaldata *gd = mycpu;
	1034	thread_t xtd;
	1035	thread_t td;
	1036	int save_gd_intr_nesting_level;
	1037
	1038	/*
	1039	* The caller has put us in a critical section. We can only preempt
	1040	* if the caller of the caller was not in a critical section (basically
	1041	* a local interrupt), as determined by the 'critcount' parameter. We
	1042	* also can't preempt if the caller is holding any spinlocks (even if
	1043	* he isn't in a critical section). This also handles the tokens test.
	1044	*
	1045	* YYY The target thread must be in a critical section (else it must
	1046	* inherit our critical section? I dunno yet).
	1047	*/
	1048	KASSERT(ntd->td_critcount, ("BADCRIT0 %d", ntd->td_pri));
	1049
	1050	td = gd->gd_curthread;
	1051	if (preempt_enable == 0) {
	1052	++preempt_miss;
	1053	return;
	1054	}
	1055	if (ntd->td_pri <= td->td_pri) {
	1056	++preempt_miss;
	1057	return;
	1058	}
	1059	if (td->td_critcount > critcount) {
	1060	++preempt_miss;
	1061	return;
	1062	}
	1063	#ifdef SMP
	1064	if (td->td_cscount) {
	1065	++preempt_miss;
	1066	return;
	1067	}
	1068	if (ntd->td_gd != gd) {
	1069	++preempt_miss;
	1070	return;
	1071	}
	1072	#endif
	1073	/*
	1074	* We don't have to check spinlocks here as they will also bump
	1075	* td_critcount.
	1076	*
	1077	* Do not try to preempt if the target thread is holding any tokens.
	1078	* We could try to acquire the tokens but this case is so rare there
	1079	* is no need to support it.
	1080	*/
	1081	KKASSERT(gd->gd_spinlocks_wr == 0);
	1082
	1083	if (TD_TOKS_HELD(ntd)) {
	1084	++preempt_miss;
	1085	return;
	1086	}
	1087	if (td == ntd \|\| ((td->td_flags \| ntd->td_flags) & TDF_PREEMPT_LOCK)) {
	1088	++preempt_weird;
	1089	return;
	1090	}
	1091	if (ntd->td_preempted) {
	1092	++preempt_hit;
	1093	return;
	1094	}
	1095	KKASSERT(gd->gd_processing_ipiq == 0);
	1096
	1097	/*
	1098	* Since we are able to preempt the current thread, there is no need to
	1099	* call need_lwkt_resched().
	1100	*
	1101	* We must temporarily clear gd_intr_nesting_level around the switch
	1102	* since switchouts from the target thread are allowed (they will just
	1103	* return to our thread), and since the target thread has its own stack.
	1104	*
	1105	* A preemption must switch back to the original thread, assert the
	1106	* case.
	1107	*/
	1108	++preempt_hit;
	1109	ntd->td_preempted = td;
	1110	td->td_flags \|= TDF_PREEMPT_LOCK;
	1111	KTR_LOG(ctxsw_pre, gd->gd_cpuid, ntd);
	1112	save_gd_intr_nesting_level = gd->gd_intr_nesting_level;
	1113	gd->gd_intr_nesting_level = 0;
	1114
	1115	KKASSERT((ntd->td_flags & TDF_RUNNING) == 0);
	1116	ntd->td_flags \|= TDF_RUNNING;
	1117	xtd = td->td_switch(ntd);
	1118	KKASSERT(xtd == ntd);
	1119	lwkt_switch_return(xtd);
	1120	gd->gd_intr_nesting_level = save_gd_intr_nesting_level;
	1121
	1122	KKASSERT(ntd->td_preempted && (td->td_flags & TDF_PREEMPT_DONE));
	1123	ntd->td_preempted = NULL;
	1124	td->td_flags &= ~(TDF_PREEMPT_LOCK\|TDF_PREEMPT_DONE);
	1125	}
	1126
	1127	/*
	1128	* Conditionally call splz() if gd_reqflags indicates work is pending.
	1129	* This will work inside a critical section but not inside a hard code
	1130	* section.
	1131	*
	1132	* (self contained on a per cpu basis)
	1133	*/
	1134	void
	1135	splz_check(void)
	1136	{
	1137	globaldata_t gd = mycpu;
	1138	thread_t td = gd->gd_curthread;
	1139
	1140	if ((gd->gd_reqflags & RQF_IDLECHECK_MASK) &&
	1141	gd->gd_intr_nesting_level == 0 &&
	1142	td->td_nest_count < 2)
	1143	{
	1144	splz();
	1145	}
	1146	}
	1147
	1148	/*
	1149	* This version is integrated into crit_exit, reqflags has already
	1150	* been tested but td_critcount has not.
	1151	*
	1152	* We only want to execute the splz() on the 1->0 transition of
	1153	* critcount and not in a hard code section or if too deeply nested.
	1154	*
	1155	* NOTE: gd->gd_spinlocks_wr is implied to be 0 when td_critcount is 0.
	1156	*/
	1157	void
	1158	lwkt_maybe_splz(thread_t td)
	1159	{
	1160	globaldata_t gd = td->td_gd;
	1161
	1162	if (td->td_critcount == 0 &&
	1163	gd->gd_intr_nesting_level == 0 &&
	1164	td->td_nest_count < 2)
	1165	{
	1166	splz();
	1167	}
	1168	}
	1169
	1170	/*
	1171	* Drivers which set up processing co-threads can call this function to
	1172	* run the co-thread at a higher priority and to allow it to preempt
	1173	* normal threads.
	1174	*/
	1175	void
	1176	lwkt_set_interrupt_support_thread(void)
	1177	{
	1178	thread_t td = curthread;
	1179
	1180	lwkt_setpri_self(TDPRI_INT_SUPPORT);
	1181	td->td_flags \|= TDF_INTTHREAD;
	1182	td->td_preemptable = lwkt_preempt;
	1183	}
	1184
	1185
	1186	/*
	1187	* This function is used to negotiate a passive release of the current
	1188	* process/lwp designation with the user scheduler, allowing the user
	1189	* scheduler to schedule another user thread. The related kernel thread
	1190	* (curthread) continues running in the released state.
	1191	*/
	1192	void
	1193	lwkt_passive_release(struct thread *td)
	1194	{
	1195	struct lwp *lp = td->td_lwp;
	1196
	1197	td->td_release = NULL;
	1198	lwkt_setpri_self(TDPRI_KERN_USER);
	1199	lp->lwp_proc->p_usched->release_curproc(lp);
	1200	}
	1201
	1202
	1203	/*
	1204	* This implements a LWKT yield, allowing a kernel thread to yield to other
	1205	* kernel threads at the same or higher priority. This function can be
	1206	* called in a tight loop and will typically only yield once per tick.
	1207	*
	1208	* Most kernel threads run at the same priority in order to allow equal
	1209	* sharing.
	1210	*
	1211	* (self contained on a per cpu basis)
	1212	*/
	1213	void
	1214	lwkt_yield(void)
	1215	{
	1216	globaldata_t gd = mycpu;
	1217	thread_t td = gd->gd_curthread;
	1218
	1219	if ((gd->gd_reqflags & RQF_IDLECHECK_MASK) && td->td_nest_count < 2)
	1220	splz();
	1221	if (lwkt_resched_wanted()) {
	1222	lwkt_schedule_self(curthread);
	1223	lwkt_switch();
	1224	}
	1225	}
	1226
	1227	/*
	1228	* The quick version processes pending interrupts and higher-priority
	1229	* LWKT threads but will not round-robin same-priority LWKT threads.
	1230	*/
	1231	void
	1232	lwkt_yield_quick(void)
	1233	{
	1234	globaldata_t gd = mycpu;
	1235	thread_t td = gd->gd_curthread;
	1236
	1237	if ((gd->gd_reqflags & RQF_IDLECHECK_MASK) && td->td_nest_count < 2)
	1238	splz();
	1239	if (lwkt_resched_wanted()) {
	1240	if (TAILQ_FIRST(&gd->gd_tdrunq) == td) {
	1241	clear_lwkt_resched();
	1242	} else {
	1243	lwkt_schedule_self(curthread);
	1244	lwkt_switch();
	1245	}
	1246	}
	1247	}
	1248
	1249	/*
	1250	* This yield is designed for kernel threads with a user context.
	1251	*
	1252	* The kernel acting on behalf of the user is potentially cpu-bound,
	1253	* this function will efficiently allow other threads to run and also
	1254	* switch to other processes by releasing.
	1255	*
	1256	* The lwkt_user_yield() function is designed to have very low overhead
	1257	* if no yield is determined to be needed.
	1258	*/
	1259	void
	1260	lwkt_user_yield(void)
	1261	{
	1262	globaldata_t gd = mycpu;
	1263	thread_t td = gd->gd_curthread;
	1264
	1265	/*
	1266	* Always run any pending interrupts in case we are in a critical
	1267	* section.
	1268	*/
	1269	if ((gd->gd_reqflags & RQF_IDLECHECK_MASK) && td->td_nest_count < 2)
	1270	splz();
	1271
	1272	/*
	1273	* Switch (which forces a release) if another kernel thread needs
	1274	* the cpu, if userland wants us to resched, or if our kernel
	1275	* quantum has run out.
	1276	*/
	1277	if (lwkt_resched_wanted() \|\|
	1278	user_resched_wanted())
	1279	{
	1280	lwkt_switch();
	1281	}
	1282
	1283	#if 0
	1284	/*
	1285	* Reacquire the current process if we are released.
	1286	*
	1287	* XXX not implemented atm. The kernel may be holding locks and such,
	1288	* so we want the thread to continue to receive cpu.
	1289	*/
	1290	if (td->td_release == NULL && lp) {
	1291	lp->lwp_proc->p_usched->acquire_curproc(lp);
	1292	td->td_release = lwkt_passive_release;
	1293	lwkt_setpri_self(TDPRI_USER_NORM);
	1294	}
	1295	#endif
	1296	}
	1297
	1298	/*
	1299	* Generic schedule. Possibly schedule threads belonging to other cpus and
	1300	* deal with threads that might be blocked on a wait queue.
	1301	*
	1302	* We have a little helper inline function which does additional work after
	1303	* the thread has been enqueued, including dealing with preemption and
	1304	* setting need_lwkt_resched() (which prevents the kernel from returning
	1305	* to userland until it has processed higher priority threads).
	1306	*
	1307	* It is possible for this routine to be called after a failed _enqueue
	1308	* (due to the target thread migrating, sleeping, or otherwise blocked).
	1309	* We have to check that the thread is actually on the run queue!
	1310	*/
	1311	static __inline
	1312	void
	1313	_lwkt_schedule_post(globaldata_t gd, thread_t ntd, int ccount)
	1314	{
	1315	if (ntd->td_flags & TDF_RUNQ) {
	1316	if (ntd->td_preemptable) {
	1317	ntd->td_preemptable(ntd, ccount); /* YYY +token */
	1318	}
	1319	}
	1320	}
	1321
	1322	static __inline
	1323	void
	1324	_lwkt_schedule(thread_t td)
	1325	{
	1326	globaldata_t mygd = mycpu;
	1327
	1328	KASSERT(td != &td->td_gd->gd_idlethread,
	1329	("lwkt_schedule(): scheduling gd_idlethread is illegal!"));
	1330	KKASSERT((td->td_flags & TDF_MIGRATING) == 0);
	1331	crit_enter_gd(mygd);
	1332	KKASSERT(td->td_lwp == NULL \|\|
	1333	(td->td_lwp->lwp_mpflags & LWP_MP_ONRUNQ) == 0);
	1334
	1335	if (td == mygd->gd_curthread) {
	1336	_lwkt_enqueue(td);
	1337	} else {
	1338	/*
	1339	* If we own the thread, there is no race (since we are in a
	1340	* critical section). If we do not own the thread there might
	1341	* be a race but the target cpu will deal with it.
	1342	*/
	1343	#ifdef SMP
	1344	if (td->td_gd == mygd) {
	1345	_lwkt_enqueue(td);
	1346	_lwkt_schedule_post(mygd, td, 1);
	1347	} else {
	1348	lwkt_send_ipiq3(td->td_gd, lwkt_schedule_remote, td, 0);
	1349	}
	1350	#else
	1351	_lwkt_enqueue(td);
	1352	_lwkt_schedule_post(mygd, td, 1);
	1353	#endif
	1354	}
	1355	crit_exit_gd(mygd);
	1356	}
	1357
	1358	void
	1359	lwkt_schedule(thread_t td)
	1360	{
	1361	_lwkt_schedule(td);
	1362	}
	1363
	1364	void
	1365	lwkt_schedule_noresched(thread_t td) /* XXX not impl */
	1366	{
	1367	_lwkt_schedule(td);
	1368	}
	1369
	1370	#ifdef SMP
	1371
	1372	/*
	1373	* When scheduled remotely if frame != NULL the IPIQ is being
	1374	* run via doreti or an interrupt then preemption can be allowed.
	1375	*
	1376	* To allow preemption we have to drop the critical section so only
	1377	* one is present in _lwkt_schedule_post.
	1378	*/
	1379	static void
	1380	lwkt_schedule_remote(void arg, int arg2, struct intrframe frame)
	1381	{
	1382	thread_t td = curthread;
	1383	thread_t ntd = arg;
	1384
	1385	if (frame && ntd->td_preemptable) {
	1386	crit_exit_noyield(td);
	1387	_lwkt_schedule(ntd);
	1388	crit_enter_quick(td);
	1389	} else {
	1390	_lwkt_schedule(ntd);
	1391	}
	1392	}
	1393
	1394	/*
	1395	* Thread migration using a 'Pull' method. The thread may or may not be
	1396	* the current thread. It MUST be descheduled and in a stable state.
	1397	* lwkt_giveaway() must be called on the cpu owning the thread.
	1398	*
	1399	* At any point after lwkt_giveaway() is called, the target cpu may
	1400	* 'pull' the thread by calling lwkt_acquire().
	1401	*
	1402	* We have to make sure the thread is not sitting on a per-cpu tsleep
	1403	* queue or it will blow up when it moves to another cpu.
	1404	*
	1405	* MPSAFE - must be called under very specific conditions.
	1406	*/
	1407	void
	1408	lwkt_giveaway(thread_t td)
	1409	{
	1410	globaldata_t gd = mycpu;
	1411
	1412	crit_enter_gd(gd);
	1413	if (td->td_flags & TDF_TSLEEPQ)
	1414	tsleep_remove(td);
	1415	KKASSERT(td->td_gd == gd);
	1416	TAILQ_REMOVE(&gd->gd_tdallq, td, td_allq);
	1417	td->td_flags \|= TDF_MIGRATING;
	1418	crit_exit_gd(gd);
	1419	}
	1420
	1421	void
	1422	lwkt_acquire(thread_t td)
	1423	{
	1424	globaldata_t gd;
	1425	globaldata_t mygd;
	1426	int retry = 10000000;
	1427
	1428	KKASSERT(td->td_flags & TDF_MIGRATING);
	1429	gd = td->td_gd;
	1430	mygd = mycpu;
	1431	if (gd != mycpu) {
	1432	cpu_lfence();
	1433	KKASSERT((td->td_flags & TDF_RUNQ) == 0);
	1434	crit_enter_gd(mygd);
	1435	DEBUG_PUSH_INFO("lwkt_acquire");
	1436	while (td->td_flags & (TDF_RUNNING\|TDF_PREEMPT_LOCK)) {
	1437	#ifdef SMP
	1438	lwkt_process_ipiq();
	1439	#endif
	1440	cpu_lfence();
	1441	if (--retry == 0) {
	1442	kprintf("lwkt_acquire: stuck: td %p td->td_flags %08x\n",
	1443	td, td->td_flags);
	1444	retry = 10000000;
	1445	}
	1446	}
	1447	DEBUG_POP_INFO();
	1448	cpu_mfence();
	1449	td->td_gd = mygd;
	1450	TAILQ_INSERT_TAIL(&mygd->gd_tdallq, td, td_allq);
	1451	td->td_flags &= ~TDF_MIGRATING;
	1452	crit_exit_gd(mygd);
	1453	} else {
	1454	crit_enter_gd(mygd);
	1455	TAILQ_INSERT_TAIL(&mygd->gd_tdallq, td, td_allq);
	1456	td->td_flags &= ~TDF_MIGRATING;
	1457	crit_exit_gd(mygd);
	1458	}
	1459	}
	1460
	1461	#endif
	1462
	1463	/*
	1464	* Generic deschedule. Descheduling threads other then your own should be
	1465	* done only in carefully controlled circumstances. Descheduling is
	1466	* asynchronous.
	1467	*
	1468	* This function may block if the cpu has run out of messages.
	1469	*/
	1470	void
	1471	lwkt_deschedule(thread_t td)
	1472	{
	1473	crit_enter();
	1474	#ifdef SMP
	1475	if (td == curthread) {
	1476	_lwkt_dequeue(td);
	1477	} else {
	1478	if (td->td_gd == mycpu) {
	1479	_lwkt_dequeue(td);
	1480	} else {
	1481	lwkt_send_ipiq(td->td_gd, (ipifunc1_t)lwkt_deschedule, td);
	1482	}
	1483	}
	1484	#else
	1485	_lwkt_dequeue(td);
	1486	#endif
	1487	crit_exit();
	1488	}
	1489
	1490	/*
	1491	* Set the target thread's priority. This routine does not automatically
	1492	* switch to a higher priority thread, LWKT threads are not designed for
	1493	* continuous priority changes. Yield if you want to switch.
	1494	*/
	1495	void
	1496	lwkt_setpri(thread_t td, int pri)
	1497	{
	1498	if (td->td_pri != pri) {
	1499	KKASSERT(pri >= 0);
	1500	crit_enter();
	1501	if (td->td_flags & TDF_RUNQ) {
	1502	KKASSERT(td->td_gd == mycpu);
	1503	_lwkt_dequeue(td);
	1504	td->td_pri = pri;
	1505	_lwkt_enqueue(td);
	1506	} else {
	1507	td->td_pri = pri;
	1508	}
	1509	crit_exit();
	1510	}
	1511	}
	1512
	1513	/*
	1514	* Set the initial priority for a thread prior to it being scheduled for
	1515	* the first time. The thread MUST NOT be scheduled before or during
	1516	* this call. The thread may be assigned to a cpu other then the current
	1517	* cpu.
	1518	*
	1519	* Typically used after a thread has been created with TDF_STOPPREQ,
	1520	* and before the thread is initially scheduled.
	1521	*/
	1522	void
	1523	lwkt_setpri_initial(thread_t td, int pri)
	1524	{
	1525	KKASSERT(pri >= 0);
	1526	KKASSERT((td->td_flags & TDF_RUNQ) == 0);
	1527	td->td_pri = pri;
	1528	}
	1529
	1530	void
	1531	lwkt_setpri_self(int pri)
	1532	{
	1533	thread_t td = curthread;
	1534
	1535	KKASSERT(pri >= 0 && pri <= TDPRI_MAX);
	1536	crit_enter();
	1537	if (td->td_flags & TDF_RUNQ) {
	1538	_lwkt_dequeue(td);
	1539	td->td_pri = pri;
	1540	_lwkt_enqueue(td);
	1541	} else {
	1542	td->td_pri = pri;
	1543	}
	1544	crit_exit();
	1545	}
	1546
	1547	/*
	1548	* hz tick scheduler clock for LWKT threads
	1549	*/
	1550	void
	1551	lwkt_schedulerclock(thread_t td)
	1552	{
	1553	globaldata_t gd = td->td_gd;
	1554	thread_t xtd;
	1555
	1556	if (TAILQ_FIRST(&gd->gd_tdrunq) == td) {
	1557	/*
	1558	* If the current thread is at the head of the runq shift it to the
	1559	* end of any equal-priority threads and request a LWKT reschedule
	1560	* if it moved.
	1561	*/
	1562	xtd = TAILQ_NEXT(td, td_threadq);
	1563	if (xtd && xtd->td_pri == td->td_pri) {
	1564	TAILQ_REMOVE(&gd->gd_tdrunq, td, td_threadq);
	1565	while (xtd && xtd->td_pri == td->td_pri)
	1566	xtd = TAILQ_NEXT(xtd, td_threadq);
	1567	if (xtd)
	1568	TAILQ_INSERT_BEFORE(xtd, td, td_threadq);
	1569	else
	1570	TAILQ_INSERT_TAIL(&gd->gd_tdrunq, td, td_threadq);
	1571	need_lwkt_resched();
	1572	}
	1573	} else {
	1574	/*
	1575	* If we scheduled a thread other than the one at the head of the
	1576	* queue always request a reschedule every tick.
	1577	*/
	1578	need_lwkt_resched();
	1579	}
	1580	}
	1581
	1582	/*
	1583	* Migrate the current thread to the specified cpu.
	1584	*
	1585	* This is accomplished by descheduling ourselves from the current cpu
	1586	* and setting td_migrate_gd. The lwkt_switch() code will detect that the
	1587	* 'old' thread wants to migrate after it has been completely switched out
	1588	* and will complete the migration.
	1589	*
	1590	* TDF_MIGRATING prevents scheduling races while the thread is being migrated.
	1591	*
	1592	* We must be sure to release our current process designation (if a user
	1593	* process) before clearing out any tsleepq we are on because the release
	1594	* code may re-add us.
	1595	*
	1596	* We must be sure to remove ourselves from the current cpu's tsleepq
	1597	* before potentially moving to another queue. The thread can be on
	1598	* a tsleepq due to a left-over tsleep_interlock().
	1599	*/
	1600
	1601	void
	1602	lwkt_setcpu_self(globaldata_t rgd)
	1603	{
	1604	#ifdef SMP
	1605	thread_t td = curthread;
	1606
	1607	if (td->td_gd != rgd) {
	1608	crit_enter_quick(td);
	1609
	1610	if (td->td_release)
	1611	td->td_release(td);
	1612	if (td->td_flags & TDF_TSLEEPQ)
	1613	tsleep_remove(td);
	1614
	1615	/*
	1616	* Set TDF_MIGRATING to prevent a spurious reschedule while we are
	1617	* trying to deschedule ourselves and switch away, then deschedule
	1618	* ourself, remove us from tdallq, and set td_migrate_gd. Finally,
	1619	* call lwkt_switch() to complete the operation.
	1620	*/
	1621	td->td_flags \|= TDF_MIGRATING;
	1622	lwkt_deschedule_self(td);
	1623	TAILQ_REMOVE(&td->td_gd->gd_tdallq, td, td_allq);
	1624	td->td_migrate_gd = rgd;
	1625	lwkt_switch();
	1626
	1627	/*
	1628	* We are now on the target cpu
	1629	*/
	1630	KKASSERT(rgd == mycpu);
	1631	TAILQ_INSERT_TAIL(&rgd->gd_tdallq, td, td_allq);
	1632	crit_exit_quick(td);
	1633	}
	1634	#endif
	1635	}
	1636
	1637	void
	1638	lwkt_migratecpu(int cpuid)
	1639	{
	1640	#ifdef SMP
	1641	globaldata_t rgd;
	1642
	1643	rgd = globaldata_find(cpuid);
	1644	lwkt_setcpu_self(rgd);
	1645	#endif
	1646	}
	1647
	1648	#ifdef SMP
	1649	/*
	1650	* Remote IPI for cpu migration (called while in a critical section so we
	1651	* do not have to enter another one).
	1652	*
	1653	* The thread (td) has already been completely descheduled from the
	1654	* originating cpu and we can simply assert the case. The thread is
	1655	* assigned to the new cpu and enqueued.
	1656	*
	1657	* The thread will re-add itself to tdallq when it resumes execution.
	1658	*/
	1659	static void
	1660	lwkt_setcpu_remote(void *arg)
	1661	{
	1662	thread_t td = arg;
	1663	globaldata_t gd = mycpu;
	1664
	1665	KKASSERT((td->td_flags & (TDF_RUNNING\|TDF_PREEMPT_LOCK)) == 0);
	1666	td->td_gd = gd;
	1667	cpu_mfence();
	1668	td->td_flags &= ~TDF_MIGRATING;
	1669	KKASSERT(td->td_migrate_gd == NULL);
	1670	KKASSERT(td->td_lwp == NULL \|\|
	1671	(td->td_lwp->lwp_mpflags & LWP_MP_ONRUNQ) == 0);
	1672	_lwkt_enqueue(td);
	1673	}
	1674	#endif
	1675
	1676	struct lwp *
	1677	lwkt_preempted_proc(void)
	1678	{
	1679	thread_t td = curthread;
	1680	while (td->td_preempted)
	1681	td = td->td_preempted;
	1682	return(td->td_lwp);
	1683	}
	1684
	1685	/*
	1686	* Create a kernel process/thread/whatever. It shares it's address space
	1687	* with proc0 - ie: kernel only.
	1688	*
	1689	* If the cpu is not specified one will be selected. In the future
	1690	* specifying a cpu of -1 will enable kernel thread migration between
	1691	* cpus.
	1692	*/
	1693	int
	1694	lwkt_create(void (func)(void ), void arg, struct thread *tdp,
	1695	thread_t template, int tdflags, int cpu, const char *fmt, ...)
	1696	{
	1697	thread_t td;
	1698	__va_list ap;
	1699
	1700	td = lwkt_alloc_thread(template, LWKT_THREAD_STACK, cpu,
	1701	tdflags);
	1702	if (tdp)
	1703	*tdp = td;
	1704	cpu_set_thread_handler(td, lwkt_exit, func, arg);
	1705
	1706	/*
	1707	* Set up arg0 for 'ps' etc
	1708	*/
	1709	__va_start(ap, fmt);
	1710	kvsnprintf(td->td_comm, sizeof(td->td_comm), fmt, ap);
	1711	__va_end(ap);
	1712
	1713	/*
	1714	* Schedule the thread to run
	1715	*/
	1716	if (td->td_flags & TDF_NOSTART)
	1717	td->td_flags &= ~TDF_NOSTART;
	1718	else
	1719	lwkt_schedule(td);
	1720	return 0;
	1721	}
	1722
	1723	/*
	1724	* Destroy an LWKT thread. Warning! This function is not called when
	1725	* a process exits, cpu_proc_exit() directly calls cpu_thread_exit() and
	1726	* uses a different reaping mechanism.
	1727	*/
	1728	void
	1729	lwkt_exit(void)
	1730	{
	1731	thread_t td = curthread;
	1732	thread_t std;
	1733	globaldata_t gd;
	1734
	1735	/*
	1736	* Do any cleanup that might block here
	1737	*/
	1738	if (td->td_flags & TDF_VERBOSE)
	1739	kprintf("kthread %p %s has exited\n", td, td->td_comm);
	1740	caps_exit(td);
	1741	biosched_done(td);
	1742	dsched_exit_thread(td);
	1743
	1744	/*
	1745	* Get us into a critical section to interlock gd_freetd and loop
	1746	* until we can get it freed.
	1747	*
	1748	* We have to cache the current td in gd_freetd because objcache_put()ing
	1749	* it would rip it out from under us while our thread is still active.
	1750	*
	1751	* We are the current thread so of course our own TDF_RUNNING bit will
	1752	* be set, so unlike the lwp reap code we don't wait for it to clear.
	1753	*/
	1754	gd = mycpu;
	1755	crit_enter_quick(td);
	1756	for (;;) {
	1757	if (td->td_refs) {
	1758	tsleep(td, 0, "tdreap", 1);
	1759	continue;
	1760	}
	1761	if ((std = gd->gd_freetd) != NULL) {
	1762	KKASSERT((std->td_flags & (TDF_RUNNING\|TDF_PREEMPT_LOCK)) == 0);
	1763	gd->gd_freetd = NULL;
	1764	objcache_put(thread_cache, std);
	1765	continue;
	1766	}
	1767	break;
	1768	}
	1769
	1770	/*
	1771	* Remove thread resources from kernel lists and deschedule us for
	1772	* the last time. We cannot block after this point or we may end
	1773	* up with a stale td on the tsleepq.
	1774	*
	1775	* None of this may block, the critical section is the only thing
	1776	* protecting tdallq and the only thing preventing new lwkt_hold()
	1777	* thread refs now.
	1778	*/
	1779	if (td->td_flags & TDF_TSLEEPQ)
	1780	tsleep_remove(td);
	1781	lwkt_deschedule_self(td);
	1782	lwkt_remove_tdallq(td);
	1783	KKASSERT(td->td_refs == 0);
	1784
	1785	/*
	1786	* Final cleanup
	1787	*/
	1788	KKASSERT(gd->gd_freetd == NULL);
	1789	if (td->td_flags & TDF_ALLOCATED_THREAD)
	1790	gd->gd_freetd = td;
	1791	cpu_thread_exit();
	1792	}
	1793
	1794	void
	1795	lwkt_remove_tdallq(thread_t td)
	1796	{
	1797	KKASSERT(td->td_gd == mycpu);
	1798	TAILQ_REMOVE(&td->td_gd->gd_tdallq, td, td_allq);
	1799	}
	1800
	1801	/*
	1802	* Code reduction and branch prediction improvements. Call/return
	1803	* overhead on modern cpus often degenerates into 0 cycles due to
	1804	* the cpu's branch prediction hardware and return pc cache. We
	1805	* can take advantage of this by not inlining medium-complexity
	1806	* functions and we can also reduce the branch prediction impact
	1807	* by collapsing perfectly predictable branches into a single
	1808	* procedure instead of duplicating it.
	1809	*
	1810	* Is any of this noticeable? Probably not, so I'll take the
	1811	* smaller code size.
	1812	*/
	1813	void
	1814	crit_exit_wrapper(__DEBUG_CRIT_ARG__)
	1815	{
	1816	_crit_exit(mycpu __DEBUG_CRIT_PASS_ARG__);
	1817	}
	1818
	1819	void
	1820	crit_panic(void)
	1821	{
	1822	thread_t td = curthread;
	1823	int lcrit = td->td_critcount;
	1824
	1825	td->td_critcount = 0;
	1826	panic("td_critcount is/would-go negative! %p %d", td, lcrit);
	1827	/* NOT REACHED */
	1828	}
	1829
	1830	#ifdef SMP
	1831
	1832	/*
	1833	* Called from debugger/panic on cpus which have been stopped. We must still
	1834	* process the IPIQ while stopped, even if we were stopped while in a critical
	1835	* section (XXX).
	1836	*
	1837	* If we are dumping also try to process any pending interrupts. This may
	1838	* or may not work depending on the state of the cpu at the point it was
	1839	* stopped.
	1840	*/
	1841	void
	1842	lwkt_smp_stopped(void)
	1843	{
	1844	globaldata_t gd = mycpu;
	1845
	1846	crit_enter_gd(gd);
	1847	if (dumping) {
	1848	lwkt_process_ipiq();
	1849	splz();
	1850	} else {
	1851	lwkt_process_ipiq();
	1852	}
	1853	crit_exit_gd(gd);
	1854	}
	1855
	1856	#endif