gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2003-2011 The DragonFly Project. All rights reserved.
	3	*
	4	* This code is derived from software contributed to The DragonFly Project
	5	* by Matthew Dillon <dillon@backplane.com>
	6	*
	7	* Redistribution and use in source and binary forms, with or without
	8	* modification, are permitted provided that the following conditions
	9	* are met:
	10	*
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in
	15	* the documentation and/or other materials provided with the
	16	* distribution.
	17	* 3. Neither the name of The DragonFly Project nor the names of its
	18	* contributors may be used to endorse or promote products derived
	19	* from this software without specific, prior written permission.
	20	*
	21	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	22	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	23	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	24	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	25	* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	26	* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
	27	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	28	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	29	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	30	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	31	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	32	* SUCH DAMAGE.
	33	*/
	34
	35	/*
	36	* Each cpu in a system has its own self-contained light weight kernel
	37	* thread scheduler, which means that generally speaking we only need
	38	* to use a critical section to avoid problems. Foreign thread
	39	* scheduling is queued via (async) IPIs.
	40	*/
	41
	42	#include <sys/param.h>
	43	#include <sys/systm.h>
	44	#include <sys/kernel.h>
	45	#include <sys/proc.h>
	46	#include <sys/rtprio.h>
	47	#include <sys/kinfo.h>
	48	#include <sys/queue.h>
	49	#include <sys/sysctl.h>
	50	#include <sys/kthread.h>
	51	#include <machine/cpu.h>
	52	#include <sys/lock.h>
	53	#include <sys/spinlock.h>
	54	#include <sys/ktr.h>
	55
	56	#include <sys/thread2.h>
	57	#include <sys/spinlock2.h>
	58	#include <sys/mplock2.h>
	59
	60	#include <sys/dsched.h>
	61
	62	#include <vm/vm.h>
	63	#include <vm/vm_param.h>
	64	#include <vm/vm_kern.h>
	65	#include <vm/vm_object.h>
	66	#include <vm/vm_page.h>
	67	#include <vm/vm_map.h>
	68	#include <vm/vm_pager.h>
	69	#include <vm/vm_extern.h>
	70
	71	#include <machine/stdarg.h>
	72	#include <machine/smp.h>
	73
	74	#ifdef _KERNEL_VIRTUAL
	75	#include <pthread.h>
	76	#endif
	77
	78	#if !defined(KTR_CTXSW)
	79	#define KTR_CTXSW KTR_ALL
	80	#endif
	81	KTR_INFO_MASTER(ctxsw);
	82	KTR_INFO(KTR_CTXSW, ctxsw, sw, 0, "#cpu[%d].td = %p", int cpu, struct thread *td);
	83	KTR_INFO(KTR_CTXSW, ctxsw, pre, 1, "#cpu[%d].td = %p", int cpu, struct thread *td);
	84	KTR_INFO(KTR_CTXSW, ctxsw, newtd, 2, "#threads[%p].name = %s", struct thread td, char comm);
	85	KTR_INFO(KTR_CTXSW, ctxsw, deadtd, 3, "#threads[%p].name = <dead>", struct thread *td);
	86
	87	static MALLOC_DEFINE(M_THREAD, "thread", "lwkt threads");
	88
	89	#ifdef INVARIANTS
	90	static int panic_on_cscount = 0;
	91	#endif
	92	static __int64_t switch_count = 0;
	93	static __int64_t preempt_hit = 0;
	94	static __int64_t preempt_miss = 0;
	95	static __int64_t preempt_weird = 0;
	96	static int lwkt_use_spin_port;
	97	static struct objcache *thread_cache;
	98	int cpu_mwait_spin = 0;
	99
	100	static void lwkt_schedule_remote(void arg, int arg2, struct intrframe frame);
	101	static void lwkt_setcpu_remote(void *arg);
	102
	103	extern void cpu_heavy_restore(void);
	104	extern void cpu_lwkt_restore(void);
	105	extern void cpu_kthread_restore(void);
	106	extern void cpu_idle_restore(void);
	107
	108	/*
	109	* We can make all thread ports use the spin backend instead of the thread
	110	* backend. This should only be set to debug the spin backend.
	111	*/
	112	TUNABLE_INT("lwkt.use_spin_port", &lwkt_use_spin_port);
	113
	114	#ifdef INVARIANTS
	115	SYSCTL_INT(_lwkt, OID_AUTO, panic_on_cscount, CTLFLAG_RW, &panic_on_cscount, 0,
	116	"Panic if attempting to switch lwkt's while mastering cpusync");
	117	#endif
	118	SYSCTL_INT(_hw, OID_AUTO, cpu_mwait_spin, CTLFLAG_RW, &cpu_mwait_spin, 0,
	119	"monitor/mwait target state");
	120	SYSCTL_QUAD(_lwkt, OID_AUTO, switch_count, CTLFLAG_RW, &switch_count, 0,
	121	"Number of switched threads");
	122	SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_hit, CTLFLAG_RW, &preempt_hit, 0,
	123	"Successful preemption events");
	124	SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_miss, CTLFLAG_RW, &preempt_miss, 0,
	125	"Failed preemption events");
	126	SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_weird, CTLFLAG_RW, &preempt_weird, 0,
	127	"Number of preempted threads.");
	128	static int fairq_enable = 0;
	129	SYSCTL_INT(_lwkt, OID_AUTO, fairq_enable, CTLFLAG_RW,
	130	&fairq_enable, 0, "Turn on fairq priority accumulators");
	131	static int fairq_bypass = -1;
	132	SYSCTL_INT(_lwkt, OID_AUTO, fairq_bypass, CTLFLAG_RW,
	133	&fairq_bypass, 0, "Allow fairq to bypass td on token failure");
	134	extern int lwkt_sched_debug;
	135	int lwkt_sched_debug = 0;
	136	SYSCTL_INT(_lwkt, OID_AUTO, sched_debug, CTLFLAG_RW,
	137	&lwkt_sched_debug, 0, "Scheduler debug");
	138	static int lwkt_spin_loops = 10;
	139	SYSCTL_INT(_lwkt, OID_AUTO, spin_loops, CTLFLAG_RW,
	140	&lwkt_spin_loops, 0, "Scheduler spin loops until sorted decon");
	141	static int lwkt_spin_reseq = 0;
	142	SYSCTL_INT(_lwkt, OID_AUTO, spin_reseq, CTLFLAG_RW,
	143	&lwkt_spin_reseq, 0, "Scheduler resequencer enable");
	144	static int lwkt_spin_monitor = 0;
	145	SYSCTL_INT(_lwkt, OID_AUTO, spin_monitor, CTLFLAG_RW,
	146	&lwkt_spin_monitor, 0, "Scheduler uses monitor/mwait");
	147	static int lwkt_spin_fatal = 0; /* disabled */
	148	SYSCTL_INT(_lwkt, OID_AUTO, spin_fatal, CTLFLAG_RW,
	149	&lwkt_spin_fatal, 0, "LWKT scheduler spin loops till fatal panic");
	150	static int preempt_enable = 1;
	151	SYSCTL_INT(_lwkt, OID_AUTO, preempt_enable, CTLFLAG_RW,
	152	&preempt_enable, 0, "Enable preemption");
	153	static int lwkt_cache_threads = 0;
	154	SYSCTL_INT(_lwkt, OID_AUTO, cache_threads, CTLFLAG_RD,
	155	&lwkt_cache_threads, 0, "thread+kstack cache");
	156
	157	#ifndef _KERNEL_VIRTUAL
	158	static __cachealign int lwkt_cseq_rindex;
	159	static __cachealign int lwkt_cseq_windex;
	160	#endif
	161
	162	/*
	163	* These helper procedures handle the runq, they can only be called from
	164	* within a critical section.
	165	*
	166	* WARNING! Prior to SMP being brought up it is possible to enqueue and
	167	* dequeue threads belonging to other cpus, so be sure to use td->td_gd
	168	* instead of 'mycpu' when referencing the globaldata structure. Once
	169	* SMP live enqueuing and dequeueing only occurs on the current cpu.
	170	*/
	171	static __inline
	172	void
	173	_lwkt_dequeue(thread_t td)
	174	{
	175	if (td->td_flags & TDF_RUNQ) {
	176	struct globaldata *gd = td->td_gd;
	177
	178	td->td_flags &= ~TDF_RUNQ;
	179	TAILQ_REMOVE(&gd->gd_tdrunq, td, td_threadq);
	180	--gd->gd_tdrunqcount;
	181	if (TAILQ_FIRST(&gd->gd_tdrunq) == NULL)
	182	atomic_clear_int(&gd->gd_reqflags, RQF_RUNNING);
	183	}
	184	}
	185
	186	/*
	187	* Priority enqueue.
	188	*
	189	* There are a limited number of lwkt threads runnable since user
	190	* processes only schedule one at a time per cpu. However, there can
	191	* be many user processes in kernel mode exiting from a tsleep() which
	192	* become runnable.
	193	*
	194	* NOTE: lwkt_schedulerclock() will force a round-robin based on td_pri and
	195	* will ignore user priority. This is to ensure that user threads in
	196	* kernel mode get cpu at some point regardless of what the user
	197	* scheduler thinks.
	198	*/
	199	static __inline
	200	void
	201	_lwkt_enqueue(thread_t td)
	202	{
	203	thread_t xtd;
	204
	205	if ((td->td_flags & (TDF_RUNQ\|TDF_MIGRATING\|TDF_BLOCKQ)) == 0) {
	206	struct globaldata *gd = td->td_gd;
	207
	208	td->td_flags \|= TDF_RUNQ;
	209	xtd = TAILQ_FIRST(&gd->gd_tdrunq);
	210	if (xtd == NULL) {
	211	TAILQ_INSERT_TAIL(&gd->gd_tdrunq, td, td_threadq);
	212	atomic_set_int(&gd->gd_reqflags, RQF_RUNNING);
	213	} else {
	214	/*
	215	* NOTE: td_upri - higher numbers more desireable, same sense
	216	* as td_pri (typically reversed from lwp_upri).
	217	*
	218	* In the equal priority case we want the best selection
	219	* at the beginning so the less desireable selections know
	220	* that they have to setrunqueue/go-to-another-cpu, even
	221	* though it means switching back to the 'best' selection.
	222	* This also avoids degenerate situations when many threads
	223	* are runnable or waking up at the same time.
	224	*
	225	* If upri matches exactly place at end/round-robin.
	226	*/
	227	while (xtd &&
	228	(xtd->td_pri >= td->td_pri \|\|
	229	(xtd->td_pri == td->td_pri &&
	230	xtd->td_upri >= td->td_upri))) {
	231	xtd = TAILQ_NEXT(xtd, td_threadq);
	232	}
	233	if (xtd)
	234	TAILQ_INSERT_BEFORE(xtd, td, td_threadq);
	235	else
	236	TAILQ_INSERT_TAIL(&gd->gd_tdrunq, td, td_threadq);
	237	}
	238	++gd->gd_tdrunqcount;
	239
	240	/*
	241	* Request a LWKT reschedule if we are now at the head of the queue.
	242	*/
	243	if (TAILQ_FIRST(&gd->gd_tdrunq) == td)
	244	need_lwkt_resched();
	245	}
	246	}
	247
	248	static __boolean_t
	249	_lwkt_thread_ctor(void obj, void privdata, int ocflags)
	250	{
	251	struct thread td = (struct thread )obj;
	252
	253	td->td_kstack = NULL;
	254	td->td_kstack_size = 0;
	255	td->td_flags = TDF_ALLOCATED_THREAD;
	256	td->td_mpflags = 0;
	257	return (1);
	258	}
	259
	260	static void
	261	_lwkt_thread_dtor(void obj, void privdata)
	262	{
	263	struct thread td = (struct thread )obj;
	264
	265	KASSERT(td->td_flags & TDF_ALLOCATED_THREAD,
	266	("_lwkt_thread_dtor: not allocated from objcache"));
	267	KASSERT((td->td_flags & TDF_ALLOCATED_STACK) && td->td_kstack &&
	268	td->td_kstack_size > 0,
	269	("_lwkt_thread_dtor: corrupted stack"));
	270	kmem_free(&kernel_map, (vm_offset_t)td->td_kstack, td->td_kstack_size);
	271	td->td_kstack = NULL;
	272	td->td_flags = 0;
	273	}
	274
	275	/*
	276	* Initialize the lwkt s/system.
	277	*
	278	* Nominally cache up to 32 thread + kstack structures. Cache more on
	279	* systems with a lot of cpu cores.
	280	*/
	281	void
	282	lwkt_init(void)
	283	{
	284	TUNABLE_INT("lwkt.cache_threads", &lwkt_cache_threads);
	285	if (lwkt_cache_threads == 0) {
	286	lwkt_cache_threads = ncpus * 4;
	287	if (lwkt_cache_threads < 32)
	288	lwkt_cache_threads = 32;
	289	}
	290	thread_cache = objcache_create_mbacked(
	291	M_THREAD, sizeof(struct thread),
	292	0, lwkt_cache_threads,
	293	_lwkt_thread_ctor, _lwkt_thread_dtor, NULL);
	294	}
	295
	296	/*
	297	* Schedule a thread to run. As the current thread we can always safely
	298	* schedule ourselves, and a shortcut procedure is provided for that
	299	* function.
	300	*
	301	* (non-blocking, self contained on a per cpu basis)
	302	*/
	303	void
	304	lwkt_schedule_self(thread_t td)
	305	{
	306	KKASSERT((td->td_flags & TDF_MIGRATING) == 0);
	307	crit_enter_quick(td);
	308	KASSERT(td != &td->td_gd->gd_idlethread,
	309	("lwkt_schedule_self(): scheduling gd_idlethread is illegal!"));
	310	KKASSERT(td->td_lwp == NULL \|\|
	311	(td->td_lwp->lwp_mpflags & LWP_MP_ONRUNQ) == 0);
	312	_lwkt_enqueue(td);
	313	crit_exit_quick(td);
	314	}
	315
	316	/*
	317	* Deschedule a thread.
	318	*
	319	* (non-blocking, self contained on a per cpu basis)
	320	*/
	321	void
	322	lwkt_deschedule_self(thread_t td)
	323	{
	324	crit_enter_quick(td);
	325	_lwkt_dequeue(td);
	326	crit_exit_quick(td);
	327	}
	328
	329	/*
	330	* LWKTs operate on a per-cpu basis
	331	*
	332	* WARNING! Called from early boot, 'mycpu' may not work yet.
	333	*/
	334	void
	335	lwkt_gdinit(struct globaldata *gd)
	336	{
	337	TAILQ_INIT(&gd->gd_tdrunq);
	338	TAILQ_INIT(&gd->gd_tdallq);
	339	}
	340
	341	/*
	342	* Create a new thread. The thread must be associated with a process context
	343	* or LWKT start address before it can be scheduled. If the target cpu is
	344	* -1 the thread will be created on the current cpu.
	345	*
	346	* If you intend to create a thread without a process context this function
	347	* does everything except load the startup and switcher function.
	348	*/
	349	thread_t
	350	lwkt_alloc_thread(struct thread *td, int stksize, int cpu, int flags)
	351	{
	352	static int cpu_rotator;
	353	globaldata_t gd = mycpu;
	354	void *stack;
	355
	356	/*
	357	* If static thread storage is not supplied allocate a thread. Reuse
	358	* a cached free thread if possible. gd_freetd is used to keep an exiting
	359	* thread intact through the exit.
	360	*/
	361	if (td == NULL) {
	362	crit_enter_gd(gd);
	363	if ((td = gd->gd_freetd) != NULL) {
	364	KKASSERT((td->td_flags & (TDF_RUNNING\|TDF_PREEMPT_LOCK\|
	365	TDF_RUNQ)) == 0);
	366	gd->gd_freetd = NULL;
	367	} else {
	368	td = objcache_get(thread_cache, M_WAITOK);
	369	KKASSERT((td->td_flags & (TDF_RUNNING\|TDF_PREEMPT_LOCK\|
	370	TDF_RUNQ)) == 0);
	371	}
	372	crit_exit_gd(gd);
	373	KASSERT((td->td_flags &
	374	(TDF_ALLOCATED_THREAD\|TDF_RUNNING\|TDF_PREEMPT_LOCK)) ==
	375	TDF_ALLOCATED_THREAD,
	376	("lwkt_alloc_thread: corrupted td flags 0x%X", td->td_flags));
	377	flags \|= td->td_flags & (TDF_ALLOCATED_THREAD\|TDF_ALLOCATED_STACK);
	378	}
	379
	380	/*
	381	* Try to reuse cached stack.
	382	*/
	383	if ((stack = td->td_kstack) != NULL && td->td_kstack_size != stksize) {
	384	if (flags & TDF_ALLOCATED_STACK) {
	385	kmem_free(&kernel_map, (vm_offset_t)stack, td->td_kstack_size);
	386	stack = NULL;
	387	}
	388	}
	389	if (stack == NULL) {
	390	stack = (void *)kmem_alloc_stack(&kernel_map, stksize);
	391	flags \|= TDF_ALLOCATED_STACK;
	392	}
	393	if (cpu < 0) {
	394	cpu = ++cpu_rotator;
	395	cpu_ccfence();
	396	cpu %= ncpus;
	397	}
	398	lwkt_init_thread(td, stack, stksize, flags, globaldata_find(cpu));
	399	return(td);
	400	}
	401
	402	/*
	403	* Initialize a preexisting thread structure. This function is used by
	404	* lwkt_alloc_thread() and also used to initialize the per-cpu idlethread.
	405	*
	406	* All threads start out in a critical section at a priority of
	407	* TDPRI_KERN_DAEMON. Higher level code will modify the priority as
	408	* appropriate. This function may send an IPI message when the
	409	* requested cpu is not the current cpu and consequently gd_tdallq may
	410	* not be initialized synchronously from the point of view of the originating
	411	* cpu.
	412	*
	413	* NOTE! we have to be careful in regards to creating threads for other cpus
	414	* if SMP has not yet been activated.
	415	*/
	416	static void
	417	lwkt_init_thread_remote(void *arg)
	418	{
	419	thread_t td = arg;
	420
	421	/*
	422	* Protected by critical section held by IPI dispatch
	423	*/
	424	TAILQ_INSERT_TAIL(&td->td_gd->gd_tdallq, td, td_allq);
	425	}
	426
	427	/*
	428	* lwkt core thread structural initialization.
	429	*
	430	* NOTE: All threads are initialized as mpsafe threads.
	431	*/
	432	void
	433	lwkt_init_thread(thread_t td, void *stack, int stksize, int flags,
	434	struct globaldata *gd)
	435	{
	436	globaldata_t mygd = mycpu;
	437
	438	bzero(td, sizeof(struct thread));
	439	td->td_kstack = stack;
	440	td->td_kstack_size = stksize;
	441	td->td_flags = flags;
	442	td->td_mpflags = 0;
	443	td->td_type = TD_TYPE_GENERIC;
	444	td->td_gd = gd;
	445	td->td_pri = TDPRI_KERN_DAEMON;
	446	td->td_critcount = 1;
	447	td->td_toks_have = NULL;
	448	td->td_toks_stop = &td->td_toks_base;
	449	if (lwkt_use_spin_port \|\| (flags & TDF_FORCE_SPINPORT))
	450	lwkt_initport_spin(&td->td_msgport, td);
	451	else
	452	lwkt_initport_thread(&td->td_msgport, td);
	453	pmap_init_thread(td);
	454	/*
	455	* Normally initializing a thread for a remote cpu requires sending an
	456	* IPI. However, the idlethread is setup before the other cpus are
	457	* activated so we have to treat it as a special case. XXX manipulation
	458	* of gd_tdallq requires the BGL.
	459	*/
	460	if (gd == mygd \|\| td == &gd->gd_idlethread) {
	461	crit_enter_gd(mygd);
	462	TAILQ_INSERT_TAIL(&gd->gd_tdallq, td, td_allq);
	463	crit_exit_gd(mygd);
	464	} else {
	465	lwkt_send_ipiq(gd, lwkt_init_thread_remote, td);
	466	}
	467	dsched_new_thread(td);
	468	}
	469
	470	void
	471	lwkt_set_comm(thread_t td, const char *ctl, ...)
	472	{
	473	__va_list va;
	474
	475	__va_start(va, ctl);
	476	kvsnprintf(td->td_comm, sizeof(td->td_comm), ctl, va);
	477	__va_end(va);
	478	KTR_LOG(ctxsw_newtd, td, td->td_comm);
	479	}
	480
	481	/*
	482	* Prevent the thread from getting destroyed. Note that unlike PHOLD/PRELE
	483	* this does not prevent the thread from migrating to another cpu so the
	484	* gd_tdallq state is not protected by this.
	485	*/
	486	void
	487	lwkt_hold(thread_t td)
	488	{
	489	atomic_add_int(&td->td_refs, 1);
	490	}
	491
	492	void
	493	lwkt_rele(thread_t td)
	494	{
	495	KKASSERT(td->td_refs > 0);
	496	atomic_add_int(&td->td_refs, -1);
	497	}
	498
	499	void
	500	lwkt_free_thread(thread_t td)
	501	{
	502	KKASSERT(td->td_refs == 0);
	503	KKASSERT((td->td_flags & (TDF_RUNNING \| TDF_PREEMPT_LOCK \|
	504	TDF_RUNQ \| TDF_TSLEEPQ)) == 0);
	505	if (td->td_flags & TDF_ALLOCATED_THREAD) {
	506	objcache_put(thread_cache, td);
	507	} else if (td->td_flags & TDF_ALLOCATED_STACK) {
	508	/* client-allocated struct with internally allocated stack */
	509	KASSERT(td->td_kstack && td->td_kstack_size > 0,
	510	("lwkt_free_thread: corrupted stack"));
	511	kmem_free(&kernel_map, (vm_offset_t)td->td_kstack, td->td_kstack_size);
	512	td->td_kstack = NULL;
	513	td->td_kstack_size = 0;
	514	}
	515
	516	KTR_LOG(ctxsw_deadtd, td);
	517	}
	518
	519
	520	/*
	521	* Switch to the next runnable lwkt. If no LWKTs are runnable then
	522	* switch to the idlethread. Switching must occur within a critical
	523	* section to avoid races with the scheduling queue.
	524	*
	525	* We always have full control over our cpu's run queue. Other cpus
	526	* that wish to manipulate our queue must use the cpu_*msg() calls to
	527	* talk to our cpu, so a critical section is all that is needed and
	528	* the result is very, very fast thread switching.
	529	*
	530	* The LWKT scheduler uses a fixed priority model and round-robins at
	531	* each priority level. User process scheduling is a totally
	532	* different beast and LWKT priorities should not be confused with
	533	* user process priorities.
	534	*
	535	* PREEMPTION NOTE: Preemption occurs via lwkt_preempt(). lwkt_switch()
	536	* is not called by the current thread in the preemption case, only when
	537	* the preempting thread blocks (in order to return to the original thread).
	538	*
	539	* SPECIAL NOTE ON SWITCH ATOMICY: Certain operations such as thread
	540	* migration and tsleep deschedule the current lwkt thread and call
	541	* lwkt_switch(). In particular, the target cpu of the migration fully
	542	* expects the thread to become non-runnable and can deadlock against
	543	* cpusync operations if we run any IPIs prior to switching the thread out.
	544	*
	545	* WE MUST BE VERY CAREFUL NOT TO RUN SPLZ DIRECTLY OR INDIRECTLY IF
	546	* THE CURRENT THREAD HAS BEEN DESCHEDULED!
	547	*/
	548	void
	549	lwkt_switch(void)
	550	{
	551	globaldata_t gd = mycpu;
	552	thread_t td = gd->gd_curthread;
	553	thread_t ntd;
	554	int spinning = 0;
	555
	556	KKASSERT(gd->gd_processing_ipiq == 0);
	557	KKASSERT(td->td_flags & TDF_RUNNING);
	558
	559	/*
	560	* Switching from within a 'fast' (non thread switched) interrupt or IPI
	561	* is illegal. However, we may have to do it anyway if we hit a fatal
	562	* kernel trap or we have paniced.
	563	*
	564	* If this case occurs save and restore the interrupt nesting level.
	565	*/
	566	if (gd->gd_intr_nesting_level) {
	567	int savegdnest;
	568	int savegdtrap;
	569
	570	if (gd->gd_trap_nesting_level == 0 && panic_cpu_gd != mycpu) {
	571	panic("lwkt_switch: Attempt to switch from a "
	572	"fast interrupt, ipi, or hard code section, "
	573	"td %p\n",
	574	td);
	575	} else {
	576	savegdnest = gd->gd_intr_nesting_level;
	577	savegdtrap = gd->gd_trap_nesting_level;
	578	gd->gd_intr_nesting_level = 0;
	579	gd->gd_trap_nesting_level = 0;
	580	if ((td->td_flags & TDF_PANICWARN) == 0) {
	581	td->td_flags \|= TDF_PANICWARN;
	582	kprintf("Warning: thread switch from interrupt, IPI, "
	583	"or hard code section.\n"
	584	"thread %p (%s)\n", td, td->td_comm);
	585	print_backtrace(-1);
	586	}
	587	lwkt_switch();
	588	gd->gd_intr_nesting_level = savegdnest;
	589	gd->gd_trap_nesting_level = savegdtrap;
	590	return;
	591	}
	592	}
	593
	594	/*
	595	* Release our current user process designation if we are blocking
	596	* or if a user reschedule was requested.
	597	*
	598	* NOTE: This function is NOT called if we are switching into or
	599	* returning from a preemption.
	600	*
	601	* NOTE: Releasing our current user process designation may cause
	602	* it to be assigned to another thread, which in turn will
	603	* cause us to block in the usched acquire code when we attempt
	604	* to return to userland.
	605	*
	606	* NOTE: On SMP systems this can be very nasty when heavy token
	607	* contention is present so we want to be careful not to
	608	* release the designation gratuitously.
	609	*/
	610	if (td->td_release &&
	611	(user_resched_wanted() \|\| (td->td_flags & TDF_RUNQ) == 0)) {
	612	td->td_release(td);
	613	}
	614
	615	/*
	616	* Release all tokens
	617	*/
	618	crit_enter_gd(gd);
	619	if (TD_TOKS_HELD(td))
	620	lwkt_relalltokens(td);
	621
	622	/*
	623	* We had better not be holding any spin locks, but don't get into an
	624	* endless panic loop.
	625	*/
	626	KASSERT(gd->gd_spinlocks == 0 \|\| panicstr != NULL,
	627	("lwkt_switch: still holding %d exclusive spinlocks!",
	628	gd->gd_spinlocks));
	629
	630
	631	#ifdef INVARIANTS
	632	if (td->td_cscount) {
	633	kprintf("Diagnostic: attempt to switch while mastering cpusync: %p\n",
	634	td);
	635	if (panic_on_cscount)
	636	panic("switching while mastering cpusync");
	637	}
	638	#endif
	639
	640	/*
	641	* If we had preempted another thread on this cpu, resume the preempted
	642	* thread. This occurs transparently, whether the preempted thread
	643	* was scheduled or not (it may have been preempted after descheduling
	644	* itself).
	645	*
	646	* We have to setup the MP lock for the original thread after backing
	647	* out the adjustment that was made to curthread when the original
	648	* was preempted.
	649	*/
	650	if ((ntd = td->td_preempted) != NULL) {
	651	KKASSERT(ntd->td_flags & TDF_PREEMPT_LOCK);
	652	ntd->td_flags \|= TDF_PREEMPT_DONE;
	653
	654	/*
	655	* The interrupt may have woken a thread up, we need to properly
	656	* set the reschedule flag if the originally interrupted thread is
	657	* at a lower priority.
	658	*
	659	* The interrupt may not have descheduled.
	660	*/
	661	if (TAILQ_FIRST(&gd->gd_tdrunq) != ntd)
	662	need_lwkt_resched();
	663	goto havethread_preempted;
	664	}
	665
	666	/*
	667	* If we cannot obtain ownership of the tokens we cannot immediately
	668	* schedule the target thread.
	669	*
	670	* Reminder: Again, we cannot afford to run any IPIs in this path if
	671	* the current thread has been descheduled.
	672	*/
	673	for (;;) {
	674	clear_lwkt_resched();
	675
	676	/*
	677	* Hotpath - pull the head of the run queue and attempt to schedule
	678	* it.
	679	*/
	680	ntd = TAILQ_FIRST(&gd->gd_tdrunq);
	681
	682	if (ntd == NULL) {
	683	/*
	684	* Runq is empty, switch to idle to allow it to halt.
	685	*/
	686	ntd = &gd->gd_idlethread;
	687	if (gd->gd_trap_nesting_level == 0 && panicstr == NULL)
	688	ASSERT_NO_TOKENS_HELD(ntd);
	689	cpu_time.cp_msg[0] = 0;
	690	cpu_time.cp_stallpc = 0;
	691	goto haveidle;
	692	}
	693
	694	/*
	695	* Hotpath - schedule ntd.
	696	*
	697	* NOTE: For UP there is no mplock and lwkt_getalltokens()
	698	* always succeeds.
	699	*/
	700	if (TD_TOKS_NOT_HELD(ntd) \|\|
	701	lwkt_getalltokens(ntd, (spinning >= lwkt_spin_loops)))
	702	{
	703	goto havethread;
	704	}
	705
	706	/*
	707	* Coldpath (SMP only since tokens always succeed on UP)
	708	*
	709	* We had some contention on the thread we wanted to schedule.
	710	* What we do now is try to find a thread that we can schedule
	711	* in its stead.
	712	*
	713	* The coldpath scan does NOT rearrange threads in the run list.
	714	* The lwkt_schedulerclock() will assert need_lwkt_resched() on
	715	* the next tick whenever the current head is not the current thread.
	716	*/
	717	#ifdef INVARIANTS
	718	++ntd->td_contended;
	719	#endif
	720	++gd->gd_cnt.v_token_colls;
	721
	722	if (fairq_bypass > 0)
	723	goto skip;
	724
	725	while ((ntd = TAILQ_NEXT(ntd, td_threadq)) != NULL) {
	726	#ifndef NO_LWKT_SPLIT_USERPRI
	727	/*
	728	* Never schedule threads returning to userland or the
	729	* user thread scheduler helper thread when higher priority
	730	* threads are present. The runq is sorted by priority
	731	* so we can give up traversing it when we find the first
	732	* low priority thread.
	733	*/
	734	if (ntd->td_pri < TDPRI_KERN_LPSCHED) {
	735	ntd = NULL;
	736	break;
	737	}
	738	#endif
	739
	740	/*
	741	* Try this one.
	742	*/
	743	if (TD_TOKS_NOT_HELD(ntd) \|\|
	744	lwkt_getalltokens(ntd, (spinning >= lwkt_spin_loops))) {
	745	goto havethread;
	746	}
	747	#ifdef INVARIANTS
	748	++ntd->td_contended;
	749	#endif
	750	++gd->gd_cnt.v_token_colls;
	751	}
	752
	753	skip:
	754	/*
	755	* We exhausted the run list, meaning that all runnable threads
	756	* are contested.
	757	*/
	758	cpu_pause();
	759	#ifdef _KERNEL_VIRTUAL
	760	pthread_yield();
	761	#endif
	762	ntd = &gd->gd_idlethread;
	763	if (gd->gd_trap_nesting_level == 0 && panicstr == NULL)
	764	ASSERT_NO_TOKENS_HELD(ntd);
	765	/* contention case, do not clear contention mask */
	766
	767	/*
	768	* We are going to have to retry but if the current thread is not
	769	* on the runq we instead switch through the idle thread to get away
	770	* from the current thread. We have to flag for lwkt reschedule
	771	* to prevent the idle thread from halting.
	772	*
	773	* NOTE: A non-zero spinning is passed to lwkt_getalltokens() to
	774	* instruct it to deal with the potential for deadlocks by
	775	* ordering the tokens by address.
	776	*/
	777	if ((td->td_flags & TDF_RUNQ) == 0) {
	778	need_lwkt_resched(); /* prevent hlt */
	779	goto haveidle;
	780	}
	781	#if defined(INVARIANTS) && defined(__x86_64__)
	782	if ((read_rflags() & PSL_I) == 0) {
	783	cpu_enable_intr();
	784	panic("lwkt_switch() called with interrupts disabled");
	785	}
	786	#endif
	787
	788	/*
	789	* Number iterations so far. After a certain point we switch to
	790	* a sorted-address/monitor/mwait version of lwkt_getalltokens()
	791	*/
	792	if (spinning < 0x7FFFFFFF)
	793	++spinning;
	794
	795	#ifndef _KERNEL_VIRTUAL
	796	/*
	797	* lwkt_getalltokens() failed in sorted token mode, we can use
	798	* monitor/mwait in this case.
	799	*/
	800	if (spinning >= lwkt_spin_loops &&
	801	(cpu_mi_feature & CPU_MI_MONITOR) &&
	802	lwkt_spin_monitor)
	803	{
	804	cpu_mmw_pause_int(&gd->gd_reqflags,
	805	(gd->gd_reqflags \| RQF_SPINNING) &
	806	~RQF_IDLECHECK_WK_MASK,
	807	cpu_mwait_spin);
	808	}
	809	#endif
	810
	811	/*
	812	* We already checked that td is still scheduled so this should be
	813	* safe.
	814	*/
	815	splz_check();
	816
	817	#ifndef _KERNEL_VIRTUAL
	818	/*
	819	* This experimental resequencer is used as a fall-back to reduce
	820	* hw cache line contention by placing each core's scheduler into a
	821	* time-domain-multplexed slot.
	822	*
	823	* The resequencer is disabled by default. It's functionality has
	824	* largely been superceeded by the token algorithm which limits races
	825	* to a subset of cores.
	826	*
	827	* The resequencer algorithm tends to break down when more than
	828	* 20 cores are contending. What appears to happen is that new
	829	* tokens can be obtained out of address-sorted order by new cores
	830	* while existing cores languish in long delays between retries and
	831	* wind up being starved-out of the token acquisition.
	832	*/
	833	if (lwkt_spin_reseq && spinning >= lwkt_spin_reseq) {
	834	int cseq = atomic_fetchadd_int(&lwkt_cseq_windex, 1);
	835	int oseq;
	836
	837	while ((oseq = lwkt_cseq_rindex) != cseq) {
	838	cpu_ccfence();
	839	#if 1
	840	if (cpu_mi_feature & CPU_MI_MONITOR) {
	841	cpu_mmw_pause_int(&lwkt_cseq_rindex, oseq, cpu_mwait_spin);
	842	} else {
	843	#endif
	844	cpu_pause();
	845	cpu_lfence();
	846	#if 1
	847	}
	848	#endif
	849	}
	850	DELAY(1);
	851	atomic_add_int(&lwkt_cseq_rindex, 1);
	852	}
	853	#endif
	854	/* highest level for(;;) loop */
	855	}
	856
	857	havethread:
	858	/*
	859	* Clear gd_idle_repeat when doing a normal switch to a non-idle
	860	* thread.
	861	*/
	862	ntd->td_wmesg = NULL;
	863	++gd->gd_cnt.v_swtch;
	864	gd->gd_idle_repeat = 0;
	865
	866	havethread_preempted:
	867	/*
	868	* If the new target does not need the MP lock and we are holding it,
	869	* release the MP lock. If the new target requires the MP lock we have
	870	* already acquired it for the target.
	871	*/
	872	;
	873	haveidle:
	874	KASSERT(ntd->td_critcount,
	875	("priority problem in lwkt_switch %d %d",
	876	td->td_critcount, ntd->td_critcount));
	877
	878	if (td != ntd) {
	879	/*
	880	* Execute the actual thread switch operation. This function
	881	* returns to the current thread and returns the previous thread
	882	* (which may be different from the thread we switched to).
	883	*
	884	* We are responsible for marking ntd as TDF_RUNNING.
	885	*/
	886	KKASSERT((ntd->td_flags & TDF_RUNNING) == 0);
	887	++switch_count;
	888	KTR_LOG(ctxsw_sw, gd->gd_cpuid, ntd);
	889	ntd->td_flags \|= TDF_RUNNING;
	890	lwkt_switch_return(td->td_switch(ntd));
	891	/* ntd invalid, td_switch() can return a different thread_t */
	892	}
	893
	894	/*
	895	* catch-all. XXX is this strictly needed?
	896	*/
	897	splz_check();
	898
	899	/* NOTE: current cpu may have changed after switch */
	900	crit_exit_quick(td);
	901	}
	902
	903	/*
	904	* Called by assembly in the td_switch (thread restore path) for thread
	905	* bootstrap cases which do not 'return' to lwkt_switch().
	906	*/
	907	void
	908	lwkt_switch_return(thread_t otd)
	909	{
	910	globaldata_t rgd;
	911
	912	/*
	913	* Check if otd was migrating. Now that we are on ntd we can finish
	914	* up the migration. This is a bit messy but it is the only place
	915	* where td is known to be fully descheduled.
	916	*
	917	* We can only activate the migration if otd was migrating but not
	918	* held on the cpu due to a preemption chain. We still have to
	919	* clear TDF_RUNNING on the old thread either way.
	920	*
	921	* We are responsible for clearing the previously running thread's
	922	* TDF_RUNNING.
	923	*/
	924	if ((rgd = otd->td_migrate_gd) != NULL &&
	925	(otd->td_flags & TDF_PREEMPT_LOCK) == 0) {
	926	KKASSERT((otd->td_flags & (TDF_MIGRATING \| TDF_RUNNING)) ==
	927	(TDF_MIGRATING \| TDF_RUNNING));
	928	otd->td_migrate_gd = NULL;
	929	otd->td_flags &= ~TDF_RUNNING;
	930	lwkt_send_ipiq(rgd, lwkt_setcpu_remote, otd);
	931	} else {
	932	otd->td_flags &= ~TDF_RUNNING;
	933	}
	934
	935	/*
	936	* Final exit validations (see lwp_wait()). Note that otd becomes
	937	* invalid the instant we set TDF_MP_EXITSIG.
	938	*/
	939	while (otd->td_flags & TDF_EXITING) {
	940	u_int mpflags;
	941
	942	mpflags = otd->td_mpflags;
	943	cpu_ccfence();
	944
	945	if (mpflags & TDF_MP_EXITWAIT) {
	946	if (atomic_cmpset_int(&otd->td_mpflags, mpflags,
	947	mpflags \| TDF_MP_EXITSIG)) {
	948	wakeup(otd);
	949	break;
	950	}
	951	} else {
	952	if (atomic_cmpset_int(&otd->td_mpflags, mpflags,
	953	mpflags \| TDF_MP_EXITSIG)) {
	954	wakeup(otd);
	955	break;
	956	}
	957	}
	958	}
	959	}
	960
	961	/*
	962	* Request that the target thread preempt the current thread. Preemption
	963	* can only occur if our only critical section is the one that we were called
	964	* with, the relative priority of the target thread is higher, and the target
	965	* thread holds no tokens. This also only works if we are not holding any
	966	* spinlocks (obviously).
	967	*
	968	* THE CALLER OF LWKT_PREEMPT() MUST BE IN A CRITICAL SECTION. Typically
	969	* this is called via lwkt_schedule() through the td_preemptable callback.
	970	* critcount is the managed critical priority that we should ignore in order
	971	* to determine whether preemption is possible (aka usually just the crit
	972	* priority of lwkt_schedule() itself).
	973	*
	974	* Preemption is typically limited to interrupt threads.
	975	*
	976	* Operation works in a fairly straight-forward manner. The normal
	977	* scheduling code is bypassed and we switch directly to the target
	978	* thread. When the target thread attempts to block or switch away
	979	* code at the base of lwkt_switch() will switch directly back to our
	980	* thread. Our thread is able to retain whatever tokens it holds and
	981	* if the target needs one of them the target will switch back to us
	982	* and reschedule itself normally.
	983	*/
	984	void
	985	lwkt_preempt(thread_t ntd, int critcount)
	986	{
	987	struct globaldata *gd = mycpu;
	988	thread_t xtd;
	989	thread_t td;
	990	int save_gd_intr_nesting_level;
	991
	992	/*
	993	* The caller has put us in a critical section. We can only preempt
	994	* if the caller of the caller was not in a critical section (basically
	995	* a local interrupt), as determined by the 'critcount' parameter. We
	996	* also can't preempt if the caller is holding any spinlocks (even if
	997	* he isn't in a critical section). This also handles the tokens test.
	998	*
	999	* YYY The target thread must be in a critical section (else it must
	1000	* inherit our critical section? I dunno yet).
	1001	*/
	1002	KASSERT(ntd->td_critcount, ("BADCRIT0 %d", ntd->td_pri));
	1003
	1004	td = gd->gd_curthread;
	1005	if (preempt_enable == 0) {
	1006	++preempt_miss;
	1007	return;
	1008	}
	1009	if (ntd->td_pri <= td->td_pri) {
	1010	++preempt_miss;
	1011	return;
	1012	}
	1013	if (td->td_critcount > critcount) {
	1014	++preempt_miss;
	1015	return;
	1016	}
	1017	if (td->td_cscount) {
	1018	++preempt_miss;
	1019	return;
	1020	}
	1021	if (ntd->td_gd != gd) {
	1022	++preempt_miss;
	1023	return;
	1024	}
	1025	/*
	1026	* We don't have to check spinlocks here as they will also bump
	1027	* td_critcount.
	1028	*
	1029	* Do not try to preempt if the target thread is holding any tokens.
	1030	* We could try to acquire the tokens but this case is so rare there
	1031	* is no need to support it.
	1032	*/
	1033	KKASSERT(gd->gd_spinlocks == 0);
	1034
	1035	if (TD_TOKS_HELD(ntd)) {
	1036	++preempt_miss;
	1037	return;
	1038	}
	1039	if (td == ntd \|\| ((td->td_flags \| ntd->td_flags) & TDF_PREEMPT_LOCK)) {
	1040	++preempt_weird;
	1041	return;
	1042	}
	1043	if (ntd->td_preempted) {
	1044	++preempt_hit;
	1045	return;
	1046	}
	1047	KKASSERT(gd->gd_processing_ipiq == 0);
	1048
	1049	/*
	1050	* Since we are able to preempt the current thread, there is no need to
	1051	* call need_lwkt_resched().
	1052	*
	1053	* We must temporarily clear gd_intr_nesting_level around the switch
	1054	* since switchouts from the target thread are allowed (they will just
	1055	* return to our thread), and since the target thread has its own stack.
	1056	*
	1057	* A preemption must switch back to the original thread, assert the
	1058	* case.
	1059	*/
	1060	++preempt_hit;
	1061	ntd->td_preempted = td;
	1062	td->td_flags \|= TDF_PREEMPT_LOCK;
	1063	KTR_LOG(ctxsw_pre, gd->gd_cpuid, ntd);
	1064	save_gd_intr_nesting_level = gd->gd_intr_nesting_level;
	1065	gd->gd_intr_nesting_level = 0;
	1066
	1067	KKASSERT((ntd->td_flags & TDF_RUNNING) == 0);
	1068	ntd->td_flags \|= TDF_RUNNING;
	1069	xtd = td->td_switch(ntd);
	1070	KKASSERT(xtd == ntd);
	1071	lwkt_switch_return(xtd);
	1072	gd->gd_intr_nesting_level = save_gd_intr_nesting_level;
	1073
	1074	KKASSERT(ntd->td_preempted && (td->td_flags & TDF_PREEMPT_DONE));
	1075	ntd->td_preempted = NULL;
	1076	td->td_flags &= ~(TDF_PREEMPT_LOCK\|TDF_PREEMPT_DONE);
	1077	}
	1078
	1079	/*
	1080	* Conditionally call splz() if gd_reqflags indicates work is pending.
	1081	* This will work inside a critical section but not inside a hard code
	1082	* section.
	1083	*
	1084	* (self contained on a per cpu basis)
	1085	*/
	1086	void
	1087	splz_check(void)
	1088	{
	1089	globaldata_t gd = mycpu;
	1090	thread_t td = gd->gd_curthread;
	1091
	1092	if ((gd->gd_reqflags & RQF_IDLECHECK_MASK) &&
	1093	gd->gd_intr_nesting_level == 0 &&
	1094	td->td_nest_count < 2)
	1095	{
	1096	splz();
	1097	}
	1098	}
	1099
	1100	/*
	1101	* This version is integrated into crit_exit, reqflags has already
	1102	* been tested but td_critcount has not.
	1103	*
	1104	* We only want to execute the splz() on the 1->0 transition of
	1105	* critcount and not in a hard code section or if too deeply nested.
	1106	*
	1107	* NOTE: gd->gd_spinlocks is implied to be 0 when td_critcount is 0.
	1108	*/
	1109	void
	1110	lwkt_maybe_splz(thread_t td)
	1111	{
	1112	globaldata_t gd = td->td_gd;
	1113
	1114	if (td->td_critcount == 0 &&
	1115	gd->gd_intr_nesting_level == 0 &&
	1116	td->td_nest_count < 2)
	1117	{
	1118	splz();
	1119	}
	1120	}
	1121
	1122	/*
	1123	* Drivers which set up processing co-threads can call this function to
	1124	* run the co-thread at a higher priority and to allow it to preempt
	1125	* normal threads.
	1126	*/
	1127	void
	1128	lwkt_set_interrupt_support_thread(void)
	1129	{
	1130	thread_t td = curthread;
	1131
	1132	lwkt_setpri_self(TDPRI_INT_SUPPORT);
	1133	td->td_flags \|= TDF_INTTHREAD;
	1134	td->td_preemptable = lwkt_preempt;
	1135	}
	1136
	1137
	1138	/*
	1139	* This function is used to negotiate a passive release of the current
	1140	* process/lwp designation with the user scheduler, allowing the user
	1141	* scheduler to schedule another user thread. The related kernel thread
	1142	* (curthread) continues running in the released state.
	1143	*/
	1144	void
	1145	lwkt_passive_release(struct thread *td)
	1146	{
	1147	struct lwp *lp = td->td_lwp;
	1148
	1149	#ifndef NO_LWKT_SPLIT_USERPRI
	1150	td->td_release = NULL;
	1151	lwkt_setpri_self(TDPRI_KERN_USER);
	1152	#endif
	1153
	1154	lp->lwp_proc->p_usched->release_curproc(lp);
	1155	}
	1156
	1157
	1158	/*
	1159	* This implements a LWKT yield, allowing a kernel thread to yield to other
	1160	* kernel threads at the same or higher priority. This function can be
	1161	* called in a tight loop and will typically only yield once per tick.
	1162	*
	1163	* Most kernel threads run at the same priority in order to allow equal
	1164	* sharing.
	1165	*
	1166	* (self contained on a per cpu basis)
	1167	*/
	1168	void
	1169	lwkt_yield(void)
	1170	{
	1171	globaldata_t gd = mycpu;
	1172	thread_t td = gd->gd_curthread;
	1173
	1174	if ((gd->gd_reqflags & RQF_IDLECHECK_MASK) && td->td_nest_count < 2)
	1175	splz();
	1176	if (lwkt_resched_wanted()) {
	1177	lwkt_schedule_self(curthread);
	1178	lwkt_switch();
	1179	}
	1180	}
	1181
	1182	/*
	1183	* The quick version processes pending interrupts and higher-priority
	1184	* LWKT threads but will not round-robin same-priority LWKT threads.
	1185	*
	1186	* When called while attempting to return to userland the only same-pri
	1187	* threads are the ones which have already tried to become the current
	1188	* user process.
	1189	*/
	1190	void
	1191	lwkt_yield_quick(void)
	1192	{
	1193	globaldata_t gd = mycpu;
	1194	thread_t td = gd->gd_curthread;
	1195
	1196	if ((gd->gd_reqflags & RQF_IDLECHECK_MASK) && td->td_nest_count < 2)
	1197	splz();
	1198	if (lwkt_resched_wanted()) {
	1199	crit_enter();
	1200	if (TAILQ_FIRST(&gd->gd_tdrunq) == td) {
	1201	clear_lwkt_resched();
	1202	} else {
	1203	lwkt_schedule_self(curthread);
	1204	lwkt_switch();
	1205	}
	1206	crit_exit();
	1207	}
	1208	}
	1209
	1210	/*
	1211	* This yield is designed for kernel threads with a user context.
	1212	*
	1213	* The kernel acting on behalf of the user is potentially cpu-bound,
	1214	* this function will efficiently allow other threads to run and also
	1215	* switch to other processes by releasing.
	1216	*
	1217	* The lwkt_user_yield() function is designed to have very low overhead
	1218	* if no yield is determined to be needed.
	1219	*/
	1220	void
	1221	lwkt_user_yield(void)
	1222	{
	1223	globaldata_t gd = mycpu;
	1224	thread_t td = gd->gd_curthread;
	1225
	1226	/*
	1227	* Always run any pending interrupts in case we are in a critical
	1228	* section.
	1229	*/
	1230	if ((gd->gd_reqflags & RQF_IDLECHECK_MASK) && td->td_nest_count < 2)
	1231	splz();
	1232
	1233	/*
	1234	* Switch (which forces a release) if another kernel thread needs
	1235	* the cpu, if userland wants us to resched, or if our kernel
	1236	* quantum has run out.
	1237	*/
	1238	if (lwkt_resched_wanted() \|\|
	1239	user_resched_wanted())
	1240	{
	1241	lwkt_switch();
	1242	}
	1243
	1244	#if 0
	1245	/*
	1246	* Reacquire the current process if we are released.
	1247	*
	1248	* XXX not implemented atm. The kernel may be holding locks and such,
	1249	* so we want the thread to continue to receive cpu.
	1250	*/
	1251	if (td->td_release == NULL && lp) {
	1252	lp->lwp_proc->p_usched->acquire_curproc(lp);
	1253	td->td_release = lwkt_passive_release;
	1254	lwkt_setpri_self(TDPRI_USER_NORM);
	1255	}
	1256	#endif
	1257	}
	1258
	1259	/*
	1260	* Generic schedule. Possibly schedule threads belonging to other cpus and
	1261	* deal with threads that might be blocked on a wait queue.
	1262	*
	1263	* We have a little helper inline function which does additional work after
	1264	* the thread has been enqueued, including dealing with preemption and
	1265	* setting need_lwkt_resched() (which prevents the kernel from returning
	1266	* to userland until it has processed higher priority threads).
	1267	*
	1268	* It is possible for this routine to be called after a failed _enqueue
	1269	* (due to the target thread migrating, sleeping, or otherwise blocked).
	1270	* We have to check that the thread is actually on the run queue!
	1271	*/
	1272	static __inline
	1273	void
	1274	_lwkt_schedule_post(globaldata_t gd, thread_t ntd, int ccount)
	1275	{
	1276	if (ntd->td_flags & TDF_RUNQ) {
	1277	if (ntd->td_preemptable) {
	1278	ntd->td_preemptable(ntd, ccount); /* YYY +token */
	1279	}
	1280	}
	1281	}
	1282
	1283	static __inline
	1284	void
	1285	_lwkt_schedule(thread_t td)
	1286	{
	1287	globaldata_t mygd = mycpu;
	1288
	1289	KASSERT(td != &td->td_gd->gd_idlethread,
	1290	("lwkt_schedule(): scheduling gd_idlethread is illegal!"));
	1291	KKASSERT((td->td_flags & TDF_MIGRATING) == 0);
	1292	crit_enter_gd(mygd);
	1293	KKASSERT(td->td_lwp == NULL \|\|
	1294	(td->td_lwp->lwp_mpflags & LWP_MP_ONRUNQ) == 0);
	1295
	1296	if (td == mygd->gd_curthread) {
	1297	_lwkt_enqueue(td);
	1298	} else {
	1299	/*
	1300	* If we own the thread, there is no race (since we are in a
	1301	* critical section). If we do not own the thread there might
	1302	* be a race but the target cpu will deal with it.
	1303	*/
	1304	if (td->td_gd == mygd) {
	1305	_lwkt_enqueue(td);
	1306	_lwkt_schedule_post(mygd, td, 1);
	1307	} else {
	1308	lwkt_send_ipiq3(td->td_gd, lwkt_schedule_remote, td, 0);
	1309	}
	1310	}
	1311	crit_exit_gd(mygd);
	1312	}
	1313
	1314	void
	1315	lwkt_schedule(thread_t td)
	1316	{
	1317	_lwkt_schedule(td);
	1318	}
	1319
	1320	void
	1321	lwkt_schedule_noresched(thread_t td) /* XXX not impl */
	1322	{
	1323	_lwkt_schedule(td);
	1324	}
	1325
	1326	/*
	1327	* When scheduled remotely if frame != NULL the IPIQ is being
	1328	* run via doreti or an interrupt then preemption can be allowed.
	1329	*
	1330	* To allow preemption we have to drop the critical section so only
	1331	* one is present in _lwkt_schedule_post.
	1332	*/
	1333	static void
	1334	lwkt_schedule_remote(void arg, int arg2, struct intrframe frame)
	1335	{
	1336	thread_t td = curthread;
	1337	thread_t ntd = arg;
	1338
	1339	if (frame && ntd->td_preemptable) {
	1340	crit_exit_noyield(td);
	1341	_lwkt_schedule(ntd);
	1342	crit_enter_quick(td);
	1343	} else {
	1344	_lwkt_schedule(ntd);
	1345	}
	1346	}
	1347
	1348	/*
	1349	* Thread migration using a 'Pull' method. The thread may or may not be
	1350	* the current thread. It MUST be descheduled and in a stable state.
	1351	* lwkt_giveaway() must be called on the cpu owning the thread.
	1352	*
	1353	* At any point after lwkt_giveaway() is called, the target cpu may
	1354	* 'pull' the thread by calling lwkt_acquire().
	1355	*
	1356	* We have to make sure the thread is not sitting on a per-cpu tsleep
	1357	* queue or it will blow up when it moves to another cpu.
	1358	*
	1359	* MPSAFE - must be called under very specific conditions.
	1360	*/
	1361	void
	1362	lwkt_giveaway(thread_t td)
	1363	{
	1364	globaldata_t gd = mycpu;
	1365
	1366	crit_enter_gd(gd);
	1367	if (td->td_flags & TDF_TSLEEPQ)
	1368	tsleep_remove(td);
	1369	KKASSERT(td->td_gd == gd);
	1370	TAILQ_REMOVE(&gd->gd_tdallq, td, td_allq);
	1371	td->td_flags \|= TDF_MIGRATING;
	1372	crit_exit_gd(gd);
	1373	}
	1374
	1375	void
	1376	lwkt_acquire(thread_t td)
	1377	{
	1378	globaldata_t gd;
	1379	globaldata_t mygd;
	1380	int retry = 10000000;
	1381
	1382	KKASSERT(td->td_flags & TDF_MIGRATING);
	1383	gd = td->td_gd;
	1384	mygd = mycpu;
	1385	if (gd != mycpu) {
	1386	cpu_lfence();
	1387	KKASSERT((td->td_flags & TDF_RUNQ) == 0);
	1388	crit_enter_gd(mygd);
	1389	DEBUG_PUSH_INFO("lwkt_acquire");
	1390	while (td->td_flags & (TDF_RUNNING\|TDF_PREEMPT_LOCK)) {
	1391	lwkt_process_ipiq();
	1392	cpu_lfence();
	1393	if (--retry == 0) {
	1394	kprintf("lwkt_acquire: stuck: td %p td->td_flags %08x\n",
	1395	td, td->td_flags);
	1396	retry = 10000000;
	1397	}
	1398	#ifdef _KERNEL_VIRTUAL
	1399	pthread_yield();
	1400	#endif
	1401	}
	1402	DEBUG_POP_INFO();
	1403	cpu_mfence();
	1404	td->td_gd = mygd;
	1405	TAILQ_INSERT_TAIL(&mygd->gd_tdallq, td, td_allq);
	1406	td->td_flags &= ~TDF_MIGRATING;
	1407	crit_exit_gd(mygd);
	1408	} else {
	1409	crit_enter_gd(mygd);
	1410	TAILQ_INSERT_TAIL(&mygd->gd_tdallq, td, td_allq);
	1411	td->td_flags &= ~TDF_MIGRATING;
	1412	crit_exit_gd(mygd);
	1413	}
	1414	}
	1415
	1416	/*
	1417	* Generic deschedule. Descheduling threads other then your own should be
	1418	* done only in carefully controlled circumstances. Descheduling is
	1419	* asynchronous.
	1420	*
	1421	* This function may block if the cpu has run out of messages.
	1422	*/
	1423	void
	1424	lwkt_deschedule(thread_t td)
	1425	{
	1426	crit_enter();
	1427	if (td == curthread) {
	1428	_lwkt_dequeue(td);
	1429	} else {
	1430	if (td->td_gd == mycpu) {
	1431	_lwkt_dequeue(td);
	1432	} else {
	1433	lwkt_send_ipiq(td->td_gd, (ipifunc1_t)lwkt_deschedule, td);
	1434	}
	1435	}
	1436	crit_exit();
	1437	}
	1438
	1439	/*
	1440	* Set the target thread's priority. This routine does not automatically
	1441	* switch to a higher priority thread, LWKT threads are not designed for
	1442	* continuous priority changes. Yield if you want to switch.
	1443	*/
	1444	void
	1445	lwkt_setpri(thread_t td, int pri)
	1446	{
	1447	if (td->td_pri != pri) {
	1448	KKASSERT(pri >= 0);
	1449	crit_enter();
	1450	if (td->td_flags & TDF_RUNQ) {
	1451	KKASSERT(td->td_gd == mycpu);
	1452	_lwkt_dequeue(td);
	1453	td->td_pri = pri;
	1454	_lwkt_enqueue(td);
	1455	} else {
	1456	td->td_pri = pri;
	1457	}
	1458	crit_exit();
	1459	}
	1460	}
	1461
	1462	/*
	1463	* Set the initial priority for a thread prior to it being scheduled for
	1464	* the first time. The thread MUST NOT be scheduled before or during
	1465	* this call. The thread may be assigned to a cpu other then the current
	1466	* cpu.
	1467	*
	1468	* Typically used after a thread has been created with TDF_STOPPREQ,
	1469	* and before the thread is initially scheduled.
	1470	*/
	1471	void
	1472	lwkt_setpri_initial(thread_t td, int pri)
	1473	{
	1474	KKASSERT(pri >= 0);
	1475	KKASSERT((td->td_flags & TDF_RUNQ) == 0);
	1476	td->td_pri = pri;
	1477	}
	1478
	1479	void
	1480	lwkt_setpri_self(int pri)
	1481	{
	1482	thread_t td = curthread;
	1483
	1484	KKASSERT(pri >= 0 && pri <= TDPRI_MAX);
	1485	crit_enter();
	1486	if (td->td_flags & TDF_RUNQ) {
	1487	_lwkt_dequeue(td);
	1488	td->td_pri = pri;
	1489	_lwkt_enqueue(td);
	1490	} else {
	1491	td->td_pri = pri;
	1492	}
	1493	crit_exit();
	1494	}
	1495
	1496	/*
	1497	* hz tick scheduler clock for LWKT threads
	1498	*/
	1499	void
	1500	lwkt_schedulerclock(thread_t td)
	1501	{
	1502	globaldata_t gd = td->td_gd;
	1503	thread_t xtd;
	1504
	1505	if (TAILQ_FIRST(&gd->gd_tdrunq) == td) {
	1506	/*
	1507	* If the current thread is at the head of the runq shift it to the
	1508	* end of any equal-priority threads and request a LWKT reschedule
	1509	* if it moved.
	1510	*
	1511	* Ignore upri in this situation. There will only be one user thread
	1512	* in user mode, all others will be user threads running in kernel
	1513	* mode and we have to make sure they get some cpu.
	1514	*/
	1515	xtd = TAILQ_NEXT(td, td_threadq);
	1516	if (xtd && xtd->td_pri == td->td_pri) {
	1517	TAILQ_REMOVE(&gd->gd_tdrunq, td, td_threadq);
	1518	while (xtd && xtd->td_pri == td->td_pri)
	1519	xtd = TAILQ_NEXT(xtd, td_threadq);
	1520	if (xtd)
	1521	TAILQ_INSERT_BEFORE(xtd, td, td_threadq);
	1522	else
	1523	TAILQ_INSERT_TAIL(&gd->gd_tdrunq, td, td_threadq);
	1524	need_lwkt_resched();
	1525	}
	1526	} else {
	1527	/*
	1528	* If we scheduled a thread other than the one at the head of the
	1529	* queue always request a reschedule every tick.
	1530	*/
	1531	need_lwkt_resched();
	1532	}
	1533	}
	1534
	1535	/*
	1536	* Migrate the current thread to the specified cpu.
	1537	*
	1538	* This is accomplished by descheduling ourselves from the current cpu
	1539	* and setting td_migrate_gd. The lwkt_switch() code will detect that the
	1540	* 'old' thread wants to migrate after it has been completely switched out
	1541	* and will complete the migration.
	1542	*
	1543	* TDF_MIGRATING prevents scheduling races while the thread is being migrated.
	1544	*
	1545	* We must be sure to release our current process designation (if a user
	1546	* process) before clearing out any tsleepq we are on because the release
	1547	* code may re-add us.
	1548	*
	1549	* We must be sure to remove ourselves from the current cpu's tsleepq
	1550	* before potentially moving to another queue. The thread can be on
	1551	* a tsleepq due to a left-over tsleep_interlock().
	1552	*/
	1553
	1554	void
	1555	lwkt_setcpu_self(globaldata_t rgd)
	1556	{
	1557	thread_t td = curthread;
	1558
	1559	if (td->td_gd != rgd) {
	1560	crit_enter_quick(td);
	1561
	1562	if (td->td_release)
	1563	td->td_release(td);
	1564	if (td->td_flags & TDF_TSLEEPQ)
	1565	tsleep_remove(td);
	1566
	1567	/*
	1568	* Set TDF_MIGRATING to prevent a spurious reschedule while we are
	1569	* trying to deschedule ourselves and switch away, then deschedule
	1570	* ourself, remove us from tdallq, and set td_migrate_gd. Finally,
	1571	* call lwkt_switch() to complete the operation.
	1572	*/
	1573	td->td_flags \|= TDF_MIGRATING;
	1574	lwkt_deschedule_self(td);
	1575	TAILQ_REMOVE(&td->td_gd->gd_tdallq, td, td_allq);
	1576	td->td_migrate_gd = rgd;
	1577	lwkt_switch();
	1578
	1579	/*
	1580	* We are now on the target cpu
	1581	*/
	1582	KKASSERT(rgd == mycpu);
	1583	TAILQ_INSERT_TAIL(&rgd->gd_tdallq, td, td_allq);
	1584	crit_exit_quick(td);
	1585	}
	1586	}
	1587
	1588	void
	1589	lwkt_migratecpu(int cpuid)
	1590	{
	1591	globaldata_t rgd;
	1592
	1593	rgd = globaldata_find(cpuid);
	1594	lwkt_setcpu_self(rgd);
	1595	}
	1596
	1597	/*
	1598	* Remote IPI for cpu migration (called while in a critical section so we
	1599	* do not have to enter another one).
	1600	*
	1601	* The thread (td) has already been completely descheduled from the
	1602	* originating cpu and we can simply assert the case. The thread is
	1603	* assigned to the new cpu and enqueued.
	1604	*
	1605	* The thread will re-add itself to tdallq when it resumes execution.
	1606	*/
	1607	static void
	1608	lwkt_setcpu_remote(void *arg)
	1609	{
	1610	thread_t td = arg;
	1611	globaldata_t gd = mycpu;
	1612
	1613	KKASSERT((td->td_flags & (TDF_RUNNING\|TDF_PREEMPT_LOCK)) == 0);
	1614	td->td_gd = gd;
	1615	cpu_mfence();
	1616	td->td_flags &= ~TDF_MIGRATING;
	1617	KKASSERT(td->td_migrate_gd == NULL);
	1618	KKASSERT(td->td_lwp == NULL \|\|
	1619	(td->td_lwp->lwp_mpflags & LWP_MP_ONRUNQ) == 0);
	1620	_lwkt_enqueue(td);
	1621	}
	1622
	1623	struct lwp *
	1624	lwkt_preempted_proc(void)
	1625	{
	1626	thread_t td = curthread;
	1627	while (td->td_preempted)
	1628	td = td->td_preempted;
	1629	return(td->td_lwp);
	1630	}
	1631
	1632	/*
	1633	* Create a kernel process/thread/whatever. It shares it's address space
	1634	* with proc0 - ie: kernel only.
	1635	*
	1636	* If the cpu is not specified one will be selected. In the future
	1637	* specifying a cpu of -1 will enable kernel thread migration between
	1638	* cpus.
	1639	*/
	1640	int
	1641	lwkt_create(void (func)(void ), void arg, struct thread *tdp,
	1642	thread_t template, int tdflags, int cpu, const char *fmt, ...)
	1643	{
	1644	thread_t td;
	1645	__va_list ap;
	1646
	1647	td = lwkt_alloc_thread(template, LWKT_THREAD_STACK, cpu,
	1648	tdflags);
	1649	if (tdp)
	1650	*tdp = td;
	1651	cpu_set_thread_handler(td, lwkt_exit, func, arg);
	1652
	1653	/*
	1654	* Set up arg0 for 'ps' etc
	1655	*/
	1656	__va_start(ap, fmt);
	1657	kvsnprintf(td->td_comm, sizeof(td->td_comm), fmt, ap);
	1658	__va_end(ap);
	1659
	1660	/*
	1661	* Schedule the thread to run
	1662	*/
	1663	if (td->td_flags & TDF_NOSTART)
	1664	td->td_flags &= ~TDF_NOSTART;
	1665	else
	1666	lwkt_schedule(td);
	1667	return 0;
	1668	}
	1669
	1670	/*
	1671	* Destroy an LWKT thread. Warning! This function is not called when
	1672	* a process exits, cpu_proc_exit() directly calls cpu_thread_exit() and
	1673	* uses a different reaping mechanism.
	1674	*/
	1675	void
	1676	lwkt_exit(void)
	1677	{
	1678	thread_t td = curthread;
	1679	thread_t std;
	1680	globaldata_t gd;
	1681
	1682	/*
	1683	* Do any cleanup that might block here
	1684	*/
	1685	if (td->td_flags & TDF_VERBOSE)
	1686	kprintf("kthread %p %s has exited\n", td, td->td_comm);
	1687	biosched_done(td);
	1688	dsched_exit_thread(td);
	1689
	1690	/*
	1691	* Get us into a critical section to interlock gd_freetd and loop
	1692	* until we can get it freed.
	1693	*
	1694	* We have to cache the current td in gd_freetd because objcache_put()ing
	1695	* it would rip it out from under us while our thread is still active.
	1696	*
	1697	* We are the current thread so of course our own TDF_RUNNING bit will
	1698	* be set, so unlike the lwp reap code we don't wait for it to clear.
	1699	*/
	1700	gd = mycpu;
	1701	crit_enter_quick(td);
	1702	for (;;) {
	1703	if (td->td_refs) {
	1704	tsleep(td, 0, "tdreap", 1);
	1705	continue;
	1706	}
	1707	if ((std = gd->gd_freetd) != NULL) {
	1708	KKASSERT((std->td_flags & (TDF_RUNNING\|TDF_PREEMPT_LOCK)) == 0);
	1709	gd->gd_freetd = NULL;
	1710	objcache_put(thread_cache, std);
	1711	continue;
	1712	}
	1713	break;
	1714	}
	1715
	1716	/*
	1717	* Remove thread resources from kernel lists and deschedule us for
	1718	* the last time. We cannot block after this point or we may end
	1719	* up with a stale td on the tsleepq.
	1720	*
	1721	* None of this may block, the critical section is the only thing
	1722	* protecting tdallq and the only thing preventing new lwkt_hold()
	1723	* thread refs now.
	1724	*/
	1725	if (td->td_flags & TDF_TSLEEPQ)
	1726	tsleep_remove(td);
	1727	lwkt_deschedule_self(td);
	1728	lwkt_remove_tdallq(td);
	1729	KKASSERT(td->td_refs == 0);
	1730
	1731	/*
	1732	* Final cleanup
	1733	*/
	1734	KKASSERT(gd->gd_freetd == NULL);
	1735	if (td->td_flags & TDF_ALLOCATED_THREAD)
	1736	gd->gd_freetd = td;
	1737	cpu_thread_exit();
	1738	}
	1739
	1740	void
	1741	lwkt_remove_tdallq(thread_t td)
	1742	{
	1743	KKASSERT(td->td_gd == mycpu);
	1744	TAILQ_REMOVE(&td->td_gd->gd_tdallq, td, td_allq);
	1745	}
	1746
	1747	/*
	1748	* Code reduction and branch prediction improvements. Call/return
	1749	* overhead on modern cpus often degenerates into 0 cycles due to
	1750	* the cpu's branch prediction hardware and return pc cache. We
	1751	* can take advantage of this by not inlining medium-complexity
	1752	* functions and we can also reduce the branch prediction impact
	1753	* by collapsing perfectly predictable branches into a single
	1754	* procedure instead of duplicating it.
	1755	*
	1756	* Is any of this noticeable? Probably not, so I'll take the
	1757	* smaller code size.
	1758	*/
	1759	void
	1760	crit_exit_wrapper(__DEBUG_CRIT_ARG__)
	1761	{
	1762	_crit_exit(mycpu __DEBUG_CRIT_PASS_ARG__);
	1763	}
	1764
	1765	void
	1766	crit_panic(void)
	1767	{
	1768	thread_t td = curthread;
	1769	int lcrit = td->td_critcount;
	1770
	1771	td->td_critcount = 0;
	1772	panic("td_critcount is/would-go negative! %p %d", td, lcrit);
	1773	/* NOT REACHED */
	1774	}
	1775
	1776	/*
	1777	* Called from debugger/panic on cpus which have been stopped. We must still
	1778	* process the IPIQ while stopped, even if we were stopped while in a critical
	1779	* section (XXX).
	1780	*
	1781	* If we are dumping also try to process any pending interrupts. This may
	1782	* or may not work depending on the state of the cpu at the point it was
	1783	* stopped.
	1784	*/
	1785	void
	1786	lwkt_smp_stopped(void)
	1787	{
	1788	globaldata_t gd = mycpu;
	1789
	1790	crit_enter_gd(gd);
	1791	if (dumping) {
	1792	lwkt_process_ipiq();
	1793	splz();
	1794	} else {
	1795	lwkt_process_ipiq();
	1796	}
	1797	crit_exit_gd(gd);
	1798	}