gitweb.dragonflybsd.org Git - dragonfly.git/blame

Commit	Line	Data
8ad65e08	1	/*
b12defdc	2	* Copyright (c) 2003-2011 The DragonFly Project. All rights reserved.
60f60350	3	*
8c10bfcf MD	4	* This code is derived from software contributed to The DragonFly Project
8c10bfcf MD	5	* by Matthew Dillon <dillon@backplane.com>
60f60350	6	*
8ad65e08 MD	7	* Redistribution and use in source and binary forms, with or without
	8	* modification, are permitted provided that the following conditions
	9	* are met:
60f60350	10	*
8ad65e08 MD	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
8c10bfcf MD	14	* notice, this list of conditions and the following disclaimer in
	15	* the documentation and/or other materials provided with the
	16	* distribution.
	17	* 3. Neither the name of The DragonFly Project nor the names of its
	18	* contributors may be used to endorse or promote products derived
	19	* from this software without specific, prior written permission.
60f60350	20	*
8c10bfcf MD	21	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	22	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	23	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	24	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	25	* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	26	* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
	27	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	28	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	29	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	30	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	31	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
8ad65e08	32	* SUCH DAMAGE.
75cdbe6c MD	33	*/
	34
	35	/*
	36	* Each cpu in a system has its own self-contained light weight kernel
	37	* thread scheduler, which means that generally speaking we only need
dae65060	38	* to use a critical section to avoid problems. Foreign thread
75cdbe6c	39	* scheduling is queued via (async) IPIs.
8ad65e08 MD	40	*/
	41
	42	#include <sys/param.h>
	43	#include <sys/systm.h>
	44	#include <sys/kernel.h>
	45	#include <sys/proc.h>
	46	#include <sys/rtprio.h>
b37f18d6	47	#include <sys/kinfo.h>
dae65060	48	#include <sys/malloc.h>
8ad65e08	49	#include <sys/queue.h>
7d0bac62	50	#include <sys/sysctl.h>
99df837e	51	#include <sys/kthread.h>
f1d1c3fa	52	#include <machine/cpu.h>
99df837e	53	#include <sys/lock.h>
9d265729	54	#include <sys/spinlock.h>
57aa743c	55	#include <sys/ktr.h>
5b49787b	56	#include <sys/indefinite.h>
9d265729 MD	57
	58	#include <sys/thread2.h>
	59	#include <sys/spinlock2.h>
5b49787b	60	#include <sys/indefinite2.h>
f1d1c3fa	61
8c72e3d5 AH	62	#include <sys/dsched.h>
8c72e3d5 AH	63
7d0bac62 MD	64	#include <vm/vm.h>
	65	#include <vm/vm_param.h>
	66	#include <vm/vm_kern.h>
	67	#include <vm/vm_object.h>
	68	#include <vm/vm_page.h>
	69	#include <vm/vm_map.h>
	70	#include <vm/vm_pager.h>
	71	#include <vm/vm_extern.h>
7d0bac62	72
99df837e	73	#include <machine/stdarg.h>
96728c05	74	#include <machine/smp.h>
3a06728e	75	#include <machine/clock.h>
99df837e	76
3a06728e MD	77	#define LOOPMASK
3a06728e MD	78
d850923c AE	79	#if !defined(KTR_CTXSW)
	80	#define KTR_CTXSW KTR_ALL
	81	#endif
	82	KTR_INFO_MASTER(ctxsw);
5bf48697 AE	83	KTR_INFO(KTR_CTXSW, ctxsw, sw, 0, "#cpu[%d].td = %p", int cpu, struct thread *td);
	84	KTR_INFO(KTR_CTXSW, ctxsw, pre, 1, "#cpu[%d].td = %p", int cpu, struct thread *td);
	85	KTR_INFO(KTR_CTXSW, ctxsw, newtd, 2, "#threads[%p].name = %s", struct thread td, char comm);
	86	KTR_INFO(KTR_CTXSW, ctxsw, deadtd, 3, "#threads[%p].name = <dead>", struct thread *td);
1541028a	87
40aaf5fc	88	static MALLOC_DEFINE(M_THREAD, "thread", "lwkt threads");
1be00ff1	89	MALLOC_DEFINE(M_FPUCTX, "fpuctx", "kernel FPU contexts");
40aaf5fc	90
0f7a3396 MD	91	#ifdef INVARIANTS
	92	static int panic_on_cscount = 0;
	93	#endif
4b2467b7	94	#ifdef DEBUG_LWKT_THREAD
e28c8ef4 SW	95	static int64_t switch_count = 0;
	96	static int64_t preempt_hit = 0;
	97	static int64_t preempt_miss = 0;
	98	static int64_t preempt_weird = 0;
4b2467b7	99	#endif
fb0f29c4	100	static int lwkt_use_spin_port;
4b2467b7	101	__read_mostly static struct objcache *thread_cache;
a46b4a23	102	int cpu_mwait_spin = 0;
05220613	103
e381e77c	104	static void lwkt_schedule_remote(void arg, int arg2, struct intrframe frame);
cc9b6223	105	static void lwkt_setcpu_remote(void *arg);
e381e77c	106
fb0f29c4 MD	107	/*
	108	* We can make all thread ports use the spin backend instead of the thread
	109	* backend. This should only be set to debug the spin backend.
	110	*/
	111	TUNABLE_INT("lwkt.use_spin_port", &lwkt_use_spin_port);
	112
0f7a3396	113	#ifdef INVARIANTS
0c52fa62 SG	114	SYSCTL_INT(_lwkt, OID_AUTO, panic_on_cscount, CTLFLAG_RW, &panic_on_cscount, 0,
0c52fa62 SG	115	"Panic if attempting to switch lwkt's while mastering cpusync");
0f7a3396	116	#endif
4b2467b7	117	#ifdef DEBUG_LWKT_THREAD
0c52fa62 SG	118	SYSCTL_QUAD(_lwkt, OID_AUTO, switch_count, CTLFLAG_RW, &switch_count, 0,
0c52fa62 SG	119	"Number of switched threads");
dae65060	120	SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_hit, CTLFLAG_RW, &preempt_hit, 0,
0c52fa62	121	"Successful preemption events");
dae65060	122	SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_miss, CTLFLAG_RW, &preempt_miss, 0,
0c52fa62 SG	123	"Failed preemption events");
	124	SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_weird, CTLFLAG_RW, &preempt_weird, 0,
	125	"Number of preempted threads.");
4b2467b7	126	#endif
b12defdc MD	127	extern int lwkt_sched_debug;
	128	int lwkt_sched_debug = 0;
	129	SYSCTL_INT(_lwkt, OID_AUTO, sched_debug, CTLFLAG_RW,
	130	&lwkt_sched_debug, 0, "Scheduler debug");
4b2467b7	131	__read_mostly static u_int lwkt_spin_loops = 10;
7fb451cb	132	SYSCTL_UINT(_lwkt, OID_AUTO, spin_loops, CTLFLAG_RW,
b12defdc	133	&lwkt_spin_loops, 0, "Scheduler spin loops until sorted decon");
4b2467b7	134	__read_mostly static int preempt_enable = 1;
2a418930 MD	135	SYSCTL_INT(_lwkt, OID_AUTO, preempt_enable, CTLFLAG_RW,
2a418930 MD	136	&preempt_enable, 0, "Enable preemption");
7b234d8c	137	static int lwkt_cache_threads = 0;
765b1ae0 MD	138	SYSCTL_INT(_lwkt, OID_AUTO, cache_threads, CTLFLAG_RD,
765b1ae0 MD	139	&lwkt_cache_threads, 0, "thread+kstack cache");
fbc024e4	140
4b5f931b MD	141	/*
	142	* These helper procedures handle the runq, they can only be called from
	143	* within a critical section.
75cdbe6c MD	144	*
	145	* WARNING! Prior to SMP being brought up it is possible to enqueue and
	146	* dequeue threads belonging to other cpus, so be sure to use td->td_gd
	147	* instead of 'mycpu' when referencing the globaldata structure. Once
	148	* SMP live enqueuing and dequeueing only occurs on the current cpu.
4b5f931b	149	*/
f1d1c3fa MD	150	static __inline
	151	void
	152	_lwkt_dequeue(thread_t td)
	153	{
	154	if (td->td_flags & TDF_RUNQ) {
75cdbe6c	155	struct globaldata *gd = td->td_gd;
4b5f931b	156
f1d1c3fa	157	td->td_flags &= ~TDF_RUNQ;
f9235b6d	158	TAILQ_REMOVE(&gd->gd_tdrunq, td, td_threadq);
de4d4cb0	159	--gd->gd_tdrunqcount;
f9235b6d	160	if (TAILQ_FIRST(&gd->gd_tdrunq) == NULL)
2a418930	161	atomic_clear_int(&gd->gd_reqflags, RQF_RUNNING);
f1d1c3fa MD	162	}
	163	}
	164
f9235b6d MD	165	/*
	166	* Priority enqueue.
	167	*
d992c377 MD	168	* There are a limited number of lwkt threads runnable since user
	169	* processes only schedule one at a time per cpu. However, there can
	170	* be many user processes in kernel mode exiting from a tsleep() which
e3e6be1f	171	* become runnable.
d992c377	172	*
e6b81333 MD	173	* We scan the queue in both directions to help deal with degenerate
	174	* situations when hundreds or thousands (or more) threads are runnable.
	175	*
d992c377 MD	176	* NOTE: lwkt_schedulerclock() will force a round-robin based on td_pri and
	177	* will ignore user priority. This is to ensure that user threads in
	178	* kernel mode get cpu at some point regardless of what the user
	179	* scheduler thinks.
f9235b6d	180	*/
f1d1c3fa MD	181	static __inline
	182	void
	183	_lwkt_enqueue(thread_t td)
	184	{
e6b81333 MD	185	thread_t xtd; /* forward scan */
e6b81333 MD	186	thread_t rtd; /* reverse scan */
f9235b6d	187
7f5d7ed7	188	if ((td->td_flags & (TDF_RUNQ\|TDF_MIGRATING\|TDF_BLOCKQ)) == 0) {
75cdbe6c	189	struct globaldata *gd = td->td_gd;
4b5f931b	190
f1d1c3fa	191	td->td_flags \|= TDF_RUNQ;
f9235b6d MD	192	xtd = TAILQ_FIRST(&gd->gd_tdrunq);
f9235b6d MD	193	if (xtd == NULL) {
85946b6c MD	194	TAILQ_INSERT_TAIL(&gd->gd_tdrunq, td, td_threadq);
85946b6c MD	195	atomic_set_int(&gd->gd_reqflags, RQF_RUNNING);
f9235b6d	196	} else {
e3e6be1f MD	197	/*
	198	* NOTE: td_upri - higher numbers more desireable, same sense
	199	* as td_pri (typically reversed from lwp_upri).
	200	*
	201	* In the equal priority case we want the best selection
	202	* at the beginning so the less desireable selections know
	203	* that they have to setrunqueue/go-to-another-cpu, even
	204	* though it means switching back to the 'best' selection.
	205	* This also avoids degenerate situations when many threads
	206	* are runnable or waking up at the same time.
	207	*
	208	* If upri matches exactly place at end/round-robin.
	209	*/
e6b81333 MD	210	rtd = TAILQ_LAST(&gd->gd_tdrunq, lwkt_queue);
e6b81333 MD	211
d992c377	212	while (xtd &&
e6b81333	213	(xtd->td_pri > td->td_pri \|\|
d992c377	214	(xtd->td_pri == td->td_pri &&
e3e6be1f	215	xtd->td_upri >= td->td_upri))) {
85946b6c	216	xtd = TAILQ_NEXT(xtd, td_threadq);
e6b81333 MD	217
	218	/*
	219	* Doing a reverse scan at the same time is an optimization
	220	* for the insert-closer-to-tail case that avoids having to
	221	* scan the entire list. This situation can occur when
	222	* thousands of threads are woken up at the same time.
	223	*/
	224	if (rtd->td_pri > td->td_pri \|\|
	225	(rtd->td_pri == td->td_pri &&
	226	rtd->td_upri >= td->td_upri)) {
	227	TAILQ_INSERT_AFTER(&gd->gd_tdrunq, rtd, td, td_threadq);
	228	goto skip;
	229	}
	230	rtd = TAILQ_PREV(rtd, lwkt_queue, td_threadq);
d992c377	231	}
85946b6c MD	232	if (xtd)
	233	TAILQ_INSERT_BEFORE(xtd, td, td_threadq);
	234	else
	235	TAILQ_INSERT_TAIL(&gd->gd_tdrunq, td, td_threadq);
f9235b6d	236	}
e6b81333	237	skip:
de4d4cb0	238	++gd->gd_tdrunqcount;
b12defdc MD	239
b12defdc MD	240	/*
85946b6c	241	* Request a LWKT reschedule if we are now at the head of the queue.
b12defdc	242	*/
85946b6c MD	243	if (TAILQ_FIRST(&gd->gd_tdrunq) == td)
85946b6c MD	244	need_lwkt_resched();
f1d1c3fa MD	245	}
f1d1c3fa MD	246	}
8ad65e08	247
e28c8ef4	248	static boolean_t
40aaf5fc NT	249	_lwkt_thread_ctor(void obj, void privdata, int ocflags)
40aaf5fc NT	250	{
6d3dff5f	251	struct thread td = (struct thread )obj;
40aaf5fc	252
6d3dff5f MD	253	td->td_kstack = NULL;
	254	td->td_kstack_size = 0;
	255	td->td_flags = TDF_ALLOCATED_THREAD;
	256	td->td_mpflags = 0;
	257	return (1);
40aaf5fc NT	258	}
	259
	260	static void
	261	_lwkt_thread_dtor(void obj, void privdata)
	262	{
6d3dff5f MD	263	struct thread td = (struct thread )obj;
	264
	265	KASSERT(td->td_flags & TDF_ALLOCATED_THREAD,
	266	("_lwkt_thread_dtor: not allocated from objcache"));
	267	KASSERT((td->td_flags & TDF_ALLOCATED_STACK) && td->td_kstack &&
	268	td->td_kstack_size > 0,
	269	("_lwkt_thread_dtor: corrupted stack"));
	270	kmem_free(kernel_map, (vm_offset_t)td->td_kstack, td->td_kstack_size);
	271	td->td_kstack = NULL;
	272	td->td_flags = 0;
40aaf5fc NT	273	}
	274
	275	/*
	276	* Initialize the lwkt s/system.
765b1ae0	277	*
7b234d8c MD	278	* Nominally cache up to 32 thread + kstack structures. Cache more on
7b234d8c MD	279	* systems with a lot of cpu cores.
40aaf5fc	280	*/
ced589cb	281	static void
40aaf5fc NT	282	lwkt_init(void)
40aaf5fc NT	283	{
765b1ae0	284	TUNABLE_INT("lwkt.cache_threads", &lwkt_cache_threads);
7b234d8c MD	285	if (lwkt_cache_threads == 0) {
	286	lwkt_cache_threads = ncpus * 4;
	287	if (lwkt_cache_threads < 32)
	288	lwkt_cache_threads = 32;
	289	}
765b1ae0 MD	290	thread_cache = objcache_create_mbacked(
765b1ae0 MD	291	M_THREAD, sizeof(struct thread),
2fce2579	292	0, lwkt_cache_threads,
765b1ae0	293	_lwkt_thread_ctor, _lwkt_thread_dtor, NULL);
40aaf5fc	294	}
ced589cb	295	SYSINIT(lwkt_init, SI_BOOT2_LWKT_INIT, SI_ORDER_FIRST, lwkt_init, NULL);
40aaf5fc	296
37af14fe MD	297	/*
	298	* Schedule a thread to run. As the current thread we can always safely
	299	* schedule ourselves, and a shortcut procedure is provided for that
	300	* function.
	301	*
	302	* (non-blocking, self contained on a per cpu basis)
	303	*/
	304	void
	305	lwkt_schedule_self(thread_t td)
	306	{
cfaeae2a	307	KKASSERT((td->td_flags & TDF_MIGRATING) == 0);
37af14fe	308	crit_enter_quick(td);
f9235b6d MD	309	KASSERT(td != &td->td_gd->gd_idlethread,
f9235b6d MD	310	("lwkt_schedule_self(): scheduling gd_idlethread is illegal!"));
4643740a MD	311	KKASSERT(td->td_lwp == NULL \|\|
4643740a MD	312	(td->td_lwp->lwp_mpflags & LWP_MP_ONRUNQ) == 0);
37af14fe	313	_lwkt_enqueue(td);
37af14fe MD	314	crit_exit_quick(td);
	315	}
	316
	317	/*
	318	* Deschedule a thread.
	319	*
	320	* (non-blocking, self contained on a per cpu basis)
	321	*/
	322	void
	323	lwkt_deschedule_self(thread_t td)
	324	{
	325	crit_enter_quick(td);
37af14fe MD	326	_lwkt_dequeue(td);
	327	crit_exit_quick(td);
	328	}
	329
8ad65e08 MD	330	/*
	331	* LWKTs operate on a per-cpu basis
	332	*
73e4f7b9	333	* WARNING! Called from early boot, 'mycpu' may not work yet.
8ad65e08 MD	334	*/
	335	void
	336	lwkt_gdinit(struct globaldata *gd)
	337	{
f9235b6d	338	TAILQ_INIT(&gd->gd_tdrunq);
73e4f7b9	339	TAILQ_INIT(&gd->gd_tdallq);
e8e93b5f	340	lockinit(&gd->gd_sysctllock, "sysctl", 0, LK_CANRECURSE);
8ad65e08 MD	341	}
8ad65e08 MD	342
7d0bac62 MD	343	/*
7d0bac62 MD	344	* Create a new thread. The thread must be associated with a process context
75cdbe6c MD	345	* or LWKT start address before it can be scheduled. If the target cpu is
75cdbe6c MD	346	* -1 the thread will be created on the current cpu.
0cfcada1 MD	347	*
	348	* If you intend to create a thread without a process context this function
	349	* does everything except load the startup and switcher function.
7d0bac62 MD	350	*/
7d0bac62 MD	351	thread_t
d3d32139	352	lwkt_alloc_thread(struct thread *td, int stksize, int cpu, int flags)
7d0bac62	353	{
d2d8515b	354	static int cpu_rotator;
c070746a	355	globaldata_t gd = mycpu;
99df837e	356	void *stack;
7d0bac62	357
c070746a MD	358	/*
	359	* If static thread storage is not supplied allocate a thread. Reuse
	360	* a cached free thread if possible. gd_freetd is used to keep an exiting
	361	* thread intact through the exit.
	362	*/
ef0fdad1	363	if (td == NULL) {
cf709dd2 MD	364	crit_enter_gd(gd);
	365	if ((td = gd->gd_freetd) != NULL) {
	366	KKASSERT((td->td_flags & (TDF_RUNNING\|TDF_PREEMPT_LOCK\|
	367	TDF_RUNQ)) == 0);
c070746a	368	gd->gd_freetd = NULL;
cf709dd2	369	} else {
c070746a	370	td = objcache_get(thread_cache, M_WAITOK);
cf709dd2 MD	371	KKASSERT((td->td_flags & (TDF_RUNNING\|TDF_PREEMPT_LOCK\|
	372	TDF_RUNQ)) == 0);
	373	}
	374	crit_exit_gd(gd);
40aaf5fc	375	KASSERT((td->td_flags &
2af9d75d MD	376	(TDF_ALLOCATED_THREAD\|TDF_RUNNING\|TDF_PREEMPT_LOCK)) ==
2af9d75d MD	377	TDF_ALLOCATED_THREAD,
40aaf5fc NT	378	("lwkt_alloc_thread: corrupted td flags 0x%X", td->td_flags));
40aaf5fc NT	379	flags \|= td->td_flags & (TDF_ALLOCATED_THREAD\|TDF_ALLOCATED_STACK);
ef0fdad1	380	}
c070746a MD	381
	382	/*
	383	* Try to reuse cached stack.
	384	*/
f470d0c8 MD	385	if ((stack = td->td_kstack) != NULL && td->td_kstack_size != stksize) {
f470d0c8 MD	386	if (flags & TDF_ALLOCATED_STACK) {
1eeaf6b2	387	kmem_free(kernel_map, (vm_offset_t)stack, td->td_kstack_size);
f470d0c8 MD	388	stack = NULL;
	389	}
	390	}
	391	if (stack == NULL) {
1eeaf6b2 AL	392	if (cpu < 0) {
	393	stack = (void *)kmem_alloc_stack(kernel_map, stksize, 0);
	394	} else {
	395	stack = (void *)kmem_alloc_stack(kernel_map, stksize,
070a58b3	396	KM_CPU(cpu));
1eeaf6b2	397	}
ef0fdad1	398	flags \|= TDF_ALLOCATED_STACK;
99df837e	399	}
d2d8515b MD	400	if (cpu < 0) {
	401	cpu = ++cpu_rotator;
	402	cpu_ccfence();
5dd85b08	403	cpu = (uint32_t)cpu % (uint32_t)ncpus;
d2d8515b MD	404	}
d2d8515b MD	405	lwkt_init_thread(td, stack, stksize, flags, globaldata_find(cpu));
99df837e	406	return(td);
7d0bac62 MD	407	}
	408
	409	/*
	410	* Initialize a preexisting thread structure. This function is used by
	411	* lwkt_alloc_thread() and also used to initialize the per-cpu idlethread.
	412	*
f8c3996b MD	413	* All threads start out in a critical section at a priority of
f8c3996b MD	414	* TDPRI_KERN_DAEMON. Higher level code will modify the priority as
dae65060	415	* appropriate. This function may send an IPI message when the
75cdbe6c MD	416	* requested cpu is not the current cpu and consequently gd_tdallq may
	417	* not be initialized synchronously from the point of view of the originating
	418	* cpu.
	419	*
	420	* NOTE! we have to be careful in regards to creating threads for other cpus
	421	* if SMP has not yet been activated.
7d0bac62	422	*/
75cdbe6c MD	423	static void
	424	lwkt_init_thread_remote(void *arg)
	425	{
	426	thread_t td = arg;
	427
52eedfb5 MD	428	/*
	429	* Protected by critical section held by IPI dispatch
	430	*/
75cdbe6c MD	431	TAILQ_INSERT_TAIL(&td->td_gd->gd_tdallq, td, td_allq);
	432	}
	433
fdce8919 MD	434	/*
	435	* lwkt core thread structural initialization.
	436	*
	437	* NOTE: All threads are initialized as mpsafe threads.
	438	*/
7d0bac62	439	void
f470d0c8 MD	440	lwkt_init_thread(thread_t td, void *stack, int stksize, int flags,
f470d0c8 MD	441	struct globaldata *gd)
7d0bac62	442	{
37af14fe MD	443	globaldata_t mygd = mycpu;
37af14fe MD	444
99df837e MD	445	bzero(td, sizeof(struct thread));
99df837e MD	446	td->td_kstack = stack;
f470d0c8	447	td->td_kstack_size = stksize;
d3d32139	448	td->td_flags = flags;
4643740a	449	td->td_mpflags = 0;
f256b6c0	450	td->td_type = TD_TYPE_GENERIC;
26a0694b	451	td->td_gd = gd;
f9235b6d MD	452	td->td_pri = TDPRI_KERN_DAEMON;
f9235b6d MD	453	td->td_critcount = 1;
54341a3b	454	td->td_toks_have = NULL;
3b998fa9	455	td->td_toks_stop = &td->td_toks_base;
c068fb59 SZ	456	if (lwkt_use_spin_port \|\| (flags & TDF_FORCE_SPINPORT)) {
	457	lwkt_initport_spin(&td->td_msgport, td,
	458	(flags & TDF_FIXEDCPU) ? TRUE : FALSE);
	459	} else {
fb0f29c4	460	lwkt_initport_thread(&td->td_msgport, td);
c068fb59	461	}
99df837e	462	pmap_init_thread(td);
6d3dff5f	463
5d21b981 MD	464	/*
	465	* Normally initializing a thread for a remote cpu requires sending an
	466	* IPI. However, the idlethread is setup before the other cpus are
	467	* activated so we have to treat it as a special case. XXX manipulation
	468	* of gd_tdallq requires the BGL.
	469	*/
	470	if (gd == mygd \|\| td == &gd->gd_idlethread) {
37af14fe	471	crit_enter_gd(mygd);
75cdbe6c	472	TAILQ_INSERT_TAIL(&gd->gd_tdallq, td, td_allq);
37af14fe	473	crit_exit_gd(mygd);
75cdbe6c	474	} else {
2db3b277	475	lwkt_send_ipiq(gd, lwkt_init_thread_remote, td);
75cdbe6c	476	}
3573cf7b	477	dsched_enter_thread(td);
73e4f7b9 MD	478	}
	479
	480	void
	481	lwkt_set_comm(thread_t td, const char *ctl, ...)
	482	{
e2565a42	483	__va_list va;
73e4f7b9	484
e2565a42	485	__va_start(va, ctl);
379210cb	486	kvsnprintf(td->td_comm, sizeof(td->td_comm), ctl, va);
e2565a42	487	__va_end(va);
5bf48697	488	KTR_LOG(ctxsw_newtd, td, td->td_comm);
7d0bac62 MD	489	}
7d0bac62 MD	490
eb2adbf5 MD	491	/*
	492	* Prevent the thread from getting destroyed. Note that unlike PHOLD/PRELE
	493	* this does not prevent the thread from migrating to another cpu so the
	494	* gd_tdallq state is not protected by this.
	495	*/
99df837e	496	void
73e4f7b9	497	lwkt_hold(thread_t td)
99df837e	498	{
74c9628e	499	atomic_add_int(&td->td_refs, 1);
73e4f7b9 MD	500	}
	501
	502	void
	503	lwkt_rele(thread_t td)
	504	{
	505	KKASSERT(td->td_refs > 0);
74c9628e	506	atomic_add_int(&td->td_refs, -1);
73e4f7b9 MD	507	}
73e4f7b9 MD	508
73e4f7b9 MD	509	void
	510	lwkt_free_thread(thread_t td)
	511	{
74c9628e	512	KKASSERT(td->td_refs == 0);
c17a6852	513	KKASSERT((td->td_flags & (TDF_RUNNING \| TDF_PREEMPT_LOCK \|
1be00ff1 MD	514	TDF_RUNQ \| TDF_TSLEEPQ \| TDF_KERNELFP)) == 0);
	515
	516	if (td->td_kfpuctx) {
	517	kfree(td->td_kfpuctx, M_FPUCTX);
	518	td->td_kfpuctx = NULL;
	519	}
	520
40aaf5fc NT	521	if (td->td_flags & TDF_ALLOCATED_THREAD) {
	522	objcache_put(thread_cache, td);
	523	} else if (td->td_flags & TDF_ALLOCATED_STACK) {
	524	/* client-allocated struct with internally allocated stack */
	525	KASSERT(td->td_kstack && td->td_kstack_size > 0,
	526	("lwkt_free_thread: corrupted stack"));
1eeaf6b2	527	kmem_free(kernel_map, (vm_offset_t)td->td_kstack, td->td_kstack_size);
40aaf5fc NT	528	td->td_kstack = NULL;
40aaf5fc NT	529	td->td_kstack_size = 0;
99df837e	530	}
a86ce0cd	531
e7c0dbba	532	KTR_LOG(ctxsw_deadtd, td);
99df837e MD	533	}
	534
	535
8ad65e08	536	/*
dae65060	537	* Switch to the next runnable lwkt. If no LWKTs are runnable then
f1d1c3fa MD	538	* switch to the idlethread. Switching must occur within a critical
	539	* section to avoid races with the scheduling queue.
	540	*
	541	* We always have full control over our cpu's run queue. Other cpus
	542	* that wish to manipulate our queue must use the cpu_*msg() calls to
	543	* talk to our cpu, so a critical section is all that is needed and
	544	* the result is very, very fast thread switching.
	545	*
96728c05 MD	546	* The LWKT scheduler uses a fixed priority model and round-robins at
	547	* each priority level. User process scheduling is a totally
	548	* different beast and LWKT priorities should not be confused with
	549	* user process priorities.
f1d1c3fa	550	*
69d78e99 MD	551	* PREEMPTION NOTE: Preemption occurs via lwkt_preempt(). lwkt_switch()
	552	* is not called by the current thread in the preemption case, only when
	553	* the preempting thread blocks (in order to return to the original thread).
cfaeae2a MD	554	*
	555	* SPECIAL NOTE ON SWITCH ATOMICY: Certain operations such as thread
	556	* migration and tsleep deschedule the current lwkt thread and call
	557	* lwkt_switch(). In particular, the target cpu of the migration fully
	558	* expects the thread to become non-runnable and can deadlock against
	559	* cpusync operations if we run any IPIs prior to switching the thread out.
	560	*
	561	* WE MUST BE VERY CAREFUL NOT TO RUN SPLZ DIRECTLY OR INDIRECTLY IF
95858b91	562	* THE CURRENT THREAD HAS BEEN DESCHEDULED!
8ad65e08 MD	563	*/
	564	void
	565	lwkt_switch(void)
	566	{
37af14fe MD	567	globaldata_t gd = mycpu;
37af14fe MD	568	thread_t td = gd->gd_curthread;
8ad65e08	569	thread_t ntd;
c6a766f4	570	thread_t xtd;
5411d8f1	571	int upri;
3a06728e MD	572	#ifdef LOOPMASK
	573	uint64_t tsc_base = rdtsc();
	574	#endif
8ad65e08	575
da0b0e8b	576	KKASSERT(gd->gd_processing_ipiq == 0);
121f93bc	577	KKASSERT(td->td_flags & TDF_RUNNING);
da0b0e8b	578
46a3f46d	579	/*
27e88a6e MD	580	* Switching from within a 'fast' (non thread switched) interrupt or IPI
	581	* is illegal. However, we may have to do it anyway if we hit a fatal
	582	* kernel trap or we have paniced.
	583	*
	584	* If this case occurs save and restore the interrupt nesting level.
46a3f46d	585	*/
27e88a6e MD	586	if (gd->gd_intr_nesting_level) {
	587	int savegdnest;
	588	int savegdtrap;
	589
5fddbda2	590	if (gd->gd_trap_nesting_level == 0 && panic_cpu_gd != mycpu) {
4a28fe22	591	panic("lwkt_switch: Attempt to switch from a "
5a8df152	592	"fast interrupt, ipi, or hard code section, "
4a28fe22 MD	593	"td %p\n",
4a28fe22 MD	594	td);
27e88a6e MD	595	} else {
	596	savegdnest = gd->gd_intr_nesting_level;
	597	savegdtrap = gd->gd_trap_nesting_level;
	598	gd->gd_intr_nesting_level = 0;
	599	gd->gd_trap_nesting_level = 0;
a7422615 MD	600	if ((td->td_flags & TDF_PANICWARN) == 0) {
a7422615 MD	601	td->td_flags \|= TDF_PANICWARN;
4a28fe22 MD	602	kprintf("Warning: thread switch from interrupt, IPI, "
4a28fe22 MD	603	"or hard code section.\n"
a7422615	604	"thread %p (%s)\n", td, td->td_comm);
7ce2998e	605	print_backtrace(-1);
a7422615	606	}
27e88a6e MD	607	lwkt_switch();
	608	gd->gd_intr_nesting_level = savegdnest;
	609	gd->gd_trap_nesting_level = savegdtrap;
	610	return;
	611	}
96728c05	612	}
ef0fdad1	613
cb973d15	614	/*
85946b6c MD	615	* Release our current user process designation if we are blocking
	616	* or if a user reschedule was requested.
	617	*
	618	* NOTE: This function is NOT called if we are switching into or
	619	* returning from a preemption.
	620	*
	621	* NOTE: Releasing our current user process designation may cause
	622	* it to be assigned to another thread, which in turn will
	623	* cause us to block in the usched acquire code when we attempt
	624	* to return to userland.
	625	*
	626	* NOTE: On SMP systems this can be very nasty when heavy token
	627	* contention is present so we want to be careful not to
	628	* release the designation gratuitously.
cb973d15	629	*/
85946b6c MD	630	if (td->td_release &&
85946b6c MD	631	(user_resched_wanted() \|\| (td->td_flags & TDF_RUNQ) == 0)) {
cb973d15	632	td->td_release(td);
85946b6c	633	}
cb973d15	634
85946b6c	635	/*
5411d8f1 MD	636	* Release all tokens. Once we do this we must remain in the critical
	637	* section and cannot run IPIs or other interrupts until we switch away
	638	* because they may implode if they try to get a token using our thread
	639	* context.
85946b6c	640	*/
37af14fe	641	crit_enter_gd(gd);
3b998fa9	642	if (TD_TOKS_HELD(td))
9d265729 MD	643	lwkt_relalltokens(td);
	644
	645	/*
b02926de MD	646	* We had better not be holding any spin locks, but don't get into an
b02926de MD	647	* endless panic loop.
9d265729	648	*/
0846e4ce	649	KASSERT(gd->gd_spinlocks == 0 \|\| panicstr != NULL,
d666840a	650	("lwkt_switch: still holding %d exclusive spinlocks!",
0846e4ce	651	gd->gd_spinlocks));
9d265729	652
0f7a3396 MD	653	#ifdef INVARIANTS
0f7a3396 MD	654	if (td->td_cscount) {
6ea70f76	655	kprintf("Diagnostic: attempt to switch while mastering cpusync: %p\n",
0f7a3396 MD	656	td);
	657	if (panic_on_cscount)
	658	panic("switching while mastering cpusync");
	659	}
8a8d5d85	660	#endif
f9235b6d MD	661
	662	/*
	663	* If we had preempted another thread on this cpu, resume the preempted
	664	* thread. This occurs transparently, whether the preempted thread
	665	* was scheduled or not (it may have been preempted after descheduling
	666	* itself).
	667	*
	668	* We have to setup the MP lock for the original thread after backing
	669	* out the adjustment that was made to curthread when the original
	670	* was preempted.
	671	*/
99df837e	672	if ((ntd = td->td_preempted) != NULL) {
26a0694b MD	673	KKASSERT(ntd->td_flags & TDF_PREEMPT_LOCK);
26a0694b MD	674	ntd->td_flags \|= TDF_PREEMPT_DONE;
7fb451cb	675	ntd->td_contended = 0; /* reset contended */
8ec60c3f MD	676
8ec60c3f MD	677	/*
b9eb1c19 MD	678	* The interrupt may have woken a thread up, we need to properly
	679	* set the reschedule flag if the originally interrupted thread is
	680	* at a lower priority.
85946b6c	681	*
c6a766f4 MD	682	* NOTE: The interrupt may not have descheduled ntd.
	683	*
	684	* NOTE: We do not reschedule if there are no threads on the runq.
	685	* (ntd could be the idlethread).
8ec60c3f	686	*/
c6a766f4 MD	687	xtd = TAILQ_FIRST(&gd->gd_tdrunq);
c6a766f4 MD	688	if (xtd && xtd != ntd)
8ec60c3f	689	need_lwkt_resched();
f9235b6d MD	690	goto havethread_preempted;
	691	}
	692
b12defdc	693	/*
5411d8f1 MD	694	* Figure out switch target. If we cannot switch to our desired target
5411d8f1 MD	695	* look for a thread that we can switch to.
cfaeae2a	696	*
5411d8f1 MD	697	* NOTE! The limited spin loop and related parameters are extremely
	698	* important for system performance, particularly for pipes and
	699	* concurrent conflicting VM faults.
f9235b6d	700	*/
5411d8f1 MD	701	clear_lwkt_resched();
	702	ntd = TAILQ_FIRST(&gd->gd_tdrunq);
	703
	704	if (ntd) {
	705	do {
	706	if (TD_TOKS_NOT_HELD(ntd) \|\|
	707	lwkt_getalltokens(ntd, (ntd->td_contended > lwkt_spin_loops)))
	708	{
	709	goto havethread;
	710	}
7fb451cb	711	++ntd->td_contended; /* overflow ok */
b1793cc6	712	if (gd->gd_indefinite.type == 0)
6d0742ae	713	indefinite_init(&gd->gd_indefinite, NULL, NULL, 0, 't');
3a06728e MD	714	#ifdef LOOPMASK
3a06728e MD	715	if (tsc_frequency && rdtsc() - tsc_base > tsc_frequency) {
ab4aa0bb MD	716	kprintf("lwkt_switch: WARNING, excessive token contention "
	717	"cpu %d, %d sec, "
	718	"td %p (%s)\n",
	719	gd->gd_cpuid,
	720	ntd->td_contended,
	721	ntd,
	722	ntd->td_comm);
3a06728e MD	723	tsc_base = rdtsc();
	724	}
	725	#endif
5411d8f1 MD	726	} while (ntd->td_contended < (lwkt_spin_loops >> 1));
5411d8f1 MD	727	upri = ntd->td_upri;
f9235b6d	728
f9235b6d	729	/*
5411d8f1 MD	730	* Bleh, the thread we wanted to switch to has a contended token.
5411d8f1 MD	731	* See if we can switch to another thread.
2a418930	732	*
5411d8f1	733	* We generally don't want to do this because it represents a
5042f74d MD	734	* priority inversion, but contending tokens on the same cpu can
	735	* cause real problems if we don't now that we have an exclusive
	736	* priority mechanism over shared for tokens.
	737	*
	738	* The solution is to allow threads with pending tokens to compete
	739	* for them (a lower priority thread will get less cpu once it
	740	* returns from the kernel anyway). If a thread does not have
	741	* any contending tokens, we go by td_pri and upri.
f9235b6d	742	*/
b12defdc	743	while ((ntd = TAILQ_NEXT(ntd, td_threadq)) != NULL) {
5042f74d MD	744	if (TD_TOKS_NOT_HELD(ntd) &&
	745	ntd->td_pri < TDPRI_KERN_LPSCHED && upri > ntd->td_upri) {
	746	continue;
	747	}
	748	if (upri < ntd->td_upri)
	749	upri = ntd->td_upri;
b12defdc	750
5411d8f1 MD	751	/*
	752	* Try this one.
	753	*/
	754	if (TD_TOKS_NOT_HELD(ntd) \|\|
	755	lwkt_getalltokens(ntd, (ntd->td_contended > lwkt_spin_loops))) {
	756	goto havethread;
	757	}
7fb451cb	758	++ntd->td_contended; /* overflow ok */
b12defdc	759	}
b12defdc MD	760
b12defdc MD	761	/*
5411d8f1 MD	762	* Fall through, switch to idle thread to get us out of the current
	763	* context. Since we were contended, prevent HLT by flagging a
	764	* LWKT reschedule.
b12defdc	765	*/
5411d8f1	766	need_lwkt_resched();
f1d1c3fa	767	}
8a8d5d85	768
5411d8f1 MD	769	/*
	770	* We either contended on ntd or the runq is empty. We must switch
	771	* through the idle thread to get out of the current context.
	772	*/
	773	ntd = &gd->gd_idlethread;
	774	if (gd->gd_trap_nesting_level == 0 && panicstr == NULL)
	775	ASSERT_NO_TOKENS_HELD(ntd);
	776	cpu_time.cp_msg[0] = 0;
5411d8f1 MD	777	goto haveidle;
5411d8f1 MD	778
2a418930	779	havethread:
b12defdc	780	/*
be71787b MD	781	* Clear gd_idle_repeat when doing a normal switch to a non-idle
be71787b MD	782	* thread.
f9235b6d	783	*/
9ac1ee6e	784	ntd->td_wmesg = NULL;
7fb451cb	785	ntd->td_contended = 0; /* reset once scheduled */
b12defdc	786	++gd->gd_cnt.v_swtch;
be71787b	787	gd->gd_idle_repeat = 0;
2a418930	788
5b49787b MD	789	/*
	790	* If we were busy waiting record final disposition
	791	*/
b1793cc6 MD	792	if (gd->gd_indefinite.type)
b1793cc6 MD	793	indefinite_done(&gd->gd_indefinite);
5b49787b	794
f9235b6d	795	havethread_preempted:
f9235b6d MD	796	/*
	797	* If the new target does not need the MP lock and we are holding it,
	798	* release the MP lock. If the new target requires the MP lock we have
	799	* already acquired it for the target.
8a8d5d85	800	*/
2a418930	801	;
f9235b6d MD	802	haveidle:
f9235b6d MD	803	KASSERT(ntd->td_critcount,
b5d16701 MD	804	("priority problem in lwkt_switch %d %d",
	805	td->td_critcount, ntd->td_critcount));
	806
94f6d86e	807	if (td != ntd) {
cc9b6223 MD	808	/*
	809	* Execute the actual thread switch operation. This function
	810	* returns to the current thread and returns the previous thread
	811	* (which may be different from the thread we switched to).
	812	*
	813	* We are responsible for marking ntd as TDF_RUNNING.
	814	*/
121f93bc	815	KKASSERT((ntd->td_flags & TDF_RUNNING) == 0);
4b2467b7	816	#ifdef DEBUG_LWKT_THREAD
94f6d86e	817	++switch_count;
4b2467b7	818	#endif
a1f0fb66	819	KTR_LOG(ctxsw_sw, gd->gd_cpuid, ntd);
cc9b6223 MD	820	ntd->td_flags \|= TDF_RUNNING;
	821	lwkt_switch_return(td->td_switch(ntd));
	822	/* ntd invalid, td_switch() can return a different thread_t */
94f6d86e	823	}
b12defdc	824
b12defdc	825	/*
54341a3b	826	* catch-all. XXX is this strictly needed?
b12defdc MD	827	*/
b12defdc MD	828	splz_check();
54341a3b	829
37af14fe MD	830	/* NOTE: current cpu may have changed after switch */
37af14fe MD	831	crit_exit_quick(td);
8ad65e08 MD	832	}
8ad65e08 MD	833
cc9b6223 MD	834	/*
	835	* Called by assembly in the td_switch (thread restore path) for thread
	836	* bootstrap cases which do not 'return' to lwkt_switch().
	837	*/
	838	void
	839	lwkt_switch_return(thread_t otd)
	840	{
cc9b6223	841	globaldata_t rgd;
3a06728e MD	842	#ifdef LOOPMASK
	843	uint64_t tsc_base = rdtsc();
	844	#endif
	845	int exiting;
	846
	847	exiting = otd->td_flags & TDF_EXITING;
	848	cpu_ccfence();
cc9b6223 MD	849
	850	/*
	851	* Check if otd was migrating. Now that we are on ntd we can finish
	852	* up the migration. This is a bit messy but it is the only place
	853	* where td is known to be fully descheduled.
	854	*
	855	* We can only activate the migration if otd was migrating but not
	856	* held on the cpu due to a preemption chain. We still have to
	857	* clear TDF_RUNNING on the old thread either way.
	858	*
	859	* We are responsible for clearing the previously running thread's
	860	* TDF_RUNNING.
	861	*/
	862	if ((rgd = otd->td_migrate_gd) != NULL &&
	863	(otd->td_flags & TDF_PREEMPT_LOCK) == 0) {
	864	KKASSERT((otd->td_flags & (TDF_MIGRATING \| TDF_RUNNING)) ==
	865	(TDF_MIGRATING \| TDF_RUNNING));
	866	otd->td_migrate_gd = NULL;
	867	otd->td_flags &= ~TDF_RUNNING;
	868	lwkt_send_ipiq(rgd, lwkt_setcpu_remote, otd);
	869	} else {
	870	otd->td_flags &= ~TDF_RUNNING;
	871	}
2b07d9aa MD	872
	873	/*
	874	* Final exit validations (see lwp_wait()). Note that otd becomes
	875	* invalid the instant we set TDF_MP_EXITSIG.
3a06728e MD	876	*
	877	* Use the EXITING status loaded from before we clear TDF_RUNNING,
	878	* because if it is not set otd becomes invalid the instant we clear
	879	* TDF_RUNNING on it (otherwise, if the system is fast enough, we
	880	* might 'steal' TDF_EXITING from another switch-return!).
2b07d9aa	881	*/
3a06728e	882	while (exiting) {
2b07d9aa MD	883	u_int mpflags;
	884
	885	mpflags = otd->td_mpflags;
	886	cpu_ccfence();
	887
	888	if (mpflags & TDF_MP_EXITWAIT) {
	889	if (atomic_cmpset_int(&otd->td_mpflags, mpflags,
	890	mpflags \| TDF_MP_EXITSIG)) {
	891	wakeup(otd);
	892	break;
	893	}
	894	} else {
	895	if (atomic_cmpset_int(&otd->td_mpflags, mpflags,
	896	mpflags \| TDF_MP_EXITSIG)) {
	897	wakeup(otd);
	898	break;
	899	}
	900	}
3a06728e MD	901
	902	#ifdef LOOPMASK
	903	if (tsc_frequency && rdtsc() - tsc_base > tsc_frequency) {
	904	kprintf("lwkt_switch_return: excessive TDF_EXITING "
	905	"thread %p\n", otd);
	906	tsc_base = rdtsc();
	907	}
	908	#endif
2b07d9aa	909	}
cc9b6223 MD	910	}
cc9b6223 MD	911
b68b7282	912	/*
96728c05	913	* Request that the target thread preempt the current thread. Preemption
203592a0 MD	914	* can only occur only:
	915	*
	916	* - If our critical section is the one that we were called with
	917	* - The relative priority of the target thread is higher
	918	* - The target is not excessively interrupt-nested via td_nest_count
	919	* - The target thread holds no tokens.
	920	* - The target thread is not already scheduled and belongs to the
	921	* current cpu.
	922	* - The current thread is not holding any spin-locks.
96728c05 MD	923	*
	924	* THE CALLER OF LWKT_PREEMPT() MUST BE IN A CRITICAL SECTION. Typically
	925	* this is called via lwkt_schedule() through the td_preemptable callback.
f9235b6d	926	* critcount is the managed critical priority that we should ignore in order
96728c05 MD	927	* to determine whether preemption is possible (aka usually just the crit
96728c05 MD	928	* priority of lwkt_schedule() itself).
b68b7282	929	*
54341a3b MD	930	* Preemption is typically limited to interrupt threads.
	931	*
	932	* Operation works in a fairly straight-forward manner. The normal
	933	* scheduling code is bypassed and we switch directly to the target
	934	* thread. When the target thread attempts to block or switch away
	935	* code at the base of lwkt_switch() will switch directly back to our
	936	* thread. Our thread is able to retain whatever tokens it holds and
	937	* if the target needs one of them the target will switch back to us
	938	* and reschedule itself normally.
b68b7282 MD	939	*/
b68b7282 MD	940	void
f9235b6d	941	lwkt_preempt(thread_t ntd, int critcount)
b68b7282	942	{
46a3f46d	943	struct globaldata *gd = mycpu;
cc9b6223	944	thread_t xtd;
0a3f9b47	945	thread_t td;
2d910aaf	946	int save_gd_intr_nesting_level;
b68b7282	947
26a0694b	948	/*
96728c05 MD	949	* The caller has put us in a critical section. We can only preempt
96728c05 MD	950	* if the caller of the caller was not in a critical section (basically
f9235b6d	951	* a local interrupt), as determined by the 'critcount' parameter. We
47737962	952	* also can't preempt if the caller is holding any spinlocks (even if
d666840a	953	* he isn't in a critical section). This also handles the tokens test.
96728c05 MD	954	*
	955	* YYY The target thread must be in a critical section (else it must
	956	* inherit our critical section? I dunno yet).
26a0694b	957	*/
f9235b6d	958	KASSERT(ntd->td_critcount, ("BADCRIT0 %d", ntd->td_pri));
26a0694b	959
b12defdc	960	td = gd->gd_curthread;
fbc024e4	961	if (preempt_enable == 0) {
4b2467b7	962	#ifdef DEBUG_LWKT_THREAD
fbc024e4	963	++preempt_miss;
4b2467b7	964	#endif
fbc024e4 MD	965	return;
fbc024e4 MD	966	}
f9235b6d	967	if (ntd->td_pri <= td->td_pri) {
4b2467b7	968	#ifdef DEBUG_LWKT_THREAD
57c254db	969	++preempt_miss;
4b2467b7	970	#endif
57c254db MD	971	return;
57c254db MD	972	}
f9235b6d	973	if (td->td_critcount > critcount) {
4b2467b7	974	#ifdef DEBUG_LWKT_THREAD
96728c05	975	++preempt_miss;
4b2467b7	976	#endif
96728c05 MD	977	return;
96728c05 MD	978	}
203592a0	979	if (td->td_nest_count >= 2) {
4b2467b7	980	#ifdef DEBUG_LWKT_THREAD
203592a0	981	++preempt_miss;
4b2467b7	982	#endif
203592a0 MD	983	return;
203592a0 MD	984	}
121f93bc	985	if (td->td_cscount) {
4b2467b7	986	#ifdef DEBUG_LWKT_THREAD
121f93bc	987	++preempt_miss;
4b2467b7	988	#endif
121f93bc MD	989	return;
121f93bc MD	990	}
46a3f46d	991	if (ntd->td_gd != gd) {
4b2467b7	992	#ifdef DEBUG_LWKT_THREAD
96728c05	993	++preempt_miss;
4b2467b7	994	#endif
96728c05 MD	995	return;
96728c05 MD	996	}
ee89e80b	997
41a01a4d	998	/*
77912481 MD	999	* We don't have to check spinlocks here as they will also bump
77912481 MD	1000	* td_critcount.
d3d1cbc8 MD	1001	*
	1002	* Do not try to preempt if the target thread is holding any tokens.
	1003	* We could try to acquire the tokens but this case is so rare there
	1004	* is no need to support it.
41a01a4d	1005	*/
0846e4ce	1006	KKASSERT(gd->gd_spinlocks == 0);
77912481	1007
3b998fa9	1008	if (TD_TOKS_HELD(ntd)) {
4b2467b7	1009	#ifdef DEBUG_LWKT_THREAD
d3d1cbc8	1010	++preempt_miss;
4b2467b7	1011	#endif
d3d1cbc8 MD	1012	return;
d3d1cbc8 MD	1013	}
26a0694b	1014	if (td == ntd \|\| ((td->td_flags \| ntd->td_flags) & TDF_PREEMPT_LOCK)) {
4b2467b7	1015	#ifdef DEBUG_LWKT_THREAD
26a0694b	1016	++preempt_weird;
4b2467b7	1017	#endif
26a0694b MD	1018	return;
	1019	}
	1020	if (ntd->td_preempted) {
4b2467b7	1021	#ifdef DEBUG_LWKT_THREAD
4b5f931b	1022	++preempt_hit;
4b2467b7	1023	#endif
26a0694b	1024	return;
b68b7282	1025	}
da0b0e8b	1026	KKASSERT(gd->gd_processing_ipiq == 0);
26a0694b	1027
8ec60c3f MD	1028	/*
	1029	* Since we are able to preempt the current thread, there is no need to
	1030	* call need_lwkt_resched().
2d910aaf MD	1031	*
	1032	* We must temporarily clear gd_intr_nesting_level around the switch
	1033	* since switchouts from the target thread are allowed (they will just
	1034	* return to our thread), and since the target thread has its own stack.
cc9b6223 MD	1035	*
	1036	* A preemption must switch back to the original thread, assert the
	1037	* case.
8ec60c3f	1038	*/
4b2467b7	1039	#ifdef DEBUG_LWKT_THREAD
26a0694b	1040	++preempt_hit;
4b2467b7	1041	#endif
26a0694b MD	1042	ntd->td_preempted = td;
26a0694b MD	1043	td->td_flags \|= TDF_PREEMPT_LOCK;
a1f0fb66	1044	KTR_LOG(ctxsw_pre, gd->gd_cpuid, ntd);
2d910aaf MD	1045	save_gd_intr_nesting_level = gd->gd_intr_nesting_level;
2d910aaf MD	1046	gd->gd_intr_nesting_level = 0;
121f93bc MD	1047
121f93bc MD	1048	KKASSERT((ntd->td_flags & TDF_RUNNING) == 0);
cc9b6223 MD	1049	ntd->td_flags \|= TDF_RUNNING;
	1050	xtd = td->td_switch(ntd);
	1051	KKASSERT(xtd == ntd);
	1052	lwkt_switch_return(xtd);
2d910aaf	1053	gd->gd_intr_nesting_level = save_gd_intr_nesting_level;
b9eb1c19	1054
26a0694b MD	1055	KKASSERT(ntd->td_preempted && (td->td_flags & TDF_PREEMPT_DONE));
	1056	ntd->td_preempted = NULL;
	1057	td->td_flags &= ~(TDF_PREEMPT_LOCK\|TDF_PREEMPT_DONE);
b68b7282 MD	1058	}
b68b7282 MD	1059
f1d1c3fa	1060	/*
faaeffac	1061	* Conditionally call splz() if gd_reqflags indicates work is pending.
4a28fe22 MD	1062	* This will work inside a critical section but not inside a hard code
4a28fe22 MD	1063	* section.
ef0fdad1	1064	*
f1d1c3fa MD	1065	* (self contained on a per cpu basis)
	1066	*/
	1067	void
faaeffac	1068	splz_check(void)
f1d1c3fa	1069	{
7966cb69 MD	1070	globaldata_t gd = mycpu;
7966cb69 MD	1071	thread_t td = gd->gd_curthread;
ef0fdad1	1072
4a28fe22 MD	1073	if ((gd->gd_reqflags & RQF_IDLECHECK_MASK) &&
	1074	gd->gd_intr_nesting_level == 0 &&
	1075	td->td_nest_count < 2)
	1076	{
f1d1c3fa	1077	splz();
4a28fe22 MD	1078	}
	1079	}
	1080
	1081	/*
	1082	* This version is integrated into crit_exit, reqflags has already
	1083	* been tested but td_critcount has not.
	1084	*
	1085	* We only want to execute the splz() on the 1->0 transition of
	1086	* critcount and not in a hard code section or if too deeply nested.
925040f2	1087	*
0846e4ce	1088	* NOTE: gd->gd_spinlocks is implied to be 0 when td_critcount is 0.
4a28fe22 MD	1089	*/
	1090	void
	1091	lwkt_maybe_splz(thread_t td)
	1092	{
	1093	globaldata_t gd = td->td_gd;
	1094
	1095	if (td->td_critcount == 0 &&
	1096	gd->gd_intr_nesting_level == 0 &&
	1097	td->td_nest_count < 2)
	1098	{
	1099	splz();
	1100	}
f1d1c3fa MD	1101	}
f1d1c3fa MD	1102
e6546af9 MD	1103	/*
	1104	* Drivers which set up processing co-threads can call this function to
	1105	* run the co-thread at a higher priority and to allow it to preempt
	1106	* normal threads.
	1107	*/
	1108	void
	1109	lwkt_set_interrupt_support_thread(void)
	1110	{
	1111	thread_t td = curthread;
	1112
	1113	lwkt_setpri_self(TDPRI_INT_SUPPORT);
	1114	td->td_flags \|= TDF_INTTHREAD;
	1115	td->td_preemptable = lwkt_preempt;
	1116	}
	1117
	1118
8ad65e08	1119	/*
f9235b6d MD	1120	* This function is used to negotiate a passive release of the current
	1121	* process/lwp designation with the user scheduler, allowing the user
	1122	* scheduler to schedule another user thread. The related kernel thread
	1123	* (curthread) continues running in the released state.
8ad65e08 MD	1124	*/
8ad65e08 MD	1125	void
f9235b6d	1126	lwkt_passive_release(struct thread *td)
8ad65e08	1127	{
f9235b6d MD	1128	struct lwp *lp = td->td_lwp;
	1129
	1130	td->td_release = NULL;
	1131	lwkt_setpri_self(TDPRI_KERN_USER);
d992c377	1132
f9235b6d	1133	lp->lwp_proc->p_usched->release_curproc(lp);
f1d1c3fa MD	1134	}
f1d1c3fa MD	1135
f9235b6d	1136
3824f392	1137	/*
d2d8515b MD	1138	* This implements a LWKT yield, allowing a kernel thread to yield to other
	1139	* kernel threads at the same or higher priority. This function can be
	1140	* called in a tight loop and will typically only yield once per tick.
f9235b6d	1141	*
d2d8515b MD	1142	* Most kernel threads run at the same priority in order to allow equal
d2d8515b MD	1143	* sharing.
f9235b6d MD	1144	*
f9235b6d MD	1145	* (self contained on a per cpu basis)
3824f392 MD	1146	*/
3824f392 MD	1147	void
f9235b6d	1148	lwkt_yield(void)
3824f392	1149	{
f9235b6d MD	1150	globaldata_t gd = mycpu;
f9235b6d MD	1151	thread_t td = gd->gd_curthread;
3824f392	1152
fbe96076 MD	1153	/*
	1154	* Should never be called with spinlocks held but there is a path
	1155	* via ACPI where it might happen.
	1156	*/
	1157	if (gd->gd_spinlocks)
	1158	return;
	1159
	1160	/*
	1161	* Safe to call splz if we are not too-heavily nested.
	1162	*/
f9235b6d MD	1163	if ((gd->gd_reqflags & RQF_IDLECHECK_MASK) && td->td_nest_count < 2)
f9235b6d MD	1164	splz();
fbe96076 MD	1165
	1166	/*
	1167	* Caller allows switching
	1168	*/
85946b6c	1169	if (lwkt_resched_wanted()) {
dd8be70a MD	1170	atomic_set_int(&td->td_mpflags, TDF_MP_DIDYIELD);
dd8be70a MD	1171	lwkt_schedule_self(td);
f9235b6d	1172	lwkt_switch();
f9235b6d	1173	}
3824f392 MD	1174	}
3824f392 MD	1175
40504122 MD	1176	/*
	1177	* The quick version processes pending interrupts and higher-priority
	1178	* LWKT threads but will not round-robin same-priority LWKT threads.
de4d4cb0 MD	1179	*
	1180	* When called while attempting to return to userland the only same-pri
	1181	* threads are the ones which have already tried to become the current
	1182	* user process.
40504122 MD	1183	*/
	1184	void
	1185	lwkt_yield_quick(void)
	1186	{
	1187	globaldata_t gd = mycpu;
	1188	thread_t td = gd->gd_curthread;
	1189
	1190	if ((gd->gd_reqflags & RQF_IDLECHECK_MASK) && td->td_nest_count < 2)
	1191	splz();
	1192	if (lwkt_resched_wanted()) {
9c99cb33	1193	crit_enter();
40504122 MD	1194	if (TAILQ_FIRST(&gd->gd_tdrunq) == td) {
	1195	clear_lwkt_resched();
	1196	} else {
dd8be70a	1197	atomic_set_int(&td->td_mpflags, TDF_MP_DIDYIELD);
40504122 MD	1198	lwkt_schedule_self(curthread);
	1199	lwkt_switch();
	1200	}
9c99cb33	1201	crit_exit();
40504122 MD	1202	}
	1203	}
	1204
3824f392	1205	/*
f9235b6d MD	1206	* This yield is designed for kernel threads with a user context.
	1207	*
	1208	* The kernel acting on behalf of the user is potentially cpu-bound,
	1209	* this function will efficiently allow other threads to run and also
	1210	* switch to other processes by releasing.
3824f392 MD	1211	*
	1212	* The lwkt_user_yield() function is designed to have very low overhead
	1213	* if no yield is determined to be needed.
	1214	*/
	1215	void
	1216	lwkt_user_yield(void)
	1217	{
f9235b6d MD	1218	globaldata_t gd = mycpu;
	1219	thread_t td = gd->gd_curthread;
	1220
fbe96076 MD	1221	/*
	1222	* Should never be called with spinlocks held but there is a path
	1223	* via ACPI where it might happen.
	1224	*/
	1225	if (gd->gd_spinlocks)
	1226	return;
	1227
f9235b6d MD	1228	/*
	1229	* Always run any pending interrupts in case we are in a critical
	1230	* section.
	1231	*/
	1232	if ((gd->gd_reqflags & RQF_IDLECHECK_MASK) && td->td_nest_count < 2)
	1233	splz();
3824f392	1234
3824f392	1235	/*
f9235b6d MD	1236	* Switch (which forces a release) if another kernel thread needs
	1237	* the cpu, if userland wants us to resched, or if our kernel
	1238	* quantum has run out.
3824f392	1239	*/
f9235b6d	1240	if (lwkt_resched_wanted() \|\|
85946b6c	1241	user_resched_wanted())
f9235b6d	1242	{
3824f392	1243	lwkt_switch();
3824f392 MD	1244	}
3824f392 MD	1245
f9235b6d	1246	#if 0
3824f392	1247	/*
f9235b6d MD	1248	* Reacquire the current process if we are released.
	1249	*
	1250	* XXX not implemented atm. The kernel may be holding locks and such,
	1251	* so we want the thread to continue to receive cpu.
3824f392	1252	*/
f9235b6d MD	1253	if (td->td_release == NULL && lp) {
	1254	lp->lwp_proc->p_usched->acquire_curproc(lp);
	1255	td->td_release = lwkt_passive_release;
	1256	lwkt_setpri_self(TDPRI_USER_NORM);
3824f392	1257	}
f9235b6d	1258	#endif
b9eb1c19 MD	1259	}
b9eb1c19 MD	1260
8ad65e08	1261	/*
f1d1c3fa MD	1262	* Generic schedule. Possibly schedule threads belonging to other cpus and
	1263	* deal with threads that might be blocked on a wait queue.
	1264	*
0a3f9b47 MD	1265	* We have a little helper inline function which does additional work after
	1266	* the thread has been enqueued, including dealing with preemption and
	1267	* setting need_lwkt_resched() (which prevents the kernel from returning
	1268	* to userland until it has processed higher priority threads).
6330a558 MD	1269	*
	1270	* It is possible for this routine to be called after a failed _enqueue
	1271	* (due to the target thread migrating, sleeping, or otherwise blocked).
	1272	* We have to check that the thread is actually on the run queue!
8ad65e08	1273	*/
0a3f9b47 MD	1274	static __inline
0a3f9b47 MD	1275	void
85946b6c	1276	_lwkt_schedule_post(globaldata_t gd, thread_t ntd, int ccount)
0a3f9b47	1277	{
6330a558	1278	if (ntd->td_flags & TDF_RUNQ) {
85946b6c	1279	if (ntd->td_preemptable) {
f9235b6d	1280	ntd->td_preemptable(ntd, ccount); /* YYY +token */
6330a558	1281	}
0a3f9b47 MD	1282	}
	1283	}
	1284
361d01dd	1285	static __inline
8ad65e08	1286	void
85946b6c	1287	_lwkt_schedule(thread_t td)
8ad65e08	1288	{
37af14fe MD	1289	globaldata_t mygd = mycpu;
37af14fe MD	1290
cf709dd2 MD	1291	KASSERT(td != &td->td_gd->gd_idlethread,
cf709dd2 MD	1292	("lwkt_schedule(): scheduling gd_idlethread is illegal!"));
cfaeae2a	1293	KKASSERT((td->td_flags & TDF_MIGRATING) == 0);
37af14fe	1294	crit_enter_gd(mygd);
4643740a MD	1295	KKASSERT(td->td_lwp == NULL \|\|
	1296	(td->td_lwp->lwp_mpflags & LWP_MP_ONRUNQ) == 0);
	1297
37af14fe	1298	if (td == mygd->gd_curthread) {
f1d1c3fa MD	1299	_lwkt_enqueue(td);
f1d1c3fa MD	1300	} else {
f1d1c3fa	1301	/*
7cd8d145 MD	1302	* If we own the thread, there is no race (since we are in a
	1303	* critical section). If we do not own the thread there might
	1304	* be a race but the target cpu will deal with it.
f1d1c3fa	1305	*/
7cd8d145	1306	if (td->td_gd == mygd) {
9d265729	1307	_lwkt_enqueue(td);
85946b6c	1308	_lwkt_schedule_post(mygd, td, 1);
f1d1c3fa	1309	} else {
e381e77c	1310	lwkt_send_ipiq3(td->td_gd, lwkt_schedule_remote, td, 0);
7cd8d145	1311	}
8ad65e08	1312	}
37af14fe	1313	crit_exit_gd(mygd);
8ad65e08 MD	1314	}
8ad65e08 MD	1315
361d01dd MD	1316	void
	1317	lwkt_schedule(thread_t td)
	1318	{
85946b6c	1319	_lwkt_schedule(td);
361d01dd MD	1320	}
	1321
	1322	void
85946b6c	1323	lwkt_schedule_noresched(thread_t td) /* XXX not impl */
361d01dd	1324	{
85946b6c	1325	_lwkt_schedule(td);
361d01dd MD	1326	}
361d01dd MD	1327
e381e77c MD	1328	/*
	1329	* When scheduled remotely if frame != NULL the IPIQ is being
	1330	* run via doreti or an interrupt then preemption can be allowed.
	1331	*
	1332	* To allow preemption we have to drop the critical section so only
	1333	* one is present in _lwkt_schedule_post.
	1334	*/
	1335	static void
	1336	lwkt_schedule_remote(void arg, int arg2, struct intrframe frame)
	1337	{
	1338	thread_t td = curthread;
	1339	thread_t ntd = arg;
	1340
	1341	if (frame && ntd->td_preemptable) {
	1342	crit_exit_noyield(td);
85946b6c	1343	_lwkt_schedule(ntd);
e381e77c MD	1344	crit_enter_quick(td);
e381e77c MD	1345	} else {
85946b6c	1346	_lwkt_schedule(ntd);
e381e77c MD	1347	}
	1348	}
	1349
d9eea1a5	1350	/*
52eedfb5 MD	1351	* Thread migration using a 'Pull' method. The thread may or may not be
	1352	* the current thread. It MUST be descheduled and in a stable state.
	1353	* lwkt_giveaway() must be called on the cpu owning the thread.
	1354	*
	1355	* At any point after lwkt_giveaway() is called, the target cpu may
	1356	* 'pull' the thread by calling lwkt_acquire().
	1357	*
ae8e83e6 MD	1358	* We have to make sure the thread is not sitting on a per-cpu tsleep
	1359	* queue or it will blow up when it moves to another cpu.
	1360	*
52eedfb5	1361	* MPSAFE - must be called under very specific conditions.
d9eea1a5	1362	*/
52eedfb5 MD	1363	void
	1364	lwkt_giveaway(thread_t td)
	1365	{
3b4192fb	1366	globaldata_t gd = mycpu;
52eedfb5	1367
3b4192fb MD	1368	crit_enter_gd(gd);
	1369	if (td->td_flags & TDF_TSLEEPQ)
	1370	tsleep_remove(td);
	1371	KKASSERT(td->td_gd == gd);
	1372	TAILQ_REMOVE(&gd->gd_tdallq, td, td_allq);
	1373	td->td_flags \|= TDF_MIGRATING;
	1374	crit_exit_gd(gd);
52eedfb5 MD	1375	}
52eedfb5 MD	1376
a2a5ad0d MD	1377	void
	1378	lwkt_acquire(thread_t td)
	1379	{
37af14fe MD	1380	globaldata_t gd;
37af14fe MD	1381	globaldata_t mygd;
a2a5ad0d	1382
52eedfb5	1383	KKASSERT(td->td_flags & TDF_MIGRATING);
a2a5ad0d	1384	gd = td->td_gd;
37af14fe	1385	mygd = mycpu;
52eedfb5	1386	if (gd != mycpu) {
3a06728e MD	1387	#ifdef LOOPMASK
	1388	uint64_t tsc_base = rdtsc();
	1389	#endif
35238fa5	1390	cpu_lfence();
52eedfb5	1391	KKASSERT((td->td_flags & TDF_RUNQ) == 0);
37af14fe	1392	crit_enter_gd(mygd);
cfaeae2a	1393	DEBUG_PUSH_INFO("lwkt_acquire");
df910c23	1394	while (td->td_flags & (TDF_RUNNING\|TDF_PREEMPT_LOCK)) {
df910c23	1395	lwkt_process_ipiq();
52eedfb5	1396	cpu_lfence();
a86ce0cd	1397	#ifdef _KERNEL_VIRTUAL
eb8c4738	1398	vkernel_yield();
3a06728e MD	1399	#endif
	1400	#ifdef LOOPMASK
	1401	if (tsc_frequency && rdtsc() - tsc_base > tsc_frequency) {
	1402	kprintf("lwkt_acquire: stuck td %p td->td_flags %08x\n",
	1403	td, td->td_flags);
	1404	tsc_base = rdtsc();
	1405	}
a86ce0cd	1406	#endif
df910c23	1407	}
cfaeae2a	1408	DEBUG_POP_INFO();
562273ea	1409	cpu_mfence();
37af14fe	1410	td->td_gd = mygd;
52eedfb5 MD	1411	TAILQ_INSERT_TAIL(&mygd->gd_tdallq, td, td_allq);
	1412	td->td_flags &= ~TDF_MIGRATING;
	1413	crit_exit_gd(mygd);
	1414	} else {
	1415	crit_enter_gd(mygd);
	1416	TAILQ_INSERT_TAIL(&mygd->gd_tdallq, td, td_allq);
	1417	td->td_flags &= ~TDF_MIGRATING;
37af14fe	1418	crit_exit_gd(mygd);
a2a5ad0d MD	1419	}
	1420	}
	1421
f1d1c3fa MD	1422	/*
f1d1c3fa MD	1423	* Generic deschedule. Descheduling threads other then your own should be
dae65060	1424	* done only in carefully controlled circumstances. Descheduling is
dae65060	1425	* asynchronous.
f1d1c3fa MD	1426	*
f1d1c3fa MD	1427	* This function may block if the cpu has run out of messages.
8ad65e08 MD	1428	*/
	1429	void
	1430	lwkt_deschedule(thread_t td)
	1431	{
f1d1c3fa MD	1432	crit_enter();
	1433	if (td == curthread) {
	1434	_lwkt_dequeue(td);
	1435	} else {
a72187e9	1436	if (td->td_gd == mycpu) {
f1d1c3fa MD	1437	_lwkt_dequeue(td);
f1d1c3fa MD	1438	} else {
b8a98473	1439	lwkt_send_ipiq(td->td_gd, (ipifunc1_t)lwkt_deschedule, td);
f1d1c3fa MD	1440	}
	1441	}
	1442	crit_exit();
	1443	}
	1444
4b5f931b MD	1445	/*
	1446	* Set the target thread's priority. This routine does not automatically
	1447	* switch to a higher priority thread, LWKT threads are not designed for
	1448	* continuous priority changes. Yield if you want to switch.
4b5f931b MD	1449	*/
	1450	void
	1451	lwkt_setpri(thread_t td, int pri)
	1452	{
f9235b6d MD	1453	if (td->td_pri != pri) {
	1454	KKASSERT(pri >= 0);
	1455	crit_enter();
	1456	if (td->td_flags & TDF_RUNQ) {
d2d8515b	1457	KKASSERT(td->td_gd == mycpu);
f9235b6d MD	1458	_lwkt_dequeue(td);
	1459	td->td_pri = pri;
	1460	_lwkt_enqueue(td);
	1461	} else {
	1462	td->td_pri = pri;
	1463	}
	1464	crit_exit();
26a0694b	1465	}
26a0694b MD	1466	}
26a0694b MD	1467
03bd0a5e MD	1468	/*
	1469	* Set the initial priority for a thread prior to it being scheduled for
	1470	* the first time. The thread MUST NOT be scheduled before or during
	1471	* this call. The thread may be assigned to a cpu other then the current
	1472	* cpu.
	1473	*
	1474	* Typically used after a thread has been created with TDF_STOPPREQ,
	1475	* and before the thread is initially scheduled.
	1476	*/
	1477	void
	1478	lwkt_setpri_initial(thread_t td, int pri)
	1479	{
	1480	KKASSERT(pri >= 0);
	1481	KKASSERT((td->td_flags & TDF_RUNQ) == 0);
f9235b6d	1482	td->td_pri = pri;
03bd0a5e MD	1483	}
03bd0a5e MD	1484
26a0694b MD	1485	void
	1486	lwkt_setpri_self(int pri)
	1487	{
	1488	thread_t td = curthread;
	1489
4b5f931b MD	1490	KKASSERT(pri >= 0 && pri <= TDPRI_MAX);
	1491	crit_enter();
	1492	if (td->td_flags & TDF_RUNQ) {
	1493	_lwkt_dequeue(td);
f9235b6d	1494	td->td_pri = pri;
4b5f931b MD	1495	_lwkt_enqueue(td);
4b5f931b MD	1496	} else {
f9235b6d	1497	td->td_pri = pri;
4b5f931b MD	1498	}
	1499	crit_exit();
	1500	}
	1501
f9235b6d	1502	/*
85946b6c	1503	* hz tick scheduler clock for LWKT threads
f9235b6d MD	1504	*/
f9235b6d MD	1505	void
85946b6c	1506	lwkt_schedulerclock(thread_t td)
f9235b6d	1507	{
85946b6c MD	1508	globaldata_t gd = td->td_gd;
85946b6c MD	1509	thread_t xtd;
2a418930	1510
c6a766f4 MD	1511	xtd = TAILQ_FIRST(&gd->gd_tdrunq);
c6a766f4 MD	1512	if (xtd == td) {
85946b6c MD	1513	/*
	1514	* If the current thread is at the head of the runq shift it to the
	1515	* end of any equal-priority threads and request a LWKT reschedule
	1516	* if it moved.
d992c377 MD	1517	*
	1518	* Ignore upri in this situation. There will only be one user thread
	1519	* in user mode, all others will be user threads running in kernel
	1520	* mode and we have to make sure they get some cpu.
85946b6c MD	1521	*/
	1522	xtd = TAILQ_NEXT(td, td_threadq);
	1523	if (xtd && xtd->td_pri == td->td_pri) {
	1524	TAILQ_REMOVE(&gd->gd_tdrunq, td, td_threadq);
	1525	while (xtd && xtd->td_pri == td->td_pri)
	1526	xtd = TAILQ_NEXT(xtd, td_threadq);
	1527	if (xtd)
	1528	TAILQ_INSERT_BEFORE(xtd, td, td_threadq);
	1529	else
	1530	TAILQ_INSERT_TAIL(&gd->gd_tdrunq, td, td_threadq);
	1531	need_lwkt_resched();
f9235b6d	1532	}
c6a766f4	1533	} else if (xtd) {
85946b6c MD	1534	/*
	1535	* If we scheduled a thread other than the one at the head of the
	1536	* queue always request a reschedule every tick.
	1537	*/
	1538	need_lwkt_resched();
f9235b6d	1539	}
c6a766f4	1540	/* else curthread probably the idle thread, no need to reschedule */
f9235b6d MD	1541	}
f9235b6d MD	1542
5d21b981	1543	/*
dae65060	1544	* Migrate the current thread to the specified cpu.
52eedfb5	1545	*
cc9b6223 MD	1546	* This is accomplished by descheduling ourselves from the current cpu
	1547	* and setting td_migrate_gd. The lwkt_switch() code will detect that the
	1548	* 'old' thread wants to migrate after it has been completely switched out
	1549	* and will complete the migration.
	1550	*
	1551	* TDF_MIGRATING prevents scheduling races while the thread is being migrated.
	1552	*
	1553	* We must be sure to release our current process designation (if a user
	1554	* process) before clearing out any tsleepq we are on because the release
	1555	* code may re-add us.
ae8e83e6 MD	1556	*
	1557	* We must be sure to remove ourselves from the current cpu's tsleepq
	1558	* before potentially moving to another queue. The thread can be on
	1559	* a tsleepq due to a left-over tsleep_interlock().
5d21b981	1560	*/
5d21b981 MD	1561
	1562	void
	1563	lwkt_setcpu_self(globaldata_t rgd)
	1564	{
5d21b981 MD	1565	thread_t td = curthread;
	1566
	1567	if (td->td_gd != rgd) {
	1568	crit_enter_quick(td);
cc9b6223	1569
95858b91 MD	1570	if (td->td_release)
95858b91 MD	1571	td->td_release(td);
ae8e83e6	1572	if (td->td_flags & TDF_TSLEEPQ)
3b4192fb	1573	tsleep_remove(td);
cc9b6223 MD	1574
	1575	/*
	1576	* Set TDF_MIGRATING to prevent a spurious reschedule while we are
	1577	* trying to deschedule ourselves and switch away, then deschedule
	1578	* ourself, remove us from tdallq, and set td_migrate_gd. Finally,
	1579	* call lwkt_switch() to complete the operation.
	1580	*/
5d21b981 MD	1581	td->td_flags \|= TDF_MIGRATING;
5d21b981 MD	1582	lwkt_deschedule_self(td);
52eedfb5	1583	TAILQ_REMOVE(&td->td_gd->gd_tdallq, td, td_allq);
cc9b6223	1584	td->td_migrate_gd = rgd;
5d21b981	1585	lwkt_switch();
cc9b6223 MD	1586
	1587	/*
	1588	* We are now on the target cpu
	1589	*/
	1590	KKASSERT(rgd == mycpu);
52eedfb5	1591	TAILQ_INSERT_TAIL(&rgd->gd_tdallq, td, td_allq);
5d21b981 MD	1592	crit_exit_quick(td);
5d21b981 MD	1593	}
5d21b981 MD	1594	}
5d21b981 MD	1595
ecdefdda MD	1596	void
	1597	lwkt_migratecpu(int cpuid)
	1598	{
ecdefdda MD	1599	globaldata_t rgd;
	1600
	1601	rgd = globaldata_find(cpuid);
	1602	lwkt_setcpu_self(rgd);
ecdefdda MD	1603	}
ecdefdda MD	1604
5d21b981 MD	1605	/*
5d21b981 MD	1606	* Remote IPI for cpu migration (called while in a critical section so we
cc9b6223 MD	1607	* do not have to enter another one).
	1608	*
	1609	* The thread (td) has already been completely descheduled from the
	1610	* originating cpu and we can simply assert the case. The thread is
	1611	* assigned to the new cpu and enqueued.
5d21b981	1612	*
cc9b6223	1613	* The thread will re-add itself to tdallq when it resumes execution.
5d21b981 MD	1614	*/
	1615	static void
	1616	lwkt_setcpu_remote(void *arg)
	1617	{
	1618	thread_t td = arg;
	1619	globaldata_t gd = mycpu;
	1620
cc9b6223	1621	KKASSERT((td->td_flags & (TDF_RUNNING\|TDF_PREEMPT_LOCK)) == 0);
5d21b981	1622	td->td_gd = gd;
562273ea	1623	cpu_mfence();
5d21b981	1624	td->td_flags &= ~TDF_MIGRATING;
cc9b6223	1625	KKASSERT(td->td_migrate_gd == NULL);
4643740a MD	1626	KKASSERT(td->td_lwp == NULL \|\|
4643740a MD	1627	(td->td_lwp->lwp_mpflags & LWP_MP_ONRUNQ) == 0);
5d21b981 MD	1628	_lwkt_enqueue(td);
	1629	}
	1630
553ea3c8	1631	struct lwp *
4b5f931b MD	1632	lwkt_preempted_proc(void)
4b5f931b MD	1633	{
73e4f7b9	1634	thread_t td = curthread;
4b5f931b MD	1635	while (td->td_preempted)
4b5f931b MD	1636	td = td->td_preempted;
553ea3c8	1637	return(td->td_lwp);
4b5f931b MD	1638	}
4b5f931b MD	1639
99df837e MD	1640	/*
	1641	* Create a kernel process/thread/whatever. It shares it's address space
	1642	* with proc0 - ie: kernel only.
	1643	*
d2d8515b MD	1644	* If the cpu is not specified one will be selected. In the future
	1645	* specifying a cpu of -1 will enable kernel thread migration between
	1646	* cpus.
99df837e MD	1647	*/
99df837e MD	1648	int
c9e9fb21 MD	1649	lwkt_create(void (func)(void ), void arg, struct thread *tdp,
c9e9fb21 MD	1650	thread_t template, int tdflags, int cpu, const char *fmt, ...)
99df837e	1651	{
73e4f7b9	1652	thread_t td;
e2565a42	1653	__va_list ap;
99df837e	1654
d3d32139	1655	td = lwkt_alloc_thread(template, LWKT_THREAD_STACK, cpu,
dbcd0c9b	1656	tdflags);
a2a5ad0d MD	1657	if (tdp)
a2a5ad0d MD	1658	*tdp = td;
709799ea	1659	cpu_set_thread_handler(td, lwkt_exit, func, arg);
99df837e MD	1660
	1661	/*
	1662	* Set up arg0 for 'ps' etc
	1663	*/
e2565a42	1664	__va_start(ap, fmt);
379210cb	1665	kvsnprintf(td->td_comm, sizeof(td->td_comm), fmt, ap);
e2565a42	1666	__va_end(ap);
99df837e MD	1667
	1668	/*
	1669	* Schedule the thread to run
	1670	*/
4643740a MD	1671	if (td->td_flags & TDF_NOSTART)
4643740a MD	1672	td->td_flags &= ~TDF_NOSTART;
ef0fdad1	1673	else
4643740a	1674	lwkt_schedule(td);
99df837e MD	1675	return 0;
	1676	}
	1677
	1678	/*
	1679	* Destroy an LWKT thread. Warning! This function is not called when
	1680	* a process exits, cpu_proc_exit() directly calls cpu_thread_exit() and
	1681	* uses a different reaping mechanism.
	1682	*/
	1683	void
	1684	lwkt_exit(void)
	1685	{
	1686	thread_t td = curthread;
c070746a	1687	thread_t std;
8826f33a	1688	globaldata_t gd;
99df837e	1689
2883d2d8 MD	1690	/*
	1691	* Do any cleanup that might block here
	1692	*/
2883d2d8 MD	1693	biosched_done(td);
2883d2d8 MD	1694	dsched_exit_thread(td);
c070746a MD	1695
	1696	/*
	1697	* Get us into a critical section to interlock gd_freetd and loop
	1698	* until we can get it freed.
	1699	*
	1700	* We have to cache the current td in gd_freetd because objcache_put()ing
	1701	* it would rip it out from under us while our thread is still active.
2af9d75d MD	1702	*
	1703	* We are the current thread so of course our own TDF_RUNNING bit will
	1704	* be set, so unlike the lwp reap code we don't wait for it to clear.
c070746a MD	1705	*/
c070746a MD	1706	gd = mycpu;
37af14fe	1707	crit_enter_quick(td);
2af9d75d MD	1708	for (;;) {
	1709	if (td->td_refs) {
	1710	tsleep(td, 0, "tdreap", 1);
	1711	continue;
	1712	}
	1713	if ((std = gd->gd_freetd) != NULL) {
	1714	KKASSERT((std->td_flags & (TDF_RUNNING\|TDF_PREEMPT_LOCK)) == 0);
	1715	gd->gd_freetd = NULL;
	1716	objcache_put(thread_cache, std);
	1717	continue;
	1718	}
	1719	break;
c070746a	1720	}
3b4192fb MD	1721
	1722	/*
	1723	* Remove thread resources from kernel lists and deschedule us for
2883d2d8 MD	1724	* the last time. We cannot block after this point or we may end
2883d2d8 MD	1725	* up with a stale td on the tsleepq.
eb2adbf5 MD	1726	*
	1727	* None of this may block, the critical section is the only thing
	1728	* protecting tdallq and the only thing preventing new lwkt_hold()
	1729	* thread refs now.
3b4192fb MD	1730	*/
	1731	if (td->td_flags & TDF_TSLEEPQ)
	1732	tsleep_remove(td);
37af14fe	1733	lwkt_deschedule_self(td);
e56e4dea	1734	lwkt_remove_tdallq(td);
74c9628e	1735	KKASSERT(td->td_refs == 0);
2883d2d8 MD	1736
	1737	/*
	1738	* Final cleanup
	1739	*/
	1740	KKASSERT(gd->gd_freetd == NULL);
c070746a MD	1741	if (td->td_flags & TDF_ALLOCATED_THREAD)
c070746a MD	1742	gd->gd_freetd = td;
99df837e MD	1743	cpu_thread_exit();
	1744	}
	1745
e56e4dea MD	1746	void
	1747	lwkt_remove_tdallq(thread_t td)
	1748	{
	1749	KKASSERT(td->td_gd == mycpu);
	1750	TAILQ_REMOVE(&td->td_gd->gd_tdallq, td, td_allq);
	1751	}
	1752
9cf43f91 MD	1753	/*
	1754	* Code reduction and branch prediction improvements. Call/return
	1755	* overhead on modern cpus often degenerates into 0 cycles due to
	1756	* the cpu's branch prediction hardware and return pc cache. We
	1757	* can take advantage of this by not inlining medium-complexity
	1758	* functions and we can also reduce the branch prediction impact
	1759	* by collapsing perfectly predictable branches into a single
	1760	* procedure instead of duplicating it.
	1761	*
	1762	* Is any of this noticeable? Probably not, so I'll take the
	1763	* smaller code size.
	1764	*/
	1765	void
b6468f56	1766	crit_exit_wrapper(__DEBUG_CRIT_ARG__)
9cf43f91	1767	{
b6468f56	1768	_crit_exit(mycpu __DEBUG_CRIT_PASS_ARG__);
9cf43f91 MD	1769	}
9cf43f91 MD	1770
2d93b37a MD	1771	void
	1772	crit_panic(void)
	1773	{
	1774	thread_t td = curthread;
850634cc	1775	int lcrit = td->td_critcount;
2d93b37a	1776
850634cc	1777	td->td_critcount = 0;
a4d95680	1778	cpu_ccfence();
850634cc	1779	panic("td_critcount is/would-go negative! %p %d", td, lcrit);
4a28fe22	1780	/* NOT REACHED */
2d93b37a MD	1781	}
2d93b37a MD	1782
bd8015ca MD	1783	/*
bd8015ca MD	1784	* Called from debugger/panic on cpus which have been stopped. We must still
b19f40a4	1785	* process the IPIQ while stopped.
bd8015ca MD	1786	*
	1787	* If we are dumping also try to process any pending interrupts. This may
	1788	* or may not work depending on the state of the cpu at the point it was
	1789	* stopped.
	1790	*/
	1791	void
	1792	lwkt_smp_stopped(void)
	1793	{
	1794	globaldata_t gd = mycpu;
	1795
bd8015ca MD	1796	if (dumping) {
bd8015ca MD	1797	lwkt_process_ipiq();
b19f40a4	1798	--gd->gd_intr_nesting_level;
bd8015ca	1799	splz();
b19f40a4	1800	++gd->gd_intr_nesting_level;
bd8015ca MD	1801	} else {
	1802	lwkt_process_ipiq();
	1803	}
63cff036	1804	cpu_smp_stopped();
bd8015ca	1805	}