gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2003 Matthew Dillon <dillon@backplane.com>
	3	* All rights reserved.
	4	*
	5	* Redistribution and use in source and binary forms, with or without
	6	* modification, are permitted provided that the following conditions
	7	* are met:
	8	* 1. Redistributions of source code must retain the above copyright
	9	* notice, this list of conditions and the following disclaimer.
	10	* 2. Redistributions in binary form must reproduce the above copyright
	11	* notice, this list of conditions and the following disclaimer in the
	12	* documentation and/or other materials provided with the distribution.
	13	*
	14	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	15	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	16	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	17	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	18	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	19	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	20	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	21	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	22	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	23	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	24	* SUCH DAMAGE.
	25	*
	26	* $DragonFly: src/sys/kern/lwkt_thread.c,v 1.40 2003/11/03 02:08:35 dillon Exp $
	27	*/
	28
	29	/*
	30	* Each cpu in a system has its own self-contained light weight kernel
	31	* thread scheduler, which means that generally speaking we only need
	32	* to use a critical section to avoid problems. Foreign thread
	33	* scheduling is queued via (async) IPIs.
	34	*
	35	* NOTE: on UP machines smp_active is defined to be 0. On SMP machines
	36	* smp_active is 0 prior to SMP activation, then it is 1. The LWKT module
	37	* uses smp_active to optimize UP builds and to avoid sending IPIs during
	38	* early boot (primarily interrupt and network thread initialization).
	39	*/
	40
	41	#include <sys/param.h>
	42	#include <sys/systm.h>
	43	#include <sys/kernel.h>
	44	#include <sys/proc.h>
	45	#include <sys/rtprio.h>
	46	#include <sys/queue.h>
	47	#include <sys/thread2.h>
	48	#include <sys/sysctl.h>
	49	#include <sys/kthread.h>
	50	#include <machine/cpu.h>
	51	#include <sys/lock.h>
	52
	53	#include <vm/vm.h>
	54	#include <vm/vm_param.h>
	55	#include <vm/vm_kern.h>
	56	#include <vm/vm_object.h>
	57	#include <vm/vm_page.h>
	58	#include <vm/vm_map.h>
	59	#include <vm/vm_pager.h>
	60	#include <vm/vm_extern.h>
	61	#include <vm/vm_zone.h>
	62
	63	#include <machine/stdarg.h>
	64	#include <machine/ipl.h>
	65	#include <machine/smp.h>
	66
	67	static int untimely_switch = 0;
	68	SYSCTL_INT(_lwkt, OID_AUTO, untimely_switch, CTLFLAG_RW, &untimely_switch, 0, "");
	69	#ifdef INVARIANTS
	70	static int token_debug = 0;
	71	SYSCTL_INT(_lwkt, OID_AUTO, token_debug, CTLFLAG_RW, &token_debug, 0, "");
	72	#endif
	73	static quad_t switch_count = 0;
	74	SYSCTL_QUAD(_lwkt, OID_AUTO, switch_count, CTLFLAG_RW, &switch_count, 0, "");
	75	static quad_t preempt_hit = 0;
	76	SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_hit, CTLFLAG_RW, &preempt_hit, 0, "");
	77	static quad_t preempt_miss = 0;
	78	SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_miss, CTLFLAG_RW, &preempt_miss, 0, "");
	79	static quad_t preempt_weird = 0;
	80	SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_weird, CTLFLAG_RW, &preempt_weird, 0, "");
	81	static quad_t ipiq_count = 0;
	82	SYSCTL_QUAD(_lwkt, OID_AUTO, ipiq_count, CTLFLAG_RW, &ipiq_count, 0, "");
	83	static quad_t ipiq_fifofull = 0;
	84	SYSCTL_QUAD(_lwkt, OID_AUTO, ipiq_fifofull, CTLFLAG_RW, &ipiq_fifofull, 0, "");
	85
	86	/*
	87	* These helper procedures handle the runq, they can only be called from
	88	* within a critical section.
	89	*
	90	* WARNING! Prior to SMP being brought up it is possible to enqueue and
	91	* dequeue threads belonging to other cpus, so be sure to use td->td_gd
	92	* instead of 'mycpu' when referencing the globaldata structure. Once
	93	* SMP live enqueuing and dequeueing only occurs on the current cpu.
	94	*/
	95	static __inline
	96	void
	97	_lwkt_dequeue(thread_t td)
	98	{
	99	if (td->td_flags & TDF_RUNQ) {
	100	int nq = td->td_pri & TDPRI_MASK;
	101	struct globaldata *gd = td->td_gd;
	102
	103	td->td_flags &= ~TDF_RUNQ;
	104	TAILQ_REMOVE(&gd->gd_tdrunq[nq], td, td_threadq);
	105	/* runqmask is passively cleaned up by the switcher */
	106	}
	107	}
	108
	109	static __inline
	110	void
	111	_lwkt_enqueue(thread_t td)
	112	{
	113	if ((td->td_flags & TDF_RUNQ) == 0) {
	114	int nq = td->td_pri & TDPRI_MASK;
	115	struct globaldata *gd = td->td_gd;
	116
	117	td->td_flags \|= TDF_RUNQ;
	118	TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], td, td_threadq);
	119	gd->gd_runqmask \|= 1 << nq;
	120	}
	121	}
	122
	123	static __inline
	124	int
	125	_lwkt_wantresched(thread_t ntd, thread_t cur)
	126	{
	127	return((ntd->td_pri & TDPRI_MASK) > (cur->td_pri & TDPRI_MASK));
	128	}
	129
	130	/*
	131	* LWKTs operate on a per-cpu basis
	132	*
	133	* WARNING! Called from early boot, 'mycpu' may not work yet.
	134	*/
	135	void
	136	lwkt_gdinit(struct globaldata *gd)
	137	{
	138	int i;
	139
	140	for (i = 0; i < sizeof(gd->gd_tdrunq)/sizeof(gd->gd_tdrunq[0]); ++i)
	141	TAILQ_INIT(&gd->gd_tdrunq[i]);
	142	gd->gd_runqmask = 0;
	143	TAILQ_INIT(&gd->gd_tdallq);
	144	}
	145
	146	/*
	147	* Initialize a thread wait structure prior to first use.
	148	*
	149	* NOTE! called from low level boot code, we cannot do anything fancy!
	150	*/
	151	void
	152	lwkt_init_wait(lwkt_wait_t w)
	153	{
	154	TAILQ_INIT(&w->wa_waitq);
	155	}
	156
	157	/*
	158	* Create a new thread. The thread must be associated with a process context
	159	* or LWKT start address before it can be scheduled. If the target cpu is
	160	* -1 the thread will be created on the current cpu.
	161	*
	162	* If you intend to create a thread without a process context this function
	163	* does everything except load the startup and switcher function.
	164	*/
	165	thread_t
	166	lwkt_alloc_thread(struct thread *td, int cpu)
	167	{
	168	void *stack;
	169	int flags = 0;
	170
	171	if (td == NULL) {
	172	crit_enter();
	173	if (mycpu->gd_tdfreecount > 0) {
	174	--mycpu->gd_tdfreecount;
	175	td = TAILQ_FIRST(&mycpu->gd_tdfreeq);
	176	KASSERT(td != NULL && (td->td_flags & TDF_RUNNING) == 0,
	177	("lwkt_alloc_thread: unexpected NULL or corrupted td"));
	178	TAILQ_REMOVE(&mycpu->gd_tdfreeq, td, td_threadq);
	179	crit_exit();
	180	stack = td->td_kstack;
	181	flags = td->td_flags & (TDF_ALLOCATED_STACK\|TDF_ALLOCATED_THREAD);
	182	} else {
	183	crit_exit();
	184	td = zalloc(thread_zone);
	185	td->td_kstack = NULL;
	186	flags \|= TDF_ALLOCATED_THREAD;
	187	}
	188	}
	189	if ((stack = td->td_kstack) == NULL) {
	190	stack = (void )kmem_alloc(kernel_map, UPAGES PAGE_SIZE);
	191	flags \|= TDF_ALLOCATED_STACK;
	192	}
	193	if (cpu < 0)
	194	lwkt_init_thread(td, stack, flags, mycpu);
	195	else
	196	lwkt_init_thread(td, stack, flags, globaldata_find(cpu));
	197	return(td);
	198	}
	199
	200	/*
	201	* Initialize a preexisting thread structure. This function is used by
	202	* lwkt_alloc_thread() and also used to initialize the per-cpu idlethread.
	203	*
	204	* All threads start out in a critical section at a priority of
	205	* TDPRI_KERN_DAEMON. Higher level code will modify the priority as
	206	* appropriate. This function may send an IPI message when the
	207	* requested cpu is not the current cpu and consequently gd_tdallq may
	208	* not be initialized synchronously from the point of view of the originating
	209	* cpu.
	210	*
	211	* NOTE! we have to be careful in regards to creating threads for other cpus
	212	* if SMP has not yet been activated.
	213	*/
	214	static void
	215	lwkt_init_thread_remote(void *arg)
	216	{
	217	thread_t td = arg;
	218
	219	TAILQ_INSERT_TAIL(&td->td_gd->gd_tdallq, td, td_allq);
	220	}
	221
	222	void
	223	lwkt_init_thread(thread_t td, void stack, int flags, struct globaldata gd)
	224	{
	225	bzero(td, sizeof(struct thread));
	226	td->td_kstack = stack;
	227	td->td_flags \|= flags;
	228	td->td_gd = gd;
	229	td->td_pri = TDPRI_KERN_DAEMON + TDPRI_CRIT;
	230	lwkt_init_port(&td->td_msgport, td);
	231	pmap_init_thread(td);
	232	if (smp_active == 0 \|\| gd == mycpu) {
	233	crit_enter();
	234	TAILQ_INSERT_TAIL(&gd->gd_tdallq, td, td_allq);
	235	crit_exit();
	236	} else {
	237	lwkt_send_ipiq(gd->gd_cpuid, lwkt_init_thread_remote, td);
	238	}
	239	}
	240
	241	void
	242	lwkt_set_comm(thread_t td, const char *ctl, ...)
	243	{
	244	va_list va;
	245
	246	va_start(va, ctl);
	247	vsnprintf(td->td_comm, sizeof(td->td_comm), ctl, va);
	248	va_end(va);
	249	}
	250
	251	void
	252	lwkt_hold(thread_t td)
	253	{
	254	++td->td_refs;
	255	}
	256
	257	void
	258	lwkt_rele(thread_t td)
	259	{
	260	KKASSERT(td->td_refs > 0);
	261	--td->td_refs;
	262	}
	263
	264	void
	265	lwkt_wait_free(thread_t td)
	266	{
	267	while (td->td_refs)
	268	tsleep(td, 0, "tdreap", hz);
	269	}
	270
	271	void
	272	lwkt_free_thread(thread_t td)
	273	{
	274	struct globaldata *gd = mycpu;
	275
	276	KASSERT((td->td_flags & TDF_RUNNING) == 0,
	277	("lwkt_free_thread: did not exit! %p", td));
	278
	279	crit_enter();
	280	TAILQ_REMOVE(&gd->gd_tdallq, td, td_allq);
	281	if (gd->gd_tdfreecount < CACHE_NTHREADS &&
	282	(td->td_flags & TDF_ALLOCATED_THREAD)
	283	) {
	284	++gd->gd_tdfreecount;
	285	TAILQ_INSERT_HEAD(&gd->gd_tdfreeq, td, td_threadq);
	286	crit_exit();
	287	} else {
	288	crit_exit();
	289	if (td->td_kstack && (td->td_flags & TDF_ALLOCATED_STACK)) {
	290	kmem_free(kernel_map,
	291	(vm_offset_t)td->td_kstack, UPAGES * PAGE_SIZE);
	292	/* gd invalid */
	293	td->td_kstack = NULL;
	294	}
	295	if (td->td_flags & TDF_ALLOCATED_THREAD)
	296	zfree(thread_zone, td);
	297	}
	298	}
	299
	300
	301	/*
	302	* Switch to the next runnable lwkt. If no LWKTs are runnable then
	303	* switch to the idlethread. Switching must occur within a critical
	304	* section to avoid races with the scheduling queue.
	305	*
	306	* We always have full control over our cpu's run queue. Other cpus
	307	* that wish to manipulate our queue must use the cpu_*msg() calls to
	308	* talk to our cpu, so a critical section is all that is needed and
	309	* the result is very, very fast thread switching.
	310	*
	311	* The LWKT scheduler uses a fixed priority model and round-robins at
	312	* each priority level. User process scheduling is a totally
	313	* different beast and LWKT priorities should not be confused with
	314	* user process priorities.
	315	*
	316	* The MP lock may be out of sync with the thread's td_mpcount. lwkt_switch()
	317	* cleans it up. Note that the td_switch() function cannot do anything that
	318	* requires the MP lock since the MP lock will have already been setup for
	319	* the target thread (not the current thread). It's nice to have a scheduler
	320	* that does not need the MP lock to work because it allows us to do some
	321	* really cool high-performance MP lock optimizations.
	322	*/
	323
	324	void
	325	lwkt_switch(void)
	326	{
	327	struct globaldata *gd;
	328	thread_t td = curthread;
	329	thread_t ntd;
	330	#ifdef SMP
	331	int mpheld;
	332	#endif
	333
	334	/*
	335	* Switching from within a 'fast' (non thread switched) interrupt is
	336	* illegal.
	337	*/
	338	if (mycpu->gd_intr_nesting_level && panicstr == NULL) {
	339	panic("lwkt_switch: cannot switch from within a fast interrupt, yet\n");
	340	}
	341
	342	/*
	343	* Passive release (used to transition from user to kernel mode
	344	* when we block or switch rather then when we enter the kernel).
	345	* This function is NOT called if we are switching into a preemption
	346	* or returning from a preemption. Typically this causes us to lose
	347	* our P_CURPROC designation (if we have one) and become a true LWKT
	348	* thread, and may also hand P_CURPROC to another process and schedule
	349	* its thread.
	350	*/
	351	if (td->td_release)
	352	td->td_release(td);
	353
	354	crit_enter();
	355	++switch_count;
	356
	357	#ifdef SMP
	358	/*
	359	* td_mpcount cannot be used to determine if we currently hold the
	360	* MP lock because get_mplock() will increment it prior to attempting
	361	* to get the lock, and switch out if it can't. Our ownership of
	362	* the actual lock will remain stable while we are in a critical section
	363	* (but, of course, another cpu may own or release the lock so the
	364	* actual value of mp_lock is not stable).
	365	*/
	366	mpheld = MP_LOCK_HELD();
	367	#endif
	368	if ((ntd = td->td_preempted) != NULL) {
	369	/*
	370	* We had preempted another thread on this cpu, resume the preempted
	371	* thread. This occurs transparently, whether the preempted thread
	372	* was scheduled or not (it may have been preempted after descheduling
	373	* itself).
	374	*
	375	* We have to setup the MP lock for the original thread after backing
	376	* out the adjustment that was made to curthread when the original
	377	* was preempted.
	378	*/
	379	KKASSERT(ntd->td_flags & TDF_PREEMPT_LOCK);
	380	#ifdef SMP
	381	if (ntd->td_mpcount && mpheld == 0) {
	382	panic("MPLOCK NOT HELD ON RETURN: %p %p %d %d\n",
	383	td, ntd, td->td_mpcount, ntd->td_mpcount);
	384	}
	385	if (ntd->td_mpcount) {
	386	td->td_mpcount -= ntd->td_mpcount;
	387	KKASSERT(td->td_mpcount >= 0);
	388	}
	389	#endif
	390	ntd->td_flags \|= TDF_PREEMPT_DONE;
	391	/* YYY release mp lock on switchback if original doesn't need it */
	392	} else {
	393	/*
	394	* Priority queue / round-robin at each priority. Note that user
	395	* processes run at a fixed, low priority and the user process
	396	* scheduler deals with interactions between user processes
	397	* by scheduling and descheduling them from the LWKT queue as
	398	* necessary.
	399	*
	400	* We have to adjust the MP lock for the target thread. If we
	401	* need the MP lock and cannot obtain it we try to locate a
	402	* thread that does not need the MP lock.
	403	*/
	404	gd = mycpu;
	405	again:
	406	if (gd->gd_runqmask) {
	407	int nq = bsrl(gd->gd_runqmask);
	408	if ((ntd = TAILQ_FIRST(&gd->gd_tdrunq[nq])) == NULL) {
	409	gd->gd_runqmask &= ~(1 << nq);
	410	goto again;
	411	}
	412	#ifdef SMP
	413	if (ntd->td_mpcount && mpheld == 0 && !cpu_try_mplock()) {
	414	/*
	415	* Target needs MP lock and we couldn't get it, try
	416	* to locate a thread which does not need the MP lock
	417	* to run. If we cannot locate a thread spin in idle.
	418	*/
	419	u_int32_t rqmask = gd->gd_runqmask;
	420	while (rqmask) {
	421	TAILQ_FOREACH(ntd, &gd->gd_tdrunq[nq], td_threadq) {
	422	if (ntd->td_mpcount == 0)
	423	break;
	424	}
	425	if (ntd)
	426	break;
	427	rqmask &= ~(1 << nq);
	428	nq = bsrl(rqmask);
	429	}
	430	if (ntd == NULL) {
	431	ntd = &gd->gd_idlethread;
	432	ntd->td_flags \|= TDF_IDLE_NOHLT;
	433	} else {
	434	TAILQ_REMOVE(&gd->gd_tdrunq[nq], ntd, td_threadq);
	435	TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], ntd, td_threadq);
	436	}
	437	} else {
	438	TAILQ_REMOVE(&gd->gd_tdrunq[nq], ntd, td_threadq);
	439	TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], ntd, td_threadq);
	440	}
	441	#else
	442	TAILQ_REMOVE(&gd->gd_tdrunq[nq], ntd, td_threadq);
	443	TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], ntd, td_threadq);
	444	#endif
	445	} else {
	446	/*
	447	* Nothing to run but we may still need the BGL to deal with
	448	* pending interrupts, spin in idle if so.
	449	*/
	450	ntd = &gd->gd_idlethread;
	451	if (gd->gd_reqflags)
	452	ntd->td_flags \|= TDF_IDLE_NOHLT;
	453	}
	454	}
	455	KASSERT(ntd->td_pri >= TDPRI_CRIT,
	456	("priority problem in lwkt_switch %d %d", td->td_pri, ntd->td_pri));
	457
	458	/*
	459	* Do the actual switch. If the new target does not need the MP lock
	460	* and we are holding it, release the MP lock. If the new target requires
	461	* the MP lock we have already acquired it for the target.
	462	*/
	463	#ifdef SMP
	464	if (ntd->td_mpcount == 0 ) {
	465	if (MP_LOCK_HELD())
	466	cpu_rel_mplock();
	467	} else {
	468	ASSERT_MP_LOCK_HELD();
	469	}
	470	#endif
	471	if (td != ntd) {
	472	td->td_switch(ntd);
	473	}
	474
	475	crit_exit();
	476	}
	477
	478	/*
	479	* Switch if another thread has a higher priority. Do not switch to other
	480	* threads at the same priority.
	481	*/
	482	void
	483	lwkt_maybe_switch()
	484	{
	485	struct globaldata *gd = mycpu;
	486	struct thread *td = gd->gd_curthread;
	487
	488	if ((td->td_pri & TDPRI_MASK) < bsrl(gd->gd_runqmask)) {
	489	lwkt_switch();
	490	}
	491	}
	492
	493	/*
	494	* Request that the target thread preempt the current thread. Preemption
	495	* only works under a specific set of conditions:
	496	*
	497	* - We are not preempting ourselves
	498	* - The target thread is owned by the current cpu
	499	* - We are not currently being preempted
	500	* - The target is not currently being preempted
	501	* - We are able to satisfy the target's MP lock requirements (if any).
	502	*
	503	* THE CALLER OF LWKT_PREEMPT() MUST BE IN A CRITICAL SECTION. Typically
	504	* this is called via lwkt_schedule() through the td_preemptable callback.
	505	* critpri is the managed critical priority that we should ignore in order
	506	* to determine whether preemption is possible (aka usually just the crit
	507	* priority of lwkt_schedule() itself).
	508	*
	509	* XXX at the moment we run the target thread in a critical section during
	510	* the preemption in order to prevent the target from taking interrupts
	511	* that WE can't. Preemption is strictly limited to interrupt threads
	512	* and interrupt-like threads, outside of a critical section, and the
	513	* preempted source thread will be resumed the instant the target blocks
	514	* whether or not the source is scheduled (i.e. preemption is supposed to
	515	* be as transparent as possible).
	516	*
	517	* The target thread inherits our MP count (added to its own) for the
	518	* duration of the preemption in order to preserve the atomicy of the
	519	* MP lock during the preemption. Therefore, any preempting targets must be
	520	* careful in regards to MP assertions. Note that the MP count may be
	521	* out of sync with the physical mp_lock, but we do not have to preserve
	522	* the original ownership of the lock if it was out of synch (that is, we
	523	* can leave it synchronized on return).
	524	*/
	525	void
	526	lwkt_preempt(thread_t ntd, int critpri)
	527	{
	528	struct globaldata *gd = mycpu;
	529	thread_t td = gd->gd_curthread;
	530	#ifdef SMP
	531	int mpheld;
	532	int savecnt;
	533	#endif
	534
	535	/*
	536	* The caller has put us in a critical section. We can only preempt
	537	* if the caller of the caller was not in a critical section (basically
	538	* a local interrupt), as determined by the 'critpri' parameter. If
	539	* we are unable to preempt
	540	*
	541	* YYY The target thread must be in a critical section (else it must
	542	* inherit our critical section? I dunno yet).
	543	*/
	544	KASSERT(ntd->td_pri >= TDPRI_CRIT, ("BADCRIT0 %d", ntd->td_pri));
	545
	546	need_resched();
	547	if (!_lwkt_wantresched(ntd, td)) {
	548	++preempt_miss;
	549	return;
	550	}
	551	if ((td->td_pri & ~TDPRI_MASK) > critpri) {
	552	++preempt_miss;
	553	return;
	554	}
	555	#ifdef SMP
	556	if (ntd->td_gd != gd) {
	557	++preempt_miss;
	558	return;
	559	}
	560	#endif
	561	if (td == ntd \|\| ((td->td_flags \| ntd->td_flags) & TDF_PREEMPT_LOCK)) {
	562	++preempt_weird;
	563	return;
	564	}
	565	if (ntd->td_preempted) {
	566	++preempt_hit;
	567	return;
	568	}
	569	#ifdef SMP
	570	/*
	571	* note: an interrupt might have occured just as we were transitioning
	572	* to or from the MP lock. In this case td_mpcount will be pre-disposed
	573	* (non-zero) but not actually synchronized with the actual state of the
	574	* lock. We can use it to imply an MP lock requirement for the
	575	* preemption but we cannot use it to test whether we hold the MP lock
	576	* or not.
	577	*/
	578	savecnt = td->td_mpcount;
	579	mpheld = MP_LOCK_HELD();
	580	ntd->td_mpcount += td->td_mpcount;
	581	if (mpheld == 0 && ntd->td_mpcount && !cpu_try_mplock()) {
	582	ntd->td_mpcount -= td->td_mpcount;
	583	++preempt_miss;
	584	return;
	585	}
	586	#endif
	587
	588	++preempt_hit;
	589	ntd->td_preempted = td;
	590	td->td_flags \|= TDF_PREEMPT_LOCK;
	591	td->td_switch(ntd);
	592	KKASSERT(ntd->td_preempted && (td->td_flags & TDF_PREEMPT_DONE));
	593	#ifdef SMP
	594	KKASSERT(savecnt == td->td_mpcount);
	595	mpheld = MP_LOCK_HELD();
	596	if (mpheld && td->td_mpcount == 0)
	597	cpu_rel_mplock();
	598	else if (mpheld == 0 && td->td_mpcount)
	599	panic("lwkt_preempt(): MP lock was not held through");
	600	#endif
	601	ntd->td_preempted = NULL;
	602	td->td_flags &= ~(TDF_PREEMPT_LOCK\|TDF_PREEMPT_DONE);
	603	}
	604
	605	/*
	606	* Yield our thread while higher priority threads are pending. This is
	607	* typically called when we leave a critical section but it can be safely
	608	* called while we are in a critical section.
	609	*
	610	* This function will not generally yield to equal priority threads but it
	611	* can occur as a side effect. Note that lwkt_switch() is called from
	612	* inside the critical section to prevent its own crit_exit() from reentering
	613	* lwkt_yield_quick().
	614	*
	615	* gd_reqflags indicates that something changed, e.g. an interrupt or softint
	616	* came along but was blocked and made pending.
	617	*
	618	* (self contained on a per cpu basis)
	619	*/
	620	void
	621	lwkt_yield_quick(void)
	622	{
	623	globaldata_t gd = mycpu;
	624	thread_t td = gd->gd_curthread;
	625
	626	/*
	627	* gd_reqflags is cleared in splz if the cpl is 0. If we were to clear
	628	* it with a non-zero cpl then we might not wind up calling splz after
	629	* a task switch when the critical section is exited even though the
	630	* new task could accept the interrupt.
	631	*
	632	* XXX from crit_exit() only called after last crit section is released.
	633	* If called directly will run splz() even if in a critical section.
	634	*
	635	* td_nest_count prevent deep nesting via splz() or doreti(). Note that
	636	* except for this special case, we MUST call splz() here to handle any
	637	* pending ints, particularly after we switch, or we might accidently
	638	* halt the cpu with interrupts pending.
	639	*/
	640	if (gd->gd_reqflags && td->td_nest_count < 2)
	641	splz();
	642
	643	/*
	644	* YYY enabling will cause wakeup() to task-switch, which really
	645	* confused the old 4.x code. This is a good way to simulate
	646	* preemption and MP without actually doing preemption or MP, because a
	647	* lot of code assumes that wakeup() does not block.
	648	*/
	649	if (untimely_switch && td->td_nest_count == 0 &&
	650	gd->gd_intr_nesting_level == 0
	651	) {
	652	crit_enter();
	653	/*
	654	* YYY temporary hacks until we disassociate the userland scheduler
	655	* from the LWKT scheduler.
	656	*/
	657	if (td->td_flags & TDF_RUNQ) {
	658	lwkt_switch(); /* will not reenter yield function */
	659	} else {
	660	lwkt_schedule_self(); /* make sure we are scheduled */
	661	lwkt_switch(); /* will not reenter yield function */
	662	lwkt_deschedule_self(); /* make sure we are descheduled */
	663	}
	664	crit_exit_noyield(td);
	665	}
	666	}
	667
	668	/*
	669	* This implements a normal yield which, unlike _quick, will yield to equal
	670	* priority threads as well. Note that gd_reqflags tests will be handled by
	671	* the crit_exit() call in lwkt_switch().
	672	*
	673	* (self contained on a per cpu basis)
	674	*/
	675	void
	676	lwkt_yield(void)
	677	{
	678	lwkt_schedule_self();
	679	lwkt_switch();
	680	}
	681
	682	/*
	683	* Schedule a thread to run. As the current thread we can always safely
	684	* schedule ourselves, and a shortcut procedure is provided for that
	685	* function.
	686	*
	687	* (non-blocking, self contained on a per cpu basis)
	688	*/
	689	void
	690	lwkt_schedule_self(void)
	691	{
	692	thread_t td = curthread;
	693
	694	crit_enter();
	695	KASSERT(td->td_wait == NULL, ("lwkt_schedule_self(): td_wait not NULL!"));
	696	_lwkt_enqueue(td);
	697	if (td->td_proc && td->td_proc->p_stat == SSLEEP)
	698	panic("SCHED SELF PANIC");
	699	crit_exit();
	700	}
	701
	702	/*
	703	* Generic schedule. Possibly schedule threads belonging to other cpus and
	704	* deal with threads that might be blocked on a wait queue.
	705	*
	706	* YYY this is one of the best places to implement load balancing code.
	707	* Load balancing can be accomplished by requesting other sorts of actions
	708	* for the thread in question.
	709	*/
	710	void
	711	lwkt_schedule(thread_t td)
	712	{
	713	#ifdef INVARIANTS
	714	if ((td->td_flags & TDF_PREEMPT_LOCK) == 0 && td->td_proc
	715	&& td->td_proc->p_stat == SSLEEP
	716	) {
	717	printf("PANIC schedule curtd = %p (%d %d) target %p (%d %d)\n",
	718	curthread,
	719	curthread->td_proc ? curthread->td_proc->p_pid : -1,
	720	curthread->td_proc ? curthread->td_proc->p_stat : -1,
	721	td,
	722	td->td_proc ? curthread->td_proc->p_pid : -1,
	723	td->td_proc ? curthread->td_proc->p_stat : -1
	724	);
	725	panic("SCHED PANIC");
	726	}
	727	#endif
	728	crit_enter();
	729	if (td == curthread) {
	730	_lwkt_enqueue(td);
	731	} else {
	732	lwkt_wait_t w;
	733
	734	/*
	735	* If the thread is on a wait list we have to send our scheduling
	736	* request to the owner of the wait structure. Otherwise we send
	737	* the scheduling request to the cpu owning the thread. Races
	738	* are ok, the target will forward the message as necessary (the
	739	* message may chase the thread around before it finally gets
	740	* acted upon).
	741	*
	742	* (remember, wait structures use stable storage)
	743	*/
	744	if ((w = td->td_wait) != NULL) {
	745	if (lwkt_trytoken(&w->wa_token)) {
	746	TAILQ_REMOVE(&w->wa_waitq, td, td_threadq);
	747	--w->wa_count;
	748	td->td_wait = NULL;
	749	if (smp_active == 0 \|\| td->td_gd == mycpu) {
	750	_lwkt_enqueue(td);
	751	if (td->td_preemptable) {
	752	td->td_preemptable(td, TDPRI_CRIT2); / YYY +token */
	753	} else if (_lwkt_wantresched(td, curthread)) {
	754	need_resched();
	755	}
	756	} else {
	757	lwkt_send_ipiq(td->td_gd->gd_cpuid, (ipifunc_t)lwkt_schedule, td);
	758	}
	759	lwkt_reltoken(&w->wa_token);
	760	} else {
	761	lwkt_send_ipiq(w->wa_token.t_cpu, (ipifunc_t)lwkt_schedule, td);
	762	}
	763	} else {
	764	/*
	765	* If the wait structure is NULL and we own the thread, there
	766	* is no race (since we are in a critical section). If we
	767	* do not own the thread there might be a race but the
	768	* target cpu will deal with it.
	769	*/
	770	if (smp_active == 0 \|\| td->td_gd == mycpu) {
	771	_lwkt_enqueue(td);
	772	if (td->td_preemptable) {
	773	td->td_preemptable(td, TDPRI_CRIT);
	774	} else if (_lwkt_wantresched(td, curthread)) {
	775	need_resched();
	776	}
	777	} else {
	778	lwkt_send_ipiq(td->td_gd->gd_cpuid, (ipifunc_t)lwkt_schedule, td);
	779	}
	780	}
	781	}
	782	crit_exit();
	783	}
	784
	785	/*
	786	* Managed acquisition. This code assumes that the MP lock is held for
	787	* the tdallq operation and that the thread has been descheduled from its
	788	* original cpu. We also have to wait for the thread to be entirely switched
	789	* out on its original cpu (this is usually fast enough that we never loop)
	790	* since the LWKT system does not have to hold the MP lock while switching
	791	* and the target may have released it before switching.
	792	*/
	793	void
	794	lwkt_acquire(thread_t td)
	795	{
	796	struct globaldata *gd;
	797
	798	gd = td->td_gd;
	799	KKASSERT((td->td_flags & TDF_RUNQ) == 0);
	800	while (td->td_flags & TDF_RUNNING) /* XXX spin */
	801	;
	802	if (gd != mycpu) {
	803	crit_enter();
	804	TAILQ_REMOVE(&gd->gd_tdallq, td, td_allq); /* protected by BGL */
	805	gd = mycpu;
	806	td->td_gd = gd;
	807	TAILQ_INSERT_TAIL(&gd->gd_tdallq, td, td_allq); /* protected by BGL */
	808	crit_exit();
	809	}
	810	}
	811
	812	/*
	813	* Deschedule a thread.
	814	*
	815	* (non-blocking, self contained on a per cpu basis)
	816	*/
	817	void
	818	lwkt_deschedule_self(void)
	819	{
	820	thread_t td = curthread;
	821
	822	crit_enter();
	823	KASSERT(td->td_wait == NULL, ("lwkt_schedule_self(): td_wait not NULL!"));
	824	_lwkt_dequeue(td);
	825	crit_exit();
	826	}
	827
	828	/*
	829	* Generic deschedule. Descheduling threads other then your own should be
	830	* done only in carefully controlled circumstances. Descheduling is
	831	* asynchronous.
	832	*
	833	* This function may block if the cpu has run out of messages.
	834	*/
	835	void
	836	lwkt_deschedule(thread_t td)
	837	{
	838	crit_enter();
	839	if (td == curthread) {
	840	_lwkt_dequeue(td);
	841	} else {
	842	if (td->td_gd == mycpu) {
	843	_lwkt_dequeue(td);
	844	} else {
	845	lwkt_send_ipiq(td->td_gd->gd_cpuid, (ipifunc_t)lwkt_deschedule, td);
	846	}
	847	}
	848	crit_exit();
	849	}
	850
	851	/*
	852	* Set the target thread's priority. This routine does not automatically
	853	* switch to a higher priority thread, LWKT threads are not designed for
	854	* continuous priority changes. Yield if you want to switch.
	855	*
	856	* We have to retain the critical section count which uses the high bits
	857	* of the td_pri field. The specified priority may also indicate zero or
	858	* more critical sections by adding TDPRI_CRIT*N.
	859	*/
	860	void
	861	lwkt_setpri(thread_t td, int pri)
	862	{
	863	KKASSERT(pri >= 0);
	864	KKASSERT(td->td_gd == mycpu);
	865	crit_enter();
	866	if (td->td_flags & TDF_RUNQ) {
	867	_lwkt_dequeue(td);
	868	td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri;
	869	_lwkt_enqueue(td);
	870	} else {
	871	td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri;
	872	}
	873	crit_exit();
	874	}
	875
	876	void
	877	lwkt_setpri_self(int pri)
	878	{
	879	thread_t td = curthread;
	880
	881	KKASSERT(pri >= 0 && pri <= TDPRI_MAX);
	882	crit_enter();
	883	if (td->td_flags & TDF_RUNQ) {
	884	_lwkt_dequeue(td);
	885	td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri;
	886	_lwkt_enqueue(td);
	887	} else {
	888	td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri;
	889	}
	890	crit_exit();
	891	}
	892
	893	struct proc *
	894	lwkt_preempted_proc(void)
	895	{
	896	thread_t td = curthread;
	897	while (td->td_preempted)
	898	td = td->td_preempted;
	899	return(td->td_proc);
	900	}
	901
	902	typedef struct lwkt_gettoken_req {
	903	lwkt_token_t tok;
	904	int cpu;
	905	} lwkt_gettoken_req;
	906
	907	#if 0
	908
	909	/*
	910	* This function deschedules the current thread and blocks on the specified
	911	* wait queue. We obtain ownership of the wait queue in order to block
	912	* on it. A generation number is used to interlock the wait queue in case
	913	* it gets signalled while we are blocked waiting on the token.
	914	*
	915	* Note: alternatively we could dequeue our thread and then message the
	916	* target cpu owning the wait queue. YYY implement as sysctl.
	917	*
	918	* Note: wait queue signals normally ping-pong the cpu as an optimization.
	919	*/
	920
	921	void
	922	lwkt_block(lwkt_wait_t w, const char wmesg, int gen)
	923	{
	924	thread_t td = curthread;
	925
	926	lwkt_gettoken(&w->wa_token);
	927	if (w->wa_gen == *gen) {
	928	_lwkt_dequeue(td);
	929	TAILQ_INSERT_TAIL(&w->wa_waitq, td, td_threadq);
	930	++w->wa_count;
	931	td->td_wait = w;
	932	td->td_wmesg = wmesg;
	933	again:
	934	lwkt_switch();
	935	lwkt_regettoken(&w->wa_token);
	936	if (td->td_wmesg != NULL) {
	937	_lwkt_dequeue(td);
	938	goto again;
	939	}
	940	}
	941	/* token might be lost, doesn't matter for gen update */
	942	*gen = w->wa_gen;
	943	lwkt_reltoken(&w->wa_token);
	944	}
	945
	946	/*
	947	* Signal a wait queue. We gain ownership of the wait queue in order to
	948	* signal it. Once a thread is removed from the wait queue we have to
	949	* deal with the cpu owning the thread.
	950	*
	951	* Note: alternatively we could message the target cpu owning the wait
	952	* queue. YYY implement as sysctl.
	953	*/
	954	void
	955	lwkt_signal(lwkt_wait_t w, int count)
	956	{
	957	thread_t td;
	958	int count;
	959
	960	lwkt_gettoken(&w->wa_token);
	961	++w->wa_gen;
	962	if (count < 0)
	963	count = w->wa_count;
	964	while ((td = TAILQ_FIRST(&w->wa_waitq)) != NULL && count) {
	965	--count;
	966	--w->wa_count;
	967	TAILQ_REMOVE(&w->wa_waitq, td, td_threadq);
	968	td->td_wait = NULL;
	969	td->td_wmesg = NULL;
	970	if (td->td_gd == mycpu) {
	971	_lwkt_enqueue(td);
	972	} else {
	973	lwkt_send_ipiq(td->td_gd->gd_cpuid, (ipifunc_t)lwkt_schedule, td);
	974	}
	975	lwkt_regettoken(&w->wa_token);
	976	}
	977	lwkt_reltoken(&w->wa_token);
	978	}
	979
	980	#endif
	981
	982	/*
	983	* Acquire ownership of a token
	984	*
	985	* Acquire ownership of a token. The token may have spl and/or critical
	986	* section side effects, depending on its purpose. These side effects
	987	* guarentee that you will maintain ownership of the token as long as you
	988	* do not block. If you block you may lose access to the token (but you
	989	* must still release it even if you lose your access to it).
	990	*
	991	* YYY for now we use a critical section to prevent IPIs from taking away
	992	* a token, but do we really only need to disable IPIs ?
	993	*
	994	* YYY certain tokens could be made to act like mutexes when performance
	995	* would be better (e.g. t_cpu == -1). This is not yet implemented.
	996	*
	997	* YYY the tokens replace 4.x's simplelocks for the most part, but this
	998	* means that 4.x does not expect a switch so for now we cannot switch
	999	* when waiting for an IPI to be returned.
	1000	*
	1001	* YYY If the token is owned by another cpu we may have to send an IPI to
	1002	* it and then block. The IPI causes the token to be given away to the
	1003	* requesting cpu, unless it has already changed hands. Since only the
	1004	* current cpu can give away a token it owns we do not need a memory barrier.
	1005	* This needs serious optimization.
	1006	*/
	1007
	1008	#ifdef SMP
	1009
	1010	static
	1011	void
	1012	lwkt_gettoken_remote(void *arg)
	1013	{
	1014	lwkt_gettoken_req *req = arg;
	1015	if (req->tok->t_cpu == mycpu->gd_cpuid) {
	1016	#ifdef INVARIANTS
	1017	if (token_debug)
	1018	printf("GT(%d,%d) ", req->tok->t_cpu, req->cpu);
	1019	#endif
	1020	req->tok->t_cpu = req->cpu;
	1021	req->tok->t_reqcpu = req->cpu; /* YYY leave owned by target cpu */
	1022	/* else set reqcpu to point to current cpu for release */
	1023	}
	1024	}
	1025
	1026	#endif
	1027
	1028	int
	1029	lwkt_gettoken(lwkt_token_t tok)
	1030	{
	1031	/*
	1032	* Prevent preemption so the token can't be taken away from us once
	1033	* we gain ownership of it. Use a synchronous request which might
	1034	* block. The request will be forwarded as necessary playing catchup
	1035	* to the token.
	1036	*/
	1037
	1038	crit_enter();
	1039	#ifdef INVARIANTS
	1040	if (curthread->td_pri > 1800) {
	1041	printf("lwkt_gettoken: %p called from %p: crit sect nesting warning\n",
	1042	tok, ((int **)&tok)[-1]);
	1043	}
	1044	if (curthread->td_pri > 2000) {
	1045	curthread->td_pri = 1000;
	1046	panic("too HIGH!");
	1047	}
	1048	#endif
	1049	#ifdef SMP
	1050	while (tok->t_cpu != mycpu->gd_cpuid) {
	1051	struct lwkt_gettoken_req req;
	1052	int seq;
	1053	int dcpu;
	1054
	1055	req.cpu = mycpu->gd_cpuid;
	1056	req.tok = tok;
	1057	dcpu = (volatile int)tok->t_cpu;
	1058	KKASSERT(dcpu >= 0 && dcpu < ncpus);
	1059	#ifdef INVARIANTS
	1060	if (token_debug)
	1061	printf("REQT%d ", dcpu);
	1062	#endif
	1063	seq = lwkt_send_ipiq(dcpu, lwkt_gettoken_remote, &req);
	1064	lwkt_wait_ipiq(dcpu, seq);
	1065	#ifdef INVARIANTS
	1066	if (token_debug)
	1067	printf("REQR%d ", tok->t_cpu);
	1068	#endif
	1069	}
	1070	#endif
	1071	/*
	1072	* leave us in a critical section on return. This will be undone
	1073	* by lwkt_reltoken(). Bump the generation number.
	1074	*/
	1075	return(++tok->t_gen);
	1076	}
	1077
	1078	/*
	1079	* Attempt to acquire ownership of a token. Returns 1 on success, 0 on
	1080	* failure.
	1081	*/
	1082	int
	1083	lwkt_trytoken(lwkt_token_t tok)
	1084	{
	1085	crit_enter();
	1086	#ifdef SMP
	1087	if (tok->t_cpu != mycpu->gd_cpuid) {
	1088	crit_exit();
	1089	return(0);
	1090	}
	1091	#endif
	1092	/* leave us in the critical section */
	1093	++tok->t_gen;
	1094	return(1);
	1095	}
	1096
	1097	/*
	1098	* Release your ownership of a token. Releases must occur in reverse
	1099	* order to aquisitions, eventually so priorities can be unwound properly
	1100	* like SPLs. At the moment the actual implemention doesn't care.
	1101	*
	1102	* We can safely hand a token that we own to another cpu without notifying
	1103	* it, but once we do we can't get it back without requesting it (unless
	1104	* the other cpu hands it back to us before we check).
	1105	*
	1106	* We might have lost the token, so check that.
	1107	*
	1108	* Return the token's generation number. The number is useful to callers
	1109	* who may want to know if the token was stolen during potential blockages.
	1110	*/
	1111	int
	1112	lwkt_reltoken(lwkt_token_t tok)
	1113	{
	1114	int gen;
	1115
	1116	if (tok->t_cpu == mycpu->gd_cpuid) {
	1117	tok->t_cpu = tok->t_reqcpu;
	1118	}
	1119	gen = tok->t_gen;
	1120	crit_exit();
	1121	return(gen);
	1122	}
	1123
	1124	/*
	1125	* Reacquire a token that might have been lost. 0 is returned if the
	1126	* generation has not changed (nobody stole the token from us), -1 is
	1127	* returned otherwise. The token is reacquired regardless but the
	1128	* generation number is not bumped further if we already own the token.
	1129	*
	1130	* For efficiency we inline the best-case situation for lwkt_regettoken()
	1131	* (i.e .we still own the token).
	1132	*/
	1133	int
	1134	lwkt_gentoken(lwkt_token_t tok, int *gen)
	1135	{
	1136	if (tok->t_cpu == mycpu->gd_cpuid && tok->t_gen == *gen)
	1137	return(0);
	1138	*gen = lwkt_regettoken(tok);
	1139	return(-1);
	1140	}
	1141
	1142	/*
	1143	* Re-acquire a token that might have been lost. The generation number
	1144	* is bumped and returned regardless of whether the token had been lost
	1145	* or not (because we only have cpu granularity we have to bump the token
	1146	* either way).
	1147	*/
	1148	int
	1149	lwkt_regettoken(lwkt_token_t tok)
	1150	{
	1151	/* assert we are in a critical section */
	1152	if (tok->t_cpu != mycpu->gd_cpuid) {
	1153	#ifdef SMP
	1154	while (tok->t_cpu != mycpu->gd_cpuid) {
	1155	struct lwkt_gettoken_req req;
	1156	int seq;
	1157	int dcpu;
	1158
	1159	req.cpu = mycpu->gd_cpuid;
	1160	req.tok = tok;
	1161	dcpu = (volatile int)tok->t_cpu;
	1162	KKASSERT(dcpu >= 0 && dcpu < ncpus);
	1163	#ifdef INVARIANTS
	1164	if (token_debug)
	1165	printf("REQT%d ", dcpu);
	1166	#endif
	1167	seq = lwkt_send_ipiq(dcpu, lwkt_gettoken_remote, &req);
	1168	lwkt_wait_ipiq(dcpu, seq);
	1169	#ifdef INVARIATNS
	1170	if (token_debug)
	1171	printf("REQR%d ", tok->t_cpu);
	1172	#endif
	1173	}
	1174	#endif
	1175	}
	1176	++tok->t_gen;
	1177	return(tok->t_gen);
	1178	}
	1179
	1180	void
	1181	lwkt_inittoken(lwkt_token_t tok)
	1182	{
	1183	/*
	1184	* Zero structure and set cpu owner and reqcpu to cpu 0.
	1185	*/
	1186	bzero(tok, sizeof(*tok));
	1187	}
	1188
	1189	/*
	1190	* Create a kernel process/thread/whatever. It shares it's address space
	1191	* with proc0 - ie: kernel only.
	1192	*
	1193	* NOTE! By default new threads are created with the MP lock held. A
	1194	* thread which does not require the MP lock should release it by calling
	1195	* rel_mplock() at the start of the new thread.
	1196	*/
	1197	int
	1198	lwkt_create(void (func)(void ), void *arg,
	1199	struct thread **tdp, thread_t template, int tdflags, int cpu,
	1200	const char *fmt, ...)
	1201	{
	1202	thread_t td;
	1203	va_list ap;
	1204
	1205	td = lwkt_alloc_thread(template, cpu);
	1206	if (tdp)
	1207	*tdp = td;
	1208	cpu_set_thread_handler(td, kthread_exit, func, arg);
	1209	td->td_flags \|= TDF_VERBOSE \| tdflags;
	1210	#ifdef SMP
	1211	td->td_mpcount = 1;
	1212	#endif
	1213
	1214	/*
	1215	* Set up arg0 for 'ps' etc
	1216	*/
	1217	va_start(ap, fmt);
	1218	vsnprintf(td->td_comm, sizeof(td->td_comm), fmt, ap);
	1219	va_end(ap);
	1220
	1221	/*
	1222	* Schedule the thread to run
	1223	*/
	1224	if ((td->td_flags & TDF_STOPREQ) == 0)
	1225	lwkt_schedule(td);
	1226	else
	1227	td->td_flags &= ~TDF_STOPREQ;
	1228	return 0;
	1229	}
	1230
	1231	/*
	1232	* Destroy an LWKT thread. Warning! This function is not called when
	1233	* a process exits, cpu_proc_exit() directly calls cpu_thread_exit() and
	1234	* uses a different reaping mechanism.
	1235	*/
	1236	void
	1237	lwkt_exit(void)
	1238	{
	1239	thread_t td = curthread;
	1240
	1241	if (td->td_flags & TDF_VERBOSE)
	1242	printf("kthread %p %s has exited\n", td, td->td_comm);
	1243	crit_enter();
	1244	lwkt_deschedule_self();
	1245	++mycpu->gd_tdfreecount;
	1246	TAILQ_INSERT_TAIL(&mycpu->gd_tdfreeq, td, td_threadq);
	1247	cpu_thread_exit();
	1248	}
	1249
	1250	/*
	1251	* Create a kernel process/thread/whatever. It shares it's address space
	1252	* with proc0 - ie: kernel only. 5.x compatible.
	1253	*
	1254	* NOTE! By default kthreads are created with the MP lock held. A
	1255	* thread which does not require the MP lock should release it by calling
	1256	* rel_mplock() at the start of the new thread.
	1257	*/
	1258	int
	1259	kthread_create(void (func)(void ), void *arg,
	1260	struct thread *tdp, const char fmt, ...)
	1261	{
	1262	thread_t td;
	1263	va_list ap;
	1264
	1265	td = lwkt_alloc_thread(NULL, -1);
	1266	if (tdp)
	1267	*tdp = td;
	1268	cpu_set_thread_handler(td, kthread_exit, func, arg);
	1269	td->td_flags \|= TDF_VERBOSE;
	1270	#ifdef SMP
	1271	td->td_mpcount = 1;
	1272	#endif
	1273
	1274	/*
	1275	* Set up arg0 for 'ps' etc
	1276	*/
	1277	va_start(ap, fmt);
	1278	vsnprintf(td->td_comm, sizeof(td->td_comm), fmt, ap);
	1279	va_end(ap);
	1280
	1281	/*
	1282	* Schedule the thread to run
	1283	*/
	1284	lwkt_schedule(td);
	1285	return 0;
	1286	}
	1287
	1288	void
	1289	crit_panic(void)
	1290	{
	1291	thread_t td = curthread;
	1292	int lpri = td->td_pri;
	1293
	1294	td->td_pri = 0;
	1295	panic("td_pri is/would-go negative! %p %d", td, lpri);
	1296	}
	1297
	1298	/*
	1299	* Destroy an LWKT thread. Warning! This function is not called when
	1300	* a process exits, cpu_proc_exit() directly calls cpu_thread_exit() and
	1301	* uses a different reaping mechanism.
	1302	*
	1303	* XXX duplicates lwkt_exit()
	1304	*/
	1305	void
	1306	kthread_exit(void)
	1307	{
	1308	lwkt_exit();
	1309	}
	1310
	1311	#ifdef SMP
	1312
	1313	/*
	1314	* Send a function execution request to another cpu. The request is queued
	1315	* on the cpu<->cpu ipiq matrix. Each cpu owns a unique ipiq FIFO for every
	1316	* possible target cpu. The FIFO can be written.
	1317	*
	1318	* YYY If the FIFO fills up we have to enable interrupts and process the
	1319	* IPIQ while waiting for it to empty or we may deadlock with another cpu.
	1320	* Create a CPU_*() function to do this!
	1321	*
	1322	* We can safely bump gd_intr_nesting_level because our crit_exit() at the
	1323	* end will take care of any pending interrupts.
	1324	*
	1325	* Must be called from a critical section.
	1326	*/
	1327	int
	1328	lwkt_send_ipiq(int dcpu, ipifunc_t func, void *arg)
	1329	{
	1330	lwkt_ipiq_t ip;
	1331	int windex;
	1332	struct globaldata *gd = mycpu;
	1333
	1334	if (dcpu == gd->gd_cpuid) {
	1335	func(arg);
	1336	return(0);
	1337	}
	1338	crit_enter();
	1339	++gd->gd_intr_nesting_level;
	1340	#ifdef INVARIANTS
	1341	if (gd->gd_intr_nesting_level > 20)
	1342	panic("lwkt_send_ipiq: TOO HEAVILY NESTED!");
	1343	#endif
	1344	KKASSERT(curthread->td_pri >= TDPRI_CRIT);
	1345	KKASSERT(dcpu >= 0 && dcpu < ncpus);
	1346	++ipiq_count;
	1347	ip = &gd->gd_ipiq[dcpu];
	1348
	1349	/*
	1350	* We always drain before the FIFO becomes full so it should never
	1351	* become full. We need to leave enough entries to deal with
	1352	* reentrancy.
	1353	*/
	1354	KKASSERT(ip->ip_windex - ip->ip_rindex != MAXCPUFIFO);
	1355	windex = ip->ip_windex & MAXCPUFIFO_MASK;
	1356	ip->ip_func[windex] = func;
	1357	ip->ip_arg[windex] = arg;
	1358	/* YYY memory barrier */
	1359	++ip->ip_windex;
	1360	if (ip->ip_windex - ip->ip_rindex > MAXCPUFIFO / 2) {
	1361	unsigned int eflags = read_eflags();
	1362	cpu_enable_intr();
	1363	++ipiq_fifofull;
	1364	while (ip->ip_windex - ip->ip_rindex > MAXCPUFIFO / 4) {
	1365	KKASSERT(ip->ip_windex - ip->ip_rindex != MAXCPUFIFO - 1);
	1366	lwkt_process_ipiq();
	1367	}
	1368	write_eflags(eflags);
	1369	}
	1370	--gd->gd_intr_nesting_level;
	1371	cpu_send_ipiq(dcpu); /* issues memory barrier if appropriate */
	1372	crit_exit();
	1373	return(ip->ip_windex);
	1374	}
	1375
	1376	/*
	1377	* Send a message to several target cpus. Typically used for scheduling.
	1378	* The message will not be sent to stopped cpus.
	1379	*/
	1380	void
	1381	lwkt_send_ipiq_mask(u_int32_t mask, ipifunc_t func, void *arg)
	1382	{
	1383	int cpuid;
	1384
	1385	mask &= ~stopped_cpus;
	1386	while (mask) {
	1387	cpuid = bsfl(mask);
	1388	lwkt_send_ipiq(cpuid, func, arg);
	1389	mask &= ~(1 << cpuid);
	1390	}
	1391	}
	1392
	1393	/*
	1394	* Wait for the remote cpu to finish processing a function.
	1395	*
	1396	* YYY we have to enable interrupts and process the IPIQ while waiting
	1397	* for it to empty or we may deadlock with another cpu. Create a CPU_*()
	1398	* function to do this! YYY we really should 'block' here.
	1399	*
	1400	* Must be called from a critical section. Thsi routine may be called
	1401	* from an interrupt (for example, if an interrupt wakes a foreign thread
	1402	* up).
	1403	*/
	1404	void
	1405	lwkt_wait_ipiq(int dcpu, int seq)
	1406	{
	1407	lwkt_ipiq_t ip;
	1408	int maxc = 100000000;
	1409
	1410	if (dcpu != mycpu->gd_cpuid) {
	1411	KKASSERT(dcpu >= 0 && dcpu < ncpus);
	1412	ip = &mycpu->gd_ipiq[dcpu];
	1413	if ((int)(ip->ip_xindex - seq) < 0) {
	1414	unsigned int eflags = read_eflags();
	1415	cpu_enable_intr();
	1416	while ((int)(ip->ip_xindex - seq) < 0) {
	1417	lwkt_process_ipiq();
	1418	if (--maxc == 0)
	1419	printf("LWKT_WAIT_IPIQ WARNING! %d wait %d (%d)\n", mycpu->gd_cpuid, dcpu, ip->ip_xindex - seq);
	1420	if (maxc < -1000000)
	1421	panic("LWKT_WAIT_IPIQ");
	1422	}
	1423	write_eflags(eflags);
	1424	}
	1425	}
	1426	}
	1427
	1428	/*
	1429	* Called from IPI interrupt (like a fast interrupt), which has placed
	1430	* us in a critical section. The MP lock may or may not be held.
	1431	* May also be called from doreti or splz, or be reentrantly called
	1432	* indirectly through the ip_func[] we run.
	1433	*/
	1434	void
	1435	lwkt_process_ipiq(void)
	1436	{
	1437	int n;
	1438	int cpuid = mycpu->gd_cpuid;
	1439
	1440	for (n = 0; n < ncpus; ++n) {
	1441	lwkt_ipiq_t ip;
	1442	int ri;
	1443
	1444	if (n == cpuid)
	1445	continue;
	1446	ip = globaldata_find(n)->gd_ipiq;
	1447	if (ip == NULL)
	1448	continue;
	1449	ip = &ip[cpuid];
	1450
	1451	/*
	1452	* Note: xindex is only updated after we are sure the function has
	1453	* finished execution. Beware lwkt_process_ipiq() reentrancy! The
	1454	* function may send an IPI which may block/drain.
	1455	*/
	1456	while (ip->ip_rindex != ip->ip_windex) {
	1457	ri = ip->ip_rindex & MAXCPUFIFO_MASK;
	1458	++ip->ip_rindex;
	1459	ip->ip_func[ri](ip->ip_arg[ri]);
	1460	/* YYY memory barrier */
	1461	ip->ip_xindex = ip->ip_rindex;
	1462	}
	1463	}
	1464	}
	1465
	1466	#else
	1467
	1468	int
	1469	lwkt_send_ipiq(int dcpu, ipifunc_t func, void *arg)
	1470	{
	1471	panic("lwkt_send_ipiq: UP box! (%d,%p,%p)", dcpu, func, arg);
	1472	return(0); /* NOT REACHED */
	1473	}
	1474
	1475	void
	1476	lwkt_wait_ipiq(int dcpu, int seq)
	1477	{
	1478	panic("lwkt_wait_ipiq: UP box! (%d,%d)", dcpu, seq);
	1479	}
	1480
	1481	#endif