gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2003 Matthew Dillon <dillon@backplane.com>
	3	* All rights reserved.
	4	*
	5	* Redistribution and use in source and binary forms, with or without
	6	* modification, are permitted provided that the following conditions
	7	* are met:
	8	* 1. Redistributions of source code must retain the above copyright
	9	* notice, this list of conditions and the following disclaimer.
	10	* 2. Redistributions in binary form must reproduce the above copyright
	11	* notice, this list of conditions and the following disclaimer in the
	12	* documentation and/or other materials provided with the distribution.
	13	*
	14	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	15	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	16	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	17	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	18	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	19	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	20	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	21	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	22	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	23	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	24	* SUCH DAMAGE.
	25	*
	26	* Each cpu in a system has its own self-contained light weight kernel
	27	* thread scheduler, which means that generally speaking we only need
	28	* to use a critical section to avoid problems. Foreign thread
	29	* scheduling is queued via (async) IPIs.
	30	*
	31	* $DragonFly: src/sys/kern/lwkt_thread.c,v 1.22 2003/07/11 22:30:09 dillon Exp $
	32	*/
	33
	34	#include <sys/param.h>
	35	#include <sys/systm.h>
	36	#include <sys/kernel.h>
	37	#include <sys/proc.h>
	38	#include <sys/rtprio.h>
	39	#include <sys/queue.h>
	40	#include <sys/thread2.h>
	41	#include <sys/sysctl.h>
	42	#include <sys/kthread.h>
	43	#include <machine/cpu.h>
	44	#include <sys/lock.h>
	45
	46	#include <vm/vm.h>
	47	#include <vm/vm_param.h>
	48	#include <vm/vm_kern.h>
	49	#include <vm/vm_object.h>
	50	#include <vm/vm_page.h>
	51	#include <vm/vm_map.h>
	52	#include <vm/vm_pager.h>
	53	#include <vm/vm_extern.h>
	54	#include <vm/vm_zone.h>
	55
	56	#include <machine/stdarg.h>
	57	#include <machine/ipl.h>
	58	#ifdef SMP
	59	#include <machine/smp.h>
	60	#endif
	61
	62	static int untimely_switch = 0;
	63	SYSCTL_INT(_lwkt, OID_AUTO, untimely_switch, CTLFLAG_RW, &untimely_switch, 0, "");
	64	#ifdef INVARIANTS
	65	static int token_debug = 0;
	66	SYSCTL_INT(_lwkt, OID_AUTO, token_debug, CTLFLAG_RW, &token_debug, 0, "");
	67	#endif
	68	static quad_t switch_count = 0;
	69	SYSCTL_QUAD(_lwkt, OID_AUTO, switch_count, CTLFLAG_RW, &switch_count, 0, "");
	70	static quad_t preempt_hit = 0;
	71	SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_hit, CTLFLAG_RW, &preempt_hit, 0, "");
	72	static quad_t preempt_miss = 0;
	73	SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_miss, CTLFLAG_RW, &preempt_miss, 0, "");
	74	static quad_t preempt_weird = 0;
	75	SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_weird, CTLFLAG_RW, &preempt_weird, 0, "");
	76	static quad_t ipiq_count = 0;
	77	SYSCTL_QUAD(_lwkt, OID_AUTO, ipiq_count, CTLFLAG_RW, &ipiq_count, 0, "");
	78	static quad_t ipiq_fifofull = 0;
	79	SYSCTL_QUAD(_lwkt, OID_AUTO, ipiq_fifofull, CTLFLAG_RW, &ipiq_fifofull, 0, "");
	80
	81	/*
	82	* These helper procedures handle the runq, they can only be called from
	83	* within a critical section.
	84	*/
	85	static __inline
	86	void
	87	_lwkt_dequeue(thread_t td)
	88	{
	89	if (td->td_flags & TDF_RUNQ) {
	90	int nq = td->td_pri & TDPRI_MASK;
	91	struct globaldata *gd = mycpu;
	92
	93	td->td_flags &= ~TDF_RUNQ;
	94	TAILQ_REMOVE(&gd->gd_tdrunq[nq], td, td_threadq);
	95	/* runqmask is passively cleaned up by the switcher */
	96	}
	97	}
	98
	99	static __inline
	100	void
	101	_lwkt_enqueue(thread_t td)
	102	{
	103	if ((td->td_flags & TDF_RUNQ) == 0) {
	104	int nq = td->td_pri & TDPRI_MASK;
	105	struct globaldata *gd = mycpu;
	106
	107	td->td_flags \|= TDF_RUNQ;
	108	TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], td, td_threadq);
	109	gd->gd_runqmask \|= 1 << nq;
	110	#if 0
	111	/*
	112	* YYY needs cli/sti protection? gd_reqpri set by interrupt
	113	* when made pending. need better mechanism.
	114	*/
	115	if (gd->gd_reqpri < (td->td_pri & TDPRI_MASK))
	116	gd->gd_reqpri = (td->td_pri & TDPRI_MASK);
	117	#endif
	118	}
	119	}
	120
	121	static __inline
	122	int
	123	_lwkt_wantresched(thread_t ntd, thread_t cur)
	124	{
	125	return((ntd->td_pri & TDPRI_MASK) > (cur->td_pri & TDPRI_MASK));
	126	}
	127
	128	/*
	129	* LWKTs operate on a per-cpu basis
	130	*
	131	* WARNING! Called from early boot, 'mycpu' may not work yet.
	132	*/
	133	void
	134	lwkt_gdinit(struct globaldata *gd)
	135	{
	136	int i;
	137
	138	for (i = 0; i < sizeof(gd->gd_tdrunq)/sizeof(gd->gd_tdrunq[0]); ++i)
	139	TAILQ_INIT(&gd->gd_tdrunq[i]);
	140	gd->gd_runqmask = 0;
	141	TAILQ_INIT(&gd->gd_tdallq);
	142	}
	143
	144	/*
	145	* Initialize a thread wait structure prior to first use.
	146	*
	147	* NOTE! called from low level boot code, we cannot do anything fancy!
	148	*/
	149	void
	150	lwkt_init_wait(lwkt_wait_t w)
	151	{
	152	TAILQ_INIT(&w->wa_waitq);
	153	}
	154
	155	/*
	156	* Create a new thread. The thread must be associated with a process context
	157	* or LWKT start address before it can be scheduled.
	158	*
	159	* If you intend to create a thread without a process context this function
	160	* does everything except load the startup and switcher function.
	161	*/
	162	thread_t
	163	lwkt_alloc_thread(struct thread *td)
	164	{
	165	void *stack;
	166	int flags = 0;
	167
	168	if (td == NULL) {
	169	crit_enter();
	170	if (mycpu->gd_tdfreecount > 0) {
	171	--mycpu->gd_tdfreecount;
	172	td = TAILQ_FIRST(&mycpu->gd_tdfreeq);
	173	KASSERT(td != NULL && (td->td_flags & TDF_RUNNING) == 0,
	174	("lwkt_alloc_thread: unexpected NULL or corrupted td"));
	175	TAILQ_REMOVE(&mycpu->gd_tdfreeq, td, td_threadq);
	176	crit_exit();
	177	stack = td->td_kstack;
	178	flags = td->td_flags & (TDF_ALLOCATED_STACK\|TDF_ALLOCATED_THREAD);
	179	} else {
	180	crit_exit();
	181	td = zalloc(thread_zone);
	182	td->td_kstack = NULL;
	183	flags \|= TDF_ALLOCATED_THREAD;
	184	}
	185	}
	186	if ((stack = td->td_kstack) == NULL) {
	187	stack = (void )kmem_alloc(kernel_map, UPAGES PAGE_SIZE);
	188	flags \|= TDF_ALLOCATED_STACK;
	189	}
	190	lwkt_init_thread(td, stack, flags, mycpu);
	191	return(td);
	192	}
	193
	194	/*
	195	* Initialize a preexisting thread structure. This function is used by
	196	* lwkt_alloc_thread() and also used to initialize the per-cpu idlethread.
	197	*
	198	* NOTE! called from low level boot code, we cannot do anything fancy!
	199	*/
	200	void
	201	lwkt_init_thread(thread_t td, void stack, int flags, struct globaldata gd)
	202	{
	203	bzero(td, sizeof(struct thread));
	204	td->td_kstack = stack;
	205	td->td_flags \|= flags;
	206	td->td_gd = gd;
	207	td->td_pri = TDPRI_CRIT;
	208	td->td_cpu = gd->gd_cpuid; /* YYY don't really need this if have td_gd */
	209	pmap_init_thread(td);
	210	crit_enter();
	211	TAILQ_INSERT_TAIL(&mycpu->gd_tdallq, td, td_allq);
	212	crit_exit();
	213	}
	214
	215	void
	216	lwkt_set_comm(thread_t td, const char *ctl, ...)
	217	{
	218	va_list va;
	219
	220	va_start(va, ctl);
	221	vsnprintf(td->td_comm, sizeof(td->td_comm), ctl, va);
	222	va_end(va);
	223	}
	224
	225	void
	226	lwkt_hold(thread_t td)
	227	{
	228	++td->td_refs;
	229	}
	230
	231	void
	232	lwkt_rele(thread_t td)
	233	{
	234	KKASSERT(td->td_refs > 0);
	235	--td->td_refs;
	236	}
	237
	238	void
	239	lwkt_wait_free(thread_t td)
	240	{
	241	while (td->td_refs)
	242	tsleep(td, PWAIT, "tdreap", hz);
	243	}
	244
	245	void
	246	lwkt_free_thread(thread_t td)
	247	{
	248	struct globaldata *gd = mycpu;
	249
	250	KASSERT((td->td_flags & TDF_RUNNING) == 0,
	251	("lwkt_free_thread: did not exit! %p", td));
	252
	253	crit_enter();
	254	TAILQ_REMOVE(&gd->gd_tdallq, td, td_allq);
	255	if (gd->gd_tdfreecount < CACHE_NTHREADS &&
	256	(td->td_flags & TDF_ALLOCATED_THREAD)
	257	) {
	258	++gd->gd_tdfreecount;
	259	TAILQ_INSERT_HEAD(&gd->gd_tdfreeq, td, td_threadq);
	260	crit_exit();
	261	} else {
	262	crit_exit();
	263	if (td->td_kstack && (td->td_flags & TDF_ALLOCATED_STACK)) {
	264	kmem_free(kernel_map,
	265	(vm_offset_t)td->td_kstack, UPAGES * PAGE_SIZE);
	266	/* gd invalid */
	267	td->td_kstack = NULL;
	268	}
	269	if (td->td_flags & TDF_ALLOCATED_THREAD)
	270	zfree(thread_zone, td);
	271	}
	272	}
	273
	274
	275	/*
	276	* Switch to the next runnable lwkt. If no LWKTs are runnable then
	277	* switch to the idlethread. Switching must occur within a critical
	278	* section to avoid races with the scheduling queue.
	279	*
	280	* We always have full control over our cpu's run queue. Other cpus
	281	* that wish to manipulate our queue must use the cpu_*msg() calls to
	282	* talk to our cpu, so a critical section is all that is needed and
	283	* the result is very, very fast thread switching.
	284	*
	285	* The LWKT scheduler uses a fixed priority model and round-robins at
	286	* each priority level. User process scheduling is a totally
	287	* different beast and LWKT priorities should not be confused with
	288	* user process priorities.
	289	*
	290	* The MP lock may be out of sync with the thread's td_mpcount. lwkt_switch()
	291	* cleans it up. Note that the td_switch() function cannot do anything that
	292	* requires the MP lock since the MP lock will have already been setup for
	293	* the target thread (not the current thread).
	294	*/
	295
	296	void
	297	lwkt_switch(void)
	298	{
	299	struct globaldata *gd;
	300	thread_t td = curthread;
	301	thread_t ntd;
	302	#ifdef SMP
	303	int mpheld;
	304	#endif
	305
	306	if (mycpu->gd_intr_nesting_level &&
	307	td->td_preempted == NULL && panicstr == NULL
	308	) {
	309	panic("lwkt_switch: cannot switch from within an interrupt, yet\n");
	310	}
	311
	312	/*
	313	* Passive release (used to transition from user to kernel mode
	314	* when we block or switch rather then when we enter the kernel).
	315	* This function is NOT called if we are switching into a preemption
	316	* or returning from a preemption. Typically this causes us to lose
	317	* our P_CURPROC designation (if we have one) and become a true LWKT
	318	* thread, and may also hand P_CURPROC to another process and schedule
	319	* its thread.
	320	*/
	321	if (td->td_release)
	322	td->td_release(td);
	323
	324	crit_enter();
	325	++switch_count;
	326
	327	#ifdef SMP
	328	/*
	329	* td_mpcount cannot be used to determine if we currently hold the
	330	* MP lock because get_mplock() will increment it prior to attempting
	331	* to get the lock, and switch out if it can't. Look at the actual lock.
	332	*/
	333	mpheld = MP_LOCK_HELD();
	334	#endif
	335	if ((ntd = td->td_preempted) != NULL) {
	336	/*
	337	* We had preempted another thread on this cpu, resume the preempted
	338	* thread. This occurs transparently, whether the preempted thread
	339	* was scheduled or not (it may have been preempted after descheduling
	340	* itself).
	341	*
	342	* We have to setup the MP lock for the original thread after backing
	343	* out the adjustment that was made to curthread when the original
	344	* was preempted.
	345	*/
	346	KKASSERT(ntd->td_flags & TDF_PREEMPT_LOCK);
	347	#ifdef SMP
	348	if (ntd->td_mpcount && mpheld == 0) {
	349	panic("MPLOCK NOT HELD ON RETURN: %p %p %d %d\n",
	350	td, ntd, td->td_mpcount, ntd->td_mpcount);
	351	}
	352	if (ntd->td_mpcount) {
	353	td->td_mpcount -= ntd->td_mpcount;
	354	KKASSERT(td->td_mpcount >= 0);
	355	}
	356	#endif
	357	ntd->td_flags \|= TDF_PREEMPT_DONE;
	358	/* YYY release mp lock on switchback if original doesn't need it */
	359	} else {
	360	/*
	361	* Priority queue / round-robin at each priority. Note that user
	362	* processes run at a fixed, low priority and the user process
	363	* scheduler deals with interactions between user processes
	364	* by scheduling and descheduling them from the LWKT queue as
	365	* necessary.
	366	*
	367	* We have to adjust the MP lock for the target thread. If we
	368	* need the MP lock and cannot obtain it we try to locate a
	369	* thread that does not need the MP lock.
	370	*/
	371	gd = mycpu;
	372	again:
	373	if (gd->gd_runqmask) {
	374	int nq = bsrl(gd->gd_runqmask);
	375	if ((ntd = TAILQ_FIRST(&gd->gd_tdrunq[nq])) == NULL) {
	376	gd->gd_runqmask &= ~(1 << nq);
	377	goto again;
	378	}
	379	#ifdef SMP
	380	if (ntd->td_mpcount && mpheld == 0 && !cpu_try_mplock()) {
	381	/*
	382	* Target needs MP lock and we couldn't get it, try
	383	* to locate a thread which does not need the MP lock
	384	* to run. If we cannot locate a thread spin in idle.
	385	*/
	386	u_int32_t rqmask = gd->gd_runqmask;
	387	while (rqmask) {
	388	TAILQ_FOREACH(ntd, &gd->gd_tdrunq[nq], td_threadq) {
	389	if (ntd->td_mpcount == 0)
	390	break;
	391	}
	392	if (ntd)
	393	break;
	394	rqmask &= ~(1 << nq);
	395	nq = bsrl(rqmask);
	396	}
	397	if (ntd == NULL) {
	398	ntd = &gd->gd_idlethread;
	399	ntd->td_flags \|= TDF_IDLE_NOHLT;
	400	} else {
	401	TAILQ_REMOVE(&gd->gd_tdrunq[nq], ntd, td_threadq);
	402	TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], ntd, td_threadq);
	403	}
	404	} else {
	405	TAILQ_REMOVE(&gd->gd_tdrunq[nq], ntd, td_threadq);
	406	TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], ntd, td_threadq);
	407	}
	408	#else
	409	TAILQ_REMOVE(&gd->gd_tdrunq[nq], ntd, td_threadq);
	410	TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], ntd, td_threadq);
	411	#endif
	412	} else {
	413	/*
	414	* Nothing to run but we may still need the BGL to deal with
	415	* pending interrupts, spin in idle if so.
	416	*/
	417	ntd = &gd->gd_idlethread;
	418	if (gd->gd_reqpri)
	419	ntd->td_flags \|= TDF_IDLE_NOHLT;
	420	}
	421	}
	422	KASSERT(ntd->td_pri >= TDPRI_CRIT,
	423	("priority problem in lwkt_switch %d %d", td->td_pri, ntd->td_pri));
	424
	425	/*
	426	* Do the actual switch. If the new target does not need the MP lock
	427	* and we are holding it, release the MP lock. If the new target requires
	428	* the MP lock we have already acquired it for the target.
	429	*/
	430	#ifdef SMP
	431	if (ntd->td_mpcount == 0 ) {
	432	if (MP_LOCK_HELD())
	433	cpu_rel_mplock();
	434	} else {
	435	ASSERT_MP_LOCK_HELD();
	436	}
	437	#endif
	438	if (td != ntd) {
	439	td->td_switch(ntd);
	440	}
	441
	442	crit_exit();
	443	}
	444
	445	/*
	446	* Switch if another thread has a higher priority. Do not switch to other
	447	* threads at the same priority.
	448	*/
	449	void
	450	lwkt_maybe_switch()
	451	{
	452	struct globaldata *gd = mycpu;
	453	struct thread *td = gd->gd_curthread;
	454
	455	if ((td->td_pri & TDPRI_MASK) < bsrl(gd->gd_runqmask)) {
	456	lwkt_switch();
	457	}
	458	}
	459
	460	/*
	461	* Request that the target thread preempt the current thread. Preemption
	462	* only works under a specific set of conditions:
	463	*
	464	* - We are not preempting ourselves
	465	* - The target thread is owned by the current cpu
	466	* - We are not currently being preempted
	467	* - The target is not currently being preempted
	468	* - We are able to satisfy the target's MP lock requirements (if any).
	469	*
	470	* THE CALLER OF LWKT_PREEMPT() MUST BE IN A CRITICAL SECTION. Typically
	471	* this is called via lwkt_schedule() through the td_preemptable callback.
	472	* critpri is the managed critical priority that we should ignore in order
	473	* to determine whether preemption is possible (aka usually just the crit
	474	* priority of lwkt_schedule() itself).
	475	*
	476	* XXX at the moment we run the target thread in a critical section during
	477	* the preemption in order to prevent the target from taking interrupts
	478	* that WE can't. Preemption is strictly limited to interrupt threads
	479	* and interrupt-like threads, outside of a critical section, and the
	480	* preempted source thread will be resumed the instant the target blocks
	481	* whether or not the source is scheduled (i.e. preemption is supposed to
	482	* be as transparent as possible).
	483	*
	484	* The target thread inherits our MP count (added to its own) for the
	485	* duration of the preemption in order to preserve the atomicy of the
	486	* MP lock during the preemption. Therefore, any preempting targets must be
	487	* careful in regards to MP assertions. Note that the MP count may be
	488	* out of sync with the physical mp_lock. If we preempt we have to preserve
	489	* the expected situation.
	490	*/
	491	void
	492	lwkt_preempt(thread_t ntd, int critpri)
	493	{
	494	thread_t td = curthread;
	495	#ifdef SMP
	496	int mpheld;
	497	int savecnt;
	498	#endif
	499
	500	/*
	501	* The caller has put us in a critical section. We can only preempt
	502	* if the caller of the caller was not in a critical section (basically
	503	* a local interrupt), as determined by the 'critpri' parameter. If
	504	* we are unable to preempt
	505	*
	506	* YYY The target thread must be in a critical section (else it must
	507	* inherit our critical section? I dunno yet).
	508	*/
	509	KASSERT(ntd->td_pri >= TDPRI_CRIT, ("BADCRIT0 %d", ntd->td_pri));
	510
	511	need_resched();
	512	if (!_lwkt_wantresched(ntd, td)) {
	513	++preempt_miss;
	514	return;
	515	}
	516	if ((td->td_pri & ~TDPRI_MASK) > critpri) {
	517	++preempt_miss;
	518	return;
	519	}
	520	#ifdef SMP
	521	if (ntd->td_cpu != mycpu->gd_cpuid) {
	522	++preempt_miss;
	523	return;
	524	}
	525	#endif
	526	if (td == ntd \|\| ((td->td_flags \| ntd->td_flags) & TDF_PREEMPT_LOCK)) {
	527	++preempt_weird;
	528	return;
	529	}
	530	if (ntd->td_preempted) {
	531	++preempt_hit;
	532	return;
	533	}
	534	#ifdef SMP
	535	/*
	536	* note: an interrupt might have occured just as we were transitioning
	537	* to the MP lock. In this case td_mpcount will be pre-disposed but
	538	* not actually synchronized with the actual state of the lock. We
	539	* can use it to imply an MP lock requirement for the preemption but
	540	* we cannot use it to test whether we hold the MP lock or not.
	541	*/
	542	mpheld = MP_LOCK_HELD();
	543	if (mpheld && td->td_mpcount == 0)
	544	panic("lwkt_preempt(): held and no count");
	545	savecnt = td->td_mpcount;
	546	ntd->td_mpcount += td->td_mpcount;
	547	if (mpheld == 0 && ntd->td_mpcount && !cpu_try_mplock()) {
	548	ntd->td_mpcount -= td->td_mpcount;
	549	++preempt_miss;
	550	return;
	551	}
	552	#endif
	553
	554	++preempt_hit;
	555	ntd->td_preempted = td;
	556	td->td_flags \|= TDF_PREEMPT_LOCK;
	557	td->td_switch(ntd);
	558	KKASSERT(ntd->td_preempted && (td->td_flags & TDF_PREEMPT_DONE));
	559	#ifdef SMP
	560	KKASSERT(savecnt == td->td_mpcount);
	561	if (mpheld == 0 && MP_LOCK_HELD())
	562	cpu_rel_mplock();
	563	else if (mpheld && !MP_LOCK_HELD())
	564	panic("lwkt_preempt(): MP lock was not held through");
	565	#endif
	566	ntd->td_preempted = NULL;
	567	td->td_flags &= ~(TDF_PREEMPT_LOCK\|TDF_PREEMPT_DONE);
	568	}
	569
	570	/*
	571	* Yield our thread while higher priority threads are pending. This is
	572	* typically called when we leave a critical section but it can be safely
	573	* called while we are in a critical section.
	574	*
	575	* This function will not generally yield to equal priority threads but it
	576	* can occur as a side effect. Note that lwkt_switch() is called from
	577	* inside the critical section to pervent its own crit_exit() from reentering
	578	* lwkt_yield_quick().
	579	*
	580	* gd_reqpri indicates that something changed, e.g. an interrupt or softint
	581	* came along but was blocked and made pending.
	582	*
	583	* (self contained on a per cpu basis)
	584	*/
	585	void
	586	lwkt_yield_quick(void)
	587	{
	588	thread_t td = curthread;
	589
	590	/*
	591	* gd_reqpri is cleared in splz if the cpl is 0. If we were to clear
	592	* it with a non-zero cpl then we might not wind up calling splz after
	593	* a task switch when the critical section is exited even though the
	594	* new task could accept the interrupt. YYY alternative is to have
	595	* lwkt_switch() just call splz unconditionally.
	596	*
	597	* XXX from crit_exit() only called after last crit section is released.
	598	* If called directly will run splz() even if in a critical section.
	599	*/
	600	if ((td->td_pri & TDPRI_MASK) < mycpu->gd_reqpri) {
	601	splz();
	602	}
	603
	604	/*
	605	* YYY enabling will cause wakeup() to task-switch, which really
	606	* confused the old 4.x code. This is a good way to simulate
	607	* preemption and MP without actually doing preemption or MP, because a
	608	* lot of code assumes that wakeup() does not block.
	609	*/
	610	if (untimely_switch && mycpu->gd_intr_nesting_level == 0) {
	611	crit_enter();
	612	/*
	613	* YYY temporary hacks until we disassociate the userland scheduler
	614	* from the LWKT scheduler.
	615	*/
	616	if (td->td_flags & TDF_RUNQ) {
	617	lwkt_switch(); /* will not reenter yield function */
	618	} else {
	619	lwkt_schedule_self(); /* make sure we are scheduled */
	620	lwkt_switch(); /* will not reenter yield function */
	621	lwkt_deschedule_self(); /* make sure we are descheduled */
	622	}
	623	crit_exit_noyield();
	624	}
	625	}
	626
	627	/*
	628	* This implements a normal yield which, unlike _quick, will yield to equal
	629	* priority threads as well. Note that gd_reqpri tests will be handled by
	630	* the crit_exit() call in lwkt_switch().
	631	*
	632	* (self contained on a per cpu basis)
	633	*/
	634	void
	635	lwkt_yield(void)
	636	{
	637	lwkt_schedule_self();
	638	lwkt_switch();
	639	}
	640
	641	/*
	642	* Schedule a thread to run. As the current thread we can always safely
	643	* schedule ourselves, and a shortcut procedure is provided for that
	644	* function.
	645	*
	646	* (non-blocking, self contained on a per cpu basis)
	647	*/
	648	void
	649	lwkt_schedule_self(void)
	650	{
	651	thread_t td = curthread;
	652
	653	crit_enter();
	654	KASSERT(td->td_wait == NULL, ("lwkt_schedule_self(): td_wait not NULL!"));
	655	_lwkt_enqueue(td);
	656	if (td->td_proc && td->td_proc->p_stat == SSLEEP)
	657	panic("SCHED SELF PANIC");
	658	crit_exit();
	659	}
	660
	661	/*
	662	* Generic schedule. Possibly schedule threads belonging to other cpus and
	663	* deal with threads that might be blocked on a wait queue.
	664	*
	665	* YYY this is one of the best places to implement load balancing code.
	666	* Load balancing can be accomplished by requesting other sorts of actions
	667	* for the thread in question.
	668	*/
	669	void
	670	lwkt_schedule(thread_t td)
	671	{
	672	#ifdef INVARIANTS
	673	if ((td->td_flags & TDF_PREEMPT_LOCK) == 0 && td->td_proc
	674	&& td->td_proc->p_stat == SSLEEP
	675	) {
	676	printf("PANIC schedule curtd = %p (%d %d) target %p (%d %d)\n",
	677	curthread,
	678	curthread->td_proc ? curthread->td_proc->p_pid : -1,
	679	curthread->td_proc ? curthread->td_proc->p_stat : -1,
	680	td,
	681	td->td_proc ? curthread->td_proc->p_pid : -1,
	682	td->td_proc ? curthread->td_proc->p_stat : -1
	683	);
	684	panic("SCHED PANIC");
	685	}
	686	#endif
	687	crit_enter();
	688	if (td == curthread) {
	689	_lwkt_enqueue(td);
	690	} else {
	691	lwkt_wait_t w;
	692
	693	/*
	694	* If the thread is on a wait list we have to send our scheduling
	695	* request to the owner of the wait structure. Otherwise we send
	696	* the scheduling request to the cpu owning the thread. Races
	697	* are ok, the target will forward the message as necessary (the
	698	* message may chase the thread around before it finally gets
	699	* acted upon).
	700	*
	701	* (remember, wait structures use stable storage)
	702	*/
	703	if ((w = td->td_wait) != NULL) {
	704	if (lwkt_trytoken(&w->wa_token)) {
	705	TAILQ_REMOVE(&w->wa_waitq, td, td_threadq);
	706	--w->wa_count;
	707	td->td_wait = NULL;
	708	if (td->td_cpu == mycpu->gd_cpuid) {
	709	_lwkt_enqueue(td);
	710	if (td->td_preemptable) {
	711	td->td_preemptable(td, TDPRI_CRIT2); / YYY +token */
	712	} else if (_lwkt_wantresched(td, curthread)) {
	713	need_resched();
	714	}
	715	} else {
	716	lwkt_send_ipiq(td->td_cpu, (ipifunc_t)lwkt_schedule, td);
	717	}
	718	lwkt_reltoken(&w->wa_token);
	719	} else {
	720	lwkt_send_ipiq(w->wa_token.t_cpu, (ipifunc_t)lwkt_schedule, td);
	721	}
	722	} else {
	723	/*
	724	* If the wait structure is NULL and we own the thread, there
	725	* is no race (since we are in a critical section). If we
	726	* do not own the thread there might be a race but the
	727	* target cpu will deal with it.
	728	*/
	729	if (td->td_cpu == mycpu->gd_cpuid) {
	730	_lwkt_enqueue(td);
	731	if (td->td_preemptable) {
	732	td->td_preemptable(td, TDPRI_CRIT);
	733	} else if (_lwkt_wantresched(td, curthread)) {
	734	need_resched();
	735	}
	736	} else {
	737	lwkt_send_ipiq(td->td_cpu, (ipifunc_t)lwkt_schedule, td);
	738	}
	739	}
	740	}
	741	crit_exit();
	742	}
	743
	744	/*
	745	* Managed acquisition. This code assumes that the MP lock is held for
	746	* the tdallq operation and that the thread has been descheduled from its
	747	* original cpu. We also have to wait for the thread to be entirely switched
	748	* out on its original cpu (this is usually fast enough that we never loop)
	749	* since the LWKT system does not have to hold the MP lock while switching
	750	* and the target may have released it before switching.
	751	*/
	752	void
	753	lwkt_acquire(thread_t td)
	754	{
	755	struct globaldata *gd;
	756	int ocpu;
	757
	758	gd = td->td_gd;
	759	KKASSERT((td->td_flags & TDF_RUNQ) == 0);
	760	while (td->td_flags & TDF_RUNNING) /* XXX spin */
	761	;
	762	if (gd != mycpu) {
	763	ocpu = td->td_cpu;
	764	crit_enter();
	765	TAILQ_REMOVE(&gd->gd_tdallq, td, td_allq); /* protected by BGL */
	766	gd = mycpu;
	767	td->td_gd = gd;
	768	td->td_cpu = gd->gd_cpuid;
	769	TAILQ_INSERT_TAIL(&gd->gd_tdallq, td, td_allq); /* protected by BGL */
	770	crit_exit();
	771	}
	772	}
	773
	774	/*
	775	* Deschedule a thread.
	776	*
	777	* (non-blocking, self contained on a per cpu basis)
	778	*/
	779	void
	780	lwkt_deschedule_self(void)
	781	{
	782	thread_t td = curthread;
	783
	784	crit_enter();
	785	KASSERT(td->td_wait == NULL, ("lwkt_schedule_self(): td_wait not NULL!"));
	786	_lwkt_dequeue(td);
	787	crit_exit();
	788	}
	789
	790	/*
	791	* Generic deschedule. Descheduling threads other then your own should be
	792	* done only in carefully controlled circumstances. Descheduling is
	793	* asynchronous.
	794	*
	795	* This function may block if the cpu has run out of messages.
	796	*/
	797	void
	798	lwkt_deschedule(thread_t td)
	799	{
	800	crit_enter();
	801	if (td == curthread) {
	802	_lwkt_dequeue(td);
	803	} else {
	804	if (td->td_cpu == mycpu->gd_cpuid) {
	805	_lwkt_dequeue(td);
	806	} else {
	807	lwkt_send_ipiq(td->td_cpu, (ipifunc_t)lwkt_deschedule, td);
	808	}
	809	}
	810	crit_exit();
	811	}
	812
	813	/*
	814	* Set the target thread's priority. This routine does not automatically
	815	* switch to a higher priority thread, LWKT threads are not designed for
	816	* continuous priority changes. Yield if you want to switch.
	817	*
	818	* We have to retain the critical section count which uses the high bits
	819	* of the td_pri field. The specified priority may also indicate zero or
	820	* more critical sections by adding TDPRI_CRIT*N.
	821	*/
	822	void
	823	lwkt_setpri(thread_t td, int pri)
	824	{
	825	KKASSERT(pri >= 0);
	826	KKASSERT(td->td_cpu == mycpu->gd_cpuid);
	827	crit_enter();
	828	if (td->td_flags & TDF_RUNQ) {
	829	_lwkt_dequeue(td);
	830	td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri;
	831	_lwkt_enqueue(td);
	832	} else {
	833	td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri;
	834	}
	835	crit_exit();
	836	}
	837
	838	void
	839	lwkt_setpri_self(int pri)
	840	{
	841	thread_t td = curthread;
	842
	843	KKASSERT(pri >= 0 && pri <= TDPRI_MAX);
	844	crit_enter();
	845	if (td->td_flags & TDF_RUNQ) {
	846	_lwkt_dequeue(td);
	847	td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri;
	848	_lwkt_enqueue(td);
	849	} else {
	850	td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri;
	851	}
	852	crit_exit();
	853	}
	854
	855	struct proc *
	856	lwkt_preempted_proc(void)
	857	{
	858	thread_t td = curthread;
	859	while (td->td_preempted)
	860	td = td->td_preempted;
	861	return(td->td_proc);
	862	}
	863
	864
	865	/*
	866	* This function deschedules the current thread and blocks on the specified
	867	* wait queue. We obtain ownership of the wait queue in order to block
	868	* on it. A generation number is used to interlock the wait queue in case
	869	* it gets signalled while we are blocked waiting on the token.
	870	*
	871	* Note: alternatively we could dequeue our thread and then message the
	872	* target cpu owning the wait queue. YYY implement as sysctl.
	873	*
	874	* Note: wait queue signals normally ping-pong the cpu as an optimization.
	875	*/
	876	typedef struct lwkt_gettoken_req {
	877	lwkt_token_t tok;
	878	int cpu;
	879	} lwkt_gettoken_req;
	880
	881	void
	882	lwkt_block(lwkt_wait_t w, const char wmesg, int gen)
	883	{
	884	thread_t td = curthread;
	885
	886	lwkt_gettoken(&w->wa_token);
	887	if (w->wa_gen == *gen) {
	888	_lwkt_dequeue(td);
	889	TAILQ_INSERT_TAIL(&w->wa_waitq, td, td_threadq);
	890	++w->wa_count;
	891	td->td_wait = w;
	892	td->td_wmesg = wmesg;
	893	lwkt_switch();
	894	}
	895	/* token might be lost, doesn't matter for gen update */
	896	*gen = w->wa_gen;
	897	lwkt_reltoken(&w->wa_token);
	898	}
	899
	900	/*
	901	* Signal a wait queue. We gain ownership of the wait queue in order to
	902	* signal it. Once a thread is removed from the wait queue we have to
	903	* deal with the cpu owning the thread.
	904	*
	905	* Note: alternatively we could message the target cpu owning the wait
	906	* queue. YYY implement as sysctl.
	907	*/
	908	void
	909	lwkt_signal(lwkt_wait_t w)
	910	{
	911	thread_t td;
	912	int count;
	913
	914	lwkt_gettoken(&w->wa_token);
	915	++w->wa_gen;
	916	count = w->wa_count;
	917	while ((td = TAILQ_FIRST(&w->wa_waitq)) != NULL && count) {
	918	--count;
	919	--w->wa_count;
	920	TAILQ_REMOVE(&w->wa_waitq, td, td_threadq);
	921	td->td_wait = NULL;
	922	td->td_wmesg = NULL;
	923	if (td->td_cpu == mycpu->gd_cpuid) {
	924	_lwkt_enqueue(td);
	925	} else {
	926	lwkt_send_ipiq(td->td_cpu, (ipifunc_t)lwkt_schedule, td);
	927	}
	928	lwkt_regettoken(&w->wa_token);
	929	}
	930	lwkt_reltoken(&w->wa_token);
	931	}
	932
	933	/*
	934	* Acquire ownership of a token
	935	*
	936	* Acquire ownership of a token. The token may have spl and/or critical
	937	* section side effects, depending on its purpose. These side effects
	938	* guarentee that you will maintain ownership of the token as long as you
	939	* do not block. If you block you may lose access to the token (but you
	940	* must still release it even if you lose your access to it).
	941	*
	942	* YYY for now we use a critical section to prevent IPIs from taking away
	943	* a token, but do we really only need to disable IPIs ?
	944	*
	945	* YYY certain tokens could be made to act like mutexes when performance
	946	* would be better (e.g. t_cpu == -1). This is not yet implemented.
	947	*
	948	* YYY the tokens replace 4.x's simplelocks for the most part, but this
	949	* means that 4.x does not expect a switch so for now we cannot switch
	950	* when waiting for an IPI to be returned.
	951	*
	952	* YYY If the token is owned by another cpu we may have to send an IPI to
	953	* it and then block. The IPI causes the token to be given away to the
	954	* requesting cpu, unless it has already changed hands. Since only the
	955	* current cpu can give away a token it owns we do not need a memory barrier.
	956	* This needs serious optimization.
	957	*/
	958
	959	#ifdef SMP
	960
	961	static
	962	void
	963	lwkt_gettoken_remote(void *arg)
	964	{
	965	lwkt_gettoken_req *req = arg;
	966	if (req->tok->t_cpu == mycpu->gd_cpuid) {
	967	if (token_debug)
	968	printf("GT(%d,%d) ", req->tok->t_cpu, req->cpu);
	969	req->tok->t_cpu = req->cpu;
	970	req->tok->t_reqcpu = req->cpu; /* YYY leave owned by target cpu */
	971	/* else set reqcpu to point to current cpu for release */
	972	}
	973	}
	974
	975	#endif
	976
	977	int
	978	lwkt_gettoken(lwkt_token_t tok)
	979	{
	980	/*
	981	* Prevent preemption so the token can't be taken away from us once
	982	* we gain ownership of it. Use a synchronous request which might
	983	* block. The request will be forwarded as necessary playing catchup
	984	* to the token.
	985	*/
	986
	987	crit_enter();
	988	#ifdef INVARIANTS
	989	if (curthread->td_pri > 2000) {
	990	curthread->td_pri = 1000;
	991	panic("too HIGH!");
	992	}
	993	#endif
	994	#ifdef SMP
	995	while (tok->t_cpu != mycpu->gd_cpuid) {
	996	struct lwkt_gettoken_req req;
	997	int seq;
	998	int dcpu;
	999
	1000	req.cpu = mycpu->gd_cpuid;
	1001	req.tok = tok;
	1002	dcpu = (volatile int)tok->t_cpu;
	1003	KKASSERT(dcpu >= 0 && dcpu < ncpus);
	1004	if (token_debug)
	1005	printf("REQT%d ", dcpu);
	1006	seq = lwkt_send_ipiq(dcpu, lwkt_gettoken_remote, &req);
	1007	lwkt_wait_ipiq(dcpu, seq);
	1008	if (token_debug)
	1009	printf("REQR%d ", tok->t_cpu);
	1010	}
	1011	#endif
	1012	/*
	1013	* leave us in a critical section on return. This will be undone
	1014	* by lwkt_reltoken(). Bump the generation number.
	1015	*/
	1016	return(++tok->t_gen);
	1017	}
	1018
	1019	/*
	1020	* Attempt to acquire ownership of a token. Returns 1 on success, 0 on
	1021	* failure.
	1022	*/
	1023	int
	1024	lwkt_trytoken(lwkt_token_t tok)
	1025	{
	1026	crit_enter();
	1027	#ifdef SMP
	1028	if (tok->t_cpu != mycpu->gd_cpuid) {
	1029	return(0);
	1030	}
	1031	#endif
	1032	/* leave us in the critical section */
	1033	++tok->t_gen;
	1034	return(1);
	1035	}
	1036
	1037	/*
	1038	* Release your ownership of a token. Releases must occur in reverse
	1039	* order to aquisitions, eventually so priorities can be unwound properly
	1040	* like SPLs. At the moment the actual implemention doesn't care.
	1041	*
	1042	* We can safely hand a token that we own to another cpu without notifying
	1043	* it, but once we do we can't get it back without requesting it (unless
	1044	* the other cpu hands it back to us before we check).
	1045	*
	1046	* We might have lost the token, so check that.
	1047	*/
	1048	void
	1049	lwkt_reltoken(lwkt_token_t tok)
	1050	{
	1051	if (tok->t_cpu == mycpu->gd_cpuid) {
	1052	tok->t_cpu = tok->t_reqcpu;
	1053	}
	1054	crit_exit();
	1055	}
	1056
	1057	/*
	1058	* Reacquire a token that might have been lost and compare and update the
	1059	* generation number. 0 is returned if the generation has not changed
	1060	* (nobody else obtained the token while we were blocked, on this cpu or
	1061	* any other cpu).
	1062	*
	1063	* This function returns with the token re-held whether the generation
	1064	* number changed or not.
	1065	*/
	1066	int
	1067	lwkt_gentoken(lwkt_token_t tok, int *gen)
	1068	{
	1069	if (lwkt_regettoken(tok) == *gen) {
	1070	return(0);
	1071	} else {
	1072	*gen = tok->t_gen;
	1073	return(-1);
	1074	}
	1075	}
	1076
	1077
	1078	/*
	1079	* Re-acquire a token that might have been lost. Returns the generation
	1080	* number of the token.
	1081	*/
	1082	int
	1083	lwkt_regettoken(lwkt_token_t tok)
	1084	{
	1085	/* assert we are in a critical section */
	1086	if (tok->t_cpu != mycpu->gd_cpuid) {
	1087	#ifdef SMP
	1088	while (tok->t_cpu != mycpu->gd_cpuid) {
	1089	struct lwkt_gettoken_req req;
	1090	int seq;
	1091	int dcpu;
	1092
	1093	req.cpu = mycpu->gd_cpuid;
	1094	req.tok = tok;
	1095	dcpu = (volatile int)tok->t_cpu;
	1096	KKASSERT(dcpu >= 0 && dcpu < ncpus);
	1097	if (token_debug)
	1098	printf("REQT%d ", dcpu);
	1099	seq = lwkt_send_ipiq(dcpu, lwkt_gettoken_remote, &req);
	1100	lwkt_wait_ipiq(dcpu, seq);
	1101	if (token_debug)
	1102	printf("REQR%d ", tok->t_cpu);
	1103	}
	1104	#endif
	1105	++tok->t_gen;
	1106	}
	1107	return(tok->t_gen);
	1108	}
	1109
	1110	void
	1111	lwkt_inittoken(lwkt_token_t tok)
	1112	{
	1113	/*
	1114	* Zero structure and set cpu owner and reqcpu to cpu 0.
	1115	*/
	1116	bzero(tok, sizeof(*tok));
	1117	}
	1118
	1119	/*
	1120	* Create a kernel process/thread/whatever. It shares it's address space
	1121	* with proc0 - ie: kernel only.
	1122	*
	1123	* XXX should be renamed to lwkt_create()
	1124	*
	1125	* The thread will be entered with the MP lock held.
	1126	*/
	1127	int
	1128	lwkt_create(void (func)(void ), void *arg,
	1129	struct thread **tdp, thread_t template, int tdflags,
	1130	const char *fmt, ...)
	1131	{
	1132	thread_t td;
	1133	va_list ap;
	1134
	1135	td = lwkt_alloc_thread(template);
	1136	if (tdp)
	1137	*tdp = td;
	1138	cpu_set_thread_handler(td, kthread_exit, func, arg);
	1139	td->td_flags \|= TDF_VERBOSE \| tdflags;
	1140	#ifdef SMP
	1141	td->td_mpcount = 1;
	1142	#endif
	1143
	1144	/*
	1145	* Set up arg0 for 'ps' etc
	1146	*/
	1147	va_start(ap, fmt);
	1148	vsnprintf(td->td_comm, sizeof(td->td_comm), fmt, ap);
	1149	va_end(ap);
	1150
	1151	/*
	1152	* Schedule the thread to run
	1153	*/
	1154	if ((td->td_flags & TDF_STOPREQ) == 0)
	1155	lwkt_schedule(td);
	1156	else
	1157	td->td_flags &= ~TDF_STOPREQ;
	1158	return 0;
	1159	}
	1160
	1161	/*
	1162	* Destroy an LWKT thread. Warning! This function is not called when
	1163	* a process exits, cpu_proc_exit() directly calls cpu_thread_exit() and
	1164	* uses a different reaping mechanism.
	1165	*/
	1166	void
	1167	lwkt_exit(void)
	1168	{
	1169	thread_t td = curthread;
	1170
	1171	if (td->td_flags & TDF_VERBOSE)
	1172	printf("kthread %p %s has exited\n", td, td->td_comm);
	1173	crit_enter();
	1174	lwkt_deschedule_self();
	1175	++mycpu->gd_tdfreecount;
	1176	TAILQ_INSERT_TAIL(&mycpu->gd_tdfreeq, td, td_threadq);
	1177	cpu_thread_exit();
	1178	}
	1179
	1180	/*
	1181	* Create a kernel process/thread/whatever. It shares it's address space
	1182	* with proc0 - ie: kernel only. 5.x compatible.
	1183	*/
	1184	int
	1185	kthread_create(void (func)(void ), void *arg,
	1186	struct thread *tdp, const char fmt, ...)
	1187	{
	1188	thread_t td;
	1189	va_list ap;
	1190
	1191	td = lwkt_alloc_thread(NULL);
	1192	if (tdp)
	1193	*tdp = td;
	1194	cpu_set_thread_handler(td, kthread_exit, func, arg);
	1195	td->td_flags \|= TDF_VERBOSE;
	1196	#ifdef SMP
	1197	td->td_mpcount = 1;
	1198	#endif
	1199
	1200	/*
	1201	* Set up arg0 for 'ps' etc
	1202	*/
	1203	va_start(ap, fmt);
	1204	vsnprintf(td->td_comm, sizeof(td->td_comm), fmt, ap);
	1205	va_end(ap);
	1206
	1207	/*
	1208	* Schedule the thread to run
	1209	*/
	1210	lwkt_schedule(td);
	1211	return 0;
	1212	}
	1213
	1214	void
	1215	crit_panic(void)
	1216	{
	1217	thread_t td = curthread;
	1218	int lpri = td->td_pri;
	1219
	1220	td->td_pri = 0;
	1221	panic("td_pri is/would-go negative! %p %d", td, lpri);
	1222	}
	1223
	1224	/*
	1225	* Destroy an LWKT thread. Warning! This function is not called when
	1226	* a process exits, cpu_proc_exit() directly calls cpu_thread_exit() and
	1227	* uses a different reaping mechanism.
	1228	*
	1229	* XXX duplicates lwkt_exit()
	1230	*/
	1231	void
	1232	kthread_exit(void)
	1233	{
	1234	lwkt_exit();
	1235	}
	1236
	1237	#ifdef SMP
	1238
	1239	/*
	1240	* Send a function execution request to another cpu. The request is queued
	1241	* on the cpu<->cpu ipiq matrix. Each cpu owns a unique ipiq FIFO for every
	1242	* possible target cpu. The FIFO can be written.
	1243	*
	1244	* YYY If the FIFO fills up we have to enable interrupts and process the
	1245	* IPIQ while waiting for it to empty or we may deadlock with another cpu.
	1246	* Create a CPU_*() function to do this!
	1247	*
	1248	* Must be called from a critical section.
	1249	*/
	1250	int
	1251	lwkt_send_ipiq(int dcpu, ipifunc_t func, void *arg)
	1252	{
	1253	lwkt_ipiq_t ip;
	1254	int windex;
	1255	struct globaldata *gd = mycpu;
	1256
	1257	if (dcpu == gd->gd_cpuid) {
	1258	func(arg);
	1259	return(0);
	1260	}
	1261	crit_enter();
	1262	++gd->gd_intr_nesting_level;
	1263	#ifdef INVARIANTS
	1264	if (gd->gd_intr_nesting_level > 20)
	1265	panic("lwkt_send_ipiq: TOO HEAVILY NESTED!");
	1266	#endif
	1267	KKASSERT(curthread->td_pri >= TDPRI_CRIT);
	1268	KKASSERT(dcpu >= 0 && dcpu < ncpus);
	1269	++ipiq_count;
	1270	ip = &gd->gd_ipiq[dcpu];
	1271
	1272	/*
	1273	* We always drain before the FIFO becomes full so it should never
	1274	* become full. We need to leave enough entries to deal with
	1275	* reentrancy.
	1276	*/
	1277	KKASSERT(ip->ip_windex - ip->ip_rindex != MAXCPUFIFO);
	1278	windex = ip->ip_windex & MAXCPUFIFO_MASK;
	1279	ip->ip_func[windex] = func;
	1280	ip->ip_arg[windex] = arg;
	1281	/* YYY memory barrier */
	1282	++ip->ip_windex;
	1283	if (ip->ip_windex - ip->ip_rindex > MAXCPUFIFO / 2) {
	1284	unsigned int eflags = read_eflags();
	1285	cpu_enable_intr();
	1286	++ipiq_fifofull;
	1287	while (ip->ip_windex - ip->ip_rindex > MAXCPUFIFO / 4) {
	1288	KKASSERT(ip->ip_windex - ip->ip_rindex != MAXCPUFIFO - 1);
	1289	lwkt_process_ipiq();
	1290	}
	1291	write_eflags(eflags);
	1292	}
	1293	--gd->gd_intr_nesting_level;
	1294	cpu_send_ipiq(dcpu); /* issues memory barrier if appropriate */
	1295	crit_exit();
	1296	return(ip->ip_windex);
	1297	}
	1298
	1299	/*
	1300	* Send a message to several target cpus. Typically used for scheduling.
	1301	*/
	1302	void
	1303	lwkt_send_ipiq_mask(u_int32_t mask, ipifunc_t func, void *arg)
	1304	{
	1305	int cpuid;
	1306
	1307	while (mask) {
	1308	cpuid = bsfl(mask);
	1309	lwkt_send_ipiq(cpuid, func, arg);
	1310	mask &= ~(1 << cpuid);
	1311	}
	1312	}
	1313
	1314	/*
	1315	* Wait for the remote cpu to finish processing a function.
	1316	*
	1317	* YYY we have to enable interrupts and process the IPIQ while waiting
	1318	* for it to empty or we may deadlock with another cpu. Create a CPU_*()
	1319	* function to do this! YYY we really should 'block' here.
	1320	*
	1321	* Must be called from a critical section. Thsi routine may be called
	1322	* from an interrupt (for example, if an interrupt wakes a foreign thread
	1323	* up).
	1324	*/
	1325	void
	1326	lwkt_wait_ipiq(int dcpu, int seq)
	1327	{
	1328	lwkt_ipiq_t ip;
	1329	int maxc = 100000000;
	1330
	1331	if (dcpu != mycpu->gd_cpuid) {
	1332	KKASSERT(dcpu >= 0 && dcpu < ncpus);
	1333	ip = &mycpu->gd_ipiq[dcpu];
	1334	if ((int)(ip->ip_xindex - seq) < 0) {
	1335	unsigned int eflags = read_eflags();
	1336	cpu_enable_intr();
	1337	while ((int)(ip->ip_xindex - seq) < 0) {
	1338	lwkt_process_ipiq();
	1339	if (--maxc == 0)
	1340	printf("LWKT_WAIT_IPIQ WARNING! %d wait %d (%d)\n", mycpu->gd_cpuid, dcpu, ip->ip_xindex - seq);
	1341	if (maxc < -1000000)
	1342	panic("LWKT_WAIT_IPIQ");
	1343	}
	1344	write_eflags(eflags);
	1345	}
	1346	}
	1347	}
	1348
	1349	/*
	1350	* Called from IPI interrupt (like a fast interrupt), which has placed
	1351	* us in a critical section. The MP lock may or may not be held.
	1352	* May also be called from doreti or splz, or be reentrantly called
	1353	* indirectly through the ip_func[] we run.
	1354	*/
	1355	void
	1356	lwkt_process_ipiq(void)
	1357	{
	1358	int n;
	1359	int cpuid = mycpu->gd_cpuid;
	1360
	1361	for (n = 0; n < ncpus; ++n) {
	1362	lwkt_ipiq_t ip;
	1363	int ri;
	1364
	1365	if (n == cpuid)
	1366	continue;
	1367	ip = globaldata_find(n)->gd_ipiq;
	1368	if (ip == NULL)
	1369	continue;
	1370	ip = &ip[cpuid];
	1371
	1372	/*
	1373	* Note: xindex is only updated after we are sure the function has
	1374	* finished execution. Beware lwkt_process_ipiq() reentrancy! The
	1375	* function may send an IPI which may block/drain.
	1376	*/
	1377	while (ip->ip_rindex != ip->ip_windex) {
	1378	ri = ip->ip_rindex & MAXCPUFIFO_MASK;
	1379	++ip->ip_rindex;
	1380	ip->ip_func[ri](ip->ip_arg[ri]);
	1381	/* YYY memory barrier */
	1382	ip->ip_xindex = ip->ip_rindex;
	1383	}
	1384	}
	1385	}
	1386
	1387	#else
	1388
	1389	int
	1390	lwkt_send_ipiq(int dcpu, ipifunc_t func, void *arg)
	1391	{
	1392	panic("lwkt_send_ipiq: UP box! (%d,%p,%p)", dcpu, func, arg);
	1393	return(0); /* NOT REACHED */
	1394	}
	1395
	1396	void
	1397	lwkt_wait_ipiq(int dcpu, int seq)
	1398	{
	1399	panic("lwkt_wait_ipiq: UP box! (%d,%d)", dcpu, seq);
	1400	}
	1401
	1402	#endif