gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2003 Matthew Dillon <dillon@backplane.com>
	3	* All rights reserved.
	4	*
	5	* Redistribution and use in source and binary forms, with or without
	6	* modification, are permitted provided that the following conditions
	7	* are met:
	8	* 1. Redistributions of source code must retain the above copyright
	9	* notice, this list of conditions and the following disclaimer.
	10	* 2. Redistributions in binary form must reproduce the above copyright
	11	* notice, this list of conditions and the following disclaimer in the
	12	* documentation and/or other materials provided with the distribution.
	13	*
	14	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	15	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	16	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	17	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	18	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	19	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	20	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	21	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	22	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	23	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	24	* SUCH DAMAGE.
	25	*
	26	* Each cpu in a system has its own self-contained light weight kernel
	27	* thread scheduler, which means that generally speaking we only need
	28	* to use a critical section to avoid problems. Foreign thread
	29	* scheduling is queued via (async) IPIs.
	30	*
	31	* $DragonFly: src/sys/kern/lwkt_thread.c,v 1.17 2003/07/08 09:57:13 dillon Exp $
	32	*/
	33
	34	#include <sys/param.h>
	35	#include <sys/systm.h>
	36	#include <sys/kernel.h>
	37	#include <sys/proc.h>
	38	#include <sys/rtprio.h>
	39	#include <sys/queue.h>
	40	#include <sys/thread2.h>
	41	#include <sys/sysctl.h>
	42	#include <sys/kthread.h>
	43	#include <machine/cpu.h>
	44	#include <sys/lock.h>
	45
	46	#include <vm/vm.h>
	47	#include <vm/vm_param.h>
	48	#include <vm/vm_kern.h>
	49	#include <vm/vm_object.h>
	50	#include <vm/vm_page.h>
	51	#include <vm/vm_map.h>
	52	#include <vm/vm_pager.h>
	53	#include <vm/vm_extern.h>
	54	#include <vm/vm_zone.h>
	55
	56	#include <machine/stdarg.h>
	57	#include <machine/ipl.h>
	58	#ifdef SMP
	59	#include <machine/smp.h>
	60	#endif
	61
	62	static int untimely_switch = 0;
	63	SYSCTL_INT(_lwkt, OID_AUTO, untimely_switch, CTLFLAG_RW, &untimely_switch, 0, "");
	64	#ifdef INVARIANTS
	65	static int token_debug = 0;
	66	SYSCTL_INT(_lwkt, OID_AUTO, token_debug, CTLFLAG_RW, &token_debug, 0, "");
	67	#endif
	68	static quad_t switch_count = 0;
	69	SYSCTL_QUAD(_lwkt, OID_AUTO, switch_count, CTLFLAG_RW, &switch_count, 0, "");
	70	static quad_t preempt_hit = 0;
	71	SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_hit, CTLFLAG_RW, &preempt_hit, 0, "");
	72	static quad_t preempt_miss = 0;
	73	SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_miss, CTLFLAG_RW, &preempt_miss, 0, "");
	74	static quad_t preempt_weird = 0;
	75	SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_weird, CTLFLAG_RW, &preempt_weird, 0, "");
	76	static quad_t ipiq_count = 0;
	77	SYSCTL_QUAD(_lwkt, OID_AUTO, ipiq_count, CTLFLAG_RW, &ipiq_count, 0, "");
	78	static quad_t ipiq_fifofull = 0;
	79	SYSCTL_QUAD(_lwkt, OID_AUTO, ipiq_fifofull, CTLFLAG_RW, &ipiq_fifofull, 0, "");
	80
	81	/*
	82	* These helper procedures handle the runq, they can only be called from
	83	* within a critical section.
	84	*/
	85	static __inline
	86	void
	87	_lwkt_dequeue(thread_t td)
	88	{
	89	if (td->td_flags & TDF_RUNQ) {
	90	int nq = td->td_pri & TDPRI_MASK;
	91	struct globaldata *gd = mycpu;
	92
	93	td->td_flags &= ~TDF_RUNQ;
	94	TAILQ_REMOVE(&gd->gd_tdrunq[nq], td, td_threadq);
	95	/* runqmask is passively cleaned up by the switcher */
	96	}
	97	}
	98
	99	static __inline
	100	void
	101	_lwkt_enqueue(thread_t td)
	102	{
	103	if ((td->td_flags & TDF_RUNQ) == 0) {
	104	int nq = td->td_pri & TDPRI_MASK;
	105	struct globaldata *gd = mycpu;
	106
	107	td->td_flags \|= TDF_RUNQ;
	108	TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], td, td_threadq);
	109	gd->gd_runqmask \|= 1 << nq;
	110	#if 0
	111	/*
	112	* YYY needs cli/sti protection? gd_reqpri set by interrupt
	113	* when made pending. need better mechanism.
	114	*/
	115	if (gd->gd_reqpri < (td->td_pri & TDPRI_MASK))
	116	gd->gd_reqpri = (td->td_pri & TDPRI_MASK);
	117	#endif
	118	}
	119	}
	120
	121	static __inline
	122	int
	123	_lwkt_wantresched(thread_t ntd, thread_t cur)
	124	{
	125	return((ntd->td_pri & TDPRI_MASK) > (cur->td_pri & TDPRI_MASK));
	126	}
	127
	128	/*
	129	* LWKTs operate on a per-cpu basis
	130	*
	131	* WARNING! Called from early boot, 'mycpu' may not work yet.
	132	*/
	133	void
	134	lwkt_gdinit(struct globaldata *gd)
	135	{
	136	int i;
	137
	138	for (i = 0; i < sizeof(gd->gd_tdrunq)/sizeof(gd->gd_tdrunq[0]); ++i)
	139	TAILQ_INIT(&gd->gd_tdrunq[i]);
	140	gd->gd_runqmask = 0;
	141	TAILQ_INIT(&gd->gd_tdallq);
	142	}
	143
	144	/*
	145	* Initialize a thread wait structure prior to first use.
	146	*
	147	* NOTE! called from low level boot code, we cannot do anything fancy!
	148	*/
	149	void
	150	lwkt_init_wait(lwkt_wait_t w)
	151	{
	152	TAILQ_INIT(&w->wa_waitq);
	153	}
	154
	155	/*
	156	* Create a new thread. The thread must be associated with a process context
	157	* or LWKT start address before it can be scheduled.
	158	*
	159	* If you intend to create a thread without a process context this function
	160	* does everything except load the startup and switcher function.
	161	*/
	162	thread_t
	163	lwkt_alloc_thread(struct thread *td)
	164	{
	165	void *stack;
	166	int flags = 0;
	167
	168	if (td == NULL) {
	169	crit_enter();
	170	if (mycpu->gd_tdfreecount > 0) {
	171	--mycpu->gd_tdfreecount;
	172	td = TAILQ_FIRST(&mycpu->gd_tdfreeq);
	173	KASSERT(td != NULL && (td->td_flags & TDF_EXITED),
	174	("lwkt_alloc_thread: unexpected NULL or corrupted td"));
	175	TAILQ_REMOVE(&mycpu->gd_tdfreeq, td, td_threadq);
	176	crit_exit();
	177	stack = td->td_kstack;
	178	flags = td->td_flags & (TDF_ALLOCATED_STACK\|TDF_ALLOCATED_THREAD);
	179	} else {
	180	crit_exit();
	181	td = zalloc(thread_zone);
	182	td->td_kstack = NULL;
	183	flags \|= TDF_ALLOCATED_THREAD;
	184	}
	185	}
	186	if ((stack = td->td_kstack) == NULL) {
	187	stack = (void )kmem_alloc(kernel_map, UPAGES PAGE_SIZE);
	188	flags \|= TDF_ALLOCATED_STACK;
	189	}
	190	lwkt_init_thread(td, stack, flags, mycpu);
	191	return(td);
	192	}
	193
	194	/*
	195	* Initialize a preexisting thread structure. This function is used by
	196	* lwkt_alloc_thread() and also used to initialize the per-cpu idlethread.
	197	*
	198	* NOTE! called from low level boot code, we cannot do anything fancy!
	199	*/
	200	void
	201	lwkt_init_thread(thread_t td, void stack, int flags, struct globaldata gd)
	202	{
	203	bzero(td, sizeof(struct thread));
	204	td->td_kstack = stack;
	205	td->td_flags \|= flags;
	206	td->td_gd = gd;
	207	td->td_pri = TDPRI_CRIT;
	208	td->td_cpu = gd->gd_cpuid; /* YYY don't need this if have td_gd */
	209	pmap_init_thread(td);
	210	crit_enter();
	211	TAILQ_INSERT_TAIL(&mycpu->gd_tdallq, td, td_allq);
	212	crit_exit();
	213	}
	214
	215	void
	216	lwkt_set_comm(thread_t td, const char *ctl, ...)
	217	{
	218	va_list va;
	219
	220	va_start(va, ctl);
	221	vsnprintf(td->td_comm, sizeof(td->td_comm), ctl, va);
	222	va_end(va);
	223	}
	224
	225	void
	226	lwkt_hold(thread_t td)
	227	{
	228	++td->td_refs;
	229	}
	230
	231	void
	232	lwkt_rele(thread_t td)
	233	{
	234	KKASSERT(td->td_refs > 0);
	235	--td->td_refs;
	236	}
	237
	238	void
	239	lwkt_wait_free(thread_t td)
	240	{
	241	while (td->td_refs)
	242	tsleep(td, PWAIT, "tdreap", hz);
	243	}
	244
	245	void
	246	lwkt_free_thread(thread_t td)
	247	{
	248	struct globaldata *gd = mycpu;
	249
	250	KASSERT(td->td_flags & TDF_EXITED,
	251	("lwkt_free_thread: did not exit! %p", td));
	252
	253	crit_enter();
	254	TAILQ_REMOVE(&gd->gd_tdallq, td, td_allq);
	255	if (gd->gd_tdfreecount < CACHE_NTHREADS &&
	256	(td->td_flags & TDF_ALLOCATED_THREAD)
	257	) {
	258	++gd->gd_tdfreecount;
	259	TAILQ_INSERT_HEAD(&gd->gd_tdfreeq, td, td_threadq);
	260	crit_exit();
	261	} else {
	262	crit_exit();
	263	if (td->td_kstack && (td->td_flags & TDF_ALLOCATED_STACK)) {
	264	kmem_free(kernel_map,
	265	(vm_offset_t)td->td_kstack, UPAGES * PAGE_SIZE);
	266	/* gd invalid */
	267	td->td_kstack = NULL;
	268	}
	269	if (td->td_flags & TDF_ALLOCATED_THREAD)
	270	zfree(thread_zone, td);
	271	}
	272	}
	273
	274
	275	/*
	276	* Switch to the next runnable lwkt. If no LWKTs are runnable then
	277	* switch to the idlethread. Switching must occur within a critical
	278	* section to avoid races with the scheduling queue.
	279	*
	280	* We always have full control over our cpu's run queue. Other cpus
	281	* that wish to manipulate our queue must use the cpu_*msg() calls to
	282	* talk to our cpu, so a critical section is all that is needed and
	283	* the result is very, very fast thread switching.
	284	*
	285	* The LWKT scheduler uses a fixed priority model and round-robins at
	286	* each priority level. User process scheduling is a totally
	287	* different beast and LWKT priorities should not be confused with
	288	* user process priorities.
	289	*
	290	* The MP lock may be out of sync with the thread's td_mpcount. lwkt_switch()
	291	* cleans it up. Note that the td_switch() function cannot do anything that
	292	* requires the MP lock since the MP lock will have already been setup for
	293	* the target thread (not the current thread).
	294	*/
	295
	296	void
	297	lwkt_switch(void)
	298	{
	299	struct globaldata *gd;
	300	thread_t td = curthread;
	301	thread_t ntd;
	302	#ifdef SMP
	303	int mpheld;
	304	#endif
	305
	306	if (mycpu->gd_intr_nesting_level &&
	307	td->td_preempted == NULL && panicstr == NULL
	308	) {
	309	panic("lwkt_switch: cannot switch from within an interrupt, yet\n");
	310	}
	311
	312	crit_enter();
	313	++switch_count;
	314
	315	#ifdef SMP
	316	/*
	317	* td_mpcount cannot be used to determine if we currently hold the
	318	* MP lock because get_mplock() will increment it prior to attempting
	319	* to get the lock, and switch out if it can't. Look at the actual lock.
	320	*/
	321	mpheld = MP_LOCK_HELD();
	322	#endif
	323	if ((ntd = td->td_preempted) != NULL) {
	324	/*
	325	* We had preempted another thread on this cpu, resume the preempted
	326	* thread. This occurs transparently, whether the preempted thread
	327	* was scheduled or not (it may have been preempted after descheduling
	328	* itself).
	329	*
	330	* We have to setup the MP lock for the original thread after backing
	331	* out the adjustment that was made to curthread when the original
	332	* was preempted.
	333	*/
	334	KKASSERT(ntd->td_flags & TDF_PREEMPT_LOCK);
	335	#ifdef SMP
	336	if (ntd->td_mpcount && mpheld == 0) {
	337	panic("MPLOCK NOT HELD ON RETURN: %p %p %d %d\n",
	338	td, ntd, td->td_mpcount, ntd->td_mpcount);
	339	}
	340	if (ntd->td_mpcount) {
	341	td->td_mpcount -= ntd->td_mpcount;
	342	KKASSERT(td->td_mpcount >= 0);
	343	}
	344	#endif
	345	ntd->td_flags \|= TDF_PREEMPT_DONE;
	346	/* YYY release mp lock on switchback if original doesn't need it */
	347	} else {
	348	/*
	349	* Priority queue / round-robin at each priority. Note that user
	350	* processes run at a fixed, low priority and the user process
	351	* scheduler deals with interactions between user processes
	352	* by scheduling and descheduling them from the LWKT queue as
	353	* necessary.
	354	*
	355	* We have to adjust the MP lock for the target thread. If we
	356	* need the MP lock and cannot obtain it we try to locate a
	357	* thread that does not need the MP lock.
	358	*/
	359	gd = mycpu;
	360	again:
	361	if (gd->gd_runqmask) {
	362	int nq = bsrl(gd->gd_runqmask);
	363	if ((ntd = TAILQ_FIRST(&gd->gd_tdrunq[nq])) == NULL) {
	364	gd->gd_runqmask &= ~(1 << nq);
	365	goto again;
	366	}
	367	#ifdef SMP
	368	if (ntd->td_mpcount && mpheld == 0 && !cpu_try_mplock()) {
	369	/*
	370	* Target needs MP lock and we couldn't get it, try
	371	* to locate a thread which does not need the MP lock
	372	* to run.
	373	*/
	374	u_int32_t rqmask = gd->gd_runqmask;
	375	while (rqmask) {
	376	TAILQ_FOREACH(ntd, &gd->gd_tdrunq[nq], td_threadq) {
	377	if (ntd->td_mpcount == 0)
	378	break;
	379	}
	380	if (ntd)
	381	break;
	382	rqmask &= ~(1 << nq);
	383	nq = bsrl(rqmask);
	384	}
	385	if (ntd == NULL) {
	386	ntd = gd->gd_idletd;
	387	} else {
	388	TAILQ_REMOVE(&gd->gd_tdrunq[nq], ntd, td_threadq);
	389	TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], ntd, td_threadq);
	390	}
	391	} else {
	392	TAILQ_REMOVE(&gd->gd_tdrunq[nq], ntd, td_threadq);
	393	TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], ntd, td_threadq);
	394	}
	395	#else
	396	TAILQ_REMOVE(&gd->gd_tdrunq[nq], ntd, td_threadq);
	397	TAILQ_INSERT_TAIL(&gd->gd_tdrunq[nq], ntd, td_threadq);
	398	#endif
	399	} else {
	400	ntd = gd->gd_idletd;
	401	}
	402	}
	403	KASSERT(ntd->td_pri >= TDPRI_CRIT,
	404	("priority problem in lwkt_switch %d %d", td->td_pri, ntd->td_pri));
	405
	406	/*
	407	* Do the actual switch. If the new target does not need the MP lock
	408	* and we are holding it, release the MP lock. If the new target requires
	409	* the MP lock we have already acquired it for the target.
	410	*/
	411	#ifdef SMP
	412	if (ntd->td_mpcount == 0 ) {
	413	if (MP_LOCK_HELD())
	414	cpu_rel_mplock();
	415	} else {
	416	ASSERT_MP_LOCK_HELD();
	417	}
	418	#endif
	419	if (td != ntd) {
	420	td->td_switch(ntd);
	421	}
	422
	423	crit_exit();
	424	}
	425
	426	/*
	427	* Request that the target thread preempt the current thread. Preemption
	428	* only works under a specific set of conditions:
	429	*
	430	* - We are not preempting ourselves
	431	* - The target thread is owned by the current cpu
	432	* - We are not currently being preempted
	433	* - The target is not currently being preempted
	434	* - We are able to satisfy the target's MP lock requirements (if any).
	435	*
	436	* THE CALLER OF LWKT_PREEMPT() MUST BE IN A CRITICAL SECTION. Typically
	437	* this is called via lwkt_schedule() through the td_preemptable callback.
	438	* critpri is the managed critical priority that we should ignore in order
	439	* to determine whether preemption is possible (aka usually just the crit
	440	* priority of lwkt_schedule() itself).
	441	*
	442	* XXX at the moment we run the target thread in a critical section during
	443	* the preemption in order to prevent the target from taking interrupts
	444	* that WE can't. Preemption is strictly limited to interrupt threads
	445	* and interrupt-like threads, outside of a critical section, and the
	446	* preempted source thread will be resumed the instant the target blocks
	447	* whether or not the source is scheduled (i.e. preemption is supposed to
	448	* be as transparent as possible).
	449	*
	450	* The target thread inherits our MP count (added to its own) for the
	451	* duration of the preemption in order to preserve the atomicy of the
	452	* MP lock during the preemption. Therefore, any preempting targets must be
	453	* careful in regards to MP assertions. Note that the MP count may be
	454	* out of sync with the physical mp_lock. If we preempt we have to preserve
	455	* the expected situation.
	456	*/
	457	void
	458	lwkt_preempt(thread_t ntd, int critpri)
	459	{
	460	thread_t td = curthread;
	461	#ifdef SMP
	462	int mpheld;
	463	int savecnt;
	464	#endif
	465
	466	/*
	467	* The caller has put us in a critical section. We can only preempt
	468	* if the caller of the caller was not in a critical section (basically
	469	* a local interrupt), as determined by the 'critpri' parameter. If
	470	* we are unable to preempt
	471	*
	472	* YYY The target thread must be in a critical section (else it must
	473	* inherit our critical section? I dunno yet).
	474	*/
	475	KASSERT(ntd->td_pri >= TDPRI_CRIT, ("BADCRIT0 %d", ntd->td_pri));
	476
	477	if (!_lwkt_wantresched(ntd, td)) {
	478	++preempt_miss;
	479	return;
	480	}
	481	if ((td->td_pri & ~TDPRI_MASK) > critpri) {
	482	++preempt_miss;
	483	need_resched();
	484	return;
	485	}
	486	#ifdef SMP
	487	if (ntd->td_cpu != mycpu->gd_cpuid) {
	488	++preempt_miss;
	489	return;
	490	}
	491	#endif
	492	if (td == ntd \|\| ((td->td_flags \| ntd->td_flags) & TDF_PREEMPT_LOCK)) {
	493	++preempt_weird;
	494	need_resched();
	495	return;
	496	}
	497	if (ntd->td_preempted) {
	498	++preempt_hit;
	499	need_resched();
	500	return;
	501	}
	502	#ifdef SMP
	503	mpheld = MP_LOCK_HELD();
	504	if (mpheld && td->td_mpcount == 0)
	505	panic("lwkt_preempt(): held and no count");
	506	savecnt = td->td_mpcount;
	507	ntd->td_mpcount += td->td_mpcount;
	508	if (mpheld == 0 && ntd->td_mpcount && !cpu_try_mplock()) {
	509	ntd->td_mpcount -= td->td_mpcount;
	510	++preempt_miss;
	511	need_resched();
	512	return;
	513	}
	514	#endif
	515
	516	++preempt_hit;
	517	ntd->td_preempted = td;
	518	td->td_flags \|= TDF_PREEMPT_LOCK;
	519	td->td_switch(ntd);
	520	KKASSERT(ntd->td_preempted && (td->td_flags & TDF_PREEMPT_DONE));
	521	#ifdef SMP
	522	KKASSERT(savecnt == td->td_mpcount);
	523	if (mpheld == 0 && MP_LOCK_HELD())
	524	cpu_rel_mplock();
	525	else if (mpheld && !MP_LOCK_HELD())
	526	panic("lwkt_preempt(): MP lock was not held through");
	527	#endif
	528	ntd->td_preempted = NULL;
	529	td->td_flags &= ~(TDF_PREEMPT_LOCK\|TDF_PREEMPT_DONE);
	530	}
	531
	532	/*
	533	* Yield our thread while higher priority threads are pending. This is
	534	* typically called when we leave a critical section but it can be safely
	535	* called while we are in a critical section.
	536	*
	537	* This function will not generally yield to equal priority threads but it
	538	* can occur as a side effect. Note that lwkt_switch() is called from
	539	* inside the critical section to pervent its own crit_exit() from reentering
	540	* lwkt_yield_quick().
	541	*
	542	* gd_reqpri indicates that something changed, e.g. an interrupt or softint
	543	* came along but was blocked and made pending.
	544	*
	545	* (self contained on a per cpu basis)
	546	*/
	547	void
	548	lwkt_yield_quick(void)
	549	{
	550	thread_t td = curthread;
	551
	552	if ((td->td_pri & TDPRI_MASK) < mycpu->gd_reqpri) {
	553	mycpu->gd_reqpri = 0;
	554	splz();
	555	}
	556
	557	/*
	558	* YYY enabling will cause wakeup() to task-switch, which really
	559	* confused the old 4.x code. This is a good way to simulate
	560	* preemption and MP without actually doing preemption or MP, because a
	561	* lot of code assumes that wakeup() does not block.
	562	*/
	563	if (untimely_switch && mycpu->gd_intr_nesting_level == 0) {
	564	crit_enter();
	565	/*
	566	* YYY temporary hacks until we disassociate the userland scheduler
	567	* from the LWKT scheduler.
	568	*/
	569	if (td->td_flags & TDF_RUNQ) {
	570	lwkt_switch(); /* will not reenter yield function */
	571	} else {
	572	lwkt_schedule_self(); /* make sure we are scheduled */
	573	lwkt_switch(); /* will not reenter yield function */
	574	lwkt_deschedule_self(); /* make sure we are descheduled */
	575	}
	576	crit_exit_noyield();
	577	}
	578	}
	579
	580	/*
	581	* This implements a normal yield which, unlike _quick, will yield to equal
	582	* priority threads as well. Note that gd_reqpri tests will be handled by
	583	* the crit_exit() call in lwkt_switch().
	584	*
	585	* (self contained on a per cpu basis)
	586	*/
	587	void
	588	lwkt_yield(void)
	589	{
	590	lwkt_schedule_self();
	591	lwkt_switch();
	592	}
	593
	594	/*
	595	* Schedule a thread to run. As the current thread we can always safely
	596	* schedule ourselves, and a shortcut procedure is provided for that
	597	* function.
	598	*
	599	* (non-blocking, self contained on a per cpu basis)
	600	*/
	601	void
	602	lwkt_schedule_self(void)
	603	{
	604	thread_t td = curthread;
	605
	606	crit_enter();
	607	KASSERT(td->td_wait == NULL, ("lwkt_schedule_self(): td_wait not NULL!"));
	608	_lwkt_enqueue(td);
	609	if (td->td_proc && td->td_proc->p_stat == SSLEEP)
	610	panic("SCHED SELF PANIC");
	611	crit_exit();
	612	}
	613
	614	/*
	615	* Generic schedule. Possibly schedule threads belonging to other cpus and
	616	* deal with threads that might be blocked on a wait queue.
	617	*
	618	* YYY this is one of the best places to implement load balancing code.
	619	* Load balancing can be accomplished by requesting other sorts of actions
	620	* for the thread in question.
	621	*/
	622	void
	623	lwkt_schedule(thread_t td)
	624	{
	625	#ifdef INVARIANTS
	626	if ((td->td_flags & TDF_PREEMPT_LOCK) == 0 && td->td_proc
	627	&& td->td_proc->p_stat == SSLEEP
	628	) {
	629	printf("PANIC schedule curtd = %p (%d %d) target %p (%d %d)\n",
	630	curthread,
	631	curthread->td_proc ? curthread->td_proc->p_pid : -1,
	632	curthread->td_proc ? curthread->td_proc->p_stat : -1,
	633	td,
	634	td->td_proc ? curthread->td_proc->p_pid : -1,
	635	td->td_proc ? curthread->td_proc->p_stat : -1
	636	);
	637	panic("SCHED PANIC");
	638	}
	639	#endif
	640	crit_enter();
	641	if (td == curthread) {
	642	_lwkt_enqueue(td);
	643	} else {
	644	lwkt_wait_t w;
	645
	646	/*
	647	* If the thread is on a wait list we have to send our scheduling
	648	* request to the owner of the wait structure. Otherwise we send
	649	* the scheduling request to the cpu owning the thread. Races
	650	* are ok, the target will forward the message as necessary (the
	651	* message may chase the thread around before it finally gets
	652	* acted upon).
	653	*
	654	* (remember, wait structures use stable storage)
	655	*/
	656	if ((w = td->td_wait) != NULL) {
	657	if (lwkt_trytoken(&w->wa_token)) {
	658	TAILQ_REMOVE(&w->wa_waitq, td, td_threadq);
	659	--w->wa_count;
	660	td->td_wait = NULL;
	661	if (td->td_cpu == mycpu->gd_cpuid) {
	662	_lwkt_enqueue(td);
	663	if (td->td_preemptable) {
	664	td->td_preemptable(td, TDPRI_CRIT2); / YYY +token */
	665	} else if (_lwkt_wantresched(td, curthread)) {
	666	need_resched();
	667	}
	668	} else {
	669	lwkt_send_ipiq(td->td_cpu, (ipifunc_t)lwkt_schedule, td);
	670	}
	671	lwkt_reltoken(&w->wa_token);
	672	} else {
	673	lwkt_send_ipiq(w->wa_token.t_cpu, (ipifunc_t)lwkt_schedule, td);
	674	}
	675	} else {
	676	/*
	677	* If the wait structure is NULL and we own the thread, there
	678	* is no race (since we are in a critical section). If we
	679	* do not own the thread there might be a race but the
	680	* target cpu will deal with it.
	681	*/
	682	if (td->td_cpu == mycpu->gd_cpuid) {
	683	_lwkt_enqueue(td);
	684	if (td->td_preemptable) {
	685	td->td_preemptable(td, TDPRI_CRIT);
	686	} else if (_lwkt_wantresched(td, curthread)) {
	687	need_resched();
	688	}
	689	} else {
	690	lwkt_send_ipiq(td->td_cpu, (ipifunc_t)lwkt_schedule, td);
	691	}
	692	}
	693	}
	694	crit_exit();
	695	}
	696
	697	/*
	698	* Deschedule a thread.
	699	*
	700	* (non-blocking, self contained on a per cpu basis)
	701	*/
	702	void
	703	lwkt_deschedule_self(void)
	704	{
	705	thread_t td = curthread;
	706
	707	crit_enter();
	708	KASSERT(td->td_wait == NULL, ("lwkt_schedule_self(): td_wait not NULL!"));
	709	_lwkt_dequeue(td);
	710	crit_exit();
	711	}
	712
	713	/*
	714	* Generic deschedule. Descheduling threads other then your own should be
	715	* done only in carefully controlled circumstances. Descheduling is
	716	* asynchronous.
	717	*
	718	* This function may block if the cpu has run out of messages.
	719	*/
	720	void
	721	lwkt_deschedule(thread_t td)
	722	{
	723	crit_enter();
	724	if (td == curthread) {
	725	_lwkt_dequeue(td);
	726	} else {
	727	if (td->td_cpu == mycpu->gd_cpuid) {
	728	_lwkt_dequeue(td);
	729	} else {
	730	lwkt_send_ipiq(td->td_cpu, (ipifunc_t)lwkt_deschedule, td);
	731	}
	732	}
	733	crit_exit();
	734	}
	735
	736	/*
	737	* Set the target thread's priority. This routine does not automatically
	738	* switch to a higher priority thread, LWKT threads are not designed for
	739	* continuous priority changes. Yield if you want to switch.
	740	*
	741	* We have to retain the critical section count which uses the high bits
	742	* of the td_pri field. The specified priority may also indicate zero or
	743	* more critical sections by adding TDPRI_CRIT*N.
	744	*/
	745	void
	746	lwkt_setpri(thread_t td, int pri)
	747	{
	748	KKASSERT(pri >= 0);
	749	KKASSERT(td->td_cpu == mycpu->gd_cpuid);
	750	crit_enter();
	751	if (td->td_flags & TDF_RUNQ) {
	752	_lwkt_dequeue(td);
	753	td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri;
	754	_lwkt_enqueue(td);
	755	} else {
	756	td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri;
	757	}
	758	crit_exit();
	759	}
	760
	761	void
	762	lwkt_setpri_self(int pri)
	763	{
	764	thread_t td = curthread;
	765
	766	KKASSERT(pri >= 0 && pri <= TDPRI_MAX);
	767	crit_enter();
	768	if (td->td_flags & TDF_RUNQ) {
	769	_lwkt_dequeue(td);
	770	td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri;
	771	_lwkt_enqueue(td);
	772	} else {
	773	td->td_pri = (td->td_pri & ~TDPRI_MASK) + pri;
	774	}
	775	crit_exit();
	776	}
	777
	778	struct proc *
	779	lwkt_preempted_proc(void)
	780	{
	781	thread_t td = curthread;
	782	while (td->td_preempted)
	783	td = td->td_preempted;
	784	return(td->td_proc);
	785	}
	786
	787
	788	/*
	789	* This function deschedules the current thread and blocks on the specified
	790	* wait queue. We obtain ownership of the wait queue in order to block
	791	* on it. A generation number is used to interlock the wait queue in case
	792	* it gets signalled while we are blocked waiting on the token.
	793	*
	794	* Note: alternatively we could dequeue our thread and then message the
	795	* target cpu owning the wait queue. YYY implement as sysctl.
	796	*
	797	* Note: wait queue signals normally ping-pong the cpu as an optimization.
	798	*/
	799	typedef struct lwkt_gettoken_req {
	800	lwkt_token_t tok;
	801	int cpu;
	802	} lwkt_gettoken_req;
	803
	804	void
	805	lwkt_block(lwkt_wait_t w, const char wmesg, int gen)
	806	{
	807	thread_t td = curthread;
	808
	809	lwkt_gettoken(&w->wa_token);
	810	if (w->wa_gen == *gen) {
	811	_lwkt_dequeue(td);
	812	TAILQ_INSERT_TAIL(&w->wa_waitq, td, td_threadq);
	813	++w->wa_count;
	814	td->td_wait = w;
	815	td->td_wmesg = wmesg;
	816	lwkt_switch();
	817	}
	818	/* token might be lost, doesn't matter for gen update */
	819	*gen = w->wa_gen;
	820	lwkt_reltoken(&w->wa_token);
	821	}
	822
	823	/*
	824	* Signal a wait queue. We gain ownership of the wait queue in order to
	825	* signal it. Once a thread is removed from the wait queue we have to
	826	* deal with the cpu owning the thread.
	827	*
	828	* Note: alternatively we could message the target cpu owning the wait
	829	* queue. YYY implement as sysctl.
	830	*/
	831	void
	832	lwkt_signal(lwkt_wait_t w)
	833	{
	834	thread_t td;
	835	int count;
	836
	837	lwkt_gettoken(&w->wa_token);
	838	++w->wa_gen;
	839	count = w->wa_count;
	840	while ((td = TAILQ_FIRST(&w->wa_waitq)) != NULL && count) {
	841	--count;
	842	--w->wa_count;
	843	TAILQ_REMOVE(&w->wa_waitq, td, td_threadq);
	844	td->td_wait = NULL;
	845	td->td_wmesg = NULL;
	846	if (td->td_cpu == mycpu->gd_cpuid) {
	847	_lwkt_enqueue(td);
	848	} else {
	849	lwkt_send_ipiq(td->td_cpu, (ipifunc_t)lwkt_schedule, td);
	850	}
	851	lwkt_regettoken(&w->wa_token);
	852	}
	853	lwkt_reltoken(&w->wa_token);
	854	}
	855
	856	/*
	857	* Acquire ownership of a token
	858	*
	859	* Acquire ownership of a token. The token may have spl and/or critical
	860	* section side effects, depending on its purpose. These side effects
	861	* guarentee that you will maintain ownership of the token as long as you
	862	* do not block. If you block you may lose access to the token (but you
	863	* must still release it even if you lose your access to it).
	864	*
	865	* YYY for now we use a critical section to prevent IPIs from taking away
	866	* a token, but we really only need to disable IPIs ?
	867	*
	868	* YYY certain tokens could be made to act like mutexes when performance
	869	* would be better (e.g. t_cpu == -1). This is not yet implemented.
	870	*
	871	* If the token is owned by another cpu we may have to send an IPI to
	872	* it and then block. The IPI causes the token to be given away to the
	873	* requesting cpu, unless it has already changed hands. Since only the
	874	* current cpu can give away a token it owns we do not need a memory barrier.
	875	*/
	876
	877	#ifdef SMP
	878
	879	static
	880	void
	881	lwkt_gettoken_remote(void *arg)
	882	{
	883	lwkt_gettoken_req *req = arg;
	884	if (req->tok->t_cpu == mycpu->gd_cpuid) {
	885	req->tok->t_cpu = req->cpu;
	886	}
	887	}
	888
	889	#endif
	890
	891	int
	892	lwkt_gettoken(lwkt_token_t tok)
	893	{
	894	/*
	895	* Prevent preemption so the token can't be taken away from us once
	896	* we gain ownership of it. Use a synchronous request which might
	897	* block. The request will be forwarded as necessary playing catchup
	898	* to the token.
	899	*/
	900
	901	crit_enter();
	902	#ifdef INVARIANTS
	903	if (token_debug) {
	904	printf("gettoken %p %d/%d\n", ((int **)&tok)[-1], (curthread->td_proc?curthread->td_proc->p_pid:-1), curthread->td_pri);
	905	if (curthread->td_pri > 2000) {
	906	curthread->td_pri = 1000;
	907	panic("too HIGH!");
	908	}
	909	}
	910	#endif
	911	#ifdef SMP
	912	while (tok->t_cpu != mycpu->gd_cpuid) {
	913	struct lwkt_gettoken_req req;
	914	int seq;
	915	int dcpu;
	916
	917	req.cpu = mycpu->gd_cpuid;
	918	req.tok = tok;
	919	dcpu = (volatile int)tok->t_cpu;
	920	seq = lwkt_send_ipiq(dcpu, lwkt_gettoken_remote, &req);
	921	lwkt_wait_ipiq(dcpu, seq);
	922	}
	923	#endif
	924	/*
	925	* leave us in a critical section on return. This will be undone
	926	* by lwkt_reltoken(). Bump the generation number.
	927	*/
	928	return(++tok->t_gen);
	929	}
	930
	931	/*
	932	* Attempt to acquire ownership of a token. Returns 1 on success, 0 on
	933	* failure.
	934	*/
	935	int
	936	lwkt_trytoken(lwkt_token_t tok)
	937	{
	938	crit_enter();
	939	#ifdef SMP
	940	if (tok->t_cpu != mycpu->gd_cpuid) {
	941	return(0);
	942	}
	943	#endif
	944	/* leave us in the critical section */
	945	++tok->t_gen;
	946	return(1);
	947	}
	948
	949	/*
	950	* Release your ownership of a token. Releases must occur in reverse
	951	* order to aquisitions, eventually so priorities can be unwound properly
	952	* like SPLs. At the moment the actual implemention doesn't care.
	953	*
	954	* We can safely hand a token that we own to another cpu without notifying
	955	* it, but once we do we can't get it back without requesting it (unless
	956	* the other cpu hands it back to us before we check).
	957	*
	958	* We might have lost the token, so check that.
	959	*/
	960	void
	961	lwkt_reltoken(lwkt_token_t tok)
	962	{
	963	if (tok->t_cpu == mycpu->gd_cpuid) {
	964	tok->t_cpu = tok->t_reqcpu;
	965	}
	966	crit_exit();
	967	}
	968
	969	/*
	970	* Reacquire a token that might have been lost and compare and update the
	971	* generation number. 0 is returned if the generation has not changed
	972	* (nobody else obtained the token while we were blocked, on this cpu or
	973	* any other cpu).
	974	*
	975	* This function returns with the token re-held whether the generation
	976	* number changed or not.
	977	*/
	978	int
	979	lwkt_gentoken(lwkt_token_t tok, int *gen)
	980	{
	981	if (lwkt_regettoken(tok) == *gen) {
	982	return(0);
	983	} else {
	984	*gen = tok->t_gen;
	985	return(-1);
	986	}
	987	}
	988
	989
	990	/*
	991	* Re-acquire a token that might have been lost. Returns the generation
	992	* number of the token.
	993	*/
	994	int
	995	lwkt_regettoken(lwkt_token_t tok)
	996	{
	997	/* assert we are in a critical section */
	998	if (tok->t_cpu != mycpu->gd_cpuid) {
	999	#ifdef SMP
	1000	while (tok->t_cpu != mycpu->gd_cpuid) {
	1001	struct lwkt_gettoken_req req;
	1002	int seq;
	1003	int dcpu;
	1004
	1005	req.cpu = mycpu->gd_cpuid;
	1006	req.tok = tok;
	1007	dcpu = (volatile int)tok->t_cpu;
	1008	seq = lwkt_send_ipiq(dcpu, lwkt_gettoken_remote, &req);
	1009	lwkt_wait_ipiq(dcpu, seq);
	1010	}
	1011	#endif
	1012	++tok->t_gen;
	1013	}
	1014	return(tok->t_gen);
	1015	}
	1016
	1017	void
	1018	lwkt_inittoken(lwkt_token_t tok)
	1019	{
	1020	/*
	1021	* Zero structure and set cpu owner and reqcpu to cpu 0.
	1022	*/
	1023	bzero(tok, sizeof(*tok));
	1024	}
	1025
	1026	/*
	1027	* Create a kernel process/thread/whatever. It shares it's address space
	1028	* with proc0 - ie: kernel only.
	1029	*
	1030	* XXX should be renamed to lwkt_create()
	1031	*
	1032	* The thread will be entered with the MP lock held.
	1033	*/
	1034	int
	1035	lwkt_create(void (func)(void ), void *arg,
	1036	struct thread **tdp, thread_t template, int tdflags,
	1037	const char *fmt, ...)
	1038	{
	1039	thread_t td;
	1040	va_list ap;
	1041
	1042	td = *tdp = lwkt_alloc_thread(template);
	1043	cpu_set_thread_handler(td, kthread_exit, func, arg);
	1044	td->td_flags \|= TDF_VERBOSE \| tdflags;
	1045	#ifdef SMP
	1046	td->td_mpcount = 1;
	1047	#endif
	1048
	1049	/*
	1050	* Set up arg0 for 'ps' etc
	1051	*/
	1052	va_start(ap, fmt);
	1053	vsnprintf(td->td_comm, sizeof(td->td_comm), fmt, ap);
	1054	va_end(ap);
	1055
	1056	/*
	1057	* Schedule the thread to run
	1058	*/
	1059	if ((td->td_flags & TDF_STOPREQ) == 0)
	1060	lwkt_schedule(td);
	1061	else
	1062	td->td_flags &= ~TDF_STOPREQ;
	1063	return 0;
	1064	}
	1065
	1066	/*
	1067	* Destroy an LWKT thread. Warning! This function is not called when
	1068	* a process exits, cpu_proc_exit() directly calls cpu_thread_exit() and
	1069	* uses a different reaping mechanism.
	1070	*/
	1071	void
	1072	lwkt_exit(void)
	1073	{
	1074	thread_t td = curthread;
	1075
	1076	if (td->td_flags & TDF_VERBOSE)
	1077	printf("kthread %p %s has exited\n", td, td->td_comm);
	1078	crit_enter();
	1079	lwkt_deschedule_self();
	1080	++mycpu->gd_tdfreecount;
	1081	TAILQ_INSERT_TAIL(&mycpu->gd_tdfreeq, td, td_threadq);
	1082	cpu_thread_exit();
	1083	}
	1084
	1085	/*
	1086	* Create a kernel process/thread/whatever. It shares it's address space
	1087	* with proc0 - ie: kernel only. 5.x compatible.
	1088	*/
	1089	int
	1090	kthread_create(void (func)(void ), void *arg,
	1091	struct thread *tdp, const char fmt, ...)
	1092	{
	1093	thread_t td;
	1094	va_list ap;
	1095
	1096	td = *tdp = lwkt_alloc_thread(NULL);
	1097	cpu_set_thread_handler(td, kthread_exit, func, arg);
	1098	td->td_flags \|= TDF_VERBOSE;
	1099	#ifdef SMP
	1100	td->td_mpcount = 1;
	1101	#endif
	1102
	1103	/*
	1104	* Set up arg0 for 'ps' etc
	1105	*/
	1106	va_start(ap, fmt);
	1107	vsnprintf(td->td_comm, sizeof(td->td_comm), fmt, ap);
	1108	va_end(ap);
	1109
	1110	/*
	1111	* Schedule the thread to run
	1112	*/
	1113	lwkt_schedule(td);
	1114	return 0;
	1115	}
	1116
	1117	void
	1118	crit_panic(void)
	1119	{
	1120	thread_t td = curthread;
	1121	int lpri = td->td_pri;
	1122
	1123	td->td_pri = 0;
	1124	panic("td_pri is/would-go negative! %p %d", td, lpri);
	1125	}
	1126
	1127	/*
	1128	* Destroy an LWKT thread. Warning! This function is not called when
	1129	* a process exits, cpu_proc_exit() directly calls cpu_thread_exit() and
	1130	* uses a different reaping mechanism.
	1131	*
	1132	* XXX duplicates lwkt_exit()
	1133	*/
	1134	void
	1135	kthread_exit(void)
	1136	{
	1137	lwkt_exit();
	1138	}
	1139
	1140	#ifdef SMP
	1141
	1142	/*
	1143	* Send a function execution request to another cpu. The request is queued
	1144	* on the cpu<->cpu ipiq matrix. Each cpu owns a unique ipiq FIFO for every
	1145	* possible target cpu. The FIFO can be written.
	1146	*
	1147	* YYY If the FIFO fills up we have to enable interrupts and process the
	1148	* IPIQ while waiting for it to empty or we may deadlock with another cpu.
	1149	* Create a CPU_*() function to do this!
	1150	*
	1151	* Must be called from a critical section.
	1152	*/
	1153	int
	1154	lwkt_send_ipiq(int dcpu, ipifunc_t func, void *arg)
	1155	{
	1156	lwkt_ipiq_t ip;
	1157	int windex;
	1158
	1159	if (dcpu == mycpu->gd_cpuid) {
	1160	func(arg);
	1161	return(0);
	1162	}
	1163	KKASSERT(curthread->td_pri >= TDPRI_CRIT);
	1164	KKASSERT(dcpu >= 0 && dcpu < ncpus);
	1165	++ipiq_count;
	1166	ip = &mycpu->gd_ipiq[dcpu];
	1167	if (ip->ip_windex - ip->ip_rindex > MAXCPUFIFO / 2) {
	1168	unsigned int eflags = read_eflags();
	1169	cpu_enable_intr();
	1170	++ipiq_fifofull;
	1171	while (ip->ip_windex - ip->ip_rindex > MAXCPUFIFO / 2) {
	1172	KKASSERT(ip->ip_windex - ip->ip_rindex != MAXCPUFIFO - 1);
	1173	lwkt_process_ipiq();
	1174	}
	1175	write_eflags(eflags);
	1176	}
	1177	KKASSERT(ip->ip_windex - ip->ip_rindex != MAXCPUFIFO - 1);
	1178	windex = ip->ip_windex & MAXCPUFIFO_MASK;
	1179	ip->ip_func[windex] = func;
	1180	ip->ip_arg[windex] = arg;
	1181	/* YYY memory barrier */
	1182	++ip->ip_windex;
	1183	cpu_send_ipiq(dcpu); /* issues memory barrier if appropriate */
	1184	return(ip->ip_windex);
	1185	}
	1186
	1187	/*
	1188	* Wait for the remote cpu to finish processing a function.
	1189	*
	1190	* YYY we have to enable interrupts and process the IPIQ while waiting
	1191	* for it to empty or we may deadlock with another cpu. Create a CPU_*()
	1192	* function to do this! YYY we really should 'block' here.
	1193	*
	1194	* Must be called from a critical section. Thsi routine may be called
	1195	* from an interrupt (for example, if an interrupt wakes a foreign thread
	1196	* up).
	1197	*/
	1198	void
	1199	lwkt_wait_ipiq(int dcpu, int seq)
	1200	{
	1201	lwkt_ipiq_t ip;
	1202
	1203	if (dcpu != mycpu->gd_cpuid) {
	1204	KKASSERT(dcpu >= 0 && dcpu < ncpus);
	1205	ip = &mycpu->gd_ipiq[dcpu];
	1206	if ((int)(ip->ip_rindex - seq) < 0) {
	1207	unsigned int eflags = read_eflags();
	1208	cpu_enable_intr();
	1209	while ((int)(ip->ip_rindex - seq) < 0) {
	1210	lwkt_process_ipiq();
	1211	#if 0
	1212	lwkt_switch(); /* YYY fixme */
	1213	#endif
	1214	}
	1215	write_eflags(eflags);
	1216	}
	1217	}
	1218	}
	1219
	1220	/*
	1221	* Called from IPI interrupt (like a fast interrupt), which has placed
	1222	* us in a critical section. The MP lock may or may not be held.
	1223	* May also be called from doreti or splz.
	1224	*/
	1225	void
	1226	lwkt_process_ipiq(void)
	1227	{
	1228	int n;
	1229	int cpuid = mycpu->gd_cpuid;
	1230
	1231	for (n = 0; n < ncpus; ++n) {
	1232	lwkt_ipiq_t ip;
	1233	int ri;
	1234
	1235	if (n == cpuid)
	1236	continue;
	1237	ip = globaldata_find(n)->gd_ipiq;
	1238	if (ip == NULL)
	1239	continue;
	1240	ip = &ip[cpuid];
	1241	while (ip->ip_rindex != ip->ip_windex) {
	1242	ri = ip->ip_rindex & MAXCPUFIFO_MASK;
	1243	ip->ip_func[ri](ip->ip_arg[ri]);
	1244	++ip->ip_rindex;
	1245	}
	1246	}
	1247	}
	1248
	1249	#else
	1250
	1251	int
	1252	lwkt_send_ipiq(int dcpu, ipifunc_t func, void *arg)
	1253	{
	1254	panic("lwkt_send_ipiq: UP box! (%d,%p,%p)", dcpu, func, arg);
	1255	return(0); /* NOT REACHED */
	1256	}
	1257
	1258	void
	1259	lwkt_wait_ipiq(int dcpu, int seq)
	1260	{
	1261	panic("lwkt_wait_ipiq: UP box! (%d,%d)", dcpu, seq);
	1262	}
	1263
	1264	#endif