gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*-
	2	* Copyright (c) 1982, 1986, 1990, 1991, 1993
	3	* The Regents of the University of California. All rights reserved.
	4	* (c) UNIX System Laboratories, Inc.
	5	* All or some portions of this file are derived from material licensed
	6	* to the University of California by American Telephone and Telegraph
	7	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	8	* the permission of UNIX System Laboratories, Inc.
	9	*
	10	* Redistribution and use in source and binary forms, with or without
	11	* modification, are permitted provided that the following conditions
	12	* are met:
	13	* 1. Redistributions of source code must retain the above copyright
	14	* notice, this list of conditions and the following disclaimer.
	15	* 2. Redistributions in binary form must reproduce the above copyright
	16	* notice, this list of conditions and the following disclaimer in the
	17	* documentation and/or other materials provided with the distribution.
	18	* 3. All advertising materials mentioning features or use of this software
	19	* must display the following acknowledgement:
	20	* This product includes software developed by the University of
	21	* California, Berkeley and its contributors.
	22	* 4. Neither the name of the University nor the names of its contributors
	23	* may be used to endorse or promote products derived from this software
	24	* without specific prior written permission.
	25	*
	26	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	27	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	28	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	29	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	30	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	31	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	32	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	33	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	34	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	35	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	36	* SUCH DAMAGE.
	37	*
	38	* @(#)kern_synch.c 8.9 (Berkeley) 5/19/95
	39	* $FreeBSD: src/sys/kern/kern_synch.c,v 1.87.2.6 2002/10/13 07:29:53 kbyanc Exp $
	40	* $DragonFly: src/sys/kern/kern_synch.c,v 1.20 2003/08/03 10:07:41 hmp Exp $
	41	*/
	42
	43	#include "opt_ktrace.h"
	44
	45	#include <sys/param.h>
	46	#include <sys/systm.h>
	47	#include <sys/proc.h>
	48	#include <sys/kernel.h>
	49	#include <sys/signalvar.h>
	50	#include <sys/resourcevar.h>
	51	#include <sys/vmmeter.h>
	52	#include <sys/sysctl.h>
	53	#include <sys/thread2.h>
	54	#ifdef KTRACE
	55	#include <sys/uio.h>
	56	#include <sys/ktrace.h>
	57	#endif
	58	#include <sys/xwait.h>
	59
	60	#include <machine/cpu.h>
	61	#include <machine/ipl.h>
	62	#include <machine/smp.h>
	63
	64	static void sched_setup __P((void *dummy));
	65	SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL)
	66
	67	int hogticks;
	68	int lbolt;
	69	int sched_quantum; /* Roundrobin scheduling quantum in ticks. */
	70	int ncpus;
	71
	72	static struct callout loadav_callout;
	73
	74	struct loadavg averunnable =
	75	{ {0, 0, 0}, FSCALE }; /* load average, of runnable procs */
	76	/*
	77	* Constants for averages over 1, 5, and 15 minutes
	78	* when sampling at 5 second intervals.
	79	*/
	80	static fixpt_t cexp[3] = {
	81	0.9200444146293232 * FSCALE, /* exp(-1/12) */
	82	0.9834714538216174 * FSCALE, /* exp(-1/60) */
	83	0.9944598480048967 * FSCALE, /* exp(-1/180) */
	84	};
	85
	86	static void endtsleep __P((void *));
	87	static void loadav __P((void *arg));
	88	static void maybe_resched __P((struct proc *chk));
	89	static void roundrobin __P((void *arg));
	90	static void schedcpu __P((void *arg));
	91	static void updatepri __P((struct proc *p));
	92	static void crit_panicints(void);
	93
	94	static int
	95	sysctl_kern_quantum(SYSCTL_HANDLER_ARGS)
	96	{
	97	int error, new_val;
	98
	99	new_val = sched_quantum * tick;
	100	error = sysctl_handle_int(oidp, &new_val, 0, req);
	101	if (error != 0 \|\| req->newptr == NULL)
	102	return (error);
	103	if (new_val < tick)
	104	return (EINVAL);
	105	sched_quantum = new_val / tick;
	106	hogticks = 2 * sched_quantum;
	107	return (0);
	108	}
	109
	110	SYSCTL_PROC(_kern, OID_AUTO, quantum, CTLTYPE_INT\|CTLFLAG_RW,
	111	0, sizeof sched_quantum, sysctl_kern_quantum, "I", "");
	112
	113	/*
	114	* Arrange to reschedule if necessary by checking to see if the current
	115	* process is on the highest priority user scheduling queue. This may
	116	* be run from an interrupt so we have to follow any preemption chains
	117	* back to the original process.
	118	*/
	119	static void
	120	maybe_resched(struct proc *chk)
	121	{
	122	struct proc *cur = lwkt_preempted_proc();
	123
	124	if (cur == NULL)
	125	return;
	126
	127	/*
	128	* Check the user queue (realtime, normal, idle). Lower numbers
	129	* indicate higher priority queues. Lower numbers are also better
	130	* for p_priority.
	131	*/
	132	if (chk->p_rtprio.type < cur->p_rtprio.type) {
	133	need_resched();
	134	} else if (chk->p_rtprio.type == cur->p_rtprio.type) {
	135	if (chk->p_rtprio.type == RTP_PRIO_NORMAL) {
	136	if (chk->p_priority / PPQ < cur->p_priority / PPQ)
	137	need_resched();
	138	} else {
	139	if (chk->p_rtprio.prio < cur->p_rtprio.prio)
	140	need_resched();
	141	}
	142	}
	143	}
	144
	145	int
	146	roundrobin_interval(void)
	147	{
	148	return (sched_quantum);
	149	}
	150
	151	/*
	152	* Force switch among equal priority processes every 100ms.
	153	*/
	154	#ifdef SMP
	155
	156	static void
	157	roundrobin_remote(void *arg)
	158	{
	159	struct proc *p = lwkt_preempted_proc();
	160	if (p == NULL \|\| RTP_PRIO_NEED_RR(p->p_rtprio.type))
	161	need_resched();
	162	}
	163
	164	#endif
	165
	166	static void
	167	roundrobin(void *arg)
	168	{
	169	struct proc *p = lwkt_preempted_proc();
	170	if (p == NULL \|\| RTP_PRIO_NEED_RR(p->p_rtprio.type))
	171	need_resched();
	172	#ifdef SMP
	173	lwkt_send_ipiq_mask(mycpu->gd_other_cpus, roundrobin_remote, NULL);
	174	#endif
	175	timeout(roundrobin, NULL, sched_quantum);
	176	}
	177
	178	#ifdef SMP
	179
	180	void
	181	resched_cpus(u_int32_t mask)
	182	{
	183	lwkt_send_ipiq_mask(mask, roundrobin_remote, NULL);
	184	}
	185
	186	#endif
	187
	188	/*
	189	* Constants for digital decay and forget:
	190	* 90% of (p_estcpu) usage in 5 * loadav time
	191	* 95% of (p_pctcpu) usage in 60 seconds (load insensitive)
	192	* Note that, as ps(1) mentions, this can let percentages
	193	* total over 100% (I've seen 137.9% for 3 processes).
	194	*
	195	* Note that schedclock() updates p_estcpu and p_cpticks asynchronously.
	196	*
	197	* We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds.
	198	* That is, the system wants to compute a value of decay such
	199	* that the following for loop:
	200	* for (i = 0; i < (5 * loadavg); i++)
	201	* p_estcpu *= decay;
	202	* will compute
	203	* p_estcpu *= 0.1;
	204	* for all values of loadavg:
	205	*
	206	* Mathematically this loop can be expressed by saying:
	207	* decay ** (5 * loadavg) ~= .1
	208	*
	209	* The system computes decay as:
	210	* decay = (2 * loadavg) / (2 * loadavg + 1)
	211	*
	212	* We wish to prove that the system's computation of decay
	213	* will always fulfill the equation:
	214	* decay ** (5 * loadavg) ~= .1
	215	*
	216	* If we compute b as:
	217	* b = 2 * loadavg
	218	* then
	219	* decay = b / (b + 1)
	220	*
	221	* We now need to prove two things:
	222	* 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
	223	* 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
	224	*
	225	* Facts:
	226	* For x close to zero, exp(x) =~ 1 + x, since
	227	* exp(x) = 0! + x1/1! + x2/2! + ... .
	228	* therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
	229	* For x close to zero, ln(1+x) =~ x, since
	230	* ln(1+x) = x - x2/2 + x3/3 - ... -1 < x < 1
	231	* therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
	232	* ln(.1) =~ -2.30
	233	*
	234	* Proof of (1):
	235	* Solve (factor)*(power) =~ .1 given power (5loadav):
	236	* solving for factor,
	237	* ln(factor) =~ (-2.30/5*loadav), or
	238	* factor =~ exp(-1/((5/2.30)loadav)) =~ exp(-1/(2loadav)) =
	239	* exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED
	240	*
	241	* Proof of (2):
	242	* Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
	243	* solving for power,
	244	* power*ln(b/(b+1)) =~ -2.30, or
	245	* power =~ 2.3 * (b + 1) = 4.6loadav + 2.3 =~ 5loadav. QED
	246	*
	247	* Actual power values for the implemented algorithm are as follows:
	248	* loadav: 1 2 3 4
	249	* power: 5.68 10.32 14.94 19.55
	250	*/
	251
	252	/* calculations for digital decay to forget 90% of usage in 5loadav sec /
	253	#define loadfactor(loadav) (2 * (loadav))
	254	#define decay_cpu(loadfac, cpu) (((loadfac) * (cpu)) / ((loadfac) + FSCALE))
	255
	256	/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
	257	static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
	258	SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
	259
	260	/* kernel uses `FSCALE', userland (SHOULD) use kern.fscale */
	261	static int fscale __unused = FSCALE;
	262	SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, 0, FSCALE, "");
	263
	264	/*
	265	* If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
	266	* faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
	267	* and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
	268	*
	269	* To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
	270	* 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, 11 bits).
	271	*
	272	* If you don't want to bother with the faster/more-accurate formula, you
	273	* can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
	274	* (more general) method of calculating the %age of CPU used by a process.
	275	*/
	276	#define CCPU_SHIFT 11
	277
	278	/*
	279	* Recompute process priorities, every hz ticks.
	280	*/
	281	/* ARGSUSED */
	282	static void
	283	schedcpu(void *arg)
	284	{
	285	fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
	286	struct proc *p;
	287	struct proc *curp;
	288	int realstathz, s;
	289
	290	curp = lwkt_preempted_proc(); /* YYY temporary hack */
	291
	292	realstathz = stathz ? stathz : hz;
	293	FOREACH_PROC_IN_SYSTEM(p) {
	294	/*
	295	* Increment time in/out of memory and sleep time
	296	* (if sleeping). We ignore overflow; with 16-bit int's
	297	* (remember them?) overflow takes 45 days.
	298	*/
	299	p->p_swtime++;
	300	if (p->p_stat == SSLEEP \|\| p->p_stat == SSTOP)
	301	p->p_slptime++;
	302	p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
	303	/*
	304	* If the process has slept the entire second,
	305	* stop recalculating its priority until it wakes up.
	306	*/
	307	if (p->p_slptime > 1)
	308	continue;
	309	s = splhigh(); /* prevent state changes and protect run queue */
	310	/*
	311	* p_pctcpu is only for ps.
	312	*/
	313	#if (FSHIFT >= CCPU_SHIFT)
	314	p->p_pctcpu += (realstathz == 100)?
	315	((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT):
	316	100 * (((fixpt_t) p->p_cpticks)
	317	<< (FSHIFT - CCPU_SHIFT)) / realstathz;
	318	#else
	319	p->p_pctcpu += ((FSCALE - ccpu) *
	320	(p->p_cpticks * FSCALE / realstathz)) >> FSHIFT;
	321	#endif
	322	p->p_cpticks = 0;
	323	p->p_estcpu = decay_cpu(loadfac, p->p_estcpu);
	324	resetpriority(p);
	325	splx(s);
	326	}
	327	wakeup((caddr_t)&lbolt);
	328	timeout(schedcpu, (void *)0, hz);
	329	}
	330
	331	/*
	332	* Recalculate the priority of a process after it has slept for a while.
	333	* For all load averages >= 1 and max p_estcpu of 255, sleeping for at
	334	* least six times the loadfactor will decay p_estcpu to zero.
	335	*/
	336	static void
	337	updatepri(struct proc *p)
	338	{
	339	unsigned int newcpu = p->p_estcpu;
	340	fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
	341
	342	if (p->p_slptime > 5 * loadfac) {
	343	p->p_estcpu = 0;
	344	} else {
	345	p->p_slptime--; /* the first time was done in schedcpu */
	346	while (newcpu && --p->p_slptime)
	347	newcpu = decay_cpu(loadfac, newcpu);
	348	p->p_estcpu = newcpu;
	349	}
	350	resetpriority(p);
	351	}
	352
	353	/*
	354	* We're only looking at 7 bits of the address; everything is
	355	* aligned to 4, lots of things are aligned to greater powers
	356	* of 2. Shift right by 8, i.e. drop the bottom 256 worth.
	357	*/
	358	#define TABLESIZE 128
	359	static TAILQ_HEAD(slpquehead, thread) slpque[TABLESIZE];
	360	#define LOOKUP(x) (((intptr_t)(x) >> 8) & (TABLESIZE - 1))
	361
	362	/*
	363	* During autoconfiguration or after a panic, a sleep will simply
	364	* lower the priority briefly to allow interrupts, then return.
	365	* The priority to be used (safepri) is machine-dependent, thus this
	366	* value is initialized and maintained in the machine-dependent layers.
	367	* This priority will typically be 0, or the lowest priority
	368	* that is safe for use on the interrupt stack; it can be made
	369	* higher to block network software interrupts after panics.
	370	*/
	371	int safepri;
	372
	373	void
	374	sleepinit(void)
	375	{
	376	int i;
	377
	378	sched_quantum = hz/10;
	379	hogticks = 2 * sched_quantum;
	380	for (i = 0; i < TABLESIZE; i++)
	381	TAILQ_INIT(&slpque[i]);
	382	}
	383
	384	/*
	385	* General sleep call. Suspends the current process until a wakeup is
	386	* performed on the specified identifier. The process will then be made
	387	* runnable with the specified priority. Sleeps at most timo/hz seconds
	388	* (0 means no timeout). If flags includes PCATCH flag, signals are checked
	389	* before and after sleeping, else signals are not checked. Returns 0 if
	390	* awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a
	391	* signal needs to be delivered, ERESTART is returned if the current system
	392	* call should be restarted if possible, and EINTR is returned if the system
	393	* call should be interrupted by the signal (return EINTR).
	394	*
	395	* If the process has P_CURPROC set mi_switch() will not re-queue it to
	396	* the userland scheduler queues because we are in a SSLEEP state. If
	397	* we are not the current process then we have to remove ourselves from
	398	* the scheduler queues.
	399	*
	400	* YYY priority now unused
	401	*/
	402	int
	403	tsleep(ident, flags, wmesg, timo)
	404	void *ident;
	405	int flags, timo;
	406	const char *wmesg;
	407	{
	408	struct thread *td = curthread;
	409	struct proc p = td->td_proc; / may be NULL */
	410	int s, sig = 0, catch = flags & PCATCH;
	411	int id = LOOKUP(ident);
	412	struct callout_handle thandle;
	413
	414	/*
	415	* NOTE: removed KTRPOINT, it could cause races due to blocking
	416	* even in stable. Just scrap it for now.
	417	*/
	418	if (cold \|\| panicstr) {
	419	/*
	420	* After a panic, or during autoconfiguration,
	421	* just give interrupts a chance, then just return;
	422	* don't run any other procs or panic below,
	423	* in case this is the idle process and already asleep.
	424	*/
	425	crit_panicints();
	426	return (0);
	427	}
	428	KKASSERT(td != &mycpu->gd_idlethread); /* you must be kidding! */
	429	s = splhigh();
	430	KASSERT(ident != NULL, ("tsleep: no ident"));
	431	KASSERT(p == NULL \|\| p->p_stat == SRUN, ("tsleep %p %s %d",
	432	ident, wmesg, p->p_stat));
	433
	434	crit_enter();
	435	td->td_wchan = ident;
	436	td->td_wmesg = wmesg;
	437	if (p)
	438	p->p_slptime = 0;
	439	lwkt_deschedule_self();
	440	TAILQ_INSERT_TAIL(&slpque[id], td, td_threadq);
	441	if (timo)
	442	thandle = timeout(endtsleep, (void *)td, timo);
	443	/*
	444	* We put ourselves on the sleep queue and start our timeout
	445	* before calling CURSIG, as we could stop there, and a wakeup
	446	* or a SIGCONT (or both) could occur while we were stopped.
	447	* A SIGCONT would cause us to be marked as SSLEEP
	448	* without resuming us, thus we must be ready for sleep
	449	* when CURSIG is called. If the wakeup happens while we're
	450	* stopped, td->td_wchan will be 0 upon return from CURSIG.
	451	*/
	452	if (p) {
	453	if (catch) {
	454	p->p_flag \|= P_SINTR;
	455	if ((sig = CURSIG(p))) {
	456	if (td->td_wchan) {
	457	unsleep(td);
	458	lwkt_schedule_self();
	459	}
	460	p->p_stat = SRUN;
	461	goto resume;
	462	}
	463	if (td->td_wchan == NULL) {
	464	catch = 0;
	465	goto resume;
	466	}
	467	} else {
	468	sig = 0;
	469	}
	470
	471	/*
	472	* If we are not the current process we have to remove ourself
	473	* from the run queue.
	474	*/
	475	KASSERT(p->p_stat == SRUN, ("PSTAT NOT SRUN %d %d", p->p_pid, p->p_stat));
	476	/*
	477	* If this is the current 'user' process schedule another one.
	478	*/
	479	clrrunnable(p, SSLEEP);
	480	p->p_stats->p_ru.ru_nvcsw++;
	481	KKASSERT(td->td_release \|\| (p->p_flag & P_CURPROC) == 0);
	482	mi_switch();
	483	KASSERT(p->p_stat == SRUN, ("tsleep: stat not srun"));
	484	} else {
	485	lwkt_switch();
	486	}
	487	resume:
	488	crit_exit();
	489	if (p)
	490	p->p_flag &= ~P_SINTR;
	491	splx(s);
	492	if (td->td_flags & TDF_TIMEOUT) {
	493	td->td_flags &= ~TDF_TIMEOUT;
	494	if (sig == 0)
	495	return (EWOULDBLOCK);
	496	} else if (timo) {
	497	untimeout(endtsleep, (void *)td, thandle);
	498	}
	499	if (p) {
	500	if (catch && (sig != 0 \|\| (sig = CURSIG(p)))) {
	501	if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
	502	return (EINTR);
	503	return (ERESTART);
	504	}
	505	}
	506	return (0);
	507	}
	508
	509	/*
	510	* Implement the timeout for tsleep. We interlock against
	511	* wchan when setting TDF_TIMEOUT. For processes we remove
	512	* the sleep if the process is stopped rather then sleeping,
	513	* so it remains stopped.
	514	*/
	515	static void
	516	endtsleep(void *arg)
	517	{
	518	thread_t td = arg;
	519	struct proc *p;
	520	int s;
	521
	522	s = splhigh();
	523	if (td->td_wchan) {
	524	td->td_flags \|= TDF_TIMEOUT;
	525	if ((p = td->td_proc) != NULL) {
	526	if (p->p_stat == SSLEEP)
	527	setrunnable(p);
	528	else
	529	unsleep(td);
	530	} else {
	531	unsleep(td);
	532	lwkt_schedule(td);
	533	}
	534	}
	535	splx(s);
	536	}
	537
	538	/*
	539	* Remove a process from its wait queue
	540	*/
	541	void
	542	unsleep(struct thread *td)
	543	{
	544	int s;
	545
	546	s = splhigh();
	547	if (td->td_wchan) {
	548	#if 0
	549	if (p->p_flag & P_XSLEEP) {
	550	struct xwait *w = p->p_wchan;
	551	TAILQ_REMOVE(&w->waitq, p, p_procq);
	552	p->p_flag &= ~P_XSLEEP;
	553	} else
	554	#endif
	555	TAILQ_REMOVE(&slpque[LOOKUP(td->td_wchan)], td, td_threadq);
	556	td->td_wchan = NULL;
	557	}
	558	splx(s);
	559	}
	560
	561	#if 0
	562	/*
	563	* Make all processes sleeping on the explicit lock structure runnable.
	564	*/
	565	void
	566	xwakeup(struct xwait *w)
	567	{
	568	struct proc *p;
	569	int s;
	570
	571	s = splhigh();
	572	++w->gen;
	573	while ((p = TAILQ_FIRST(&w->waitq)) != NULL) {
	574	TAILQ_REMOVE(&w->waitq, p, p_procq);
	575	KASSERT(p->p_wchan == w && (p->p_flag & P_XSLEEP),
	576	("xwakeup: wchan mismatch for %p (%p/%p) %08x", p, p->p_wchan, w, p->p_flag & P_XSLEEP));
	577	p->p_wchan = NULL;
	578	p->p_flag &= ~P_XSLEEP;
	579	if (p->p_stat == SSLEEP) {
	580	/* OPTIMIZED EXPANSION OF setrunnable(p); */
	581	if (p->p_slptime > 1)
	582	updatepri(p);
	583	p->p_slptime = 0;
	584	p->p_stat = SRUN;
	585	if (p->p_flag & P_INMEM) {
	586	setrunqueue(p);
	587	maybe_resched(p);
	588	} else {
	589	p->p_flag \|= P_SWAPINREQ;
	590	wakeup((caddr_t)&proc0);
	591	}
	592	}
	593	}
	594	splx(s);
	595	}
	596	#endif
	597
	598	/*
	599	* Make all processes sleeping on the specified identifier runnable.
	600	*/
	601	static void
	602	_wakeup(void *ident, int count)
	603	{
	604	struct slpquehead *qp;
	605	struct thread *td;
	606	struct thread *ntd;
	607	struct proc *p;
	608	int s;
	609	int id = LOOKUP(ident);
	610
	611	s = splhigh();
	612	qp = &slpque[id];
	613	restart:
	614	for (td = TAILQ_FIRST(qp); td != NULL; td = ntd) {
	615	ntd = TAILQ_NEXT(td, td_threadq);
	616	if (td->td_wchan == ident) {
	617	TAILQ_REMOVE(qp, td, td_threadq);
	618	td->td_wchan = NULL;
	619	if ((p = td->td_proc) != NULL && p->p_stat == SSLEEP) {
	620	/* OPTIMIZED EXPANSION OF setrunnable(p); */
	621	if (p->p_slptime > 1)
	622	updatepri(p);
	623	p->p_slptime = 0;
	624	p->p_stat = SRUN;
	625	if (p->p_flag & P_INMEM) {
	626	setrunqueue(p);
	627	if (p->p_flag & P_CURPROC)
	628	maybe_resched(p);
	629	} else {
	630	p->p_flag \|= P_SWAPINREQ;
	631	wakeup((caddr_t)&proc0);
	632	}
	633	/* END INLINE EXPANSION */
	634	} else if (p == NULL) {
	635	lwkt_schedule(td);
	636	}
	637	if (--count == 0)
	638	break;
	639	goto restart;
	640	}
	641	}
	642	splx(s);
	643	}
	644
	645	void
	646	wakeup(void *ident)
	647	{
	648	_wakeup(ident, 0);
	649	}
	650
	651	void
	652	wakeup_one(void *ident)
	653	{
	654	_wakeup(ident, 1);
	655	}
	656
	657	/*
	658	* The machine independent parts of mi_switch().
	659	* Must be called at splstatclock() or higher.
	660	*/
	661	void
	662	mi_switch()
	663	{
	664	struct thread *td = curthread;
	665	struct proc p = td->td_proc; / XXX */
	666	struct rlimit *rlim;
	667	int x;
	668	u_int64_t ttime;
	669
	670	/*
	671	* XXX this spl is almost unnecessary. It is partly to allow for
	672	* sloppy callers that don't do it (issignal() via CURSIG() is the
	673	* main offender). It is partly to work around a bug in the i386
	674	* cpu_switch() (the ipl is not preserved). We ran for years
	675	* without it. I think there was only a interrupt latency problem.
	676	* The main caller, tsleep(), does an splx() a couple of instructions
	677	* after calling here. The buggy caller, issignal(), usually calls
	678	* here at spl0() and sometimes returns at splhigh(). The process
	679	* then runs for a little too long at splhigh(). The ipl gets fixed
	680	* when the process returns to user mode (or earlier).
	681	*
	682	* It would probably be better to always call here at spl0(). Callers
	683	* are prepared to give up control to another process, so they must
	684	* be prepared to be interrupted. The clock stuff here may not
	685	* actually need splstatclock().
	686	*/
	687	x = splstatclock();
	688	clear_resched();
	689
	690	/*
	691	* Check if the process exceeds its cpu resource allocation.
	692	* If over max, kill it. Time spent in interrupts is not
	693	* included. YYY 64 bit match is expensive. Ick.
	694	*/
	695	ttime = td->td_sticks + td->td_uticks;
	696	if (p->p_stat != SZOMB && p->p_limit->p_cpulimit != RLIM_INFINITY &&
	697	ttime > p->p_limit->p_cpulimit) {
	698	rlim = &p->p_rlimit[RLIMIT_CPU];
	699	if (ttime / (rlim_t)1000000 >= rlim->rlim_max) {
	700	killproc(p, "exceeded maximum CPU limit");
	701	} else {
	702	psignal(p, SIGXCPU);
	703	if (rlim->rlim_cur < rlim->rlim_max) {
	704	/* XXX: we should make a private copy */
	705	rlim->rlim_cur += 5;
	706	}
	707	}
	708	}
	709
	710	/*
	711	* Pick a new current process and record its start time. If we
	712	* are in a SSTOPped state we deschedule ourselves. YYY this needs
	713	* to be cleaned up, remember that LWKTs stay on their run queue
	714	* which works differently then the user scheduler which removes
	715	* the process from the runq when it runs it.
	716	*/
	717	mycpu->gd_cnt.v_swtch++;
	718	if (p->p_stat == SSTOP)
	719	lwkt_deschedule_self();
	720	lwkt_switch();
	721
	722	splx(x);
	723	}
	724
	725	/*
	726	* Change process state to be runnable,
	727	* placing it on the run queue if it is in memory,
	728	* and awakening the swapper if it isn't in memory.
	729	*/
	730	void
	731	setrunnable(struct proc *p)
	732	{
	733	int s;
	734
	735	s = splhigh();
	736	switch (p->p_stat) {
	737	case 0:
	738	case SRUN:
	739	case SZOMB:
	740	default:
	741	panic("setrunnable");
	742	case SSTOP:
	743	case SSLEEP:
	744	unsleep(p->p_thread); /* e.g. when sending signals */
	745	break;
	746
	747	case SIDL:
	748	break;
	749	}
	750	p->p_stat = SRUN;
	751	if (p->p_flag & P_INMEM)
	752	setrunqueue(p);
	753	splx(s);
	754	if (p->p_slptime > 1)
	755	updatepri(p);
	756	p->p_slptime = 0;
	757	if ((p->p_flag & P_INMEM) == 0) {
	758	p->p_flag \|= P_SWAPINREQ;
	759	wakeup((caddr_t)&proc0);
	760	} else {
	761	maybe_resched(p);
	762	}
	763	}
	764
	765	/*
	766	* Change the process state to NOT be runnable, removing it from the run
	767	* queue. If P_CURPROC is not set and we are in SRUN the process is on the
	768	* run queue (If P_INMEM is not set then it isn't because it is swapped).
	769	*/
	770	void
	771	clrrunnable(struct proc *p, int stat)
	772	{
	773	int s;
	774
	775	s = splhigh();
	776	switch(p->p_stat) {
	777	case SRUN:
	778	if (p->p_flag & P_ONRUNQ)
	779	remrunqueue(p);
	780	break;
	781	default:
	782	break;
	783	}
	784	p->p_stat = stat;
	785	splx(s);
	786	}
	787
	788	/*
	789	* Compute the priority of a process when running in user mode.
	790	* Arrange to reschedule if the resulting priority is better
	791	* than that of the current process.
	792	*
	793	* YYY real time / idle procs do not use p_priority XXX
	794	*/
	795	void
	796	resetpriority(struct proc *p)
	797	{
	798	unsigned int newpriority;
	799	int opq;
	800	int npq;
	801
	802	if (p->p_rtprio.type != RTP_PRIO_NORMAL)
	803	return;
	804	newpriority = PUSER + p->p_estcpu / INVERSE_ESTCPU_WEIGHT +
	805	NICE_WEIGHT * p->p_nice;
	806	newpriority = min(newpriority, MAXPRI);
	807	npq = newpriority / PPQ;
	808	crit_enter();
	809	opq = p->p_priority / PPQ;
	810	if (p->p_stat == SRUN && (p->p_flag & P_ONRUNQ) && opq != npq) {
	811	/*
	812	* We have to move the process to another queue
	813	*/
	814	remrunqueue(p);
	815	p->p_priority = newpriority;
	816	setrunqueue(p);
	817	} else {
	818	/*
	819	* We can just adjust the priority and it will be picked
	820	* up later.
	821	*/
	822	KKASSERT(opq == npq \|\| (p->p_flag & P_ONRUNQ) == 0);
	823	p->p_priority = newpriority;
	824	}
	825	crit_exit();
	826	maybe_resched(p);
	827	}
	828
	829	/*
	830	* Compute a tenex style load average of a quantity on
	831	* 1, 5 and 15 minute intervals.
	832	*/
	833	static void
	834	loadav(void *arg)
	835	{
	836	int i, nrun;
	837	struct loadavg *avg;
	838	struct proc *p;
	839
	840	avg = &averunnable;
	841	nrun = 0;
	842	FOREACH_PROC_IN_SYSTEM(p) {
	843	switch (p->p_stat) {
	844	case SRUN:
	845	case SIDL:
	846	nrun++;
	847	}
	848	}
	849	for (i = 0; i < 3; i++)
	850	avg->ldavg[i] = (cexp[i] * avg->ldavg[i] +
	851	nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT;
	852
	853	/*
	854	* Schedule the next update to occur after 5 seconds, but add a
	855	* random variation to avoid synchronisation with processes that
	856	* run at regular intervals.
	857	*/
	858	callout_reset(&loadav_callout, hz * 4 + (int)(random() % (hz * 2 + 1)),
	859	loadav, NULL);
	860	}
	861
	862	/* ARGSUSED */
	863	static void
	864	sched_setup(dummy)
	865	void *dummy;
	866	{
	867
	868	callout_init(&loadav_callout);
	869
	870	/* Kick off timeout driven events by calling first time. */
	871	roundrobin(NULL);
	872	schedcpu(NULL);
	873	loadav(NULL);
	874	}
	875
	876	/*
	877	* We adjust the priority of the current process. The priority of
	878	* a process gets worse as it accumulates CPU time. The cpu usage
	879	* estimator (p_estcpu) is increased here. resetpriority() will
	880	* compute a different priority each time p_estcpu increases by
	881	* INVERSE_ESTCPU_WEIGHT
	882	* (until MAXPRI is reached). The cpu usage estimator ramps up
	883	* quite quickly when the process is running (linearly), and decays
	884	* away exponentially, at a rate which is proportionally slower when
	885	* the system is busy. The basic principle is that the system will
	886	* 90% forget that the process used a lot of CPU time in 5 * loadav
	887	* seconds. This causes the system to favor processes which haven't
	888	* run much recently, and to round-robin among other processes.
	889	*/
	890	void
	891	schedclock(p)
	892	struct proc *p;
	893	{
	894
	895	p->p_cpticks++;
	896	p->p_estcpu = ESTCPULIM(p->p_estcpu + 1);
	897	if ((p->p_estcpu % INVERSE_ESTCPU_WEIGHT) == 0)
	898	resetpriority(p);
	899	}
	900
	901	static
	902	void
	903	crit_panicints(void)
	904	{
	905	int s;
	906	int cpri;
	907
	908	s = splhigh();
	909	cpri = crit_panic_save();
	910	splx(safepri);
	911	crit_panic_restore(cpri);
	912	splx(s);
	913	}
	914