2 * Copyright (c) 1999 Peter Wemm <peter@FreeBSD.org>
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * $FreeBSD: src/sys/kern/kern_switch.c,v 1.3.2.1 2000/05/16 06:58:12 dillon Exp $
27 * $DragonFly: src/sys/kern/Attic/kern_switch.c,v 1.9 2003/07/25 05:26:50 dillon Exp $
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/kernel.h>
34 #include <sys/queue.h>
36 #include <sys/rtprio.h>
37 #include <sys/thread2.h>
39 #include <sys/sysctl.h>
40 #include <machine/ipl.h>
41 #include <machine/cpu.h>
44 * debugging only YYY Remove me! define to schedule user processes only
45 * on the BSP. Interrupts can still be taken on the APs.
47 #undef ONLY_ONE_USER_CPU
50 * We have NQS (32) run queues per scheduling class. For the normal
51 * class, there are 128 priorities scaled onto these 32 queues. New
52 * processes are added to the last entry in each queue, and processes
53 * are selected for running by taking them from the head and maintaining
54 * a simple FIFO arrangement. Realtime and Idle priority processes have
55 * and explicit 0-31 priority which maps directly onto their class queue
56 * index. When a queue has something in it, the corresponding bit is
57 * set in the queuebits variable, allowing a single read to determine
58 * the state of all 32 queues and then a ffs() to find the first busy
61 static struct rq queues[NQS];
62 static struct rq rtqueues[NQS];
63 static struct rq idqueues[NQS];
64 static u_int32_t queuebits;
65 static u_int32_t rtqueuebits;
66 static u_int32_t idqueuebits;
67 static u_int32_t curprocmask = -1; /* currently running a user process */
68 static u_int32_t rdyprocmask; /* ready to accept a user process */
71 SYSCTL_INT(_debug, OID_AUTO, runqcount, CTLFLAG_RD, &runqcount, 0, "");
72 static int usched_steal;
73 SYSCTL_INT(_debug, OID_AUTO, usched_steal, CTLFLAG_RW,
74 &usched_steal, 0, "Passive Release was nonoptimal");
75 static int usched_optimal;
76 SYSCTL_INT(_debug, OID_AUTO, usched_optimal, CTLFLAG_RW,
77 &usched_optimal, 0, "Passive Release was nonoptimal");
79 #define USCHED_COUNTER(td) ((td->td_gd == mycpu) ? ++usched_optimal : ++usched_steal)
82 * Initialize the run queues at boot time.
89 for (i = 0; i < NQS; i++) {
90 TAILQ_INIT(&queues[i]);
91 TAILQ_INIT(&rtqueues[i]);
92 TAILQ_INIT(&idqueues[i]);
100 SYSINIT(runqueue, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, rqinit, NULL)
103 test_resched(struct proc *curp, struct proc *newp)
105 if (newp->p_rtprio.type < curp->p_rtprio.type)
107 if (newp->p_rtprio.type == curp->p_rtprio.type) {
108 if (newp->p_rtprio.type == RTP_PRIO_NORMAL) {
109 if (newp->p_priority / PPQ <= curp->p_priority / PPQ)
111 } else if (newp->p_rtprio.prio < curp->p_rtprio.prio) {
119 * chooseproc() is called when a cpu needs a user process to LWKT schedule.
120 * chooseproc() will select a user process and return it.
132 pri = bsfl(rtqueuebits);
134 which = &rtqueuebits;
135 } else if (queuebits) {
136 pri = bsfl(queuebits);
139 } else if (idqueuebits) {
140 pri = bsfl(idqueuebits);
142 which = &idqueuebits;
147 KASSERT(p, ("chooseproc: no proc on busy queue"));
148 TAILQ_REMOVE(q, p, p_procq);
151 *which &= ~(1 << pri);
152 KASSERT((p->p_flag & P_ONRUNQ) != 0, ("not on runq6!"));
153 p->p_flag &= ~P_ONRUNQ;
159 * setrunqueue() 'wakes up' a 'user' process, which can mean several things.
161 * If P_CP_RELEASED is set the user process is under the control of the
162 * LWKT subsystem and we simply wake the thread up. This is ALWAYS the
163 * case when setrunqueue() is called from wakeup() and, in fact wakeup()
164 * asserts that P_CP_RELEASED is set.
166 * Note that acquire_curproc() already optimizes making the current process
167 * P_CURPROC, so setrunqueue() does not need to.
169 * If P_CP_RELEASED is not set we place the process on the run queue and we
170 * signal other cpus in the system that may need to be woken up to service
171 * the new 'user' process.
173 * If P_PASSIVE_ACQ is set setrunqueue() will not wakeup potential target
174 * cpus in an attempt to keep the process on the current cpu at least for
175 * a little while to take advantage of locality of reference (e.g. fork/exec
176 * or short fork/exit).
178 * WARNING! a thread can be acquired by another cpu the moment it is put
179 * on the user scheduler's run queue AND we release the MP lock. Since we
180 * release the MP lock before switching out another cpu may begin stealing
181 * our current thread before we are completely switched out! The
182 * lwkt_acquire() function will stall until TDF_RUNNING is cleared on the
183 * thread before stealing it.
185 * The associated thread must NOT be scheduled.
186 * The process must be runnable.
187 * This must be called at splhigh().
190 setrunqueue(struct proc *p)
198 KASSERT(p->p_stat == SRUN, ("setrunqueue: proc not SRUN"));
199 KASSERT((p->p_flag & (P_ONRUNQ|P_CURPROC)) == 0,
200 ("process %d already on runq! flag %08x", p->p_pid, p->p_flag));
201 KKASSERT((p->p_thread->td_flags & TDF_RUNQ) == 0);
204 * If we have been released from the userland scheduler we
205 * directly schedule its thread.
207 if (p->p_flag & P_CP_RELEASED) {
208 lwkt_schedule(p->p_thread);
214 * If our cpu is available to run a user process we short cut and
217 cpuid = mycpu->gd_cpuid;
218 if ((curprocmask & (1 << cpuid)) == 0) {
219 curprocmask |= 1 << cpuid;
220 p->p_flag |= P_CURPROC;
221 USCHED_COUNTER(p->p_thread);
222 lwkt_acquire(p->p_thread);
223 lwkt_schedule(p->p_thread);
229 * Otherwise place this process on the userland scheduler's run
233 p->p_flag |= P_ONRUNQ;
234 if (p->p_rtprio.type == RTP_PRIO_NORMAL) {
235 pri = p->p_priority >> 2;
237 queuebits |= 1 << pri;
238 } else if (p->p_rtprio.type == RTP_PRIO_REALTIME ||
239 p->p_rtprio.type == RTP_PRIO_FIFO) {
240 pri = (u_int8_t)p->p_rtprio.prio;
242 rtqueuebits |= 1 << pri;
243 } else if (p->p_rtprio.type == RTP_PRIO_IDLE) {
244 pri = (u_int8_t)p->p_rtprio.prio;
246 idqueuebits |= 1 << pri;
248 panic("setrunqueue: invalid rtprio type");
251 p->p_rqindex = pri; /* remember the queue index */
252 TAILQ_INSERT_TAIL(q, p, p_procq);
255 * Wakeup other cpus to schedule the newly available thread.
256 * XXX doesn't really have to be in a critical section.
257 * We own giant after all.
259 * We use rdyprocmask to avoid unnecessarily waking up the scheduler
260 * thread when it is already running.
262 if ((mask = ~curprocmask & rdyprocmask & mycpu->gd_other_cpus) != 0 &&
263 (p->p_flag & P_PASSIVE_ACQ) == 0) {
264 int count = runqcount;
266 printf("PROC %d nocpu to schedule it on\n", p->p_pid);
267 while (mask && count) {
269 KKASSERT((curprocmask & (1 << cpuid)) == 0);
270 rdyprocmask &= ~(1 << cpuid);
271 lwkt_schedule(&globaldata_find(cpuid)->gd_schedthread);
273 mask &= ~(1 << cpuid);
280 * remrunqueue() removes a given process from the run queue that it is on,
281 * clearing the queue busy bit if it becomes empty. This function is called
282 * when a userland process is selected for LWKT scheduling. Note that
283 * LWKT scheduling is an abstraction of 'curproc'.. there could very well be
284 * several userland processes whos threads are scheduled or otherwise in
285 * a special state, and such processes are NOT on the userland scheduler's
288 * This must be called at splhigh().
291 remrunqueue(struct proc *p)
298 KASSERT((p->p_flag & P_ONRUNQ) != 0, ("not on runq4!"));
299 p->p_flag &= ~P_ONRUNQ;
301 KKASSERT(runqcount >= 0);
303 if (p->p_rtprio.type == RTP_PRIO_NORMAL) {
306 } else if (p->p_rtprio.type == RTP_PRIO_REALTIME ||
307 p->p_rtprio.type == RTP_PRIO_FIFO) {
309 which = &rtqueuebits;
310 } else if (p->p_rtprio.type == RTP_PRIO_IDLE) {
312 which = &idqueuebits;
314 panic("remrunqueue: invalid rtprio type");
316 TAILQ_REMOVE(q, p, p_procq);
317 if (TAILQ_EMPTY(q)) {
318 KASSERT((*which & (1 << pri)) != 0,
319 ("remrunqueue: remove from empty queue"));
320 *which &= ~(1 << pri);
326 * Release the P_CURPROC designation on the CURRENT process only. This
327 * will allow another userland process to be scheduled. If we do not
328 * have or cannot get the MP lock we just wakeup the scheduler thread for
331 * WARNING! The MP lock may be in an unsynchronized state due to the
332 * way get_mplock() works and the fact that this function may be called
333 * from a passive release during a lwkt_switch(). try_mplock() will deal
334 * with this for us but you should be aware that td_mpcount may not be
338 release_curproc(struct proc *p)
343 #ifdef ONLY_ONE_USER_CPU
344 KKASSERT(mycpu->gd_cpuid == 0 && p->p_thread->td_gd == mycpu);
348 cpuid = p->p_thread->td_gd->gd_cpuid;
349 p->p_flag |= P_CP_RELEASED;
350 if (p->p_flag & P_CURPROC) {
351 p->p_flag &= ~P_CURPROC;
353 KKASSERT(curprocmask & (1 << cpuid));
354 if ((np = chooseproc()) != NULL) {
355 np->p_flag |= P_CURPROC;
356 USCHED_COUNTER(np->p_thread);
357 lwkt_acquire(np->p_thread);
358 lwkt_schedule(np->p_thread);
360 curprocmask &= ~(1 << cpuid);
365 curprocmask &= ~(1 << cpuid);
366 if (rdyprocmask & (1 << cpuid))
367 lwkt_schedule(&globaldata_find(cpuid)->gd_schedthread);
374 * Acquire the P_CURPROC designation on the CURRENT process only. This
375 * function is called prior to returning to userland. If the system
376 * call or trap did not block and if no reschedule was requested it is
377 * highly likely that the P_CURPROC flag is still set in the proc, and
378 * we do almost nothing here.
381 acquire_curproc(struct proc *p)
387 * Short cut, we've already acquired the designation or we never
388 * lost it in the first place.
390 if ((p->p_flag & P_CURPROC) != 0)
394 * Long cut. This pulls in a bit of the userland scheduler as
395 * an optimization. If our cpu has not scheduled a userland
396 * process we gladly fill the slot, otherwise we choose the best
397 * candidate from the run queue and compare it against ourselves,
398 * scheduling either us or him depending.
400 * If our cpu's slot isn't free we put ourselves on the userland
401 * run queue and switch away. We should have P_CURPROC when we
402 * come back. Note that a cpu change can occur when we come back.
404 * YYY don't need critical section, we hold giant and no interrupt
405 * will mess w/ this proc? Or will it? What about curprocmask?
407 #ifdef ONLY_ONE_USER_CPU
408 KKASSERT(mycpu->gd_cpuid == 0 && p->p_thread->td_gd == mycpu);
411 p->p_flag &= ~P_CP_RELEASED;
412 while ((p->p_flag & P_CURPROC) == 0) {
413 cpuid = p->p_thread->td_gd->gd_cpuid; /* load/reload cpuid */
414 if ((curprocmask & (1 << cpuid)) == 0) {
415 curprocmask |= 1 << cpuid;
416 if ((np = chooseproc()) != NULL) {
417 KKASSERT((np->p_flag & P_CP_RELEASED) == 0);
418 if (test_resched(p, np)) {
419 np->p_flag |= P_CURPROC;
420 USCHED_COUNTER(np->p_thread);
421 lwkt_acquire(np->p_thread);
422 lwkt_schedule(np->p_thread);
424 p->p_flag |= P_CURPROC;
428 p->p_flag |= P_CURPROC;
431 if ((p->p_flag & P_CURPROC) == 0) {
432 lwkt_deschedule_self();
435 KKASSERT((p->p_flag & (P_ONRUNQ|P_CURPROC|P_CP_RELEASED)) == P_CURPROC);
442 * Yield / synchronous reschedule. This is a bit tricky because the trap
443 * code might have set a lazy release on the switch function. The first
444 * thing we do is call lwkt_switch() to resolve the lazy release (if any).
445 * Then, if we are a process, we want to allow another process to run.
447 * The only way to do that is to acquire and then release P_CURPROC. We
448 * have to release it because the kernel expects it to be released as a
449 * sanity check when it goes to sleep.
451 * XXX we need a way to ensure that we wake up eventually from a yield,
452 * even if we are an idprio process.
457 struct thread *td = curthread;
458 struct proc *p = td->td_proc;
462 p->p_flag |= P_PASSIVE_ACQ;
465 p->p_flag &= ~P_PASSIVE_ACQ;
471 * For SMP systems a user scheduler helper thread is created for each
472 * cpu and is used to allow one cpu to wakeup another for the purposes of
473 * scheduling userland threads from setrunqueue(). UP systems do not
474 * need the helper since there is only one cpu. We can't use the idle
475 * thread for this because we need to hold the MP lock. Additionally,
476 * doing things this way allows us to HLT idle cpus on MP systems.
482 sched_thread(void *dummy)
484 int cpuid = mycpu->gd_cpuid; /* doesn't change */
485 u_int32_t cpumask = 1 << cpuid; /* doesn't change */
487 #ifdef ONLY_ONE_USER_CPU
488 KKASSERT(cpuid == 0);
491 get_mplock(); /* hold the MP lock */
495 lwkt_deschedule_self(); /* interlock */
496 rdyprocmask |= cpumask;
498 if ((curprocmask & cpumask) == 0 && (np = chooseproc()) != NULL) {
499 curprocmask |= cpumask;
500 np->p_flag |= P_CURPROC;
501 USCHED_COUNTER(np->p_thread);
502 lwkt_acquire(np->p_thread);
503 lwkt_schedule(np->p_thread);
511 sched_thread_init(void)
513 int cpuid = mycpu->gd_cpuid;
515 lwkt_create(sched_thread, NULL, NULL, &mycpu->gd_schedthread,
516 TDF_STOPREQ, "usched %d", cpuid);
517 curprocmask &= ~(1 << cpuid); /* schedule user proc on cpu */
518 #ifdef ONLY_ONE_USER_CPU
520 curprocmask |= 1 << cpuid; /* DISABLE USER PROCS */
522 rdyprocmask |= 1 << cpuid;