2 * Copyright (c) 1999 Peter Wemm <peter@FreeBSD.org>
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * $FreeBSD: src/sys/kern/kern_switch.c,v 1.3.2.1 2000/05/16 06:58:12 dillon Exp $
27 * $DragonFly: src/sys/kern/Attic/kern_switch.c,v 1.18 2004/03/01 06:33:17 dillon Exp $
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/kernel.h>
34 #include <sys/queue.h>
36 #include <sys/rtprio.h>
37 #include <sys/thread2.h>
39 #include <sys/sysctl.h>
40 #include <machine/ipl.h>
41 #include <machine/cpu.h>
42 #include <machine/smp.h>
45 * debugging only YYY Remove me! define to schedule user processes only
46 * on the BSP. Interrupts can still be taken on the APs.
48 #undef ONLY_ONE_USER_CPU
51 * We have NQS (32) run queues per scheduling class. For the normal
52 * class, there are 128 priorities scaled onto these 32 queues. New
53 * processes are added to the last entry in each queue, and processes
54 * are selected for running by taking them from the head and maintaining
55 * a simple FIFO arrangement. Realtime and Idle priority processes have
56 * and explicit 0-31 priority which maps directly onto their class queue
57 * index. When a queue has something in it, the corresponding bit is
58 * set in the queuebits variable, allowing a single read to determine
59 * the state of all 32 queues and then a ffs() to find the first busy
62 static struct rq queues[NQS];
63 static struct rq rtqueues[NQS];
64 static struct rq idqueues[NQS];
65 static u_int32_t queuebits;
66 static u_int32_t rtqueuebits;
67 static u_int32_t idqueuebits;
68 static cpumask_t curprocmask = -1; /* currently running a user process */
69 static cpumask_t rdyprocmask; /* ready to accept a user process */
75 SYSCTL_INT(_debug, OID_AUTO, runqcount, CTLFLAG_RD, &runqcount, 0, "");
76 static int usched_steal;
77 SYSCTL_INT(_debug, OID_AUTO, usched_steal, CTLFLAG_RW,
78 &usched_steal, 0, "Passive Release was nonoptimal");
79 static int usched_optimal;
80 SYSCTL_INT(_debug, OID_AUTO, usched_optimal, CTLFLAG_RW,
81 &usched_optimal, 0, "Passive Release was nonoptimal");
83 static int remote_resched = 1;
84 static int remote_resched_nonaffinity;
85 static int remote_resched_affinity;
86 static int choose_affinity;
87 SYSCTL_INT(_debug, OID_AUTO, remote_resched, CTLFLAG_RW,
88 &remote_resched, 0, "Resched to another cpu");
89 SYSCTL_INT(_debug, OID_AUTO, remote_resched_nonaffinity, CTLFLAG_RD,
90 &remote_resched_nonaffinity, 0, "Number of remote rescheds");
91 SYSCTL_INT(_debug, OID_AUTO, remote_resched_affinity, CTLFLAG_RD,
92 &remote_resched_affinity, 0, "Number of remote rescheds");
93 SYSCTL_INT(_debug, OID_AUTO, choose_affinity, CTLFLAG_RD,
94 &choose_affinity, 0, "chooseproc() was smart");
97 static void sched_thread_cpu_init(void);
99 #define USCHED_COUNTER(td) ((td->td_gd == mycpu) ? ++usched_optimal : ++usched_steal)
102 * Initialize the run queues at boot time.
109 for (i = 0; i < NQS; i++) {
110 TAILQ_INIT(&queues[i]);
111 TAILQ_INIT(&rtqueues[i]);
112 TAILQ_INIT(&idqueues[i]);
116 SYSINIT(runqueue, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, rqinit, NULL)
120 test_resched(struct proc *curp, struct proc *newp)
122 if (newp->p_priority / PPQ <= curp->p_priority / PPQ)
128 * chooseproc() is called when a cpu needs a user process to LWKT schedule.
129 * chooseproc() will select a user process and return it.
133 chooseproc(struct proc *chkp)
142 pri = bsfl(rtqueuebits);
144 which = &rtqueuebits;
145 } else if (queuebits) {
146 pri = bsfl(queuebits);
149 } else if (idqueuebits) {
150 pri = bsfl(idqueuebits);
152 which = &idqueuebits;
157 KASSERT(p, ("chooseproc: no proc on busy queue"));
160 * If the chosen process is not at a higher priority then chkp
161 * then return NULL without dequeueing a new process.
163 if (chkp && !test_resched(chkp, p))
168 * If the chosen process does not reside on this cpu spend a few
169 * cycles looking for a better candidate at the same priority level.
170 * This is a fallback check, setrunqueue() tries to wakeup the
171 * correct cpu and is our front-line affinity.
173 if (p->p_thread->td_gd != mycpu &&
174 (chkp = TAILQ_NEXT(p, p_procq)) != NULL
176 if (chkp->p_thread->td_gd == mycpu) {
183 TAILQ_REMOVE(q, p, p_procq);
186 *which &= ~(1 << pri);
187 KASSERT((p->p_flag & P_ONRUNQ) != 0, ("not on runq6!"));
188 p->p_flag &= ~P_ONRUNQ;
194 * called via an ipi message to reschedule on another cpu.
198 need_resched_remote(void *dummy)
206 * setrunqueue() 'wakes up' a 'user' process, which can mean several things.
208 * If P_CP_RELEASED is set the user process is under the control of the
209 * LWKT subsystem and we simply wake the thread up. This is ALWAYS the
210 * case when setrunqueue() is called from wakeup() and, in fact wakeup()
211 * asserts that P_CP_RELEASED is set.
213 * Note that acquire_curproc() already optimizes making the current process
214 * P_CURPROC, so setrunqueue() does not need to.
216 * If P_CP_RELEASED is not set we place the process on the run queue and we
217 * signal other cpus in the system that may need to be woken up to service
218 * the new 'user' process.
220 * If P_PASSIVE_ACQ is set setrunqueue() will not wakeup potential target
221 * cpus in an attempt to keep the process on the current cpu at least for
222 * a little while to take advantage of locality of reference (e.g. fork/exec
223 * or short fork/exit).
225 * CPU AFFINITY: cpu affinity is handled by attempting to either schedule
226 * or (user level) preempt on the same cpu that a process was previously
227 * scheduled to. If we cannot do this but we are at enough of a higher
228 * priority then the processes running on other cpus, we will allow the
229 * process to be stolen by another cpu.
231 * WARNING! a thread can be acquired by another cpu the moment it is put
232 * on the user scheduler's run queue AND we release the MP lock. Since we
233 * release the MP lock before switching out another cpu may begin stealing
234 * our current thread before we are completely switched out! The
235 * lwkt_acquire() function will stall until TDF_RUNNING is cleared on the
236 * thread before stealing it.
238 * The associated thread must NOT be scheduled.
239 * The process must be runnable.
240 * This must be called at splhigh().
243 setrunqueue(struct proc *p)
246 struct globaldata *gd;
255 KASSERT(p->p_stat == SRUN, ("setrunqueue: proc not SRUN"));
256 KASSERT((p->p_flag & (P_ONRUNQ|P_CURPROC)) == 0,
257 ("process %d already on runq! flag %08x", p->p_pid, p->p_flag));
258 KKASSERT((p->p_thread->td_flags & TDF_RUNQ) == 0);
261 * If we have been released from the userland scheduler we
262 * directly schedule its thread.
264 if (p->p_flag & P_CP_RELEASED) {
265 lwkt_schedule(p->p_thread);
271 * Check cpu affinity. The associated thread is stable at the
272 * moment. Note that we may be checking another cpu here so we
273 * have to be careful. Note that gd_upri only counts when the
274 * curprocmask bit is set for the cpu in question, and since it is
275 * only a hint we can modify it on another cpu's globaldata structure.
276 * We use it to prevent unnecessary IPIs (hence the - PPQ).
278 gd = p->p_thread->td_gd;
279 cpuid = gd->gd_cpuid;
281 if ((curprocmask & (1 << cpuid)) == 0) {
282 curprocmask |= 1 << cpuid;
283 p->p_flag |= P_CURPROC;
284 gd->gd_upri = p->p_priority;
285 USCHED_COUNTER(p->p_thread);
286 lwkt_schedule(p->p_thread);
287 /* CANNOT TOUCH PROC OR TD AFTER SCHEDULE CALL TO REMOTE CPU */
291 ++remote_resched_affinity;
297 * gd and cpuid may still 'hint' at another cpu. Even so we have
298 * to place this process on the userland scheduler's run queue for
299 * action by the target cpu.
302 p->p_flag |= P_ONRUNQ;
303 if (p->p_rtprio.type == RTP_PRIO_NORMAL) {
304 pri = (p->p_priority & PRIMASK) >> 2;
306 queuebits |= 1 << pri;
307 } else if (p->p_rtprio.type == RTP_PRIO_REALTIME ||
308 p->p_rtprio.type == RTP_PRIO_FIFO) {
309 pri = (u_int8_t)p->p_rtprio.prio;
311 rtqueuebits |= 1 << pri;
312 } else if (p->p_rtprio.type == RTP_PRIO_IDLE) {
313 pri = (u_int8_t)p->p_rtprio.prio;
315 idqueuebits |= 1 << pri;
317 panic("setrunqueue: invalid rtprio type");
320 p->p_rqindex = pri; /* remember the queue index */
321 TAILQ_INSERT_TAIL(q, p, p_procq);
325 * Either wakeup other cpus user thread scheduler or request
326 * preemption on other cpus (which will also wakeup a HLT).
328 * NOTE! gd and cpuid may still be our 'hint', not our current
335 * Check cpu affinity for user preemption (when the curprocmask bit
339 if (p->p_priority / PPQ < gd->gd_upri / PPQ) {
343 } else if (remote_resched) {
344 if (p->p_priority / PPQ < gd->gd_upri / PPQ) {
345 gd->gd_upri = p->p_priority;
346 lwkt_send_ipiq(gd, need_resched_remote, NULL);
348 ++remote_resched_affinity;
353 * No affinity, first schedule to any cpus that do not have a current
354 * process. If there is a free cpu we always schedule to it.
357 (mask = ~curprocmask & rdyprocmask & mycpu->gd_other_cpus) != 0 &&
358 (p->p_flag & P_PASSIVE_ACQ) == 0) {
360 printf("PROC %d nocpu to schedule it on\n", p->p_pid);
361 while (mask && count) {
363 KKASSERT((curprocmask & (1 << cpuid)) == 0);
364 rdyprocmask &= ~(1 << cpuid);
365 lwkt_schedule(&globaldata_find(cpuid)->gd_schedthread);
367 mask &= ~(1 << cpuid);
372 * If there are still runnable processes try to wakeup a random
373 * cpu that is running a much lower priority process in order to
374 * preempt on it. Note that gd_upri is only a hint, so we can
375 * overwrite it from the wrong cpu. If we can't find one, we
378 * We depress the priority check so multiple cpu bound programs
379 * do not bounce between cpus. Remember that the clock interrupt
380 * will also cause all cpus to reschedule.
382 * We must mask against rdyprocmask or we will race in the boot
383 * code (before all cpus have working scheduler helpers), plus
384 * some cpus might not be operational and/or not configured to
385 * handle user processes.
387 if (count && remote_resched && ncpus > 1) {
390 if (++cpuid == ncpus)
392 } while (cpuid == mycpu->gd_cpuid);
395 if (rdyprocmask & (1 << cpuid)) {
396 gd = globaldata_find(cpuid);
398 if (p->p_priority / PPQ < gd->gd_upri / PPQ - 2) {
399 gd->gd_upri = p->p_priority;
400 lwkt_send_ipiq(gd, need_resched_remote, NULL);
401 ++remote_resched_nonaffinity;
406 if (p->p_priority / PPQ < gd->gd_upri / PPQ) {
414 * remrunqueue() removes a given process from the run queue that it is on,
415 * clearing the queue busy bit if it becomes empty. This function is called
416 * when a userland process is selected for LWKT scheduling. Note that
417 * LWKT scheduling is an abstraction of 'curproc'.. there could very well be
418 * several userland processes whos threads are scheduled or otherwise in
419 * a special state, and such processes are NOT on the userland scheduler's
422 * This must be called at splhigh().
425 remrunqueue(struct proc *p)
432 KASSERT((p->p_flag & P_ONRUNQ) != 0, ("not on runq4!"));
433 p->p_flag &= ~P_ONRUNQ;
435 KKASSERT(runqcount >= 0);
437 if (p->p_rtprio.type == RTP_PRIO_NORMAL) {
440 } else if (p->p_rtprio.type == RTP_PRIO_REALTIME ||
441 p->p_rtprio.type == RTP_PRIO_FIFO) {
443 which = &rtqueuebits;
444 } else if (p->p_rtprio.type == RTP_PRIO_IDLE) {
446 which = &idqueuebits;
448 panic("remrunqueue: invalid rtprio type");
450 TAILQ_REMOVE(q, p, p_procq);
451 if (TAILQ_EMPTY(q)) {
452 KASSERT((*which & (1 << pri)) != 0,
453 ("remrunqueue: remove from empty queue"));
454 *which &= ~(1 << pri);
460 * Release the P_CURPROC designation on the current process for this cpu
461 * and attempt to assign a new current process from the run queue.
463 * If we do not have or cannot get the MP lock we just wakeup the userland
464 * helper scheduler thread for this cpu.
466 * WARNING! The MP lock may be in an unsynchronized state due to the
467 * way get_mplock() works and the fact that this function may be called
468 * from a passive release during a lwkt_switch(). try_mplock() will deal
469 * with this for us but you should be aware that td_mpcount may not be
473 release_curproc(struct proc *p)
478 #ifdef ONLY_ONE_USER_CPU
479 KKASSERT(mycpu->gd_cpuid == 0 && p->p_thread->td_gd == mycpu);
483 cpuid = p->p_thread->td_gd->gd_cpuid;
484 if ((p->p_flag & P_CP_RELEASED) == 0) {
485 p->p_flag |= P_CP_RELEASED;
486 lwkt_setpri_self(TDPRI_KERN_USER);
488 if (p->p_flag & P_CURPROC) {
489 p->p_flag &= ~P_CURPROC;
490 curprocmask &= ~(1 << cpuid);
493 * Choose the next process to assign P_CURPROC to.
494 * Note that we cannot schedule gd_schedthread
495 * if runqcount is 0 without creating a scheduling
498 if ((np = chooseproc(NULL)) != NULL) {
499 curprocmask |= 1 << cpuid;
500 np->p_flag |= P_CURPROC;
501 mycpu->gd_upri = np->p_priority;
502 USCHED_COUNTER(np->p_thread);
503 lwkt_acquire(np->p_thread);
504 lwkt_schedule(np->p_thread);
505 } else if (runqcount && (rdyprocmask & (1 << cpuid))) {
506 rdyprocmask &= ~(1 << cpuid);
507 lwkt_schedule(&mycpu->gd_schedthread);
511 KKASSERT(0); /* MP LOCK ALWAYS HELD AT THE MOMENT */
512 if (runqcount && (rdyprocmask & (1 << cpuid))) {
513 rdyprocmask &= ~(1 << cpuid);
514 lwkt_schedule(&mycpu->gd_schedthread);
522 * Acquire the P_CURPROC designation on the CURRENT process only. This
523 * function is called prior to returning to userland. If the system
524 * call or trap did not block and if no reschedule was requested it is
525 * highly likely that the P_CURPROC flag is still set in the proc, and
526 * we do almost nothing here.
529 acquire_curproc(struct proc *p)
535 * Short cut, we've already acquired the designation or we never
536 * lost it in the first place. P_CP_RELEASED is cleared, meaning
537 * that the process is again under the control of the userland
538 * scheduler. We do not have to fiddle with the LWKT priority,
539 * the trap code (userret/userexit) will do that for us.
541 if ((p->p_flag & P_CURPROC) != 0) {
542 p->p_flag &= ~P_CP_RELEASED;
547 * Long cut. This pulls in a bit of the userland scheduler as
548 * an optimization. If our cpu has not scheduled a userland
549 * process we gladly fill the slot, otherwise we choose the best
550 * candidate from the run queue and compare it against ourselves,
551 * scheduling either us or him depending.
553 * If our cpu's slot isn't free we put ourselves on the userland
554 * run queue and switch away. We should have P_CURPROC when we
555 * come back. Note that a cpu change can occur when we come back.
557 * YYY don't need critical section, we hold giant and no interrupt
558 * will mess w/ this proc? Or will it? What about curprocmask?
560 #ifdef ONLY_ONE_USER_CPU
561 KKASSERT(mycpu->gd_cpuid == 0 && p->p_thread->td_gd == mycpu);
565 while ((p->p_flag & P_CURPROC) == 0) {
569 cpuid = p->p_thread->td_gd->gd_cpuid;
572 * (broken out from setrunqueue() as an optimization that
573 * allows us to avoid descheduling and rescheduling ourself)
575 * Interlock against the helper scheduler thread by setting
576 * curprocmask while we choose a new process. Check our
577 * process against the new process to shortcut setrunqueue()
578 * and remrunqueue() operations.
580 if ((curprocmask & (1 << cpuid)) == 0) {
581 curprocmask |= 1 << cpuid;
583 if ((np = chooseproc(p)) != NULL) {
584 KKASSERT((np->p_flag & P_CP_RELEASED) == 0);
585 np->p_flag |= P_CURPROC;
586 mycpu->gd_upri = np->p_priority;
587 USCHED_COUNTER(np->p_thread);
588 lwkt_acquire(np->p_thread);
589 lwkt_schedule(np->p_thread);
591 p->p_flag |= P_CURPROC;
595 lwkt_deschedule_self();
596 p->p_flag &= ~P_CP_RELEASED;
598 lwkt_switch(); /* CPU CAN CHANGE DUE TO SETRUNQUEUE() */
599 KASSERT((p->p_flag & (P_ONRUNQ|P_CURPROC|P_CP_RELEASED)) == P_CURPROC, ("unexpected p_flag %08x acquiring P_CURPROC\n", p->p_flag));
605 * Yield / synchronous reschedule. This is a bit tricky because the trap
606 * code might have set a lazy release on the switch function. Setting
607 * P_PASSIVE_ACQ will ensure that the lazy release executes when we call
608 * switch, and that we will not be rescheduled to another cpu when we attempt
609 * to re-acquire P_CURPROC.
611 * We have to release P_CURPROC (by calling lwkt_switch(), and acquire it
612 * again to yield to another user process. Note that the release will
613 * ensure that we are running at a kernel LWKT priority, and this priority
614 * is not lowered through the reacquisition and rerelease sequence to ensure
615 * that we do not deadlock against a higher priority *user* process.
620 struct thread *td = curthread;
621 struct proc *p = td->td_proc;
624 p->p_flag |= P_PASSIVE_ACQ;
628 p->p_flag &= ~P_PASSIVE_ACQ;
637 * For SMP systems a user scheduler helper thread is created for each
638 * cpu and is used to allow one cpu to wakeup another for the purposes of
639 * scheduling userland threads from setrunqueue(). UP systems do not
640 * need the helper since there is only one cpu. We can't use the idle
641 * thread for this because we need to hold the MP lock. Additionally,
642 * doing things this way allows us to HLT idle cpus on MP systems.
645 sched_thread(void *dummy)
647 int cpuid = mycpu->gd_cpuid; /* doesn't change */
648 u_int32_t cpumask = 1 << cpuid; /* doesn't change */
650 #ifdef ONLY_ONE_USER_CPU
651 KKASSERT(cpuid == 0);
654 get_mplock(); /* hold the MP lock */
658 lwkt_deschedule_self(); /* interlock */
659 rdyprocmask |= cpumask;
661 if ((curprocmask & cpumask) == 0 && (np = chooseproc(NULL)) != NULL) {
662 curprocmask |= cpumask;
663 np->p_flag |= P_CURPROC;
664 mycpu->gd_upri = np->p_priority;
665 USCHED_COUNTER(np->p_thread);
666 lwkt_acquire(np->p_thread);
667 lwkt_schedule(np->p_thread);
675 * Setup our scheduler helpers. Note that curprocmask bit 0 has already
676 * been cleared by rqinit() and we should not mess with it further.
679 sched_thread_cpu_init(void)
684 printf("start scheduler helpers on cpus:");
686 for (i = 0; i < ncpus; ++i) {
687 globaldata_t dgd = globaldata_find(i);
688 cpumask_t mask = 1 << i;
690 if ((mask & smp_active_mask) == 0)
696 lwkt_create(sched_thread, NULL, NULL, &dgd->gd_schedthread,
697 TDF_STOPREQ, i, "usched %d", i);
698 #ifdef ONLY_ONE_USER_CPU
700 curprocmask |= mask; /* DISABLE USER PROCS */
703 curprocmask &= ~mask; /* schedule user proc on cpu */
710 SYSINIT(uschedtd, SI_SUB_FINISH_SMP, SI_ORDER_ANY, sched_thread_cpu_init, NULL)