2 * Copyright (c) 1999 Peter Wemm <peter@FreeBSD.org>
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * $DragonFly: src/sys/kern/usched_bsd4.c,v 1.3 2005/10/11 09:59:56 corecode Exp $
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/kernel.h>
33 #include <sys/queue.h>
35 #include <sys/rtprio.h>
36 #include <sys/thread2.h>
38 #include <sys/sysctl.h>
39 #include <sys/resourcevar.h>
40 #include <machine/ipl.h>
41 #include <machine/cpu.h>
42 #include <machine/smp.h>
45 * Priorities. Note that with 32 run queues per scheduler each queue
46 * represents four priority levels.
50 #define PRIMASK (MAXPRI - 1)
51 #define PRIBASE_REALTIME 0
52 #define PRIBASE_NORMAL MAXPRI
53 #define PRIBASE_IDLE (MAXPRI * 2)
54 #define PRIBASE_THREAD (MAXPRI * 3)
55 #define PRIBASE_NULL (MAXPRI * 4)
57 #define NQS 32 /* 32 run queues. */
58 #define PPQ (MAXPRI / NQS) /* priorities per queue */
61 * NICEPPQ - number of nice units per priority queue
62 * ESTCPURAMP - number of scheduler ticks for estcpu to switch queues
64 * ESTCPUPPQ - number of estcpu units per priority queue
65 * ESTCPUMAX - number of estcpu units
66 * ESTCPUINCR - amount we have to increment p_estcpu per scheduling tick at
72 #define ESTCPUMAX (ESTCPUPPQ * NQS)
73 #define ESTCPUINCR (ESTCPUPPQ / ESTCPURAMP)
74 #define PRIO_RANGE (PRIO_MAX - PRIO_MIN + 1)
76 #define ESTCPULIM(v) min((v), ESTCPUMAX)
80 #define lwp_priority lwp_usdata.bsd4.priority
81 #define lwp_rqindex lwp_usdata.bsd4.rqindex
82 #define lwp_origcpu lwp_usdata.bsd4.origcpu
83 #define lwp_estcpu lwp_usdata.bsd4.estcpu
85 static void bsd4_acquire_curproc(struct lwp *lp);
86 static void bsd4_release_curproc(struct lwp *lp);
87 static void bsd4_select_curproc(globaldata_t gd);
88 static void bsd4_setrunqueue(struct lwp *lp);
89 static void bsd4_remrunqueue(struct lwp *lp);
90 static void bsd4_schedulerclock(struct lwp *lp, sysclock_t period,
92 static void bsd4_resetpriority(struct lwp *lp);
93 static void bsd4_forking(struct lwp *plp, struct lwp *lp);
94 static void bsd4_exiting(struct lwp *plp, struct lwp *lp);
96 static void bsd4_recalculate_estcpu(struct lwp *lp);
98 struct usched usched_bsd4 = {
100 "bsd4", "Original DragonFly Scheduler",
101 bsd4_acquire_curproc,
102 bsd4_release_curproc,
107 bsd4_recalculate_estcpu,
114 * We have NQS (32) run queues per scheduling class. For the normal
115 * class, there are 128 priorities scaled onto these 32 queues. New
116 * processes are added to the last entry in each queue, and processes
117 * are selected for running by taking them from the head and maintaining
118 * a simple FIFO arrangement. Realtime and Idle priority processes have
119 * and explicit 0-31 priority which maps directly onto their class queue
120 * index. When a queue has something in it, the corresponding bit is
121 * set in the queuebits variable, allowing a single read to determine
122 * the state of all 32 queues and then a ffs() to find the first busy
125 static struct rq queues[NQS];
126 static struct rq rtqueues[NQS];
127 static struct rq idqueues[NQS];
128 static u_int32_t queuebits;
129 static u_int32_t rtqueuebits;
130 static u_int32_t idqueuebits;
131 static cpumask_t curprocmask = -1; /* currently running a user process */
132 static cpumask_t rdyprocmask; /* ready to accept a user process */
133 static int runqcount;
138 SYSCTL_INT(_debug, OID_AUTO, runqcount, CTLFLAG_RD, &runqcount, 0, "");
140 static int usched_nonoptimal;
141 SYSCTL_INT(_debug, OID_AUTO, usched_nonoptimal, CTLFLAG_RW,
142 &usched_nonoptimal, 0, "acquire_curproc() was not optimal");
143 static int usched_optimal;
144 SYSCTL_INT(_debug, OID_AUTO, usched_optimal, CTLFLAG_RW,
145 &usched_optimal, 0, "acquire_curproc() was optimal");
147 static int usched_debug = -1;
148 SYSCTL_INT(_debug, OID_AUTO, scdebug, CTLFLAG_RW, &usched_debug, 0, "");
150 static int remote_resched = 1;
151 static int remote_resched_nonaffinity;
152 static int remote_resched_affinity;
153 static int choose_affinity;
154 SYSCTL_INT(_debug, OID_AUTO, remote_resched, CTLFLAG_RW,
155 &remote_resched, 0, "Resched to another cpu");
156 SYSCTL_INT(_debug, OID_AUTO, remote_resched_nonaffinity, CTLFLAG_RD,
157 &remote_resched_nonaffinity, 0, "Number of remote rescheds");
158 SYSCTL_INT(_debug, OID_AUTO, remote_resched_affinity, CTLFLAG_RD,
159 &remote_resched_affinity, 0, "Number of remote rescheds");
160 SYSCTL_INT(_debug, OID_AUTO, choose_affinity, CTLFLAG_RD,
161 &choose_affinity, 0, "chooseproc() was smart");
164 static int usched_bsd4_rrinterval = (ESTCPUFREQ + 9) / 10;
165 SYSCTL_INT(_kern, OID_AUTO, usched_bsd4_rrinterval, CTLFLAG_RW,
166 &usched_bsd4_rrinterval, 0, "");
167 static int usched_bsd4_decay = ESTCPUINCR / 2;
168 SYSCTL_INT(_kern, OID_AUTO, usched_bsd4_decay, CTLFLAG_RW,
169 &usched_bsd4_decay, 0, "");
172 * Initialize the run queues at boot time.
179 for (i = 0; i < NQS; i++) {
180 TAILQ_INIT(&queues[i]);
181 TAILQ_INIT(&rtqueues[i]);
182 TAILQ_INIT(&idqueues[i]);
186 SYSINIT(runqueue, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, rqinit, NULL)
189 * chooseproc() is called when a cpu needs a user process to LWKT schedule,
190 * it selects a user process and returns it. If chkp is non-NULL and chkp
191 * has a better or equal then the process that would otherwise be
192 * chosen, NULL is returned.
194 * Until we fix the RUNQ code the chkp test has to be strict or we may
195 * bounce between processes trying to acquire the current process designation.
199 chooseproc(struct lwp *chklp)
207 pri = bsfl(rtqueuebits);
209 which = &rtqueuebits;
210 } else if (queuebits) {
211 pri = bsfl(queuebits);
214 } else if (idqueuebits) {
215 pri = bsfl(idqueuebits);
217 which = &idqueuebits;
222 KASSERT(lp, ("chooseproc: no lwp on busy queue"));
225 * If the passed lwp <chklp> is reasonably close to the selected
226 * lwp <lp>, return NULL (indicating that <chklp> should be kept).
228 * Note that we must error on the side of <chklp> to avoid bouncing
229 * between threads in the acquire code.
232 if (chklp->lwp_priority < lp->lwp_priority + PPQ)
238 * If the chosen lwp does not reside on this cpu spend a few
239 * cycles looking for a better candidate at the same priority level.
240 * This is a fallback check, setrunqueue() tries to wakeup the
241 * correct cpu and is our front-line affinity.
243 if (lp->lwp_thread->td_gd != mycpu &&
244 (chklp = TAILQ_NEXT(lp, lwp_procq)) != NULL
246 if (chklp->lwp_thread->td_gd == mycpu) {
253 TAILQ_REMOVE(q, lp, lwp_procq);
256 *which &= ~(1 << pri);
257 KASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) != 0, ("not on runq6!"));
258 lp->lwp_proc->p_flag &= ~P_ONRUNQ;
264 * called via an ipi message to reschedule on another cpu.
268 need_user_resched_remote(void *dummy)
276 * setrunqueue() 'wakes up' a 'user' process. GIANT must be held. The
277 * user process may represent any user process, including the current
280 * If P_PASSIVE_ACQ is set setrunqueue() will not wakeup potential target
281 * cpus in an attempt to keep the process on the current cpu at least for
282 * a little while to take advantage of locality of reference (e.g. fork/exec
283 * or short fork/exit, and uio_yield()).
285 * CPU AFFINITY: cpu affinity is handled by attempting to either schedule
286 * or (user level) preempt on the same cpu that a process was previously
287 * scheduled to. If we cannot do this but we are at enough of a higher
288 * priority then the processes running on other cpus, we will allow the
289 * process to be stolen by another cpu.
291 * WARNING! a thread can be acquired by another cpu the moment it is put
292 * on the user scheduler's run queue AND we release the MP lock. Since we
293 * release the MP lock before switching out another cpu may begin stealing
294 * our current thread before we are completely switched out! The
295 * lwkt_acquire() function will stall until TDF_RUNNING is cleared on the
296 * thread before stealing it.
298 * NOTE on need_user_resched() calls: we have to call need_user_resched()
299 * if the new process is more important then the current process, or if
300 * the new process is the current process and is now less important then
303 * The associated thread must NOT be scheduled.
304 * The process must be runnable.
305 * This must be called at splhigh().
308 bsd4_setrunqueue(struct lwp *lp)
311 struct globaldata *gd;
314 u_int32_t needresched;
321 KASSERT(lp->lwp_proc->p_stat == SRUN, ("setrunqueue: proc not SRUN"));
322 KASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) == 0,
323 ("lwp %d/%d already on runq! flag %08x", lp->lwp_proc->p_pid,
324 lp->lwp_tid, lp->lwp_proc->p_flag));
325 KKASSERT((lp->lwp_thread->td_flags & TDF_RUNQ) == 0);
328 * Note: gd is the gd of the TARGET thread's cpu, not our cpu.
330 gd = lp->lwp_thread->td_gd;
333 * Because recalculate is only called once or twice for long sleeps,
334 * not every second forever while the process is sleeping, we have
335 * to manually call it to resynchronize p_cpbase on wakeup or it
336 * will wrap if the process was sleeping long enough (e.g. ~10 min
337 * with the ACPI timer) and really mess up the nticks calculation.
339 if (lp->lwp_slptime) {
340 bsd4_recalculate_estcpu(lp);
344 * We have not been released, make sure that we are not the currently
345 * designated process.
347 KKASSERT(gd->gd_uschedcp != lp);
350 * Check cpu affinity. The associated thread is stable at the
351 * moment. Note that we may be checking another cpu here so we
352 * have to be careful. We are currently protected by the BGL.
354 * This allows us to avoid actually queueing the process.
355 * acquire_curproc() will handle any threads we mistakenly schedule.
357 cpuid = gd->gd_cpuid;
359 if ((curprocmask & (1 << cpuid)) == 0) {
360 curprocmask |= 1 << cpuid;
361 gd->gd_uschedcp = lp;
362 gd->gd_upri = lp->lwp_priority;
363 lwkt_schedule(lp->lwp_thread);
364 /* CANNOT TOUCH PROC OR TD AFTER SCHEDULE CALL TO REMOTE CPU */
368 ++remote_resched_affinity;
374 * gd and cpuid may still 'hint' at another cpu. Even so we have
375 * to place this process on the userland scheduler's run queue for
376 * action by the target cpu.
379 lp->lwp_proc->p_flag |= P_ONRUNQ;
380 if (lp->lwp_rtprio.type == RTP_PRIO_NORMAL) {
381 pri = (lp->lwp_priority & PRIMASK) / PPQ;
383 queuebits |= 1 << pri;
384 needresched = (queuebits & ((1 << pri) - 1));
385 } else if (lp->lwp_rtprio.type == RTP_PRIO_REALTIME ||
386 lp->lwp_rtprio.type == RTP_PRIO_FIFO) {
387 pri = (u_int8_t)lp->lwp_rtprio.prio;
389 rtqueuebits |= 1 << pri;
390 needresched = (rtqueuebits & ((1 << pri) - 1));
391 } else if (lp->lwp_rtprio.type == RTP_PRIO_IDLE) {
392 pri = (u_int8_t)lp->lwp_rtprio.prio;
394 idqueuebits |= 1 << pri;
395 needresched = (idqueuebits & ((1 << pri) - 1));
398 panic("setrunqueue: invalid rtprio type");
401 lp->lwp_rqindex = pri; /* remember the queue index */
402 TAILQ_INSERT_TAIL(q, lp, lwp_procq);
406 * Either wakeup other cpus user thread scheduler or request
407 * preemption on other cpus (which will also wakeup a HLT).
409 * NOTE! gd and cpuid may still be our 'hint', not our current
416 * Check cpu affinity for user preemption (when the curprocmask bit
417 * is set). Note that gd_upri is a speculative field (we modify
418 * another cpu's gd_upri to avoid sending ipiq storms).
421 if ((lp->lwp_thread->td_flags & TDF_NORESCHED) == 0) {
422 if (lp->lwp_priority < gd->gd_upri - PPQ) {
423 gd->gd_upri = lp->lwp_priority;
427 } else if (gd->gd_uschedcp == lp && needresched) {
433 } else if (remote_resched) {
434 if (lp->lwp_priority < gd->gd_upri - PPQ) {
435 gd->gd_upri = lp->lwp_priority;
436 lwkt_send_ipiq(gd, need_user_resched_remote, NULL);
438 ++remote_resched_affinity;
443 * No affinity, first schedule to any cpus that do not have a current
444 * process. If there is a free cpu we always schedule to it.
447 (mask = ~curprocmask & rdyprocmask & mycpu->gd_other_cpus) != 0 &&
448 (lp->lwp_proc->p_flag & P_PASSIVE_ACQ) == 0) {
450 printf("lwp %d/%d nocpu to schedule it on\n",
451 lp->lwp_proc->p_pid, lp->lwp_tid);
452 while (mask && count) {
454 KKASSERT((curprocmask & (1 << cpuid)) == 0);
455 rdyprocmask &= ~(1 << cpuid);
456 lwkt_schedule(&globaldata_find(cpuid)->gd_schedthread);
458 mask &= ~(1 << cpuid);
463 * If there are still runnable processes try to wakeup a random
464 * cpu that is running a much lower priority process in order to
465 * preempt on it. Note that gd_upri is only a hint, so we can
466 * overwrite it from the wrong cpu. If we can't find one, we
469 * We depress the priority check so multiple cpu bound programs
470 * do not bounce between cpus. Remember that the clock interrupt
471 * will also cause all cpus to reschedule.
473 * We must mask against rdyprocmask or we will race in the boot
474 * code (before all cpus have working scheduler helpers), plus
475 * some cpus might not be operational and/or not configured to
476 * handle user processes.
478 if (count && remote_resched && ncpus > 1) {
481 if (++cpuid == ncpus)
483 } while (cpuid == mycpu->gd_cpuid);
486 if (rdyprocmask & (1 << cpuid)) {
487 gd = globaldata_find(cpuid);
489 if (lp->lwp_priority < gd->gd_upri - PPQ) {
490 gd->gd_upri = lp->lwp_priority;
491 lwkt_send_ipiq(gd, need_user_resched_remote, NULL);
492 ++remote_resched_nonaffinity;
497 if ((lp->lwp_thread->td_flags & TDF_NORESCHED) == 0) {
498 if (lp->lwp_priority < gd->gd_upri - PPQ) {
499 gd->gd_upri = lp->lwp_priority;
502 } else if (gd->gd_uschedcp == lp && needresched) {
512 * remrunqueue() removes a given process from the run queue that it is on,
513 * clearing the queue busy bit if it becomes empty. This function is called
514 * when a userland process is selected for LWKT scheduling. Note that
515 * LWKT scheduling is an abstraction of 'curproc'.. there could very well be
516 * several userland processes whos threads are scheduled or otherwise in
517 * a special state, and such processes are NOT on the userland scheduler's
520 * This must be called at splhigh().
523 bsd4_remrunqueue(struct lwp *lp)
530 KASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) != 0, ("not on runq4!"));
531 lp->lwp_proc->p_flag &= ~P_ONRUNQ;
533 KKASSERT(runqcount >= 0);
534 pri = lp->lwp_rqindex;
535 if (lp->lwp_rtprio.type == RTP_PRIO_NORMAL) {
538 } else if (lp->lwp_rtprio.type == RTP_PRIO_REALTIME ||
539 lp->lwp_rtprio.type == RTP_PRIO_FIFO) {
541 which = &rtqueuebits;
542 } else if (lp->lwp_rtprio.type == RTP_PRIO_IDLE) {
544 which = &idqueuebits;
546 panic("remrunqueue: invalid rtprio type");
548 TAILQ_REMOVE(q, lp, lwp_procq);
549 if (TAILQ_EMPTY(q)) {
550 KASSERT((*which & (1 << pri)) != 0,
551 ("remrunqueue: remove from empty queue"));
552 *which &= ~(1 << pri);
558 * This routine is called from a systimer IPI. It MUST be MP-safe and
559 * the BGL IS NOT HELD ON ENTRY. This routine is called at ESTCPUFREQ.
563 bsd4_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp)
565 globaldata_t gd = mycpu;
568 * Do we need to round-robin? We round-robin 10 times a second.
569 * This should only occur for cpu-bound batch processes.
571 if (++gd->gd_rrcount >= usched_bsd4_rrinterval) {
577 * As the process accumulates cpu time p_estcpu is bumped and may
578 * push the process into another scheduling queue. It typically
579 * takes 4 ticks to bump the queue.
581 lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu + ESTCPUINCR);
584 * Reducing p_origcpu over time causes more of our estcpu to be
585 * returned to the parent when we exit. This is a small tweak
586 * for the batch detection heuristic.
591 /* XXX optimize, avoid lock if no reset is required */
593 bsd4_resetpriority(lp);
599 * Release the current process designation on p. P MUST BE CURPROC.
600 * Attempt to assign a new current process from the run queue.
602 * This function is called from exit1(), tsleep(), and the passive
603 * release code setup in <arch>/<arch>/trap.c
605 * If we do not have or cannot get the MP lock we just wakeup the userland
606 * helper scheduler thread for this cpu to do the work for us.
608 * WARNING! The MP lock may be in an unsynchronized state due to the
609 * way get_mplock() works and the fact that this function may be called
610 * from a passive release during a lwkt_switch(). try_mplock() will deal
611 * with this for us but you should be aware that td_mpcount may not be
615 bsd4_release_curproc(struct lwp *lp)
618 globaldata_t gd = mycpu;
620 KKASSERT(lp->lwp_thread->td_gd == gd);
622 cpuid = gd->gd_cpuid;
624 if (gd->gd_uschedcp == lp) {
627 * YYY when the MP lock is not assumed (see else) we
628 * will have to check that gd_uschedcp is still == lp
629 * after acquisition of the MP lock
631 gd->gd_uschedcp = NULL;
632 gd->gd_upri = PRIBASE_NULL;
633 bsd4_select_curproc(gd);
636 KKASSERT(0); /* MP LOCK ALWAYS HELD AT THE MOMENT */
637 gd->gd_uschedcp = NULL;
638 gd->gd_upri = PRIBASE_NULL;
639 /* YYY uschedcp and curprocmask */
640 if (runqcount && (rdyprocmask & (1 << cpuid))) {
641 rdyprocmask &= ~(1 << cpuid);
642 lwkt_schedule(&mycpu->gd_schedthread);
650 * Select a new current process, potentially retaining gd_uschedcp. However,
651 * be sure to round-robin. This routine is generally only called if a
652 * reschedule is requested and that typically only occurs if a new process
653 * has a better priority or when we are round-robining.
655 * NOTE: Must be called with giant held and the current cpu's gd.
656 * NOTE: The caller must handle the situation where it loses a
657 * uschedcp designation that it previously held, typically by
658 * calling acquire_curproc() again.
659 * NOTE: May not block
663 bsd4_select_curproc(globaldata_t gd)
666 int cpuid = gd->gd_cpuid;
669 clear_user_resched();
672 * Choose the next designated current user process.
673 * Note that we cannot schedule gd_schedthread
674 * if runqcount is 0 without creating a scheduling
677 * We do not clear the user resched request here,
678 * we need to test it later when we re-acquire.
680 * NOTE: chooseproc returns NULL if the chosen lwp
681 * is gd_uschedcp. XXX needs cleanup.
683 old = gd->gd_uschedcp;
684 if ((nlp = chooseproc(gd->gd_uschedcp)) != NULL) {
685 curprocmask |= 1 << cpuid;
686 gd->gd_upri = nlp->lwp_priority;
687 gd->gd_uschedcp = nlp;
688 lwkt_acquire(nlp->lwp_thread);
689 lwkt_schedule(nlp->lwp_thread);
690 } else if (gd->gd_uschedcp) {
691 gd->gd_upri = gd->gd_uschedcp->lwp_priority;
692 KKASSERT(curprocmask & (1 << cpuid));
693 } else if (runqcount && (rdyprocmask & (1 << cpuid))) {
694 /*gd->gd_uschedcp = NULL;*/
695 curprocmask &= ~(1 << cpuid);
696 rdyprocmask &= ~(1 << cpuid);
697 lwkt_schedule(&gd->gd_schedthread);
699 /*gd->gd_uschedcp = NULL;*/
700 curprocmask &= ~(1 << cpuid);
705 * Acquire the current process designation on the CURRENT process only.
706 * This function is called at kernel-user priority (not userland priority)
707 * when curlwp does not match gd_uschedcp.
709 * Basically we recalculate our estcpu to hopefully give us a more
710 * favorable disposition, setrunqueue, then wait for the curlwp
711 * designation to be handed to us (if the setrunqueue didn't do it).
714 bsd4_acquire_curproc(struct lwp *lp)
716 globaldata_t gd = mycpu;
719 ++lp->lwp_stats->p_ru.ru_nivcsw;
722 * Loop until we become the current process.
725 KKASSERT(lp == gd->gd_curthread->td_lwp);
726 bsd4_recalculate_estcpu(lp);
727 lwkt_deschedule_self(gd->gd_curthread);
728 bsd4_setrunqueue(lp);
732 * WE MAY HAVE BEEN MIGRATED TO ANOTHER CPU, RELOAD GD.
735 } while (gd->gd_uschedcp != lp);
740 * That's it. Cleanup, we are done. The caller can return to
743 KKASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) == 0);
747 * Compute the priority of a process when running in user mode.
748 * Arrange to reschedule if the resulting priority is better
749 * than that of the current process.
752 bsd4_resetpriority(struct lwp *lp)
759 * Set p_priority for general process comparisons
761 switch(lp->lwp_rtprio.type) {
762 case RTP_PRIO_REALTIME:
763 lp->lwp_priority = PRIBASE_REALTIME + lp->lwp_rtprio.prio;
765 case RTP_PRIO_NORMAL:
768 lp->lwp_priority = PRIBASE_IDLE + lp->lwp_rtprio.prio;
770 case RTP_PRIO_THREAD:
771 lp->lwp_priority = PRIBASE_THREAD + lp->lwp_rtprio.prio;
776 * NORMAL priorities fall through. These are based on niceness
777 * and cpu use. Lower numbers == higher priorities.
779 * Calculate our priority based on our niceness and estimated cpu.
780 * Note that the nice value adjusts the baseline, which effects
781 * cpu bursts but does not effect overall cpu use between cpu-bound
782 * processes. The use of the nice field in the decay calculation
783 * controls the overall cpu use.
785 * This isn't an exact calculation. We fit the full nice and
786 * estcpu range into the priority range so the actual PPQ value
787 * is incorrect, but it's still a reasonable way to think about it.
789 newpriority = (lp->lwp_proc->p_nice - PRIO_MIN) * PPQ / NICEPPQ;
790 newpriority += lp->lwp_estcpu * PPQ / ESTCPUPPQ;
791 newpriority = newpriority * MAXPRI /
792 (PRIO_RANGE * PPQ / NICEPPQ + ESTCPUMAX * PPQ / ESTCPUPPQ);
793 newpriority = MIN(newpriority, MAXPRI - 1); /* sanity */
794 newpriority = MAX(newpriority, 0); /* sanity */
795 npq = newpriority / PPQ;
797 opq = (lp->lwp_priority & PRIMASK) / PPQ;
798 if (lp->lwp_proc->p_stat == SRUN && (lp->lwp_proc->p_flag & P_ONRUNQ) && opq != npq) {
800 * We have to move the process to another queue
802 bsd4_remrunqueue(lp);
803 lp->lwp_priority = PRIBASE_NORMAL + newpriority;
804 bsd4_setrunqueue(lp);
807 * We can just adjust the priority and it will be picked
810 KKASSERT(opq == npq || (lp->lwp_proc->p_flag & P_ONRUNQ) == 0);
811 lp->lwp_priority = PRIBASE_NORMAL + newpriority;
817 * Called from fork1() when a new child process is being created.
819 * Give the child process an initial estcpu that is more batch then
820 * its parent and dock the parent for the fork (but do not
821 * reschedule the parent). This comprises the main part of our batch
822 * detection heuristic for both parallel forking and sequential execs.
824 * Interactive processes will decay the boosted estcpu quickly while batch
825 * processes will tend to compound it.
826 * XXX lwp should be "spawning" instead of "forking"
829 bsd4_forking(struct lwp *plp, struct lwp *lp)
831 lp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ);
832 lp->lwp_origcpu = lp->lwp_estcpu;
833 plp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ);
837 * Called when the parent reaps a child. Propogate cpu use by the child
838 * back to the parent.
841 bsd4_exiting(struct lwp *plp, struct lwp *lp)
845 if (plp->lwp_proc->p_pid != 1) {
846 delta = lp->lwp_estcpu - lp->lwp_origcpu;
848 plp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + delta);
853 * Called from acquire and from kern_synch's one-second timer with a
854 * critical section held.
856 * Decay p_estcpu based on the number of ticks we haven't been running
857 * and our p_nice. As the load increases each process observes a larger
858 * number of idle ticks (because other processes are running in them).
859 * This observation leads to a larger correction which tends to make the
860 * system more 'batchy'.
862 * Note that no recalculation occurs for a process which sleeps and wakes
863 * up in the same tick. That is, a system doing thousands of context
864 * switches per second will still only do serious estcpu calculations
865 * ESTCPUFREQ times per second.
869 bsd4_recalculate_estcpu(struct lwp *lp)
871 globaldata_t gd = mycpu;
879 * We have to subtract periodic to get the last schedclock
880 * timeout time, otherwise we would get the upcoming timeout.
881 * Keep in mind that a process can migrate between cpus and
882 * while the scheduler clock should be very close, boundary
883 * conditions could lead to a small negative delta.
885 cpbase = gd->gd_schedclock.time - gd->gd_schedclock.periodic;
887 if (lp->lwp_slptime > 1) {
889 * Too much time has passed, do a coarse correction.
891 lp->lwp_estcpu = lp->lwp_estcpu >> 1;
892 bsd4_resetpriority(lp);
893 lp->lwp_cpbase = cpbase;
895 } else if (lp->lwp_cpbase != cpbase) {
897 * Adjust estcpu if we are in a different tick. Don't waste
898 * time if we are in the same tick.
900 * First calculate the number of ticks in the measurement
903 nticks = (cpbase - lp->lwp_cpbase) / gd->gd_schedclock.periodic;
904 updatepcpu(lp, lp->lwp_cpticks, nticks);
906 if ((nleft = nticks - lp->lwp_cpticks) < 0)
908 if (usched_debug == lp->lwp_proc->p_pid) {
909 printf("pid %d tid %d estcpu %d cpticks %d nticks %d nleft %d",
910 lp->lwp_proc->p_pid, lp->lwp_tid, lp->lwp_estcpu,
911 lp->lwp_cpticks, nticks, nleft);
915 * Calculate a decay value based on ticks remaining scaled
916 * down by the instantanious load and p_nice.
918 if ((loadfac = runqcount) < 2)
920 ndecay = nleft * usched_bsd4_decay * 2 *
921 (PRIO_MAX * 2 - lp->lwp_proc->p_nice) / (loadfac * PRIO_MAX * 2);
924 * Adjust p_estcpu. Handle a border case where batch jobs
925 * can get stalled long enough to decay to zero when they
928 if (lp->lwp_estcpu > ndecay * 2)
929 lp->lwp_estcpu -= ndecay;
931 lp->lwp_estcpu >>= 1;
933 if (usched_debug == lp->lwp_proc->p_pid)
934 printf(" ndecay %d estcpu %d\n", ndecay, lp->lwp_estcpu);
936 bsd4_resetpriority(lp);
937 lp->lwp_cpbase = cpbase;
945 * For SMP systems a user scheduler helper thread is created for each
946 * cpu and is used to allow one cpu to wakeup another for the purposes of
947 * scheduling userland threads from setrunqueue(). UP systems do not
948 * need the helper since there is only one cpu. We can't use the idle
949 * thread for this because we need to hold the MP lock. Additionally,
950 * doing things this way allows us to HLT idle cpus on MP systems.
953 sched_thread(void *dummy)
955 globaldata_t gd = mycpu;
956 int cpuid = gd->gd_cpuid; /* doesn't change */
957 u_int32_t cpumask = 1 << cpuid; /* doesn't change */
959 get_mplock(); /* hold the MP lock */
963 lwkt_deschedule_self(gd->gd_curthread); /* interlock */
964 rdyprocmask |= cpumask;
965 crit_enter_quick(gd->gd_curthread);
966 if ((curprocmask & cpumask) == 0 && (nlp = chooseproc(NULL)) != NULL) {
967 curprocmask |= cpumask;
968 gd->gd_upri = nlp->lwp_priority;
969 gd->gd_uschedcp = nlp;
970 lwkt_acquire(nlp->lwp_thread);
971 lwkt_schedule(nlp->lwp_thread);
973 crit_exit_quick(gd->gd_curthread);
979 * Setup our scheduler helpers. Note that curprocmask bit 0 has already
980 * been cleared by rqinit() and we should not mess with it further.
983 sched_thread_cpu_init(void)
988 printf("start scheduler helpers on cpus:");
990 for (i = 0; i < ncpus; ++i) {
991 globaldata_t dgd = globaldata_find(i);
992 cpumask_t mask = 1 << i;
994 if ((mask & smp_active_mask) == 0)
1000 lwkt_create(sched_thread, NULL, NULL, &dgd->gd_schedthread,
1001 TDF_STOPREQ, i, "usched %d", i);
1004 * Allow user scheduling on the target cpu. cpu #0 has already
1005 * been enabled in rqinit().
1008 curprocmask &= ~mask;
1009 rdyprocmask |= mask;
1014 SYSINIT(uschedtd, SI_SUB_FINISH_SMP, SI_ORDER_ANY, sched_thread_cpu_init, NULL)