2 * Copyright (c) 1999 Peter Wemm <peter@FreeBSD.org>
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * $DragonFly: src/sys/kern/usched_bsd4.c,v 1.2 2005/09/27 18:03:32 dillon Exp $
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/kernel.h>
33 #include <sys/queue.h>
35 #include <sys/rtprio.h>
36 #include <sys/thread2.h>
38 #include <sys/sysctl.h>
39 #include <sys/resourcevar.h>
40 #include <machine/ipl.h>
41 #include <machine/cpu.h>
42 #include <machine/smp.h>
45 * Priorities. Note that with 32 run queues per scheduler each queue
46 * represents four priority levels.
50 #define PRIMASK (MAXPRI - 1)
51 #define PRIBASE_REALTIME 0
52 #define PRIBASE_NORMAL MAXPRI
53 #define PRIBASE_IDLE (MAXPRI * 2)
54 #define PRIBASE_THREAD (MAXPRI * 3)
55 #define PRIBASE_NULL (MAXPRI * 4)
57 #define NQS 32 /* 32 run queues. */
58 #define PPQ (MAXPRI / NQS) /* priorities per queue */
61 * NICEPPQ - number of nice units per priority queue
62 * ESTCPURAMP - number of scheduler ticks for estcpu to switch queues
64 * ESTCPUPPQ - number of estcpu units per priority queue
65 * ESTCPUMAX - number of estcpu units
66 * ESTCPUINCR - amount we have to increment p_estcpu per scheduling tick at
72 #define ESTCPUMAX (ESTCPUPPQ * NQS)
73 #define ESTCPUINCR (ESTCPUPPQ / ESTCPURAMP)
74 #define PRIO_RANGE (PRIO_MAX - PRIO_MIN + 1)
76 #define ESTCPULIM(v) min((v), ESTCPUMAX)
80 #define p_priority p_usdata.bsd4.priority
81 #define p_rqindex p_usdata.bsd4.rqindex
82 #define p_origcpu p_usdata.bsd4.origcpu
83 #define p_estcpu p_usdata.bsd4.estcpu
85 static void bsd4_acquire_curproc(struct proc *p);
86 static void bsd4_release_curproc(struct proc *p);
87 static void bsd4_select_curproc(globaldata_t gd);
88 static void bsd4_setrunqueue(struct proc *p);
89 static void bsd4_remrunqueue(struct proc *p);
90 static void bsd4_schedulerclock(struct proc *p, sysclock_t period,
92 static void bsd4_resetpriority(struct proc *p);
93 static void bsd4_forking(struct proc *pp, struct proc *p);
94 static void bsd4_exiting(struct proc *pp, struct proc *p);
96 static void bsd4_recalculate_estcpu(struct proc *p);
98 struct usched usched_bsd4 = {
100 "bsd4", "Original DragonFly Scheduler",
101 bsd4_acquire_curproc,
102 bsd4_release_curproc,
107 bsd4_recalculate_estcpu,
114 * We have NQS (32) run queues per scheduling class. For the normal
115 * class, there are 128 priorities scaled onto these 32 queues. New
116 * processes are added to the last entry in each queue, and processes
117 * are selected for running by taking them from the head and maintaining
118 * a simple FIFO arrangement. Realtime and Idle priority processes have
119 * and explicit 0-31 priority which maps directly onto their class queue
120 * index. When a queue has something in it, the corresponding bit is
121 * set in the queuebits variable, allowing a single read to determine
122 * the state of all 32 queues and then a ffs() to find the first busy
125 static struct rq queues[NQS];
126 static struct rq rtqueues[NQS];
127 static struct rq idqueues[NQS];
128 static u_int32_t queuebits;
129 static u_int32_t rtqueuebits;
130 static u_int32_t idqueuebits;
131 static cpumask_t curprocmask = -1; /* currently running a user process */
132 static cpumask_t rdyprocmask; /* ready to accept a user process */
133 static int runqcount;
138 SYSCTL_INT(_debug, OID_AUTO, runqcount, CTLFLAG_RD, &runqcount, 0, "");
140 static int usched_nonoptimal;
141 SYSCTL_INT(_debug, OID_AUTO, usched_nonoptimal, CTLFLAG_RW,
142 &usched_nonoptimal, 0, "acquire_curproc() was not optimal");
143 static int usched_optimal;
144 SYSCTL_INT(_debug, OID_AUTO, usched_optimal, CTLFLAG_RW,
145 &usched_optimal, 0, "acquire_curproc() was optimal");
147 static int usched_debug = -1;
148 SYSCTL_INT(_debug, OID_AUTO, scdebug, CTLFLAG_RW, &usched_debug, 0, "");
150 static int remote_resched = 1;
151 static int remote_resched_nonaffinity;
152 static int remote_resched_affinity;
153 static int choose_affinity;
154 SYSCTL_INT(_debug, OID_AUTO, remote_resched, CTLFLAG_RW,
155 &remote_resched, 0, "Resched to another cpu");
156 SYSCTL_INT(_debug, OID_AUTO, remote_resched_nonaffinity, CTLFLAG_RD,
157 &remote_resched_nonaffinity, 0, "Number of remote rescheds");
158 SYSCTL_INT(_debug, OID_AUTO, remote_resched_affinity, CTLFLAG_RD,
159 &remote_resched_affinity, 0, "Number of remote rescheds");
160 SYSCTL_INT(_debug, OID_AUTO, choose_affinity, CTLFLAG_RD,
161 &choose_affinity, 0, "chooseproc() was smart");
164 static int usched_bsd4_rrinterval = (ESTCPUFREQ + 9) / 10;
165 SYSCTL_INT(_kern, OID_AUTO, usched_bsd4_rrinterval, CTLFLAG_RW,
166 &usched_bsd4_rrinterval, 0, "");
167 static int usched_bsd4_decay = ESTCPUINCR / 2;
168 SYSCTL_INT(_kern, OID_AUTO, usched_bsd4_decay, CTLFLAG_RW,
169 &usched_bsd4_decay, 0, "");
172 * Initialize the run queues at boot time.
179 for (i = 0; i < NQS; i++) {
180 TAILQ_INIT(&queues[i]);
181 TAILQ_INIT(&rtqueues[i]);
182 TAILQ_INIT(&idqueues[i]);
186 SYSINIT(runqueue, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, rqinit, NULL)
189 * chooseproc() is called when a cpu needs a user process to LWKT schedule,
190 * it selects a user process and returns it. If chkp is non-NULL and chkp
191 * has a better or equal then the process that would otherwise be
192 * chosen, NULL is returned.
194 * Until we fix the RUNQ code the chkp test has to be strict or we may
195 * bounce between processes trying to acquire the current process designation.
199 chooseproc(struct proc *chkp)
207 pri = bsfl(rtqueuebits);
209 which = &rtqueuebits;
210 } else if (queuebits) {
211 pri = bsfl(queuebits);
214 } else if (idqueuebits) {
215 pri = bsfl(idqueuebits);
217 which = &idqueuebits;
222 KASSERT(p, ("chooseproc: no proc on busy queue"));
225 * If the passed process <chkp> is reasonably close to the selected
226 * processed <p>, return NULL (indicating that <chkp> should be kept).
228 * Note that we must error on the side of <chkp> to avoid bouncing
229 * between threads in the acquire code.
232 if (chkp->p_priority < p->p_priority + PPQ)
238 * If the chosen process does not reside on this cpu spend a few
239 * cycles looking for a better candidate at the same priority level.
240 * This is a fallback check, setrunqueue() tries to wakeup the
241 * correct cpu and is our front-line affinity.
243 if (p->p_thread->td_gd != mycpu &&
244 (chkp = TAILQ_NEXT(p, p_procq)) != NULL
246 if (chkp->p_thread->td_gd == mycpu) {
253 TAILQ_REMOVE(q, p, p_procq);
256 *which &= ~(1 << pri);
257 KASSERT((p->p_flag & P_ONRUNQ) != 0, ("not on runq6!"));
258 p->p_flag &= ~P_ONRUNQ;
264 * called via an ipi message to reschedule on another cpu.
268 need_user_resched_remote(void *dummy)
276 * setrunqueue() 'wakes up' a 'user' process. GIANT must be held. The
277 * user process may represent any user process, including the current
280 * If P_PASSIVE_ACQ is set setrunqueue() will not wakeup potential target
281 * cpus in an attempt to keep the process on the current cpu at least for
282 * a little while to take advantage of locality of reference (e.g. fork/exec
283 * or short fork/exit, and uio_yield()).
285 * CPU AFFINITY: cpu affinity is handled by attempting to either schedule
286 * or (user level) preempt on the same cpu that a process was previously
287 * scheduled to. If we cannot do this but we are at enough of a higher
288 * priority then the processes running on other cpus, we will allow the
289 * process to be stolen by another cpu.
291 * WARNING! a thread can be acquired by another cpu the moment it is put
292 * on the user scheduler's run queue AND we release the MP lock. Since we
293 * release the MP lock before switching out another cpu may begin stealing
294 * our current thread before we are completely switched out! The
295 * lwkt_acquire() function will stall until TDF_RUNNING is cleared on the
296 * thread before stealing it.
298 * NOTE on need_user_resched() calls: we have to call need_user_resched()
299 * if the new process is more important then the current process, or if
300 * the new process is the current process and is now less important then
303 * The associated thread must NOT be scheduled.
304 * The process must be runnable.
305 * This must be called at splhigh().
308 bsd4_setrunqueue(struct proc *p)
311 struct globaldata *gd;
314 u_int32_t needresched;
321 KASSERT(p->p_stat == SRUN, ("setrunqueue: proc not SRUN"));
322 KASSERT((p->p_flag & P_ONRUNQ) == 0,
323 ("process %d already on runq! flag %08x", p->p_pid, p->p_flag));
324 KKASSERT((p->p_thread->td_flags & TDF_RUNQ) == 0);
327 * Note: gd is the gd of the TARGET thread's cpu, not our cpu.
329 gd = p->p_thread->td_gd;
332 * Because recalculate is only called once or twice for long sleeps,
333 * not every second forever while the process is sleeping, we have
334 * to manually call it to resynchronize p_cpbase on wakeup or it
335 * will wrap if the process was sleeping long enough (e.g. ~10 min
336 * with the ACPI timer) and really mess up the nticks calculation.
339 bsd4_recalculate_estcpu(p);
343 * We have not been released, make sure that we are not the currently
344 * designated process.
346 KKASSERT(gd->gd_uschedcp != p);
349 * Check cpu affinity. The associated thread is stable at the
350 * moment. Note that we may be checking another cpu here so we
351 * have to be careful. We are currently protected by the BGL.
353 * This allows us to avoid actually queueing the process.
354 * acquire_curproc() will handle any threads we mistakenly schedule.
356 cpuid = gd->gd_cpuid;
358 if ((curprocmask & (1 << cpuid)) == 0) {
359 curprocmask |= 1 << cpuid;
361 gd->gd_upri = p->p_priority;
362 lwkt_schedule(p->p_thread);
363 /* CANNOT TOUCH PROC OR TD AFTER SCHEDULE CALL TO REMOTE CPU */
367 ++remote_resched_affinity;
373 * gd and cpuid may still 'hint' at another cpu. Even so we have
374 * to place this process on the userland scheduler's run queue for
375 * action by the target cpu.
378 p->p_flag |= P_ONRUNQ;
379 if (p->p_rtprio.type == RTP_PRIO_NORMAL) {
380 pri = (p->p_priority & PRIMASK) / PPQ;
382 queuebits |= 1 << pri;
383 needresched = (queuebits & ((1 << pri) - 1));
384 } else if (p->p_rtprio.type == RTP_PRIO_REALTIME ||
385 p->p_rtprio.type == RTP_PRIO_FIFO) {
386 pri = (u_int8_t)p->p_rtprio.prio;
388 rtqueuebits |= 1 << pri;
389 needresched = (rtqueuebits & ((1 << pri) - 1));
390 } else if (p->p_rtprio.type == RTP_PRIO_IDLE) {
391 pri = (u_int8_t)p->p_rtprio.prio;
393 idqueuebits |= 1 << pri;
394 needresched = (idqueuebits & ((1 << pri) - 1));
397 panic("setrunqueue: invalid rtprio type");
400 p->p_rqindex = pri; /* remember the queue index */
401 TAILQ_INSERT_TAIL(q, p, p_procq);
405 * Either wakeup other cpus user thread scheduler or request
406 * preemption on other cpus (which will also wakeup a HLT).
408 * NOTE! gd and cpuid may still be our 'hint', not our current
415 * Check cpu affinity for user preemption (when the curprocmask bit
416 * is set). Note that gd_upri is a speculative field (we modify
417 * another cpu's gd_upri to avoid sending ipiq storms).
420 if ((p->p_thread->td_flags & TDF_NORESCHED) == 0) {
421 if (p->p_priority < gd->gd_upri - PPQ) {
422 gd->gd_upri = p->p_priority;
426 } else if (gd->gd_uschedcp == p && needresched) {
432 } else if (remote_resched) {
433 if (p->p_priority < gd->gd_upri - PPQ) {
434 gd->gd_upri = p->p_priority;
435 lwkt_send_ipiq(gd, need_user_resched_remote, NULL);
437 ++remote_resched_affinity;
442 * No affinity, first schedule to any cpus that do not have a current
443 * process. If there is a free cpu we always schedule to it.
446 (mask = ~curprocmask & rdyprocmask & mycpu->gd_other_cpus) != 0 &&
447 (p->p_flag & P_PASSIVE_ACQ) == 0) {
449 printf("PROC %d nocpu to schedule it on\n", p->p_pid);
450 while (mask && count) {
452 KKASSERT((curprocmask & (1 << cpuid)) == 0);
453 rdyprocmask &= ~(1 << cpuid);
454 lwkt_schedule(&globaldata_find(cpuid)->gd_schedthread);
456 mask &= ~(1 << cpuid);
461 * If there are still runnable processes try to wakeup a random
462 * cpu that is running a much lower priority process in order to
463 * preempt on it. Note that gd_upri is only a hint, so we can
464 * overwrite it from the wrong cpu. If we can't find one, we
467 * We depress the priority check so multiple cpu bound programs
468 * do not bounce between cpus. Remember that the clock interrupt
469 * will also cause all cpus to reschedule.
471 * We must mask against rdyprocmask or we will race in the boot
472 * code (before all cpus have working scheduler helpers), plus
473 * some cpus might not be operational and/or not configured to
474 * handle user processes.
476 if (count && remote_resched && ncpus > 1) {
479 if (++cpuid == ncpus)
481 } while (cpuid == mycpu->gd_cpuid);
484 if (rdyprocmask & (1 << cpuid)) {
485 gd = globaldata_find(cpuid);
487 if (p->p_priority < gd->gd_upri - PPQ) {
488 gd->gd_upri = p->p_priority;
489 lwkt_send_ipiq(gd, need_user_resched_remote, NULL);
490 ++remote_resched_nonaffinity;
495 if ((p->p_thread->td_flags & TDF_NORESCHED) == 0) {
496 if (p->p_priority < gd->gd_upri - PPQ) {
497 gd->gd_upri = p->p_priority;
500 } else if (gd->gd_uschedcp == p && needresched) {
510 * remrunqueue() removes a given process from the run queue that it is on,
511 * clearing the queue busy bit if it becomes empty. This function is called
512 * when a userland process is selected for LWKT scheduling. Note that
513 * LWKT scheduling is an abstraction of 'curproc'.. there could very well be
514 * several userland processes whos threads are scheduled or otherwise in
515 * a special state, and such processes are NOT on the userland scheduler's
518 * This must be called at splhigh().
521 bsd4_remrunqueue(struct proc *p)
528 KASSERT((p->p_flag & P_ONRUNQ) != 0, ("not on runq4!"));
529 p->p_flag &= ~P_ONRUNQ;
531 KKASSERT(runqcount >= 0);
533 if (p->p_rtprio.type == RTP_PRIO_NORMAL) {
536 } else if (p->p_rtprio.type == RTP_PRIO_REALTIME ||
537 p->p_rtprio.type == RTP_PRIO_FIFO) {
539 which = &rtqueuebits;
540 } else if (p->p_rtprio.type == RTP_PRIO_IDLE) {
542 which = &idqueuebits;
544 panic("remrunqueue: invalid rtprio type");
546 TAILQ_REMOVE(q, p, p_procq);
547 if (TAILQ_EMPTY(q)) {
548 KASSERT((*which & (1 << pri)) != 0,
549 ("remrunqueue: remove from empty queue"));
550 *which &= ~(1 << pri);
556 * This routine is called from a systimer IPI. It MUST be MP-safe and
557 * the BGL IS NOT HELD ON ENTRY. This routine is called at ESTCPUFREQ.
561 bsd4_schedulerclock(struct proc *p, sysclock_t period, sysclock_t cpstamp)
563 globaldata_t gd = mycpu;
566 * Do we need to round-robin? We round-robin 10 times a second.
567 * This should only occur for cpu-bound batch processes.
569 if (++gd->gd_rrcount >= usched_bsd4_rrinterval) {
575 * As the process accumulates cpu time p_estcpu is bumped and may
576 * push the process into another scheduling queue. It typically
577 * takes 4 ticks to bump the queue.
579 p->p_estcpu = ESTCPULIM(p->p_estcpu + ESTCPUINCR);
582 * Reducing p_origcpu over time causes more of our estcpu to be
583 * returned to the parent when we exit. This is a small tweak
584 * for the batch detection heuristic.
589 /* XXX optimize, avoid lock if no reset is required */
591 bsd4_resetpriority(p);
597 * Release the current process designation on p. P MUST BE CURPROC.
598 * Attempt to assign a new current process from the run queue.
600 * This function is called from exit1(), tsleep(), and the passive
601 * release code setup in <arch>/<arch>/trap.c
603 * If we do not have or cannot get the MP lock we just wakeup the userland
604 * helper scheduler thread for this cpu to do the work for us.
606 * WARNING! The MP lock may be in an unsynchronized state due to the
607 * way get_mplock() works and the fact that this function may be called
608 * from a passive release during a lwkt_switch(). try_mplock() will deal
609 * with this for us but you should be aware that td_mpcount may not be
613 bsd4_release_curproc(struct proc *p)
616 globaldata_t gd = mycpu;
618 KKASSERT(p->p_thread->td_gd == gd);
620 cpuid = gd->gd_cpuid;
622 if (gd->gd_uschedcp == p) {
625 * YYY when the MP lock is not assumed (see else) we
626 * will have to check that gd_uschedcp is still == p
627 * after acquisition of the MP lock
629 gd->gd_uschedcp = NULL;
630 gd->gd_upri = PRIBASE_NULL;
631 bsd4_select_curproc(gd);
634 KKASSERT(0); /* MP LOCK ALWAYS HELD AT THE MOMENT */
635 gd->gd_uschedcp = NULL;
636 gd->gd_upri = PRIBASE_NULL;
637 /* YYY uschedcp and curprocmask */
638 if (runqcount && (rdyprocmask & (1 << cpuid))) {
639 rdyprocmask &= ~(1 << cpuid);
640 lwkt_schedule(&mycpu->gd_schedthread);
648 * Select a new current process, potentially retaining gd_uschedcp. However,
649 * be sure to round-robin. This routine is generally only called if a
650 * reschedule is requested and that typically only occurs if a new process
651 * has a better priority or when we are round-robining.
653 * NOTE: Must be called with giant held and the current cpu's gd.
654 * NOTE: The caller must handle the situation where it loses a
655 * uschedcp designation that it previously held, typically by
656 * calling acquire_curproc() again.
657 * NOTE: May not block
661 bsd4_select_curproc(globaldata_t gd)
664 int cpuid = gd->gd_cpuid;
667 clear_user_resched();
670 * Choose the next designated current user process.
671 * Note that we cannot schedule gd_schedthread
672 * if runqcount is 0 without creating a scheduling
675 * We do not clear the user resched request here,
676 * we need to test it later when we re-acquire.
678 * NOTE: chooseproc returns NULL if the chosen proc
679 * is gd_uschedcp. XXX needs cleanup.
681 old = gd->gd_uschedcp;
682 if ((np = chooseproc(gd->gd_uschedcp)) != NULL) {
683 curprocmask |= 1 << cpuid;
684 gd->gd_upri = np->p_priority;
685 gd->gd_uschedcp = np;
686 lwkt_acquire(np->p_thread);
687 lwkt_schedule(np->p_thread);
688 } else if (gd->gd_uschedcp) {
689 gd->gd_upri = gd->gd_uschedcp->p_priority;
690 KKASSERT(curprocmask & (1 << cpuid));
691 } else if (runqcount && (rdyprocmask & (1 << cpuid))) {
692 /*gd->gd_uschedcp = NULL;*/
693 curprocmask &= ~(1 << cpuid);
694 rdyprocmask &= ~(1 << cpuid);
695 lwkt_schedule(&gd->gd_schedthread);
697 /*gd->gd_uschedcp = NULL;*/
698 curprocmask &= ~(1 << cpuid);
703 * Acquire the current process designation on the CURRENT process only.
704 * This function is called at kernel-user priority (not userland priority)
705 * when curproc does not match gd_uschedcp.
707 * Basically we recalculate our estcpu to hopefully give us a more
708 * favorable disposition, setrunqueue, then wait for the curproc
709 * designation to be handed to us (if the setrunqueue didn't do it).
712 bsd4_acquire_curproc(struct proc *p)
714 globaldata_t gd = mycpu;
717 ++p->p_stats->p_ru.ru_nivcsw;
720 * Loop until we become the current process.
723 KKASSERT(p == gd->gd_curthread->td_proc);
724 bsd4_recalculate_estcpu(p);
725 lwkt_deschedule_self(gd->gd_curthread);
730 * WE MAY HAVE BEEN MIGRATED TO ANOTHER CPU, RELOAD GD.
733 } while (gd->gd_uschedcp != p);
738 * That's it. Cleanup, we are done. The caller can return to
741 KKASSERT((p->p_flag & P_ONRUNQ) == 0);
745 * Compute the priority of a process when running in user mode.
746 * Arrange to reschedule if the resulting priority is better
747 * than that of the current process.
750 bsd4_resetpriority(struct proc *p)
757 * Set p_priority for general process comparisons
759 switch(p->p_rtprio.type) {
760 case RTP_PRIO_REALTIME:
761 p->p_priority = PRIBASE_REALTIME + p->p_rtprio.prio;
763 case RTP_PRIO_NORMAL:
766 p->p_priority = PRIBASE_IDLE + p->p_rtprio.prio;
768 case RTP_PRIO_THREAD:
769 p->p_priority = PRIBASE_THREAD + p->p_rtprio.prio;
774 * NORMAL priorities fall through. These are based on niceness
775 * and cpu use. Lower numbers == higher priorities.
777 * Calculate our priority based on our niceness and estimated cpu.
778 * Note that the nice value adjusts the baseline, which effects
779 * cpu bursts but does not effect overall cpu use between cpu-bound
780 * processes. The use of the nice field in the decay calculation
781 * controls the overall cpu use.
783 * This isn't an exact calculation. We fit the full nice and
784 * estcpu range into the priority range so the actual PPQ value
785 * is incorrect, but it's still a reasonable way to think about it.
787 newpriority = (p->p_nice - PRIO_MIN) * PPQ / NICEPPQ;
788 newpriority += p->p_estcpu * PPQ / ESTCPUPPQ;
789 newpriority = newpriority * MAXPRI /
790 (PRIO_RANGE * PPQ / NICEPPQ + ESTCPUMAX * PPQ / ESTCPUPPQ);
791 newpriority = MIN(newpriority, MAXPRI - 1); /* sanity */
792 newpriority = MAX(newpriority, 0); /* sanity */
793 npq = newpriority / PPQ;
795 opq = (p->p_priority & PRIMASK) / PPQ;
796 if (p->p_stat == SRUN && (p->p_flag & P_ONRUNQ) && opq != npq) {
798 * We have to move the process to another queue
801 p->p_priority = PRIBASE_NORMAL + newpriority;
805 * We can just adjust the priority and it will be picked
808 KKASSERT(opq == npq || (p->p_flag & P_ONRUNQ) == 0);
809 p->p_priority = PRIBASE_NORMAL + newpriority;
815 * Called from fork1() when a new child process is being created.
817 * Give the child process an initial estcpu that is more batch then
818 * its parent and dock the parent for the fork (but do not
819 * reschedule the parent). This comprises the main part of our batch
820 * detection heuristic for both parallel forking and sequential execs.
822 * Interactive processes will decay the boosted estcpu quickly while batch
823 * processes will tend to compound it.
826 bsd4_forking(struct proc *pp, struct proc *p)
828 p->p_estcpu = ESTCPULIM(pp->p_estcpu + ESTCPUPPQ);
829 p->p_origcpu = p->p_estcpu;
830 pp->p_estcpu = ESTCPULIM(pp->p_estcpu + ESTCPUPPQ);
834 * Called when the parent reaps a child. Propogate cpu use by the child
835 * back to the parent.
838 bsd4_exiting(struct proc *pp, struct proc *p)
842 if (pp->p_pid != 1) {
843 delta = p->p_estcpu - p->p_origcpu;
845 pp->p_estcpu = ESTCPULIM(pp->p_estcpu + delta);
850 * Called from acquire and from kern_synch's one-second timer with a
851 * critical section held.
853 * Decay p_estcpu based on the number of ticks we haven't been running
854 * and our p_nice. As the load increases each process observes a larger
855 * number of idle ticks (because other processes are running in them).
856 * This observation leads to a larger correction which tends to make the
857 * system more 'batchy'.
859 * Note that no recalculation occurs for a process which sleeps and wakes
860 * up in the same tick. That is, a system doing thousands of context
861 * switches per second will still only do serious estcpu calculations
862 * ESTCPUFREQ times per second.
866 bsd4_recalculate_estcpu(struct proc *p)
868 globaldata_t gd = mycpu;
876 * We have to subtract periodic to get the last schedclock
877 * timeout time, otherwise we would get the upcoming timeout.
878 * Keep in mind that a process can migrate between cpus and
879 * while the scheduler clock should be very close, boundary
880 * conditions could lead to a small negative delta.
882 cpbase = gd->gd_schedclock.time - gd->gd_schedclock.periodic;
884 if (p->p_slptime > 1) {
886 * Too much time has passed, do a coarse correction.
888 p->p_estcpu = p->p_estcpu >> 1;
889 bsd4_resetpriority(p);
890 p->p_cpbase = cpbase;
892 } else if (p->p_cpbase != cpbase) {
894 * Adjust estcpu if we are in a different tick. Don't waste
895 * time if we are in the same tick.
897 * First calculate the number of ticks in the measurement
900 nticks = (cpbase - p->p_cpbase) / gd->gd_schedclock.periodic;
901 updatepcpu(p, p->p_cpticks, nticks);
903 if ((nleft = nticks - p->p_cpticks) < 0)
905 if (usched_debug == p->p_pid) {
906 printf("pid %d estcpu %d cpticks %d nticks %d nleft %d",
907 p->p_pid, p->p_estcpu,
908 p->p_cpticks, nticks, nleft);
912 * Calculate a decay value based on ticks remaining scaled
913 * down by the instantanious load and p_nice.
915 if ((loadfac = runqcount) < 2)
917 ndecay = nleft * usched_bsd4_decay * 2 *
918 (PRIO_MAX * 2 - p->p_nice) / (loadfac * PRIO_MAX * 2);
921 * Adjust p_estcpu. Handle a border case where batch jobs
922 * can get stalled long enough to decay to zero when they
925 if (p->p_estcpu > ndecay * 2)
926 p->p_estcpu -= ndecay;
930 if (usched_debug == p->p_pid)
931 printf(" ndecay %d estcpu %d\n", ndecay, p->p_estcpu);
933 bsd4_resetpriority(p);
934 p->p_cpbase = cpbase;
942 * For SMP systems a user scheduler helper thread is created for each
943 * cpu and is used to allow one cpu to wakeup another for the purposes of
944 * scheduling userland threads from setrunqueue(). UP systems do not
945 * need the helper since there is only one cpu. We can't use the idle
946 * thread for this because we need to hold the MP lock. Additionally,
947 * doing things this way allows us to HLT idle cpus on MP systems.
950 sched_thread(void *dummy)
952 globaldata_t gd = mycpu;
953 int cpuid = gd->gd_cpuid; /* doesn't change */
954 u_int32_t cpumask = 1 << cpuid; /* doesn't change */
956 get_mplock(); /* hold the MP lock */
960 lwkt_deschedule_self(gd->gd_curthread); /* interlock */
961 rdyprocmask |= cpumask;
962 crit_enter_quick(gd->gd_curthread);
963 if ((curprocmask & cpumask) == 0 && (np = chooseproc(NULL)) != NULL) {
964 curprocmask |= cpumask;
965 gd->gd_upri = np->p_priority;
966 gd->gd_uschedcp = np;
967 lwkt_acquire(np->p_thread);
968 lwkt_schedule(np->p_thread);
970 crit_exit_quick(gd->gd_curthread);
976 * Setup our scheduler helpers. Note that curprocmask bit 0 has already
977 * been cleared by rqinit() and we should not mess with it further.
980 sched_thread_cpu_init(void)
985 printf("start scheduler helpers on cpus:");
987 for (i = 0; i < ncpus; ++i) {
988 globaldata_t dgd = globaldata_find(i);
989 cpumask_t mask = 1 << i;
991 if ((mask & smp_active_mask) == 0)
997 lwkt_create(sched_thread, NULL, NULL, &dgd->gd_schedthread,
998 TDF_STOPREQ, i, "usched %d", i);
1001 * Allow user scheduling on the target cpu. cpu #0 has already
1002 * been enabled in rqinit().
1005 curprocmask &= ~mask;
1006 rdyprocmask |= mask;
1011 SYSINIT(uschedtd, SI_SUB_FINISH_SMP, SI_ORDER_ANY, sched_thread_cpu_init, NULL)