2 * Copyright (c) 1999 Peter Wemm <peter@FreeBSD.org>
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * $DragonFly: src/sys/kern/usched_bsd4.c,v 1.5 2005/11/16 02:24:30 dillon Exp $
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/kernel.h>
33 #include <sys/queue.h>
35 #include <sys/rtprio.h>
36 #include <sys/thread2.h>
38 #include <sys/sysctl.h>
39 #include <sys/resourcevar.h>
40 #include <machine/ipl.h>
41 #include <machine/cpu.h>
42 #include <machine/smp.h>
45 * Priorities. Note that with 32 run queues per scheduler each queue
46 * represents four priority levels.
50 #define PRIMASK (MAXPRI - 1)
51 #define PRIBASE_REALTIME 0
52 #define PRIBASE_NORMAL MAXPRI
53 #define PRIBASE_IDLE (MAXPRI * 2)
54 #define PRIBASE_THREAD (MAXPRI * 3)
55 #define PRIBASE_NULL (MAXPRI * 4)
57 #define NQS 32 /* 32 run queues. */
58 #define PPQ (MAXPRI / NQS) /* priorities per queue */
61 * NICEPPQ - number of nice units per priority queue
62 * ESTCPURAMP - number of scheduler ticks for estcpu to switch queues
64 * ESTCPUPPQ - number of estcpu units per priority queue
65 * ESTCPUMAX - number of estcpu units
66 * ESTCPUINCR - amount we have to increment p_estcpu per scheduling tick at
72 #define ESTCPUMAX (ESTCPUPPQ * NQS)
73 #define ESTCPUINCR (ESTCPUPPQ / ESTCPURAMP)
74 #define PRIO_RANGE (PRIO_MAX - PRIO_MIN + 1)
76 #define ESTCPULIM(v) min((v), ESTCPUMAX)
80 #define lwp_priority lwp_usdata.bsd4.priority
81 #define lwp_rqindex lwp_usdata.bsd4.rqindex
82 #define lwp_origcpu lwp_usdata.bsd4.origcpu
83 #define lwp_estcpu lwp_usdata.bsd4.estcpu
85 static void bsd4_acquire_curproc(struct lwp *lp);
86 static void bsd4_release_curproc(struct lwp *lp);
87 static void bsd4_select_curproc(globaldata_t gd);
88 static void bsd4_setrunqueue(struct lwp *lp);
89 static void bsd4_remrunqueue(struct lwp *lp);
90 static void bsd4_schedulerclock(struct lwp *lp, sysclock_t period,
92 static void bsd4_resetpriority(struct lwp *lp);
93 static void bsd4_forking(struct lwp *plp, struct lwp *lp);
94 static void bsd4_exiting(struct lwp *plp, struct lwp *lp);
96 static void bsd4_recalculate_estcpu(struct lwp *lp);
98 struct usched usched_bsd4 = {
100 "bsd4", "Original DragonFly Scheduler",
101 NULL, /* default registration */
102 NULL, /* default deregistration */
103 bsd4_acquire_curproc,
104 bsd4_release_curproc,
109 bsd4_recalculate_estcpu,
113 NULL /* setcpumask not supported */
117 * We have NQS (32) run queues per scheduling class. For the normal
118 * class, there are 128 priorities scaled onto these 32 queues. New
119 * processes are added to the last entry in each queue, and processes
120 * are selected for running by taking them from the head and maintaining
121 * a simple FIFO arrangement. Realtime and Idle priority processes have
122 * and explicit 0-31 priority which maps directly onto their class queue
123 * index. When a queue has something in it, the corresponding bit is
124 * set in the queuebits variable, allowing a single read to determine
125 * the state of all 32 queues and then a ffs() to find the first busy
128 static struct rq queues[NQS];
129 static struct rq rtqueues[NQS];
130 static struct rq idqueues[NQS];
131 static u_int32_t queuebits;
132 static u_int32_t rtqueuebits;
133 static u_int32_t idqueuebits;
134 static cpumask_t curprocmask = -1; /* currently running a user process */
135 static cpumask_t rdyprocmask; /* ready to accept a user process */
136 static int runqcount;
141 SYSCTL_INT(_debug, OID_AUTO, runqcount, CTLFLAG_RD, &runqcount, 0, "");
143 static int usched_nonoptimal;
144 SYSCTL_INT(_debug, OID_AUTO, usched_nonoptimal, CTLFLAG_RW,
145 &usched_nonoptimal, 0, "acquire_curproc() was not optimal");
146 static int usched_optimal;
147 SYSCTL_INT(_debug, OID_AUTO, usched_optimal, CTLFLAG_RW,
148 &usched_optimal, 0, "acquire_curproc() was optimal");
150 static int usched_debug = -1;
151 SYSCTL_INT(_debug, OID_AUTO, scdebug, CTLFLAG_RW, &usched_debug, 0, "");
153 static int remote_resched = 1;
154 static int remote_resched_nonaffinity;
155 static int remote_resched_affinity;
156 static int choose_affinity;
157 SYSCTL_INT(_debug, OID_AUTO, remote_resched, CTLFLAG_RW,
158 &remote_resched, 0, "Resched to another cpu");
159 SYSCTL_INT(_debug, OID_AUTO, remote_resched_nonaffinity, CTLFLAG_RD,
160 &remote_resched_nonaffinity, 0, "Number of remote rescheds");
161 SYSCTL_INT(_debug, OID_AUTO, remote_resched_affinity, CTLFLAG_RD,
162 &remote_resched_affinity, 0, "Number of remote rescheds");
163 SYSCTL_INT(_debug, OID_AUTO, choose_affinity, CTLFLAG_RD,
164 &choose_affinity, 0, "chooseproc() was smart");
167 static int usched_bsd4_rrinterval = (ESTCPUFREQ + 9) / 10;
168 SYSCTL_INT(_kern, OID_AUTO, usched_bsd4_rrinterval, CTLFLAG_RW,
169 &usched_bsd4_rrinterval, 0, "");
170 static int usched_bsd4_decay = ESTCPUINCR / 2;
171 SYSCTL_INT(_kern, OID_AUTO, usched_bsd4_decay, CTLFLAG_RW,
172 &usched_bsd4_decay, 0, "");
175 * Initialize the run queues at boot time.
182 for (i = 0; i < NQS; i++) {
183 TAILQ_INIT(&queues[i]);
184 TAILQ_INIT(&rtqueues[i]);
185 TAILQ_INIT(&idqueues[i]);
189 SYSINIT(runqueue, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, rqinit, NULL)
192 * chooseproc() is called when a cpu needs a user process to LWKT schedule,
193 * it selects a user process and returns it. If chkp is non-NULL and chkp
194 * has a better or equal then the process that would otherwise be
195 * chosen, NULL is returned.
197 * Until we fix the RUNQ code the chkp test has to be strict or we may
198 * bounce between processes trying to acquire the current process designation.
202 chooseproc(struct lwp *chklp)
210 pri = bsfl(rtqueuebits);
212 which = &rtqueuebits;
213 } else if (queuebits) {
214 pri = bsfl(queuebits);
217 } else if (idqueuebits) {
218 pri = bsfl(idqueuebits);
220 which = &idqueuebits;
225 KASSERT(lp, ("chooseproc: no lwp on busy queue"));
228 * If the passed lwp <chklp> is reasonably close to the selected
229 * lwp <lp>, return NULL (indicating that <chklp> should be kept).
231 * Note that we must error on the side of <chklp> to avoid bouncing
232 * between threads in the acquire code.
235 if (chklp->lwp_priority < lp->lwp_priority + PPQ)
241 * If the chosen lwp does not reside on this cpu spend a few
242 * cycles looking for a better candidate at the same priority level.
243 * This is a fallback check, setrunqueue() tries to wakeup the
244 * correct cpu and is our front-line affinity.
246 if (lp->lwp_thread->td_gd != mycpu &&
247 (chklp = TAILQ_NEXT(lp, lwp_procq)) != NULL
249 if (chklp->lwp_thread->td_gd == mycpu) {
256 TAILQ_REMOVE(q, lp, lwp_procq);
259 *which &= ~(1 << pri);
260 KASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) != 0, ("not on runq6!"));
261 lp->lwp_proc->p_flag &= ~P_ONRUNQ;
267 * called via an ipi message to reschedule on another cpu.
271 need_user_resched_remote(void *dummy)
279 * setrunqueue() 'wakes up' a 'user' process. GIANT must be held. The
280 * user process may represent any user process, including the current
283 * If P_PASSIVE_ACQ is set setrunqueue() will not wakeup potential target
284 * cpus in an attempt to keep the process on the current cpu at least for
285 * a little while to take advantage of locality of reference (e.g. fork/exec
286 * or short fork/exit, and uio_yield()).
288 * CPU AFFINITY: cpu affinity is handled by attempting to either schedule
289 * or (user level) preempt on the same cpu that a process was previously
290 * scheduled to. If we cannot do this but we are at enough of a higher
291 * priority then the processes running on other cpus, we will allow the
292 * process to be stolen by another cpu.
294 * WARNING! a thread can be acquired by another cpu the moment it is put
295 * on the user scheduler's run queue AND we release the MP lock. Since we
296 * release the MP lock before switching out another cpu may begin stealing
297 * our current thread before we are completely switched out! The
298 * lwkt_acquire() function will stall until TDF_RUNNING is cleared on the
299 * thread before stealing it.
301 * NOTE on need_user_resched() calls: we have to call need_user_resched()
302 * if the new process is more important then the current process, or if
303 * the new process is the current process and is now less important then
306 * The associated thread must NOT be scheduled.
307 * The process must be runnable.
308 * This must be called at splhigh().
311 bsd4_setrunqueue(struct lwp *lp)
314 struct globaldata *gd;
317 u_int32_t needresched;
324 KASSERT(lp->lwp_proc->p_stat == SRUN, ("setrunqueue: proc not SRUN"));
325 KASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) == 0,
326 ("lwp %d/%d already on runq! flag %08x", lp->lwp_proc->p_pid,
327 lp->lwp_tid, lp->lwp_proc->p_flag));
328 KKASSERT((lp->lwp_thread->td_flags & TDF_RUNQ) == 0);
331 * Note: gd is the gd of the TARGET thread's cpu, not our cpu.
333 gd = lp->lwp_thread->td_gd;
336 * Because recalculate is only called once or twice for long sleeps,
337 * not every second forever while the process is sleeping, we have
338 * to manually call it to resynchronize p_cpbase on wakeup or it
339 * will wrap if the process was sleeping long enough (e.g. ~10 min
340 * with the ACPI timer) and really mess up the nticks calculation.
342 if (lp->lwp_slptime) {
343 bsd4_recalculate_estcpu(lp);
347 * We have not been released, make sure that we are not the currently
348 * designated process.
350 KKASSERT(gd->gd_uschedcp != lp);
353 * Check cpu affinity. The associated thread is stable at the
354 * moment. Note that we may be checking another cpu here so we
355 * have to be careful. We are currently protected by the BGL.
357 * This allows us to avoid actually queueing the process.
358 * acquire_curproc() will handle any threads we mistakenly schedule.
360 cpuid = gd->gd_cpuid;
362 if ((curprocmask & (1 << cpuid)) == 0) {
363 curprocmask |= 1 << cpuid;
364 gd->gd_uschedcp = lp;
365 gd->gd_upri = lp->lwp_priority;
366 lwkt_schedule(lp->lwp_thread);
367 /* CANNOT TOUCH PROC OR TD AFTER SCHEDULE CALL TO REMOTE CPU */
371 ++remote_resched_affinity;
377 * gd and cpuid may still 'hint' at another cpu. Even so we have
378 * to place this process on the userland scheduler's run queue for
379 * action by the target cpu.
382 lp->lwp_proc->p_flag |= P_ONRUNQ;
383 if (lp->lwp_rtprio.type == RTP_PRIO_NORMAL) {
384 pri = (lp->lwp_priority & PRIMASK) / PPQ;
386 queuebits |= 1 << pri;
387 needresched = (queuebits & ((1 << pri) - 1));
388 } else if (lp->lwp_rtprio.type == RTP_PRIO_REALTIME ||
389 lp->lwp_rtprio.type == RTP_PRIO_FIFO) {
390 pri = (u_int8_t)lp->lwp_rtprio.prio;
392 rtqueuebits |= 1 << pri;
393 needresched = (rtqueuebits & ((1 << pri) - 1));
394 } else if (lp->lwp_rtprio.type == RTP_PRIO_IDLE) {
395 pri = (u_int8_t)lp->lwp_rtprio.prio;
397 idqueuebits |= 1 << pri;
398 needresched = (idqueuebits & ((1 << pri) - 1));
401 panic("setrunqueue: invalid rtprio type");
404 lp->lwp_rqindex = pri; /* remember the queue index */
405 TAILQ_INSERT_TAIL(q, lp, lwp_procq);
409 * Either wakeup other cpus user thread scheduler or request
410 * preemption on other cpus (which will also wakeup a HLT).
412 * NOTE! gd and cpuid may still be our 'hint', not our current
419 * Check cpu affinity for user preemption (when the curprocmask bit
420 * is set). Note that gd_upri is a speculative field (we modify
421 * another cpu's gd_upri to avoid sending ipiq storms).
424 if ((lp->lwp_thread->td_flags & TDF_NORESCHED) == 0) {
425 if (lp->lwp_priority < gd->gd_upri - PPQ) {
426 gd->gd_upri = lp->lwp_priority;
430 } else if (gd->gd_uschedcp == lp && needresched) {
436 } else if (remote_resched) {
437 if (lp->lwp_priority < gd->gd_upri - PPQ) {
438 gd->gd_upri = lp->lwp_priority;
439 lwkt_send_ipiq(gd, need_user_resched_remote, NULL);
441 ++remote_resched_affinity;
446 * No affinity, first schedule to any cpus that do not have a current
447 * process. If there is a free cpu we always schedule to it.
450 (mask = ~curprocmask & rdyprocmask & mycpu->gd_other_cpus) != 0 &&
451 (lp->lwp_proc->p_flag & P_PASSIVE_ACQ) == 0) {
453 printf("lwp %d/%d nocpu to schedule it on\n",
454 lp->lwp_proc->p_pid, lp->lwp_tid);
455 while (mask && count) {
457 KKASSERT((curprocmask & (1 << cpuid)) == 0);
458 rdyprocmask &= ~(1 << cpuid);
459 lwkt_schedule(&globaldata_find(cpuid)->gd_schedthread);
461 mask &= ~(1 << cpuid);
466 * If there are still runnable processes try to wakeup a random
467 * cpu that is running a much lower priority process in order to
468 * preempt on it. Note that gd_upri is only a hint, so we can
469 * overwrite it from the wrong cpu. If we can't find one, we
472 * We depress the priority check so multiple cpu bound programs
473 * do not bounce between cpus. Remember that the clock interrupt
474 * will also cause all cpus to reschedule.
476 * We must mask against rdyprocmask or we will race in the boot
477 * code (before all cpus have working scheduler helpers), plus
478 * some cpus might not be operational and/or not configured to
479 * handle user processes.
481 if (count && remote_resched && ncpus > 1) {
484 if (++cpuid == ncpus)
486 } while (cpuid == mycpu->gd_cpuid);
489 if (rdyprocmask & (1 << cpuid)) {
490 gd = globaldata_find(cpuid);
492 if (lp->lwp_priority < gd->gd_upri - PPQ) {
493 gd->gd_upri = lp->lwp_priority;
494 lwkt_send_ipiq(gd, need_user_resched_remote, NULL);
495 ++remote_resched_nonaffinity;
500 if ((lp->lwp_thread->td_flags & TDF_NORESCHED) == 0) {
501 if (lp->lwp_priority < gd->gd_upri - PPQ) {
502 gd->gd_upri = lp->lwp_priority;
505 } else if (gd->gd_uschedcp == lp && needresched) {
515 * remrunqueue() removes a given process from the run queue that it is on,
516 * clearing the queue busy bit if it becomes empty. This function is called
517 * when a userland process is selected for LWKT scheduling. Note that
518 * LWKT scheduling is an abstraction of 'curproc'.. there could very well be
519 * several userland processes whos threads are scheduled or otherwise in
520 * a special state, and such processes are NOT on the userland scheduler's
523 * This must be called at splhigh().
526 bsd4_remrunqueue(struct lwp *lp)
533 KASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) != 0, ("not on runq4!"));
534 lp->lwp_proc->p_flag &= ~P_ONRUNQ;
536 KKASSERT(runqcount >= 0);
537 pri = lp->lwp_rqindex;
538 if (lp->lwp_rtprio.type == RTP_PRIO_NORMAL) {
541 } else if (lp->lwp_rtprio.type == RTP_PRIO_REALTIME ||
542 lp->lwp_rtprio.type == RTP_PRIO_FIFO) {
544 which = &rtqueuebits;
545 } else if (lp->lwp_rtprio.type == RTP_PRIO_IDLE) {
547 which = &idqueuebits;
549 panic("remrunqueue: invalid rtprio type");
551 TAILQ_REMOVE(q, lp, lwp_procq);
552 if (TAILQ_EMPTY(q)) {
553 KASSERT((*which & (1 << pri)) != 0,
554 ("remrunqueue: remove from empty queue"));
555 *which &= ~(1 << pri);
561 * This routine is called from a systimer IPI. It MUST be MP-safe and
562 * the BGL IS NOT HELD ON ENTRY. This routine is called at ESTCPUFREQ.
566 bsd4_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp)
568 globaldata_t gd = mycpu;
571 * Do we need to round-robin? We round-robin 10 times a second.
572 * This should only occur for cpu-bound batch processes.
574 if (++gd->gd_rrcount >= usched_bsd4_rrinterval) {
580 * As the process accumulates cpu time p_estcpu is bumped and may
581 * push the process into another scheduling queue. It typically
582 * takes 4 ticks to bump the queue.
584 lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu + ESTCPUINCR);
587 * Reducing p_origcpu over time causes more of our estcpu to be
588 * returned to the parent when we exit. This is a small tweak
589 * for the batch detection heuristic.
594 /* XXX optimize, avoid lock if no reset is required */
596 bsd4_resetpriority(lp);
602 * Release the current process designation on p. P MUST BE CURPROC.
603 * Attempt to assign a new current process from the run queue.
605 * This function is called from exit1(), tsleep(), and the passive
606 * release code setup in <arch>/<arch>/trap.c
608 * If we do not have or cannot get the MP lock we just wakeup the userland
609 * helper scheduler thread for this cpu to do the work for us.
611 * WARNING! The MP lock may be in an unsynchronized state due to the
612 * way get_mplock() works and the fact that this function may be called
613 * from a passive release during a lwkt_switch(). try_mplock() will deal
614 * with this for us but you should be aware that td_mpcount may not be
618 bsd4_release_curproc(struct lwp *lp)
621 globaldata_t gd = mycpu;
623 KKASSERT(lp->lwp_thread->td_gd == gd);
625 cpuid = gd->gd_cpuid;
627 if (gd->gd_uschedcp == lp) {
630 * YYY when the MP lock is not assumed (see else) we
631 * will have to check that gd_uschedcp is still == lp
632 * after acquisition of the MP lock
634 gd->gd_uschedcp = NULL;
635 gd->gd_upri = PRIBASE_NULL;
636 bsd4_select_curproc(gd);
639 KKASSERT(0); /* MP LOCK ALWAYS HELD AT THE MOMENT */
640 gd->gd_uschedcp = NULL;
641 gd->gd_upri = PRIBASE_NULL;
642 /* YYY uschedcp and curprocmask */
643 if (runqcount && (rdyprocmask & (1 << cpuid))) {
644 rdyprocmask &= ~(1 << cpuid);
645 lwkt_schedule(&mycpu->gd_schedthread);
653 * Select a new current process, potentially retaining gd_uschedcp. However,
654 * be sure to round-robin. This routine is generally only called if a
655 * reschedule is requested and that typically only occurs if a new process
656 * has a better priority or when we are round-robining.
658 * NOTE: Must be called with giant held and the current cpu's gd.
659 * NOTE: The caller must handle the situation where it loses a
660 * uschedcp designation that it previously held, typically by
661 * calling acquire_curproc() again.
662 * NOTE: May not block
666 bsd4_select_curproc(globaldata_t gd)
669 int cpuid = gd->gd_cpuid;
672 clear_user_resched();
675 * Choose the next designated current user process.
676 * Note that we cannot schedule gd_schedthread
677 * if runqcount is 0 without creating a scheduling
680 * We do not clear the user resched request here,
681 * we need to test it later when we re-acquire.
683 * NOTE: chooseproc returns NULL if the chosen lwp
684 * is gd_uschedcp. XXX needs cleanup.
686 old = gd->gd_uschedcp;
687 if ((nlp = chooseproc(gd->gd_uschedcp)) != NULL) {
688 curprocmask |= 1 << cpuid;
689 gd->gd_upri = nlp->lwp_priority;
690 gd->gd_uschedcp = nlp;
691 lwkt_acquire(nlp->lwp_thread);
692 lwkt_schedule(nlp->lwp_thread);
693 } else if (gd->gd_uschedcp) {
694 gd->gd_upri = gd->gd_uschedcp->lwp_priority;
695 KKASSERT(curprocmask & (1 << cpuid));
696 } else if (runqcount && (rdyprocmask & (1 << cpuid))) {
697 /*gd->gd_uschedcp = NULL;*/
698 curprocmask &= ~(1 << cpuid);
699 rdyprocmask &= ~(1 << cpuid);
700 lwkt_schedule(&gd->gd_schedthread);
702 /*gd->gd_uschedcp = NULL;*/
703 curprocmask &= ~(1 << cpuid);
708 * Acquire the current process designation on the CURRENT process only.
709 * This function is called at kernel-user priority (not userland priority)
710 * when curlwp does not match gd_uschedcp.
712 * This function is only called just prior to returning to user mode.
714 * Basically we recalculate our estcpu to hopefully give us a more
715 * favorable disposition, setrunqueue, then wait for the curlwp
716 * designation to be handed to us (if the setrunqueue didn't do it).
718 * WARNING! THIS FUNCTION MAY CAUSE THE CURRENT THREAD TO MIGRATE TO
719 * ANOTHER CPU! Because most of the kernel assumes that no migration will
720 * occur, this function is called only under very controlled circumstances.
723 bsd4_acquire_curproc(struct lwp *lp)
725 globaldata_t gd = mycpu;
730 * Recalculate our priority and put us back on the userland
733 * Only increment the involuntary context switch count if the
734 * setrunqueue call did not immediately schedule us.
736 KKASSERT(lp == gd->gd_curthread->td_lwp);
737 bsd4_recalculate_estcpu(lp);
738 lwkt_deschedule_self(gd->gd_curthread);
739 bsd4_setrunqueue(lp);
740 if ((gd->gd_curthread->td_flags & TDF_RUNQ) == 0)
741 ++lp->lwp_stats->p_ru.ru_nivcsw;
745 * Because we put ourselves back on the userland scheduler's run
746 * queue, WE MAY HAVE BEEN MIGRATED TO ANOTHER CPU
751 * We better be the current process when we wake up, and we had
752 * better not be on the run queue.
754 KKASSERT(gd->gd_uschedcp == lp);
755 KKASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) == 0);
761 * Compute the priority of a process when running in user mode.
762 * Arrange to reschedule if the resulting priority is better
763 * than that of the current process.
766 bsd4_resetpriority(struct lwp *lp)
773 * Set p_priority for general process comparisons
775 switch(lp->lwp_rtprio.type) {
776 case RTP_PRIO_REALTIME:
777 lp->lwp_priority = PRIBASE_REALTIME + lp->lwp_rtprio.prio;
779 case RTP_PRIO_NORMAL:
782 lp->lwp_priority = PRIBASE_IDLE + lp->lwp_rtprio.prio;
784 case RTP_PRIO_THREAD:
785 lp->lwp_priority = PRIBASE_THREAD + lp->lwp_rtprio.prio;
790 * NORMAL priorities fall through. These are based on niceness
791 * and cpu use. Lower numbers == higher priorities.
793 * Calculate our priority based on our niceness and estimated cpu.
794 * Note that the nice value adjusts the baseline, which effects
795 * cpu bursts but does not effect overall cpu use between cpu-bound
796 * processes. The use of the nice field in the decay calculation
797 * controls the overall cpu use.
799 * This isn't an exact calculation. We fit the full nice and
800 * estcpu range into the priority range so the actual PPQ value
801 * is incorrect, but it's still a reasonable way to think about it.
803 newpriority = (lp->lwp_proc->p_nice - PRIO_MIN) * PPQ / NICEPPQ;
804 newpriority += lp->lwp_estcpu * PPQ / ESTCPUPPQ;
805 newpriority = newpriority * MAXPRI /
806 (PRIO_RANGE * PPQ / NICEPPQ + ESTCPUMAX * PPQ / ESTCPUPPQ);
807 newpriority = MIN(newpriority, MAXPRI - 1); /* sanity */
808 newpriority = MAX(newpriority, 0); /* sanity */
809 npq = newpriority / PPQ;
811 opq = (lp->lwp_priority & PRIMASK) / PPQ;
812 if (lp->lwp_proc->p_stat == SRUN && (lp->lwp_proc->p_flag & P_ONRUNQ) && opq != npq) {
814 * We have to move the process to another queue
816 bsd4_remrunqueue(lp);
817 lp->lwp_priority = PRIBASE_NORMAL + newpriority;
818 bsd4_setrunqueue(lp);
821 * We can just adjust the priority and it will be picked
824 KKASSERT(opq == npq || (lp->lwp_proc->p_flag & P_ONRUNQ) == 0);
825 lp->lwp_priority = PRIBASE_NORMAL + newpriority;
831 * Called from fork1() when a new child process is being created.
833 * Give the child process an initial estcpu that is more batch then
834 * its parent and dock the parent for the fork (but do not
835 * reschedule the parent). This comprises the main part of our batch
836 * detection heuristic for both parallel forking and sequential execs.
838 * Interactive processes will decay the boosted estcpu quickly while batch
839 * processes will tend to compound it.
840 * XXX lwp should be "spawning" instead of "forking"
843 bsd4_forking(struct lwp *plp, struct lwp *lp)
845 lp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ);
846 lp->lwp_origcpu = lp->lwp_estcpu;
847 plp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ);
851 * Called when the parent reaps a child. Propogate cpu use by the child
852 * back to the parent.
855 bsd4_exiting(struct lwp *plp, struct lwp *lp)
859 if (plp->lwp_proc->p_pid != 1) {
860 delta = lp->lwp_estcpu - lp->lwp_origcpu;
862 plp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + delta);
867 * Called from acquire and from kern_synch's one-second timer with a
868 * critical section held.
870 * Decay p_estcpu based on the number of ticks we haven't been running
871 * and our p_nice. As the load increases each process observes a larger
872 * number of idle ticks (because other processes are running in them).
873 * This observation leads to a larger correction which tends to make the
874 * system more 'batchy'.
876 * Note that no recalculation occurs for a process which sleeps and wakes
877 * up in the same tick. That is, a system doing thousands of context
878 * switches per second will still only do serious estcpu calculations
879 * ESTCPUFREQ times per second.
883 bsd4_recalculate_estcpu(struct lwp *lp)
885 globaldata_t gd = mycpu;
893 * We have to subtract periodic to get the last schedclock
894 * timeout time, otherwise we would get the upcoming timeout.
895 * Keep in mind that a process can migrate between cpus and
896 * while the scheduler clock should be very close, boundary
897 * conditions could lead to a small negative delta.
899 cpbase = gd->gd_schedclock.time - gd->gd_schedclock.periodic;
901 if (lp->lwp_slptime > 1) {
903 * Too much time has passed, do a coarse correction.
905 lp->lwp_estcpu = lp->lwp_estcpu >> 1;
906 bsd4_resetpriority(lp);
907 lp->lwp_cpbase = cpbase;
909 } else if (lp->lwp_cpbase != cpbase) {
911 * Adjust estcpu if we are in a different tick. Don't waste
912 * time if we are in the same tick.
914 * First calculate the number of ticks in the measurement
917 nticks = (cpbase - lp->lwp_cpbase) / gd->gd_schedclock.periodic;
918 updatepcpu(lp, lp->lwp_cpticks, nticks);
920 if ((nleft = nticks - lp->lwp_cpticks) < 0)
922 if (usched_debug == lp->lwp_proc->p_pid) {
923 printf("pid %d tid %d estcpu %d cpticks %d nticks %d nleft %d",
924 lp->lwp_proc->p_pid, lp->lwp_tid, lp->lwp_estcpu,
925 lp->lwp_cpticks, nticks, nleft);
929 * Calculate a decay value based on ticks remaining scaled
930 * down by the instantanious load and p_nice.
932 if ((loadfac = runqcount) < 2)
934 ndecay = nleft * usched_bsd4_decay * 2 *
935 (PRIO_MAX * 2 - lp->lwp_proc->p_nice) / (loadfac * PRIO_MAX * 2);
938 * Adjust p_estcpu. Handle a border case where batch jobs
939 * can get stalled long enough to decay to zero when they
942 if (lp->lwp_estcpu > ndecay * 2)
943 lp->lwp_estcpu -= ndecay;
945 lp->lwp_estcpu >>= 1;
947 if (usched_debug == lp->lwp_proc->p_pid)
948 printf(" ndecay %d estcpu %d\n", ndecay, lp->lwp_estcpu);
950 bsd4_resetpriority(lp);
951 lp->lwp_cpbase = cpbase;
959 * For SMP systems a user scheduler helper thread is created for each
960 * cpu and is used to allow one cpu to wakeup another for the purposes of
961 * scheduling userland threads from setrunqueue(). UP systems do not
962 * need the helper since there is only one cpu. We can't use the idle
963 * thread for this because we need to hold the MP lock. Additionally,
964 * doing things this way allows us to HLT idle cpus on MP systems.
967 sched_thread(void *dummy)
969 globaldata_t gd = mycpu;
970 int cpuid = gd->gd_cpuid; /* doesn't change */
971 u_int32_t cpumask = 1 << cpuid; /* doesn't change */
973 get_mplock(); /* hold the MP lock */
977 lwkt_deschedule_self(gd->gd_curthread); /* interlock */
978 rdyprocmask |= cpumask;
979 crit_enter_quick(gd->gd_curthread);
980 if ((curprocmask & cpumask) == 0 && (nlp = chooseproc(NULL)) != NULL) {
981 curprocmask |= cpumask;
982 gd->gd_upri = nlp->lwp_priority;
983 gd->gd_uschedcp = nlp;
984 lwkt_acquire(nlp->lwp_thread);
985 lwkt_schedule(nlp->lwp_thread);
987 crit_exit_quick(gd->gd_curthread);
993 * Setup our scheduler helpers. Note that curprocmask bit 0 has already
994 * been cleared by rqinit() and we should not mess with it further.
997 sched_thread_cpu_init(void)
1002 printf("start scheduler helpers on cpus:");
1004 for (i = 0; i < ncpus; ++i) {
1005 globaldata_t dgd = globaldata_find(i);
1006 cpumask_t mask = 1 << i;
1008 if ((mask & smp_active_mask) == 0)
1014 lwkt_create(sched_thread, NULL, NULL, &dgd->gd_schedthread,
1015 TDF_STOPREQ, i, "usched %d", i);
1018 * Allow user scheduling on the target cpu. cpu #0 has already
1019 * been enabled in rqinit().
1022 curprocmask &= ~mask;
1023 rdyprocmask |= mask;
1028 SYSINIT(uschedtd, SI_SUB_FINISH_SMP, SI_ORDER_ANY, sched_thread_cpu_init, NULL)