2 * Copyright (c) 1999 Peter Wemm <peter@FreeBSD.org>
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * $DragonFly: src/sys/kern/usched_bsd4.c,v 1.26 2008/11/01 23:31:19 dillon Exp $
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/kernel.h>
33 #include <sys/queue.h>
35 #include <sys/rtprio.h>
37 #include <sys/sysctl.h>
38 #include <sys/resourcevar.h>
39 #include <sys/spinlock.h>
40 #include <machine/cpu.h>
41 #include <machine/smp.h>
43 #include <sys/thread2.h>
44 #include <sys/spinlock2.h>
45 #include <sys/mplock2.h>
48 * Priorities. Note that with 32 run queues per scheduler each queue
49 * represents four priority levels.
53 #define PRIMASK (MAXPRI - 1)
54 #define PRIBASE_REALTIME 0
55 #define PRIBASE_NORMAL MAXPRI
56 #define PRIBASE_IDLE (MAXPRI * 2)
57 #define PRIBASE_THREAD (MAXPRI * 3)
58 #define PRIBASE_NULL (MAXPRI * 4)
60 #define NQS 32 /* 32 run queues. */
61 #define PPQ (MAXPRI / NQS) /* priorities per queue */
62 #define PPQMASK (PPQ - 1)
65 * NICEPPQ - number of nice units per priority queue
67 * ESTCPUPPQ - number of estcpu units per priority queue
68 * ESTCPUMAX - number of estcpu units
72 #define ESTCPUMAX (ESTCPUPPQ * NQS)
73 #define BATCHMAX (ESTCPUFREQ * 30)
74 #define PRIO_RANGE (PRIO_MAX - PRIO_MIN + 1)
76 #define ESTCPULIM(v) min((v), ESTCPUMAX)
80 #define lwp_priority lwp_usdata.bsd4.priority
81 #define lwp_rqindex lwp_usdata.bsd4.rqindex
82 #define lwp_estcpu lwp_usdata.bsd4.estcpu
83 #define lwp_batch lwp_usdata.bsd4.batch
84 #define lwp_rqtype lwp_usdata.bsd4.rqtype
86 static void bsd4_acquire_curproc(struct lwp *lp);
87 static void bsd4_release_curproc(struct lwp *lp);
88 static void bsd4_select_curproc(globaldata_t gd);
89 static void bsd4_setrunqueue(struct lwp *lp);
90 static void bsd4_schedulerclock(struct lwp *lp, sysclock_t period,
92 static void bsd4_recalculate_estcpu(struct lwp *lp);
93 static void bsd4_resetpriority(struct lwp *lp);
94 static void bsd4_forking(struct lwp *plp, struct lwp *lp);
95 static void bsd4_exiting(struct lwp *lp, struct proc *);
96 static void bsd4_yield(struct lwp *lp);
99 static void need_user_resched_remote(void *dummy);
101 static struct lwp *chooseproc_locked(struct lwp *chklp);
102 static void bsd4_remrunqueue_locked(struct lwp *lp);
103 static void bsd4_setrunqueue_locked(struct lwp *lp);
105 struct usched usched_bsd4 = {
107 "bsd4", "Original DragonFly Scheduler",
108 NULL, /* default registration */
109 NULL, /* default deregistration */
110 bsd4_acquire_curproc,
111 bsd4_release_curproc,
114 bsd4_recalculate_estcpu,
118 NULL, /* setcpumask not supported */
122 struct usched_bsd4_pcpu {
123 struct thread helper_thread;
126 struct lwp *uschedcp;
129 typedef struct usched_bsd4_pcpu *bsd4_pcpu_t;
132 * We have NQS (32) run queues per scheduling class. For the normal
133 * class, there are 128 priorities scaled onto these 32 queues. New
134 * processes are added to the last entry in each queue, and processes
135 * are selected for running by taking them from the head and maintaining
136 * a simple FIFO arrangement. Realtime and Idle priority processes have
137 * and explicit 0-31 priority which maps directly onto their class queue
138 * index. When a queue has something in it, the corresponding bit is
139 * set in the queuebits variable, allowing a single read to determine
140 * the state of all 32 queues and then a ffs() to find the first busy
143 static struct rq bsd4_queues[NQS];
144 static struct rq bsd4_rtqueues[NQS];
145 static struct rq bsd4_idqueues[NQS];
146 static u_int32_t bsd4_queuebits;
147 static u_int32_t bsd4_rtqueuebits;
148 static u_int32_t bsd4_idqueuebits;
149 static cpumask_t bsd4_curprocmask = -1; /* currently running a user process */
150 static cpumask_t bsd4_rdyprocmask; /* ready to accept a user process */
151 static int bsd4_runqcount;
153 static volatile int bsd4_scancpu;
155 static struct spinlock bsd4_spin;
156 static struct usched_bsd4_pcpu bsd4_pcpu[MAXCPU];
158 SYSCTL_INT(_debug, OID_AUTO, bsd4_runqcount, CTLFLAG_RD, &bsd4_runqcount, 0,
159 "Number of run queues");
161 static int usched_nonoptimal;
162 SYSCTL_INT(_debug, OID_AUTO, usched_nonoptimal, CTLFLAG_RW,
163 &usched_nonoptimal, 0, "acquire_curproc() was not optimal");
164 static int usched_optimal;
165 SYSCTL_INT(_debug, OID_AUTO, usched_optimal, CTLFLAG_RW,
166 &usched_optimal, 0, "acquire_curproc() was optimal");
168 static int usched_debug = -1;
169 SYSCTL_INT(_debug, OID_AUTO, scdebug, CTLFLAG_RW, &usched_debug, 0,
170 "Print debug information for this pid");
172 static int remote_resched_nonaffinity;
173 static int remote_resched_affinity;
174 static int choose_affinity;
175 SYSCTL_INT(_debug, OID_AUTO, remote_resched_nonaffinity, CTLFLAG_RD,
176 &remote_resched_nonaffinity, 0, "Number of remote rescheds");
177 SYSCTL_INT(_debug, OID_AUTO, remote_resched_affinity, CTLFLAG_RD,
178 &remote_resched_affinity, 0, "Number of remote rescheds");
179 SYSCTL_INT(_debug, OID_AUTO, choose_affinity, CTLFLAG_RD,
180 &choose_affinity, 0, "chooseproc() was smart");
183 static int usched_bsd4_rrinterval = (ESTCPUFREQ + 9) / 10;
184 SYSCTL_INT(_kern, OID_AUTO, usched_bsd4_rrinterval, CTLFLAG_RW,
185 &usched_bsd4_rrinterval, 0, "");
186 static int usched_bsd4_decay = 8;
187 SYSCTL_INT(_kern, OID_AUTO, usched_bsd4_decay, CTLFLAG_RW,
188 &usched_bsd4_decay, 0, "Extra decay when not running");
189 static int usched_bsd4_batch_time = 10;
190 SYSCTL_INT(_kern, OID_AUTO, usched_bsd4_batch_time, CTLFLAG_RW,
191 &usched_bsd4_batch_time, 0, "Minimum batch counter value");
194 * Initialize the run queues at boot time.
201 spin_init(&bsd4_spin);
202 for (i = 0; i < NQS; i++) {
203 TAILQ_INIT(&bsd4_queues[i]);
204 TAILQ_INIT(&bsd4_rtqueues[i]);
205 TAILQ_INIT(&bsd4_idqueues[i]);
207 atomic_clear_cpumask(&bsd4_curprocmask, 1);
209 SYSINIT(runqueue, SI_BOOT2_USCHED, SI_ORDER_FIRST, rqinit, NULL)
212 * BSD4_ACQUIRE_CURPROC
214 * This function is called when the kernel intends to return to userland.
215 * It is responsible for making the thread the current designated userland
216 * thread for this cpu, blocking if necessary.
218 * The kernel has already depressed our LWKT priority so we must not switch
219 * until we have either assigned or disposed of the thread.
221 * WARNING! THIS FUNCTION IS ALLOWED TO CAUSE THE CURRENT THREAD TO MIGRATE
222 * TO ANOTHER CPU! Because most of the kernel assumes that no migration will
223 * occur, this function is called only under very controlled circumstances.
228 bsd4_acquire_curproc(struct lwp *lp)
237 bsd4_recalculate_estcpu(lp);
240 * If a reschedule was requested give another thread the
243 if (user_resched_wanted()) {
244 clear_user_resched();
245 bsd4_release_curproc(lp);
249 * Loop until we are the current user thread
252 dd = &bsd4_pcpu[gd->gd_cpuid];
256 * Process any pending events and higher priority threads.
261 * Become the currently scheduled user thread for this cpu
262 * if we can do so trivially.
264 * We can steal another thread's current thread designation
265 * on this cpu since if we are running that other thread
266 * must not be, so we can safely deschedule it.
268 if (dd->uschedcp == lp) {
270 * We are already the current lwp (hot path).
272 dd->upri = lp->lwp_priority;
273 } else if (dd->uschedcp == NULL) {
275 * We can trivially become the current lwp.
277 atomic_set_cpumask(&bsd4_curprocmask, gd->gd_cpumask);
279 dd->upri = lp->lwp_priority;
280 } else if (dd->upri > lp->lwp_priority) {
282 * We can steal the current cpu's lwp designation
283 * away simply by replacing it. The other thread
284 * will stall when it tries to return to userland.
287 dd->upri = lp->lwp_priority;
289 lwkt_deschedule(olp->lwp_thread);
290 bsd4_setrunqueue(olp);
294 * We cannot become the current lwp, place the lp
295 * on the bsd4 run-queue and deschedule ourselves.
297 * When we are reactivated we will have another
300 lwkt_deschedule(lp->lwp_thread);
301 bsd4_setrunqueue(lp);
304 * Reload after a switch or setrunqueue/switch possibly
305 * moved us to another cpu.
308 dd = &bsd4_pcpu[gd->gd_cpuid];
310 } while (dd->uschedcp != lp);
313 KKASSERT((lp->lwp_flag & LWP_ONRUNQ) == 0);
317 * BSD4_RELEASE_CURPROC
319 * This routine detaches the current thread from the userland scheduler,
320 * usually because the thread needs to run or block in the kernel (at
321 * kernel priority) for a while.
323 * This routine is also responsible for selecting a new thread to
324 * make the current thread.
326 * NOTE: This implementation differs from the dummy example in that
327 * bsd4_select_curproc() is able to select the current process, whereas
328 * dummy_select_curproc() is not able to select the current process.
329 * This means we have to NULL out uschedcp.
331 * Additionally, note that we may already be on a run queue if releasing
332 * via the lwkt_switch() in bsd4_setrunqueue().
337 bsd4_release_curproc(struct lwp *lp)
339 globaldata_t gd = mycpu;
340 bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid];
342 if (dd->uschedcp == lp) {
344 KKASSERT((lp->lwp_flag & LWP_ONRUNQ) == 0);
345 dd->uschedcp = NULL; /* don't let lp be selected */
346 dd->upri = PRIBASE_NULL;
347 atomic_clear_cpumask(&bsd4_curprocmask, gd->gd_cpumask);
348 bsd4_select_curproc(gd);
354 * BSD4_SELECT_CURPROC
356 * Select a new current process for this cpu and clear any pending user
357 * reschedule request. The cpu currently has no current process.
359 * This routine is also responsible for equal-priority round-robining,
360 * typically triggered from bsd4_schedulerclock(). In our dummy example
361 * all the 'user' threads are LWKT scheduled all at once and we just
362 * call lwkt_switch().
364 * The calling process is not on the queue and cannot be selected.
370 bsd4_select_curproc(globaldata_t gd)
372 bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid];
374 int cpuid = gd->gd_cpuid;
378 spin_lock(&bsd4_spin);
379 if ((nlp = chooseproc_locked(dd->uschedcp)) != NULL) {
380 atomic_set_cpumask(&bsd4_curprocmask, CPUMASK(cpuid));
381 dd->upri = nlp->lwp_priority;
383 spin_unlock(&bsd4_spin);
385 lwkt_acquire(nlp->lwp_thread);
387 lwkt_schedule(nlp->lwp_thread);
389 spin_unlock(&bsd4_spin);
392 } else if (bsd4_runqcount && (bsd4_rdyprocmask & CPUMASK(cpuid))) {
393 atomic_clear_cpumask(&bsd4_rdyprocmask, CPUMASK(cpuid));
394 spin_unlock(&bsd4_spin);
395 lwkt_schedule(&dd->helper_thread);
397 spin_unlock(&bsd4_spin);
406 * Place the specified lwp on the user scheduler's run queue. This routine
407 * must be called with the thread descheduled. The lwp must be runnable.
409 * The thread may be the current thread as a special case.
414 bsd4_setrunqueue(struct lwp *lp)
425 * First validate the process state relative to the current cpu.
426 * We don't need the spinlock for this, just a critical section.
427 * We are in control of the process.
430 KASSERT(lp->lwp_stat == LSRUN, ("setrunqueue: lwp not LSRUN"));
431 KASSERT((lp->lwp_flag & LWP_ONRUNQ) == 0,
432 ("lwp %d/%d already on runq! flag %08x/%08x", lp->lwp_proc->p_pid,
433 lp->lwp_tid, lp->lwp_proc->p_flag, lp->lwp_flag));
434 KKASSERT((lp->lwp_thread->td_flags & TDF_RUNQ) == 0);
437 * Note: gd and dd are relative to the target thread's last cpu,
438 * NOT our current cpu.
440 gd = lp->lwp_thread->td_gd;
441 dd = &bsd4_pcpu[gd->gd_cpuid];
444 * This process is not supposed to be scheduled anywhere or assigned
445 * as the current process anywhere. Assert the condition.
447 KKASSERT(dd->uschedcp != lp);
451 * If we are not SMP we do not have a scheduler helper to kick
452 * and must directly activate the process if none are scheduled.
454 * This is really only an issue when bootstrapping init since
455 * the caller in all other cases will be a user process, and
456 * even if released (dd->uschedcp == NULL), that process will
457 * kickstart the scheduler when it returns to user mode from
460 if (dd->uschedcp == NULL) {
461 atomic_set_cpumask(&bsd4_curprocmask, gd->gd_cpumask);
463 dd->upri = lp->lwp_priority;
464 lwkt_schedule(lp->lwp_thread);
472 * XXX fixme. Could be part of a remrunqueue/setrunqueue
473 * operation when the priority is recalculated, so TDF_MIGRATING
474 * may already be set.
476 if ((lp->lwp_thread->td_flags & TDF_MIGRATING) == 0)
477 lwkt_giveaway(lp->lwp_thread);
481 * We lose control of lp the moment we release the spinlock after
482 * having placed lp on the queue. i.e. another cpu could pick it
483 * up and it could exit, or its priority could be further adjusted,
484 * or something like that.
486 spin_lock(&bsd4_spin);
487 bsd4_setrunqueue_locked(lp);
491 * Kick the scheduler helper on one of the other cpu's
492 * and request a reschedule if appropriate.
494 * NOTE: We check all cpus whos rdyprocmask is set. First we
495 * look for cpus without designated lps, then we look for
496 * cpus with designated lps with a worse priority than our
500 cpuid = (bsd4_scancpu & 0xFFFF) % ncpus;
501 mask = ~bsd4_curprocmask & bsd4_rdyprocmask & lp->lwp_cpumask &
502 smp_active_mask & usched_global_cpumask;
505 tmpmask = ~(CPUMASK(cpuid) - 1);
507 cpuid = BSFCPUMASK(mask & tmpmask);
509 cpuid = BSFCPUMASK(mask);
510 gd = globaldata_find(cpuid);
511 dd = &bsd4_pcpu[cpuid];
513 if ((dd->upri & ~PPQMASK) >= (lp->lwp_priority & ~PPQMASK))
515 mask &= ~CPUMASK(cpuid);
519 * Then cpus which might have a currently running lp
521 mask = bsd4_curprocmask & bsd4_rdyprocmask &
522 lp->lwp_cpumask & smp_active_mask & usched_global_cpumask;
525 tmpmask = ~(CPUMASK(cpuid) - 1);
527 cpuid = BSFCPUMASK(mask & tmpmask);
529 cpuid = BSFCPUMASK(mask);
530 gd = globaldata_find(cpuid);
531 dd = &bsd4_pcpu[cpuid];
533 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK))
535 mask &= ~CPUMASK(cpuid);
539 * If we cannot find a suitable cpu we reload from bsd4_scancpu
540 * and round-robin. Other cpus will pickup as they release their
541 * current lwps or become ready.
543 * Avoid a degenerate system lockup case if usched_global_cpumask
544 * is set to 0 or otherwise does not cover lwp_cpumask.
546 * We only kick the target helper thread in this case, we do not
547 * set the user resched flag because
549 cpuid = (bsd4_scancpu & 0xFFFF) % ncpus;
550 if ((CPUMASK(cpuid) & usched_global_cpumask) == 0) {
553 gd = globaldata_find(cpuid);
554 dd = &bsd4_pcpu[cpuid];
557 spin_unlock(&bsd4_spin);
558 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) {
559 if (dd->uschedcp == NULL) {
560 lwkt_schedule(&dd->helper_thread);
566 atomic_clear_cpumask(&bsd4_rdyprocmask, CPUMASK(cpuid));
567 spin_unlock(&bsd4_spin);
568 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK))
569 lwkt_send_ipiq(gd, need_user_resched_remote, NULL);
571 lwkt_schedule(&dd->helper_thread);
575 * Request a reschedule if appropriate.
577 spin_unlock(&bsd4_spin);
578 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) {
586 * This routine is called from a systimer IPI. It MUST be MP-safe and
587 * the BGL IS NOT HELD ON ENTRY. This routine is called at ESTCPUFREQ on
594 bsd4_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp)
596 globaldata_t gd = mycpu;
597 bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid];
600 * Do we need to round-robin? We round-robin 10 times a second.
601 * This should only occur for cpu-bound batch processes.
603 if (++dd->rrcount >= usched_bsd4_rrinterval) {
609 * Adjust estcpu upward using a real time equivalent calculation.
611 lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu + ESTCPUMAX / ESTCPUFREQ + 1);
614 * Spinlocks also hold a critical section so there should not be
617 KKASSERT(gd->gd_spinlocks_wr == 0);
619 bsd4_resetpriority(lp);
622 * if we can't call bsd4_resetpriority for some reason we must call
623 * need user_resched().
630 * Called from acquire and from kern_synch's one-second timer (one of the
631 * callout helper threads) with a critical section held.
633 * Decay p_estcpu based on the number of ticks we haven't been running
634 * and our p_nice. As the load increases each process observes a larger
635 * number of idle ticks (because other processes are running in them).
636 * This observation leads to a larger correction which tends to make the
637 * system more 'batchy'.
639 * Note that no recalculation occurs for a process which sleeps and wakes
640 * up in the same tick. That is, a system doing thousands of context
641 * switches per second will still only do serious estcpu calculations
642 * ESTCPUFREQ times per second.
648 bsd4_recalculate_estcpu(struct lwp *lp)
650 globaldata_t gd = mycpu;
657 * We have to subtract periodic to get the last schedclock
658 * timeout time, otherwise we would get the upcoming timeout.
659 * Keep in mind that a process can migrate between cpus and
660 * while the scheduler clock should be very close, boundary
661 * conditions could lead to a small negative delta.
663 cpbase = gd->gd_schedclock.time - gd->gd_schedclock.periodic;
665 if (lp->lwp_slptime > 1) {
667 * Too much time has passed, do a coarse correction.
669 lp->lwp_estcpu = lp->lwp_estcpu >> 1;
670 bsd4_resetpriority(lp);
671 lp->lwp_cpbase = cpbase;
673 lp->lwp_batch -= ESTCPUFREQ;
674 if (lp->lwp_batch < 0)
676 } else if (lp->lwp_cpbase != cpbase) {
678 * Adjust estcpu if we are in a different tick. Don't waste
679 * time if we are in the same tick.
681 * First calculate the number of ticks in the measurement
682 * interval. The ttlticks calculation can wind up 0 due to
683 * a bug in the handling of lwp_slptime (as yet not found),
684 * so make sure we do not get a divide by 0 panic.
686 ttlticks = (cpbase - lp->lwp_cpbase) /
687 gd->gd_schedclock.periodic;
690 lp->lwp_cpbase = cpbase;
694 updatepcpu(lp, lp->lwp_cpticks, ttlticks);
697 * Calculate the percentage of one cpu used factoring in ncpus
698 * and the load and adjust estcpu. Handle degenerate cases
699 * by adding 1 to bsd4_runqcount.
701 * estcpu is scaled by ESTCPUMAX.
703 * bsd4_runqcount is the excess number of user processes
704 * that cannot be immediately scheduled to cpus. We want
705 * to count these as running to avoid range compression
706 * in the base calculation (which is the actual percentage
709 estcpu = (lp->lwp_cpticks * ESTCPUMAX) *
710 (bsd4_runqcount + ncpus) / (ncpus * ttlticks);
713 * If estcpu is > 50% we become more batch-like
714 * If estcpu is <= 50% we become less batch-like
716 * It takes 30 cpu seconds to traverse the entire range.
718 if (estcpu > ESTCPUMAX / 2) {
719 lp->lwp_batch += ttlticks;
720 if (lp->lwp_batch > BATCHMAX)
721 lp->lwp_batch = BATCHMAX;
723 lp->lwp_batch -= ttlticks;
724 if (lp->lwp_batch < 0)
728 if (usched_debug == lp->lwp_proc->p_pid) {
729 kprintf("pid %d lwp %p estcpu %3d %3d bat %d cp %d/%d",
730 lp->lwp_proc->p_pid, lp,
731 estcpu, lp->lwp_estcpu,
733 lp->lwp_cpticks, ttlticks);
737 * Adjust lp->lwp_esetcpu. The decay factor determines how
738 * quickly lwp_estcpu collapses to its realtime calculation.
739 * A slower collapse gives us a more accurate number but
740 * can cause a cpu hog to eat too much cpu before the
741 * scheduler decides to downgrade it.
743 * NOTE: p_nice is accounted for in bsd4_resetpriority(),
744 * and not here, but we must still ensure that a
745 * cpu-bound nice -20 process does not completely
746 * override a cpu-bound nice +20 process.
748 * NOTE: We must use ESTCPULIM() here to deal with any
751 decay_factor = usched_bsd4_decay;
752 if (decay_factor < 1)
754 if (decay_factor > 1024)
757 lp->lwp_estcpu = ESTCPULIM(
758 (lp->lwp_estcpu * decay_factor + estcpu) /
761 if (usched_debug == lp->lwp_proc->p_pid)
762 kprintf(" finalestcpu %d\n", lp->lwp_estcpu);
763 bsd4_resetpriority(lp);
764 lp->lwp_cpbase += ttlticks * gd->gd_schedclock.periodic;
770 * Compute the priority of a process when running in user mode.
771 * Arrange to reschedule if the resulting priority is better
772 * than that of the current process.
774 * This routine may be called with any process.
776 * This routine is called by fork1() for initial setup with the process
777 * of the run queue, and also may be called normally with the process on or
783 bsd4_resetpriority(struct lwp *lp)
793 * Calculate the new priority and queue type
796 spin_lock(&bsd4_spin);
798 newrqtype = lp->lwp_rtprio.type;
801 case RTP_PRIO_REALTIME:
803 newpriority = PRIBASE_REALTIME +
804 (lp->lwp_rtprio.prio & PRIMASK);
806 case RTP_PRIO_NORMAL:
808 * Detune estcpu based on batchiness. lwp_batch ranges
809 * from 0 to BATCHMAX. Limit estcpu for the sake of
810 * the priority calculation to between 50% and 100%.
812 estcpu = lp->lwp_estcpu * (lp->lwp_batch + BATCHMAX) /
816 * p_nice piece Adds (0-40) * 2 0-80
817 * estcpu Adds 16384 * 4 / 512 0-128
819 newpriority = (lp->lwp_proc->p_nice - PRIO_MIN) * PPQ / NICEPPQ;
820 newpriority += estcpu * PPQ / ESTCPUPPQ;
821 newpriority = newpriority * MAXPRI / (PRIO_RANGE * PPQ /
822 NICEPPQ + ESTCPUMAX * PPQ / ESTCPUPPQ);
823 newpriority = PRIBASE_NORMAL + (newpriority & PRIMASK);
826 newpriority = PRIBASE_IDLE + (lp->lwp_rtprio.prio & PRIMASK);
828 case RTP_PRIO_THREAD:
829 newpriority = PRIBASE_THREAD + (lp->lwp_rtprio.prio & PRIMASK);
832 panic("Bad RTP_PRIO %d", newrqtype);
837 * The newpriority incorporates the queue type so do a simple masked
838 * check to determine if the process has moved to another queue. If
839 * it has, and it is currently on a run queue, then move it.
841 if ((lp->lwp_priority ^ newpriority) & ~PPQMASK) {
842 lp->lwp_priority = newpriority;
843 if (lp->lwp_flag & LWP_ONRUNQ) {
844 bsd4_remrunqueue_locked(lp);
845 lp->lwp_rqtype = newrqtype;
846 lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ;
847 bsd4_setrunqueue_locked(lp);
850 lp->lwp_rqtype = newrqtype;
851 lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ;
854 reschedcpu = lp->lwp_thread->td_gd->gd_cpuid;
856 lp->lwp_priority = newpriority;
862 * Determine if we need to reschedule the target cpu. This only
863 * occurs if the LWP is already on a scheduler queue, which means
864 * that idle cpu notification has already occured. At most we
865 * need only issue a need_user_resched() on the appropriate cpu.
867 * The LWP may be owned by a CPU different from the current one,
868 * in which case dd->uschedcp may be modified without an MP lock
869 * or a spinlock held. The worst that happens is that the code
870 * below causes a spurious need_user_resched() on the target CPU
871 * and dd->pri to be wrong for a short period of time, both of
872 * which are harmless.
874 * If checkpri is 0 we are adjusting the priority of the current
875 * process, possibly higher (less desireable), so ignore the upri
876 * check which will fail in that case.
878 if (reschedcpu >= 0) {
879 dd = &bsd4_pcpu[reschedcpu];
880 if ((bsd4_rdyprocmask & CPUMASK(reschedcpu)) &&
882 (dd->upri & ~PRIMASK) > (lp->lwp_priority & ~PRIMASK))) {
884 if (reschedcpu == mycpu->gd_cpuid) {
885 spin_unlock(&bsd4_spin);
888 spin_unlock(&bsd4_spin);
889 atomic_clear_cpumask(&bsd4_rdyprocmask,
890 CPUMASK(reschedcpu));
891 lwkt_send_ipiq(lp->lwp_thread->td_gd,
892 need_user_resched_remote, NULL);
895 spin_unlock(&bsd4_spin);
899 spin_unlock(&bsd4_spin);
902 spin_unlock(&bsd4_spin);
912 bsd4_yield(struct lwp *lp)
915 /* FUTURE (or something similar) */
916 switch(lp->lwp_rqtype) {
917 case RTP_PRIO_NORMAL:
918 lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu + ESTCPUINCR);
928 * Called from fork1() when a new child process is being created.
930 * Give the child process an initial estcpu that is more batch then
931 * its parent and dock the parent for the fork (but do not
932 * reschedule the parent). This comprises the main part of our batch
933 * detection heuristic for both parallel forking and sequential execs.
935 * XXX lwp should be "spawning" instead of "forking"
940 bsd4_forking(struct lwp *plp, struct lwp *lp)
943 * Put the child 4 queue slots (out of 32) higher than the parent
944 * (less desireable than the parent).
946 lp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ * 4);
949 * The batch status of children always starts out centerline
950 * and will inch-up or inch-down as appropriate. It takes roughly
951 * ~15 seconds of >50% cpu to hit the limit.
953 lp->lwp_batch = BATCHMAX / 2;
956 * Dock the parent a cost for the fork, protecting us from fork
957 * bombs. If the parent is forking quickly make the child more
960 plp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ / 16);
964 * Called when a parent waits for a child.
969 bsd4_exiting(struct lwp *lp, struct proc *child_proc)
974 * chooseproc() is called when a cpu needs a user process to LWKT schedule,
975 * it selects a user process and returns it. If chklp is non-NULL and chklp
976 * has a better or equal priority then the process that would otherwise be
977 * chosen, NULL is returned.
979 * Until we fix the RUNQ code the chklp test has to be strict or we may
980 * bounce between processes trying to acquire the current process designation.
982 * MPSAFE - must be called with bsd4_spin exclusive held. The spinlock is
983 * left intact through the entire routine.
987 chooseproc_locked(struct lwp *chklp)
991 u_int32_t *which, *which2;
998 rtqbits = bsd4_rtqueuebits;
999 tsqbits = bsd4_queuebits;
1000 idqbits = bsd4_idqueuebits;
1001 cpumask = mycpu->gd_cpumask;
1007 pri = bsfl(rtqbits);
1008 q = &bsd4_rtqueues[pri];
1009 which = &bsd4_rtqueuebits;
1011 } else if (tsqbits) {
1012 pri = bsfl(tsqbits);
1013 q = &bsd4_queues[pri];
1014 which = &bsd4_queuebits;
1016 } else if (idqbits) {
1017 pri = bsfl(idqbits);
1018 q = &bsd4_idqueues[pri];
1019 which = &bsd4_idqueuebits;
1024 lp = TAILQ_FIRST(q);
1025 KASSERT(lp, ("chooseproc: no lwp on busy queue"));
1028 while ((lp->lwp_cpumask & cpumask) == 0) {
1029 lp = TAILQ_NEXT(lp, lwp_procq);
1031 *which2 &= ~(1 << pri);
1038 * If the passed lwp <chklp> is reasonably close to the selected
1039 * lwp <lp>, return NULL (indicating that <chklp> should be kept).
1041 * Note that we must error on the side of <chklp> to avoid bouncing
1042 * between threads in the acquire code.
1045 if (chklp->lwp_priority < lp->lwp_priority + PPQ)
1051 * If the chosen lwp does not reside on this cpu spend a few
1052 * cycles looking for a better candidate at the same priority level.
1053 * This is a fallback check, setrunqueue() tries to wakeup the
1054 * correct cpu and is our front-line affinity.
1056 if (lp->lwp_thread->td_gd != mycpu &&
1057 (chklp = TAILQ_NEXT(lp, lwp_procq)) != NULL
1059 if (chklp->lwp_thread->td_gd == mycpu) {
1066 TAILQ_REMOVE(q, lp, lwp_procq);
1069 *which &= ~(1 << pri);
1070 KASSERT((lp->lwp_flag & LWP_ONRUNQ) != 0, ("not on runq6!"));
1071 lp->lwp_flag &= ~LWP_ONRUNQ;
1079 need_user_resched_remote(void *dummy)
1081 globaldata_t gd = mycpu;
1082 bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid];
1084 need_user_resched();
1085 lwkt_schedule(&dd->helper_thread);
1091 * bsd4_remrunqueue_locked() removes a given process from the run queue
1092 * that it is on, clearing the queue busy bit if it becomes empty.
1094 * Note that user process scheduler is different from the LWKT schedule.
1095 * The user process scheduler only manages user processes but it uses LWKT
1096 * underneath, and a user process operating in the kernel will often be
1097 * 'released' from our management.
1099 * MPSAFE - bsd4_spin must be held exclusively on call
1102 bsd4_remrunqueue_locked(struct lwp *lp)
1108 KKASSERT(lp->lwp_flag & LWP_ONRUNQ);
1109 lp->lwp_flag &= ~LWP_ONRUNQ;
1111 KKASSERT(bsd4_runqcount >= 0);
1113 pri = lp->lwp_rqindex;
1114 switch(lp->lwp_rqtype) {
1115 case RTP_PRIO_NORMAL:
1116 q = &bsd4_queues[pri];
1117 which = &bsd4_queuebits;
1119 case RTP_PRIO_REALTIME:
1121 q = &bsd4_rtqueues[pri];
1122 which = &bsd4_rtqueuebits;
1125 q = &bsd4_idqueues[pri];
1126 which = &bsd4_idqueuebits;
1129 panic("remrunqueue: invalid rtprio type");
1132 TAILQ_REMOVE(q, lp, lwp_procq);
1133 if (TAILQ_EMPTY(q)) {
1134 KASSERT((*which & (1 << pri)) != 0,
1135 ("remrunqueue: remove from empty queue"));
1136 *which &= ~(1 << pri);
1141 * bsd4_setrunqueue_locked()
1143 * Add a process whos rqtype and rqindex had previously been calculated
1144 * onto the appropriate run queue. Determine if the addition requires
1145 * a reschedule on a cpu and return the cpuid or -1.
1147 * NOTE: Lower priorities are better priorities.
1149 * MPSAFE - bsd4_spin must be held exclusively on call
1152 bsd4_setrunqueue_locked(struct lwp *lp)
1158 KKASSERT((lp->lwp_flag & LWP_ONRUNQ) == 0);
1159 lp->lwp_flag |= LWP_ONRUNQ;
1162 pri = lp->lwp_rqindex;
1164 switch(lp->lwp_rqtype) {
1165 case RTP_PRIO_NORMAL:
1166 q = &bsd4_queues[pri];
1167 which = &bsd4_queuebits;
1169 case RTP_PRIO_REALTIME:
1171 q = &bsd4_rtqueues[pri];
1172 which = &bsd4_rtqueuebits;
1175 q = &bsd4_idqueues[pri];
1176 which = &bsd4_idqueuebits;
1179 panic("remrunqueue: invalid rtprio type");
1184 * Add to the correct queue and set the appropriate bit. If no
1185 * lower priority (i.e. better) processes are in the queue then
1186 * we want a reschedule, calculate the best cpu for the job.
1188 * Always run reschedules on the LWPs original cpu.
1190 TAILQ_INSERT_TAIL(q, lp, lwp_procq);
1197 * For SMP systems a user scheduler helper thread is created for each
1198 * cpu and is used to allow one cpu to wakeup another for the purposes of
1199 * scheduling userland threads from setrunqueue().
1201 * UP systems do not need the helper since there is only one cpu.
1203 * We can't use the idle thread for this because we might block.
1204 * Additionally, doing things this way allows us to HLT idle cpus
1210 sched_thread(void *dummy)
1224 cpuid = gd->gd_cpuid; /* doesn't change */
1225 mask = gd->gd_cpumask; /* doesn't change */
1226 dd = &bsd4_pcpu[cpuid];
1229 * Since we are woken up only when no user processes are scheduled
1230 * on a cpu, we can run at an ultra low priority.
1232 lwkt_setpri_self(TDPRI_USER_SCHEDULER);
1236 * We use the LWKT deschedule-interlock trick to avoid racing
1237 * bsd4_rdyprocmask. This means we cannot block through to the
1238 * manual lwkt_switch() call we make below.
1241 lwkt_deschedule_self(gd->gd_curthread);
1242 spin_lock(&bsd4_spin);
1243 atomic_set_cpumask(&bsd4_rdyprocmask, mask);
1245 clear_user_resched(); /* This satisfied the reschedule request */
1246 dd->rrcount = 0; /* Reset the round-robin counter */
1248 if ((bsd4_curprocmask & mask) == 0) {
1250 * No thread is currently scheduled.
1252 KKASSERT(dd->uschedcp == NULL);
1253 if ((nlp = chooseproc_locked(NULL)) != NULL) {
1254 atomic_set_cpumask(&bsd4_curprocmask, mask);
1255 dd->upri = nlp->lwp_priority;
1257 spin_unlock(&bsd4_spin);
1259 lwkt_acquire(nlp->lwp_thread);
1261 lwkt_schedule(nlp->lwp_thread);
1263 spin_unlock(&bsd4_spin);
1265 } else if (bsd4_runqcount) {
1266 if ((nlp = chooseproc_locked(dd->uschedcp)) != NULL) {
1267 dd->upri = nlp->lwp_priority;
1269 spin_unlock(&bsd4_spin);
1271 lwkt_acquire(nlp->lwp_thread);
1273 lwkt_schedule(nlp->lwp_thread);
1276 * CHAINING CONDITION TRAIN
1278 * We could not deal with the scheduler wakeup
1279 * request on this cpu, locate a ready scheduler
1280 * with no current lp assignment and chain to it.
1282 * This ensures that a wakeup race which fails due
1283 * to priority test does not leave other unscheduled
1284 * cpus idle when the runqueue is not empty.
1286 tmpmask = ~bsd4_curprocmask & bsd4_rdyprocmask &
1289 tmpid = BSFCPUMASK(tmpmask);
1290 tmpdd = &bsd4_pcpu[tmpid];
1291 atomic_clear_cpumask(&bsd4_rdyprocmask,
1293 spin_unlock(&bsd4_spin);
1294 lwkt_schedule(&tmpdd->helper_thread);
1296 spin_unlock(&bsd4_spin);
1301 * The runq is empty.
1303 spin_unlock(&bsd4_spin);
1307 * We're descheduled unless someone scheduled us. Switch away.
1308 * Exiting the critical section will cause splz() to be called
1309 * for us if interrupts and such are pending.
1317 * Setup our scheduler helpers. Note that curprocmask bit 0 has already
1318 * been cleared by rqinit() and we should not mess with it further.
1321 sched_thread_cpu_init(void)
1326 kprintf("start scheduler helpers on cpus:");
1328 for (i = 0; i < ncpus; ++i) {
1329 bsd4_pcpu_t dd = &bsd4_pcpu[i];
1330 cpumask_t mask = CPUMASK(i);
1332 if ((mask & smp_active_mask) == 0)
1338 lwkt_create(sched_thread, NULL, NULL, &dd->helper_thread,
1339 TDF_STOPREQ, i, "usched %d", i);
1342 * Allow user scheduling on the target cpu. cpu #0 has already
1343 * been enabled in rqinit().
1346 atomic_clear_cpumask(&bsd4_curprocmask, mask);
1347 atomic_set_cpumask(&bsd4_rdyprocmask, mask);
1348 dd->upri = PRIBASE_NULL;
1353 SYSINIT(uschedtd, SI_BOOT2_USCHED, SI_ORDER_SECOND,
1354 sched_thread_cpu_init, NULL)