2 * Copyright (c) 2012 The DragonFly Project. All rights reserved.
3 * Copyright (c) 1999 Peter Wemm <peter@FreeBSD.org>. All rights reserved.
5 * This code is derived from software contributed to The DragonFly Project
6 * by Matthew Dillon <dillon@backplane.com>,
7 * by Mihai Carabas <mihai.carabas@gmail.com>
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/kernel.h>
36 #include <sys/queue.h>
38 #include <sys/rtprio.h>
40 #include <sys/sysctl.h>
41 #include <sys/resourcevar.h>
42 #include <sys/spinlock.h>
43 #include <sys/cpu_topology.h>
44 #include <sys/thread2.h>
45 #include <sys/spinlock2.h>
46 #include <sys/mplock2.h>
50 #include <machine/cpu.h>
51 #include <machine/smp.h>
54 * Priorities. Note that with 32 run queues per scheduler each queue
55 * represents four priority levels.
59 #define PRIMASK (MAXPRI - 1)
60 #define PRIBASE_REALTIME 0
61 #define PRIBASE_NORMAL MAXPRI
62 #define PRIBASE_IDLE (MAXPRI * 2)
63 #define PRIBASE_THREAD (MAXPRI * 3)
64 #define PRIBASE_NULL (MAXPRI * 4)
66 #define NQS 32 /* 32 run queues. */
67 #define PPQ (MAXPRI / NQS) /* priorities per queue */
68 #define PPQMASK (PPQ - 1)
71 * NICEPPQ - number of nice units per priority queue
73 * ESTCPUPPQ - number of estcpu units per priority queue
74 * ESTCPUMAX - number of estcpu units
78 #define ESTCPUMAX (ESTCPUPPQ * NQS)
79 #define BATCHMAX (ESTCPUFREQ * 30)
80 #define PRIO_RANGE (PRIO_MAX - PRIO_MIN + 1)
82 #define ESTCPULIM(v) min((v), ESTCPUMAX)
86 #define lwp_priority lwp_usdata.bsd4.priority
87 #define lwp_rqindex lwp_usdata.bsd4.rqindex
88 #define lwp_estcpu lwp_usdata.bsd4.estcpu
89 #define lwp_batch lwp_usdata.bsd4.batch
90 #define lwp_rqtype lwp_usdata.bsd4.rqtype
92 static void bsd4_acquire_curproc(struct lwp *lp);
93 static void bsd4_release_curproc(struct lwp *lp);
94 static void bsd4_select_curproc(globaldata_t gd);
95 static void bsd4_setrunqueue(struct lwp *lp);
96 static void bsd4_schedulerclock(struct lwp *lp, sysclock_t period,
98 static void bsd4_recalculate_estcpu(struct lwp *lp);
99 static void bsd4_resetpriority(struct lwp *lp);
100 static void bsd4_forking(struct lwp *plp, struct lwp *lp);
101 static void bsd4_exiting(struct lwp *lp, struct proc *);
102 static void bsd4_uload_update(struct lwp *lp);
103 static void bsd4_yield(struct lwp *lp);
106 static void bsd4_need_user_resched_remote(void *dummy);
107 static int bsd4_batchy_looser_pri_test(struct lwp* lp);
108 static struct lwp *bsd4_chooseproc_locked_cache_coherent(struct lwp *chklp);
109 static void bsd4_kick_helper(struct lwp *lp);
111 static struct lwp *bsd4_chooseproc_locked(struct lwp *chklp);
112 static void bsd4_remrunqueue_locked(struct lwp *lp);
113 static void bsd4_setrunqueue_locked(struct lwp *lp);
115 struct usched usched_bsd4 = {
117 "bsd4", "Original DragonFly Scheduler",
118 NULL, /* default registration */
119 NULL, /* default deregistration */
120 bsd4_acquire_curproc,
121 bsd4_release_curproc,
124 bsd4_recalculate_estcpu,
129 NULL, /* setcpumask not supported */
133 struct usched_bsd4_pcpu {
134 struct thread helper_thread;
137 struct lwp *uschedcp;
138 struct lwp *old_uschedcp;
144 typedef struct usched_bsd4_pcpu *bsd4_pcpu_t;
147 * We have NQS (32) run queues per scheduling class. For the normal
148 * class, there are 128 priorities scaled onto these 32 queues. New
149 * processes are added to the last entry in each queue, and processes
150 * are selected for running by taking them from the head and maintaining
151 * a simple FIFO arrangement. Realtime and Idle priority processes have
152 * and explicit 0-31 priority which maps directly onto their class queue
153 * index. When a queue has something in it, the corresponding bit is
154 * set in the queuebits variable, allowing a single read to determine
155 * the state of all 32 queues and then a ffs() to find the first busy
158 static struct rq bsd4_queues[NQS];
159 static struct rq bsd4_rtqueues[NQS];
160 static struct rq bsd4_idqueues[NQS];
161 static u_int32_t bsd4_queuebits;
162 static u_int32_t bsd4_rtqueuebits;
163 static u_int32_t bsd4_idqueuebits;
164 static cpumask_t bsd4_curprocmask = -1; /* currently running a user process */
165 static cpumask_t bsd4_rdyprocmask; /* ready to accept a user process */
166 static int bsd4_runqcount;
168 static volatile int bsd4_scancpu;
170 static struct spinlock bsd4_spin;
171 static struct usched_bsd4_pcpu bsd4_pcpu[MAXCPU];
172 static struct sysctl_ctx_list usched_bsd4_sysctl_ctx;
173 static struct sysctl_oid *usched_bsd4_sysctl_tree;
175 /* Debug info exposed through debug.* sysctl */
177 SYSCTL_INT(_debug, OID_AUTO, bsd4_runqcount, CTLFLAG_RD,
179 "Number of run queues");
181 static int usched_bsd4_debug = -1;
182 SYSCTL_INT(_debug, OID_AUTO, bsd4_scdebug, CTLFLAG_RW,
183 &usched_bsd4_debug, 0,
184 "Print debug information for this pid");
186 static int usched_bsd4_pid_debug = -1;
187 SYSCTL_INT(_debug, OID_AUTO, bsd4_pid_debug, CTLFLAG_RW,
188 &usched_bsd4_pid_debug, 0,
189 "Print KTR debug information for this pid");
191 /* Tunning usched_bsd4 - configurable through kern.usched_bsd4.* */
193 static int usched_bsd4_smt = 0;
194 static int usched_bsd4_cache_coherent = 0;
195 static int usched_bsd4_upri_affinity = 16; /* 32 queues - half-way */
196 static int usched_bsd4_queue_checks = 5;
197 static int usched_bsd4_stick_to_level = 0;
198 static long usched_bsd4_kicks;
200 static int usched_bsd4_rrinterval = (ESTCPUFREQ + 9) / 10;
201 static int usched_bsd4_decay = 8;
202 static int usched_bsd4_batch_time = 10;
204 /* KTR debug printings */
206 KTR_INFO_MASTER_EXTERN(usched);
208 #if !defined(KTR_USCHED_BSD4)
209 #define KTR_USCHED_BSD4 KTR_ALL
212 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_acquire_curproc_urw, 0,
213 "USCHED_BSD4(bsd4_acquire_curproc in user_reseched_wanted "
214 "after release: pid %d, cpuid %d, curr_cpuid %d)",
215 pid_t pid, int cpuid, int curr);
216 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_acquire_curproc_before_loop, 0,
217 "USCHED_BSD4(bsd4_acquire_curproc before loop: pid %d, cpuid %d, "
219 pid_t pid, int cpuid, int curr);
220 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_acquire_curproc_not, 0,
221 "USCHED_BSD4(bsd4_acquire_curproc couldn't acquire after "
222 "bsd4_setrunqueue: pid %d, cpuid %d, curr_lp pid %d, curr_cpuid %d)",
223 pid_t pid, int cpuid, pid_t curr_pid, int curr_cpuid);
224 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_acquire_curproc_switch, 0,
225 "USCHED_BSD4(bsd4_acquire_curproc after lwkt_switch: pid %d, "
226 "cpuid %d, curr_cpuid %d)",
227 pid_t pid, int cpuid, int curr);
229 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_release_curproc, 0,
230 "USCHED_BSD4(bsd4_release_curproc before select: pid %d, "
231 "cpuid %d, curr_cpuid %d)",
232 pid_t pid, int cpuid, int curr);
234 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_select_curproc, 0,
235 "USCHED_BSD4(bsd4_release_curproc before select: pid %d, "
236 "cpuid %d, old_pid %d, old_cpuid %d, curr_cpuid %d)",
237 pid_t pid, int cpuid, pid_t old_pid, int old_cpuid, int curr);
240 KTR_INFO(KTR_USCHED_BSD4, usched, batchy_test_false, 0,
241 "USCHED_BSD4(batchy_looser_pri_test false: pid %d, "
242 "cpuid %d, verify_mask %lu)",
243 pid_t pid, int cpuid, cpumask_t mask);
244 KTR_INFO(KTR_USCHED_BSD4, usched, batchy_test_true, 0,
245 "USCHED_BSD4(batchy_looser_pri_test true: pid %d, "
246 "cpuid %d, verify_mask %lu)",
247 pid_t pid, int cpuid, cpumask_t mask);
249 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_fc_smt, 0,
250 "USCHED_BSD4(bsd4_setrunqueue free cpus smt: pid %d, cpuid %d, "
251 "mask %lu, curr_cpuid %d)",
252 pid_t pid, int cpuid, cpumask_t mask, int curr);
253 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_fc_non_smt, 0,
254 "USCHED_BSD4(bsd4_setrunqueue free cpus check non_smt: pid %d, "
255 "cpuid %d, mask %lu, curr_cpuid %d)",
256 pid_t pid, int cpuid, cpumask_t mask, int curr);
257 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_rc, 0,
258 "USCHED_BSD4(bsd4_setrunqueue running cpus check: pid %d, "
259 "cpuid %d, mask %lu, curr_cpuid %d)",
260 pid_t pid, int cpuid, cpumask_t mask, int curr);
261 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_found, 0,
262 "USCHED_BSD4(bsd4_setrunqueue found cpu: pid %d, cpuid %d, "
263 "mask %lu, found_cpuid %d, curr_cpuid %d)",
264 pid_t pid, int cpuid, cpumask_t mask, int found_cpuid, int curr);
265 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_not_found, 0,
266 "USCHED_BSD4(bsd4_setrunqueue not found cpu: pid %d, cpuid %d, "
267 "try_cpuid %d, curr_cpuid %d)",
268 pid_t pid, int cpuid, int try_cpuid, int curr);
269 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_found_best_cpuid, 0,
270 "USCHED_BSD4(bsd4_setrunqueue found cpu: pid %d, cpuid %d, "
271 "mask %lu, found_cpuid %d, curr_cpuid %d)",
272 pid_t pid, int cpuid, cpumask_t mask, int found_cpuid, int curr);
275 KTR_INFO(KTR_USCHED_BSD4, usched, chooseproc, 0,
276 "USCHED_BSD4(chooseproc: pid %d, old_cpuid %d, curr_cpuid %d)",
277 pid_t pid, int old_cpuid, int curr);
279 KTR_INFO(KTR_USCHED_BSD4, usched, chooseproc_cc, 0,
280 "USCHED_BSD4(chooseproc_cc: pid %d, old_cpuid %d, curr_cpuid %d)",
281 pid_t pid, int old_cpuid, int curr);
282 KTR_INFO(KTR_USCHED_BSD4, usched, chooseproc_cc_not_good, 0,
283 "USCHED_BSD4(chooseproc_cc not good: pid %d, old_cpumask %lu, "
284 "sibling_mask %lu, curr_cpumask %lu)",
285 pid_t pid, cpumask_t old_cpumask, cpumask_t sibling_mask, cpumask_t curr);
286 KTR_INFO(KTR_USCHED_BSD4, usched, chooseproc_cc_elected, 0,
287 "USCHED_BSD4(chooseproc_cc elected: pid %d, old_cpumask %lu, "
288 "sibling_mask %lu, curr_cpumask: %lu)",
289 pid_t pid, cpumask_t old_cpumask, cpumask_t sibling_mask, cpumask_t curr);
291 KTR_INFO(KTR_USCHED_BSD4, usched, sched_thread_no_process, 0,
292 "USCHED_BSD4(sched_thread %d no process scheduled: pid %d, old_cpuid %d)",
293 int id, pid_t pid, int cpuid);
294 KTR_INFO(KTR_USCHED_BSD4, usched, sched_thread_process, 0,
295 "USCHED_BSD4(sched_thread %d process scheduled: pid %d, old_cpuid %d)",
296 int id, pid_t pid, int cpuid);
297 KTR_INFO(KTR_USCHED_BSD4, usched, sched_thread_no_process_found, 0,
298 "USCHED_BSD4(sched_thread %d no process found; tmpmask %lu)",
299 int id, cpumask_t tmpmask);
303 * Initialize the run queues at boot time.
306 bsd4_rqinit(void *dummy)
310 spin_init(&bsd4_spin);
311 for (i = 0; i < NQS; i++) {
312 TAILQ_INIT(&bsd4_queues[i]);
313 TAILQ_INIT(&bsd4_rtqueues[i]);
314 TAILQ_INIT(&bsd4_idqueues[i]);
316 atomic_clear_cpumask(&bsd4_curprocmask, 1);
318 SYSINIT(runqueue, SI_BOOT2_USCHED, SI_ORDER_FIRST, bsd4_rqinit, NULL)
321 * BSD4_ACQUIRE_CURPROC
323 * This function is called when the kernel intends to return to userland.
324 * It is responsible for making the thread the current designated userland
325 * thread for this cpu, blocking if necessary.
327 * The kernel has already depressed our LWKT priority so we must not switch
328 * until we have either assigned or disposed of the thread.
330 * WARNING! THIS FUNCTION IS ALLOWED TO CAUSE THE CURRENT THREAD TO MIGRATE
331 * TO ANOTHER CPU! Because most of the kernel assumes that no migration will
332 * occur, this function is called only under very controlled circumstances.
337 bsd4_acquire_curproc(struct lwp *lp)
347 * Make sure we aren't sitting on a tsleep queue.
350 crit_enter_quick(td);
351 if (td->td_flags & TDF_TSLEEPQ)
353 bsd4_recalculate_estcpu(lp);
356 * If a reschedule was requested give another thread the
359 if (user_resched_wanted()) {
360 clear_user_resched();
361 bsd4_release_curproc(lp);
363 KTR_COND_LOG(usched_bsd4_acquire_curproc_urw,
364 lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
366 lp->lwp_thread->td_gd->gd_cpuid,
371 * Loop until we are the current user thread
374 dd = &bsd4_pcpu[gd->gd_cpuid];
376 KTR_COND_LOG(usched_bsd4_acquire_curproc_before_loop,
377 lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
379 lp->lwp_thread->td_gd->gd_cpuid,
384 * Process any pending events and higher priority threads.
389 * Become the currently scheduled user thread for this cpu
390 * if we can do so trivially.
392 * We can steal another thread's current thread designation
393 * on this cpu since if we are running that other thread
394 * must not be, so we can safely deschedule it.
396 if (dd->uschedcp == lp) {
398 * We are already the current lwp (hot path).
400 dd->upri = lp->lwp_priority;
401 } else if (dd->uschedcp == NULL) {
403 * We can trivially become the current lwp.
405 atomic_set_cpumask(&bsd4_curprocmask, gd->gd_cpumask);
407 dd->upri = lp->lwp_priority;
408 } else if (dd->upri > lp->lwp_priority) {
410 * We can steal the current cpu's lwp designation
411 * away simply by replacing it. The other thread
412 * will stall when it tries to return to userland.
415 dd->upri = lp->lwp_priority;
417 lwkt_deschedule(olp->lwp_thread);
418 bsd4_setrunqueue(olp);
422 * We cannot become the current lwp, place the lp
423 * on the bsd4 run-queue and deschedule ourselves.
425 * When we are reactivated we will have another
428 lwkt_deschedule(lp->lwp_thread);
430 bsd4_setrunqueue(lp);
432 KTR_COND_LOG(usched_bsd4_acquire_curproc_not,
433 lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
435 lp->lwp_thread->td_gd->gd_cpuid,
436 dd->uschedcp->lwp_proc->p_pid,
443 * Reload after a switch or setrunqueue/switch possibly
444 * moved us to another cpu.
447 dd = &bsd4_pcpu[gd->gd_cpuid];
449 KTR_COND_LOG(usched_bsd4_acquire_curproc_switch,
450 lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
452 lp->lwp_thread->td_gd->gd_cpuid,
455 } while (dd->uschedcp != lp);
458 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0);
462 * BSD4_RELEASE_CURPROC
464 * This routine detaches the current thread from the userland scheduler,
465 * usually because the thread needs to run or block in the kernel (at
466 * kernel priority) for a while.
468 * This routine is also responsible for selecting a new thread to
469 * make the current thread.
471 * NOTE: This implementation differs from the dummy example in that
472 * bsd4_select_curproc() is able to select the current process, whereas
473 * dummy_select_curproc() is not able to select the current process.
474 * This means we have to NULL out uschedcp.
476 * Additionally, note that we may already be on a run queue if releasing
477 * via the lwkt_switch() in bsd4_setrunqueue().
483 bsd4_release_curproc(struct lwp *lp)
485 globaldata_t gd = mycpu;
486 bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid];
488 if (dd->uschedcp == lp) {
490 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0);
492 KTR_COND_LOG(usched_bsd4_release_curproc,
493 lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
495 lp->lwp_thread->td_gd->gd_cpuid,
498 dd->uschedcp = NULL; /* don't let lp be selected */
499 dd->upri = PRIBASE_NULL;
500 atomic_clear_cpumask(&bsd4_curprocmask, gd->gd_cpumask);
501 dd->old_uschedcp = lp; /* used only for KTR debug prints */
502 bsd4_select_curproc(gd);
508 * BSD4_SELECT_CURPROC
510 * Select a new current process for this cpu and clear any pending user
511 * reschedule request. The cpu currently has no current process.
513 * This routine is also responsible for equal-priority round-robining,
514 * typically triggered from bsd4_schedulerclock(). In our dummy example
515 * all the 'user' threads are LWKT scheduled all at once and we just
516 * call lwkt_switch().
518 * The calling process is not on the queue and cannot be selected.
524 bsd4_select_curproc(globaldata_t gd)
526 bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid];
528 int cpuid = gd->gd_cpuid;
532 spin_lock(&bsd4_spin);
534 if(usched_bsd4_cache_coherent)
535 nlp = bsd4_chooseproc_locked_cache_coherent(dd->uschedcp);
538 nlp = bsd4_chooseproc_locked(dd->uschedcp);
542 KTR_COND_LOG(usched_bsd4_select_curproc,
543 nlp->lwp_proc->p_pid == usched_bsd4_pid_debug,
544 nlp->lwp_proc->p_pid,
545 nlp->lwp_thread->td_gd->gd_cpuid,
546 dd->old_uschedcp->lwp_proc->p_pid,
547 dd->old_uschedcp->lwp_thread->td_gd->gd_cpuid,
550 atomic_set_cpumask(&bsd4_curprocmask, CPUMASK(cpuid));
551 dd->upri = nlp->lwp_priority;
553 dd->rrcount = 0; /* reset round robin */
554 spin_unlock(&bsd4_spin);
556 lwkt_acquire(nlp->lwp_thread);
558 lwkt_schedule(nlp->lwp_thread);
560 spin_unlock(&bsd4_spin);
564 } else if (bsd4_runqcount && (bsd4_rdyprocmask & CPUMASK(cpuid))) {
565 atomic_clear_cpumask(&bsd4_rdyprocmask, CPUMASK(cpuid));
566 spin_unlock(&bsd4_spin);
567 lwkt_schedule(&dd->helper_thread);
569 spin_unlock(&bsd4_spin);
577 * batchy_looser_pri_test() - determine if a process is batchy or not
578 * relative to the other processes running in the system
581 bsd4_batchy_looser_pri_test(struct lwp* lp)
584 bsd4_pcpu_t other_dd;
587 /* Current running processes */
588 mask = bsd4_curprocmask & smp_active_mask
589 & usched_global_cpumask;
592 cpu = BSFCPUMASK(mask);
593 other_dd = &bsd4_pcpu[cpu];
594 if (other_dd->upri - lp->lwp_priority > usched_bsd4_upri_affinity * PPQ) {
596 KTR_COND_LOG(usched_batchy_test_false,
597 lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
599 lp->lwp_thread->td_gd->gd_cpuid,
600 (unsigned long)mask);
604 mask &= ~CPUMASK(cpu);
607 KTR_COND_LOG(usched_batchy_test_true,
608 lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
610 lp->lwp_thread->td_gd->gd_cpuid,
611 (unsigned long)mask);
621 * Place the specified lwp on the user scheduler's run queue. This routine
622 * must be called with the thread descheduled. The lwp must be runnable.
624 * The thread may be the current thread as a special case.
629 bsd4_setrunqueue(struct lwp *lp)
640 * First validate the process state relative to the current cpu.
641 * We don't need the spinlock for this, just a critical section.
642 * We are in control of the process.
645 KASSERT(lp->lwp_stat == LSRUN, ("setrunqueue: lwp not LSRUN"));
646 KASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0,
647 ("lwp %d/%d already on runq! flag %08x/%08x", lp->lwp_proc->p_pid,
648 lp->lwp_tid, lp->lwp_proc->p_flags, lp->lwp_flags));
649 KKASSERT((lp->lwp_thread->td_flags & TDF_RUNQ) == 0);
652 * Note: gd and dd are relative to the target thread's last cpu,
653 * NOT our current cpu.
655 gd = lp->lwp_thread->td_gd;
656 dd = &bsd4_pcpu[gd->gd_cpuid];
659 * This process is not supposed to be scheduled anywhere or assigned
660 * as the current process anywhere. Assert the condition.
662 KKASSERT(dd->uschedcp != lp);
666 * If we are not SMP we do not have a scheduler helper to kick
667 * and must directly activate the process if none are scheduled.
669 * This is really only an issue when bootstrapping init since
670 * the caller in all other cases will be a user process, and
671 * even if released (dd->uschedcp == NULL), that process will
672 * kickstart the scheduler when it returns to user mode from
675 if (dd->uschedcp == NULL) {
676 atomic_set_cpumask(&bsd4_curprocmask, gd->gd_cpumask);
678 dd->upri = lp->lwp_priority;
679 lwkt_schedule(lp->lwp_thread);
687 * XXX fixme. Could be part of a remrunqueue/setrunqueue
688 * operation when the priority is recalculated, so TDF_MIGRATING
689 * may already be set.
691 if ((lp->lwp_thread->td_flags & TDF_MIGRATING) == 0)
692 lwkt_giveaway(lp->lwp_thread);
696 * We lose control of lp the moment we release the spinlock after
697 * having placed lp on the queue. i.e. another cpu could pick it
698 * up and it could exit, or its priority could be further adjusted,
699 * or something like that.
701 spin_lock(&bsd4_spin);
702 bsd4_setrunqueue_locked(lp);
703 lp->lwp_rebal_ticks = sched_ticks;
707 * Kick the scheduler helper on one of the other cpu's
708 * and request a reschedule if appropriate.
710 * NOTE: We check all cpus whos rdyprocmask is set. First we
711 * look for cpus without designated lps, then we look for
712 * cpus with designated lps with a worse priority than our
717 if (usched_bsd4_smt) {
720 * SMT heuristic - Try to schedule on a free physical core.
721 * If no physical core found than choose the one that has
722 * an interactive thread.
726 int min_prio = MAXPRI * MAXPRI;
729 cpuid = (bsd4_scancpu & 0xFFFF) % ncpus;
730 mask = ~bsd4_curprocmask & bsd4_rdyprocmask & lp->lwp_cpumask &
731 smp_active_mask & usched_global_cpumask;
733 KTR_COND_LOG(usched_bsd4_setrunqueue_fc_smt,
734 lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
736 lp->lwp_thread->td_gd->gd_cpuid,
741 tmpmask = ~(CPUMASK(cpuid) - 1);
743 cpuid = BSFCPUMASK(mask & tmpmask);
745 cpuid = BSFCPUMASK(mask);
746 gd = globaldata_find(cpuid);
747 dd = &bsd4_pcpu[cpuid];
749 if ((dd->upri & ~PPQMASK) >= (lp->lwp_priority & ~PPQMASK)) {
750 if (dd->cpunode->parent_node->members & ~dd->cpunode->members & mask) {
752 KTR_COND_LOG(usched_bsd4_setrunqueue_found,
753 lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
755 lp->lwp_thread->td_gd->gd_cpuid,
762 sibling = BSFCPUMASK(dd->cpunode->parent_node->members &
763 ~dd->cpunode->members);
764 if (min_prio > bsd4_pcpu[sibling].upri) {
765 min_prio = bsd4_pcpu[sibling].upri;
770 mask &= ~CPUMASK(cpuid);
773 if (best_cpuid != -1) {
775 gd = globaldata_find(cpuid);
776 dd = &bsd4_pcpu[cpuid];
778 KTR_COND_LOG(usched_bsd4_setrunqueue_found_best_cpuid,
779 lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
781 lp->lwp_thread->td_gd->gd_cpuid,
789 /* Fallback to the original heuristic */
790 cpuid = (bsd4_scancpu & 0xFFFF) % ncpus;
791 mask = ~bsd4_curprocmask & bsd4_rdyprocmask & lp->lwp_cpumask &
792 smp_active_mask & usched_global_cpumask;
794 KTR_COND_LOG(usched_bsd4_setrunqueue_fc_non_smt,
795 lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
797 lp->lwp_thread->td_gd->gd_cpuid,
802 tmpmask = ~(CPUMASK(cpuid) - 1);
804 cpuid = BSFCPUMASK(mask & tmpmask);
806 cpuid = BSFCPUMASK(mask);
807 gd = globaldata_find(cpuid);
808 dd = &bsd4_pcpu[cpuid];
810 if ((dd->upri & ~PPQMASK) >= (lp->lwp_priority & ~PPQMASK)) {
812 KTR_COND_LOG(usched_bsd4_setrunqueue_found,
813 lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
815 lp->lwp_thread->td_gd->gd_cpuid,
822 mask &= ~CPUMASK(cpuid);
827 * Then cpus which might have a currently running lp
829 mask = bsd4_curprocmask & bsd4_rdyprocmask &
830 lp->lwp_cpumask & smp_active_mask & usched_global_cpumask;
832 KTR_COND_LOG(usched_bsd4_setrunqueue_rc,
833 lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
835 lp->lwp_thread->td_gd->gd_cpuid,
840 tmpmask = ~(CPUMASK(cpuid) - 1);
842 cpuid = BSFCPUMASK(mask & tmpmask);
844 cpuid = BSFCPUMASK(mask);
845 gd = globaldata_find(cpuid);
846 dd = &bsd4_pcpu[cpuid];
848 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) {
850 KTR_COND_LOG(usched_bsd4_setrunqueue_found,
851 lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
853 lp->lwp_thread->td_gd->gd_cpuid,
860 mask &= ~CPUMASK(cpuid);
864 * If we cannot find a suitable cpu we reload from bsd4_scancpu
865 * and round-robin. Other cpus will pickup as they release their
866 * current lwps or become ready.
868 * Avoid a degenerate system lockup case if usched_global_cpumask
869 * is set to 0 or otherwise does not cover lwp_cpumask.
871 * We only kick the target helper thread in this case, we do not
872 * set the user resched flag because
874 cpuid = (bsd4_scancpu & 0xFFFF) % ncpus;
875 if ((CPUMASK(cpuid) & usched_global_cpumask) == 0) {
878 gd = globaldata_find(cpuid);
879 dd = &bsd4_pcpu[cpuid];
881 KTR_COND_LOG(usched_bsd4_setrunqueue_not_found,
882 lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
884 lp->lwp_thread->td_gd->gd_cpuid,
890 spin_unlock(&bsd4_spin);
891 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) {
892 if (dd->uschedcp == NULL) {
893 wakeup_mycpu(&dd->helper_thread);
899 atomic_clear_cpumask(&bsd4_rdyprocmask, CPUMASK(cpuid));
900 spin_unlock(&bsd4_spin);
901 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK))
902 lwkt_send_ipiq(gd, bsd4_need_user_resched_remote, NULL);
904 wakeup(&dd->helper_thread);
908 * Request a reschedule if appropriate.
910 spin_unlock(&bsd4_spin);
911 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) {
919 * This routine is called from a systimer IPI. It MUST be MP-safe and
920 * the BGL IS NOT HELD ON ENTRY. This routine is called at ESTCPUFREQ on
923 * This routine is called on every sched tick. If the currently running
924 * thread belongs to this scheduler it will be called with a non-NULL lp,
925 * otherwise it will be called with a NULL lp.
931 bsd4_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp)
933 globaldata_t gd = mycpu;
934 bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid];
937 * No impl if no lp running.
943 * Do we need to round-robin? We round-robin 10 times a second.
944 * This should only occur for cpu-bound batch processes.
946 if (++dd->rrcount >= usched_bsd4_rrinterval) {
952 * Adjust estcpu upward using a real time equivalent calculation.
954 lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu + ESTCPUMAX / ESTCPUFREQ + 1);
957 * Spinlocks also hold a critical section so there should not be
960 KKASSERT(gd->gd_spinlocks_wr == 0);
962 bsd4_resetpriority(lp);
966 * Called from acquire and from kern_synch's one-second timer (one of the
967 * callout helper threads) with a critical section held.
969 * Decay p_estcpu based on the number of ticks we haven't been running
970 * and our p_nice. As the load increases each process observes a larger
971 * number of idle ticks (because other processes are running in them).
972 * This observation leads to a larger correction which tends to make the
973 * system more 'batchy'.
975 * Note that no recalculation occurs for a process which sleeps and wakes
976 * up in the same tick. That is, a system doing thousands of context
977 * switches per second will still only do serious estcpu calculations
978 * ESTCPUFREQ times per second.
984 bsd4_recalculate_estcpu(struct lwp *lp)
986 globaldata_t gd = mycpu;
993 * We have to subtract periodic to get the last schedclock
994 * timeout time, otherwise we would get the upcoming timeout.
995 * Keep in mind that a process can migrate between cpus and
996 * while the scheduler clock should be very close, boundary
997 * conditions could lead to a small negative delta.
999 cpbase = gd->gd_schedclock.time - gd->gd_schedclock.periodic;
1001 if (lp->lwp_slptime > 1) {
1003 * Too much time has passed, do a coarse correction.
1005 lp->lwp_estcpu = lp->lwp_estcpu >> 1;
1006 bsd4_resetpriority(lp);
1007 lp->lwp_cpbase = cpbase;
1008 lp->lwp_cpticks = 0;
1009 lp->lwp_batch -= ESTCPUFREQ;
1010 if (lp->lwp_batch < 0)
1012 } else if (lp->lwp_cpbase != cpbase) {
1014 * Adjust estcpu if we are in a different tick. Don't waste
1015 * time if we are in the same tick.
1017 * First calculate the number of ticks in the measurement
1018 * interval. The ttlticks calculation can wind up 0 due to
1019 * a bug in the handling of lwp_slptime (as yet not found),
1020 * so make sure we do not get a divide by 0 panic.
1022 ttlticks = (cpbase - lp->lwp_cpbase) /
1023 gd->gd_schedclock.periodic;
1026 lp->lwp_cpbase = cpbase;
1030 updatepcpu(lp, lp->lwp_cpticks, ttlticks);
1033 * Calculate the percentage of one cpu used factoring in ncpus
1034 * and the load and adjust estcpu. Handle degenerate cases
1035 * by adding 1 to bsd4_runqcount.
1037 * estcpu is scaled by ESTCPUMAX.
1039 * bsd4_runqcount is the excess number of user processes
1040 * that cannot be immediately scheduled to cpus. We want
1041 * to count these as running to avoid range compression
1042 * in the base calculation (which is the actual percentage
1045 estcpu = (lp->lwp_cpticks * ESTCPUMAX) *
1046 (bsd4_runqcount + ncpus) / (ncpus * ttlticks);
1049 * If estcpu is > 50% we become more batch-like
1050 * If estcpu is <= 50% we become less batch-like
1052 * It takes 30 cpu seconds to traverse the entire range.
1054 if (estcpu > ESTCPUMAX / 2) {
1055 lp->lwp_batch += ttlticks;
1056 if (lp->lwp_batch > BATCHMAX)
1057 lp->lwp_batch = BATCHMAX;
1059 lp->lwp_batch -= ttlticks;
1060 if (lp->lwp_batch < 0)
1064 if (usched_bsd4_debug == lp->lwp_proc->p_pid) {
1065 kprintf("pid %d lwp %p estcpu %3d %3d bat %d cp %d/%d",
1066 lp->lwp_proc->p_pid, lp,
1067 estcpu, lp->lwp_estcpu,
1069 lp->lwp_cpticks, ttlticks);
1073 * Adjust lp->lwp_esetcpu. The decay factor determines how
1074 * quickly lwp_estcpu collapses to its realtime calculation.
1075 * A slower collapse gives us a more accurate number but
1076 * can cause a cpu hog to eat too much cpu before the
1077 * scheduler decides to downgrade it.
1079 * NOTE: p_nice is accounted for in bsd4_resetpriority(),
1080 * and not here, but we must still ensure that a
1081 * cpu-bound nice -20 process does not completely
1082 * override a cpu-bound nice +20 process.
1084 * NOTE: We must use ESTCPULIM() here to deal with any
1087 decay_factor = usched_bsd4_decay;
1088 if (decay_factor < 1)
1090 if (decay_factor > 1024)
1091 decay_factor = 1024;
1093 lp->lwp_estcpu = ESTCPULIM(
1094 (lp->lwp_estcpu * decay_factor + estcpu) /
1095 (decay_factor + 1));
1097 if (usched_bsd4_debug == lp->lwp_proc->p_pid)
1098 kprintf(" finalestcpu %d\n", lp->lwp_estcpu);
1099 bsd4_resetpriority(lp);
1100 lp->lwp_cpbase += ttlticks * gd->gd_schedclock.periodic;
1101 lp->lwp_cpticks = 0;
1106 * Compute the priority of a process when running in user mode.
1107 * Arrange to reschedule if the resulting priority is better
1108 * than that of the current process.
1110 * This routine may be called with any process.
1112 * This routine is called by fork1() for initial setup with the process
1113 * of the run queue, and also may be called normally with the process on or
1114 * off the run queue.
1119 bsd4_resetpriority(struct lwp *lp)
1129 * Calculate the new priority and queue type
1132 spin_lock(&bsd4_spin);
1134 newrqtype = lp->lwp_rtprio.type;
1137 case RTP_PRIO_REALTIME:
1139 newpriority = PRIBASE_REALTIME +
1140 (lp->lwp_rtprio.prio & PRIMASK);
1142 case RTP_PRIO_NORMAL:
1144 * Detune estcpu based on batchiness. lwp_batch ranges
1145 * from 0 to BATCHMAX. Limit estcpu for the sake of
1146 * the priority calculation to between 50% and 100%.
1148 estcpu = lp->lwp_estcpu * (lp->lwp_batch + BATCHMAX) /
1152 * p_nice piece Adds (0-40) * 2 0-80
1153 * estcpu Adds 16384 * 4 / 512 0-128
1155 newpriority = (lp->lwp_proc->p_nice - PRIO_MIN) * PPQ / NICEPPQ;
1156 newpriority += estcpu * PPQ / ESTCPUPPQ;
1157 newpriority = newpriority * MAXPRI / (PRIO_RANGE * PPQ /
1158 NICEPPQ + ESTCPUMAX * PPQ / ESTCPUPPQ);
1159 newpriority = PRIBASE_NORMAL + (newpriority & PRIMASK);
1162 newpriority = PRIBASE_IDLE + (lp->lwp_rtprio.prio & PRIMASK);
1164 case RTP_PRIO_THREAD:
1165 newpriority = PRIBASE_THREAD + (lp->lwp_rtprio.prio & PRIMASK);
1168 panic("Bad RTP_PRIO %d", newrqtype);
1173 * The newpriority incorporates the queue type so do a simple masked
1174 * check to determine if the process has moved to another queue. If
1175 * it has, and it is currently on a run queue, then move it.
1177 if ((lp->lwp_priority ^ newpriority) & ~PPQMASK) {
1178 lp->lwp_priority = newpriority;
1179 if (lp->lwp_mpflags & LWP_MP_ONRUNQ) {
1180 bsd4_remrunqueue_locked(lp);
1181 lp->lwp_rqtype = newrqtype;
1182 lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ;
1183 bsd4_setrunqueue_locked(lp);
1186 lp->lwp_rqtype = newrqtype;
1187 lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ;
1190 reschedcpu = lp->lwp_thread->td_gd->gd_cpuid;
1192 lp->lwp_priority = newpriority;
1198 * Determine if we need to reschedule the target cpu. This only
1199 * occurs if the LWP is already on a scheduler queue, which means
1200 * that idle cpu notification has already occured. At most we
1201 * need only issue a need_user_resched() on the appropriate cpu.
1203 * The LWP may be owned by a CPU different from the current one,
1204 * in which case dd->uschedcp may be modified without an MP lock
1205 * or a spinlock held. The worst that happens is that the code
1206 * below causes a spurious need_user_resched() on the target CPU
1207 * and dd->pri to be wrong for a short period of time, both of
1208 * which are harmless.
1210 * If checkpri is 0 we are adjusting the priority of the current
1211 * process, possibly higher (less desireable), so ignore the upri
1212 * check which will fail in that case.
1214 if (reschedcpu >= 0) {
1215 dd = &bsd4_pcpu[reschedcpu];
1216 if ((bsd4_rdyprocmask & CPUMASK(reschedcpu)) &&
1218 (dd->upri & ~PRIMASK) > (lp->lwp_priority & ~PRIMASK))) {
1220 if (reschedcpu == mycpu->gd_cpuid) {
1221 spin_unlock(&bsd4_spin);
1222 need_user_resched();
1224 spin_unlock(&bsd4_spin);
1225 atomic_clear_cpumask(&bsd4_rdyprocmask,
1226 CPUMASK(reschedcpu));
1227 lwkt_send_ipiq(lp->lwp_thread->td_gd,
1228 bsd4_need_user_resched_remote,
1232 spin_unlock(&bsd4_spin);
1233 need_user_resched();
1236 spin_unlock(&bsd4_spin);
1239 spin_unlock(&bsd4_spin);
1249 bsd4_yield(struct lwp *lp)
1252 /* FUTURE (or something similar) */
1253 switch(lp->lwp_rqtype) {
1254 case RTP_PRIO_NORMAL:
1255 lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu + ESTCPUINCR);
1261 need_user_resched();
1265 * Called from fork1() when a new child process is being created.
1267 * Give the child process an initial estcpu that is more batch then
1268 * its parent and dock the parent for the fork (but do not
1269 * reschedule the parent). This comprises the main part of our batch
1270 * detection heuristic for both parallel forking and sequential execs.
1272 * XXX lwp should be "spawning" instead of "forking"
1277 bsd4_forking(struct lwp *plp, struct lwp *lp)
1280 * Put the child 4 queue slots (out of 32) higher than the parent
1281 * (less desireable than the parent).
1283 lp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ * 4);
1286 * The batch status of children always starts out centerline
1287 * and will inch-up or inch-down as appropriate. It takes roughly
1288 * ~15 seconds of >50% cpu to hit the limit.
1290 lp->lwp_batch = BATCHMAX / 2;
1293 * Dock the parent a cost for the fork, protecting us from fork
1294 * bombs. If the parent is forking quickly make the child more
1297 plp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ / 16);
1301 * Called when a lwp is being removed from this scheduler, typically
1302 * during lwp_exit().
1305 bsd4_exiting(struct lwp *lp, struct proc *child_proc)
1310 bsd4_uload_update(struct lwp *lp)
1315 * chooseproc() is called when a cpu needs a user process to LWKT schedule,
1316 * it selects a user process and returns it. If chklp is non-NULL and chklp
1317 * has a better or equal priority then the process that would otherwise be
1318 * chosen, NULL is returned.
1320 * Until we fix the RUNQ code the chklp test has to be strict or we may
1321 * bounce between processes trying to acquire the current process designation.
1323 * MPSAFE - must be called with bsd4_spin exclusive held. The spinlock is
1324 * left intact through the entire routine.
1328 bsd4_chooseproc_locked(struct lwp *chklp)
1332 u_int32_t *which, *which2;
1339 rtqbits = bsd4_rtqueuebits;
1340 tsqbits = bsd4_queuebits;
1341 idqbits = bsd4_idqueuebits;
1342 cpumask = mycpu->gd_cpumask;
1349 pri = bsfl(rtqbits);
1350 q = &bsd4_rtqueues[pri];
1351 which = &bsd4_rtqueuebits;
1353 } else if (tsqbits) {
1354 pri = bsfl(tsqbits);
1355 q = &bsd4_queues[pri];
1356 which = &bsd4_queuebits;
1358 } else if (idqbits) {
1359 pri = bsfl(idqbits);
1360 q = &bsd4_idqueues[pri];
1361 which = &bsd4_idqueuebits;
1366 lp = TAILQ_FIRST(q);
1367 KASSERT(lp, ("chooseproc: no lwp on busy queue"));
1370 while ((lp->lwp_cpumask & cpumask) == 0) {
1371 lp = TAILQ_NEXT(lp, lwp_procq);
1373 *which2 &= ~(1 << pri);
1380 * If the passed lwp <chklp> is reasonably close to the selected
1381 * lwp <lp>, return NULL (indicating that <chklp> should be kept).
1383 * Note that we must error on the side of <chklp> to avoid bouncing
1384 * between threads in the acquire code.
1387 if (chklp->lwp_priority < lp->lwp_priority + PPQ)
1393 * If the chosen lwp does not reside on this cpu spend a few
1394 * cycles looking for a better candidate at the same priority level.
1395 * This is a fallback check, setrunqueue() tries to wakeup the
1396 * correct cpu and is our front-line affinity.
1398 if (lp->lwp_thread->td_gd != mycpu &&
1399 (chklp = TAILQ_NEXT(lp, lwp_procq)) != NULL
1401 if (chklp->lwp_thread->td_gd == mycpu) {
1407 KTR_COND_LOG(usched_chooseproc,
1408 lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
1409 lp->lwp_proc->p_pid,
1410 lp->lwp_thread->td_gd->gd_cpuid,
1413 TAILQ_REMOVE(q, lp, lwp_procq);
1416 *which &= ~(1 << pri);
1417 KASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) != 0, ("not on runq6!"));
1418 atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ);
1425 * chooseproc() - with a cache coherence heuristic. Try to pull a process that
1426 * has its home on the current CPU> If the process doesn't have its home here
1427 * and is a batchy one (see batcy_looser_pri_test), we can wait for a
1428 * sched_tick, may be its home will become free and pull it in. Anyway,
1429 * we can't wait more than one tick. If that tick expired, we pull in that
1430 * process, no matter what.
1434 bsd4_chooseproc_locked_cache_coherent(struct lwp *chklp)
1438 u_int32_t *which, *which2;
1446 struct lwp * min_level_lwp = NULL;
1447 struct rq *min_q = NULL;
1449 cpu_node_t* cpunode = NULL;
1450 u_int32_t min_level = MAXCPU; /* number of levels < MAXCPU */
1451 u_int32_t *min_which = NULL;
1452 u_int32_t min_pri = 0;
1453 u_int32_t level = 0;
1455 rtqbits = bsd4_rtqueuebits;
1456 tsqbits = bsd4_queuebits;
1457 idqbits = bsd4_idqueuebits;
1458 cpumask = mycpu->gd_cpumask;
1460 /* Get the mask coresponding to the sysctl configured level */
1461 cpunode = bsd4_pcpu[mycpu->gd_cpuid].cpunode;
1462 level = usched_bsd4_stick_to_level;
1464 cpunode = cpunode->parent_node;
1467 /* The cpus which can ellect a process */
1468 siblings = cpunode->members;
1473 pri = bsfl(rtqbits);
1474 q = &bsd4_rtqueues[pri];
1475 which = &bsd4_rtqueuebits;
1477 } else if (tsqbits) {
1478 pri = bsfl(tsqbits);
1479 q = &bsd4_queues[pri];
1480 which = &bsd4_queuebits;
1482 } else if (idqbits) {
1483 pri = bsfl(idqbits);
1484 q = &bsd4_idqueues[pri];
1485 which = &bsd4_idqueuebits;
1489 * No more left and we didn't reach the checks limit.
1491 bsd4_kick_helper(min_level_lwp);
1494 lp = TAILQ_FIRST(q);
1495 KASSERT(lp, ("chooseproc: no lwp on busy queue"));
1498 * Limit the number of checks/queue to a configurable value to
1499 * minimize the contention (we are in a locked region
1501 while (checks < usched_bsd4_queue_checks) {
1502 if ((lp->lwp_cpumask & cpumask) == 0 ||
1503 ((siblings & lp->lwp_thread->td_gd->gd_cpumask) == 0 &&
1504 (lp->lwp_rebal_ticks == sched_ticks ||
1505 lp->lwp_rebal_ticks == (int)(sched_ticks - 1)) &&
1506 bsd4_batchy_looser_pri_test(lp))) {
1508 KTR_COND_LOG(usched_chooseproc_cc_not_good,
1509 lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
1510 lp->lwp_proc->p_pid,
1511 (unsigned long)lp->lwp_thread->td_gd->gd_cpumask,
1512 (unsigned long)siblings,
1513 (unsigned long)cpumask);
1515 cpunode = bsd4_pcpu[lp->lwp_thread->td_gd->gd_cpuid].cpunode;
1518 if (cpunode->members & cpumask)
1520 cpunode = cpunode->parent_node;
1523 if (level < min_level ||
1524 (level == min_level && min_level_lwp &&
1525 lp->lwp_priority < min_level_lwp->lwp_priority)) {
1526 bsd4_kick_helper(min_level_lwp);
1533 bsd4_kick_helper(lp);
1535 lp = TAILQ_NEXT(lp, lwp_procq);
1537 *which2 &= ~(1 << pri);
1541 KTR_COND_LOG(usched_chooseproc_cc_elected,
1542 lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
1543 lp->lwp_proc->p_pid,
1544 (unsigned long)lp->lwp_thread->td_gd->gd_cpumask,
1545 (unsigned long)siblings,
1546 (unsigned long)cpumask);
1554 * Checks exhausted, we tried to defer too many threads, so schedule
1555 * the best of the worst.
1561 KASSERT(lp, ("chooseproc: at least the first lp was good"));
1566 * If the passed lwp <chklp> is reasonably close to the selected
1567 * lwp <lp>, return NULL (indicating that <chklp> should be kept).
1569 * Note that we must error on the side of <chklp> to avoid bouncing
1570 * between threads in the acquire code.
1573 if (chklp->lwp_priority < lp->lwp_priority + PPQ) {
1574 bsd4_kick_helper(lp);
1579 KTR_COND_LOG(usched_chooseproc_cc,
1580 lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
1581 lp->lwp_proc->p_pid,
1582 lp->lwp_thread->td_gd->gd_cpuid,
1585 TAILQ_REMOVE(q, lp, lwp_procq);
1588 *which &= ~(1 << pri);
1589 KASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) != 0, ("not on runq6!"));
1590 atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ);
1596 * If we aren't willing to schedule a ready process on our cpu, give it's
1597 * target cpu a kick rather than wait for the next tick.
1599 * Called with bsd4_spin held.
1603 bsd4_kick_helper(struct lwp *lp)
1610 gd = lp->lwp_thread->td_gd;
1611 dd = &bsd4_pcpu[gd->gd_cpuid];
1612 if ((smp_active_mask & usched_global_cpumask &
1613 bsd4_rdyprocmask & gd->gd_cpumask) == 0) {
1616 ++usched_bsd4_kicks;
1617 atomic_clear_cpumask(&bsd4_rdyprocmask, gd->gd_cpumask);
1618 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) {
1619 lwkt_send_ipiq(gd, bsd4_need_user_resched_remote, NULL);
1621 wakeup(&dd->helper_thread);
1627 bsd4_need_user_resched_remote(void *dummy)
1629 globaldata_t gd = mycpu;
1630 bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid];
1632 need_user_resched();
1634 /* Call wakeup_mycpu to avoid sending IPIs to other CPUs */
1635 wakeup_mycpu(&dd->helper_thread);
1641 * bsd4_remrunqueue_locked() removes a given process from the run queue
1642 * that it is on, clearing the queue busy bit if it becomes empty.
1644 * Note that user process scheduler is different from the LWKT schedule.
1645 * The user process scheduler only manages user processes but it uses LWKT
1646 * underneath, and a user process operating in the kernel will often be
1647 * 'released' from our management.
1649 * MPSAFE - bsd4_spin must be held exclusively on call
1652 bsd4_remrunqueue_locked(struct lwp *lp)
1658 KKASSERT(lp->lwp_mpflags & LWP_MP_ONRUNQ);
1659 atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ);
1661 KKASSERT(bsd4_runqcount >= 0);
1663 pri = lp->lwp_rqindex;
1664 switch(lp->lwp_rqtype) {
1665 case RTP_PRIO_NORMAL:
1666 q = &bsd4_queues[pri];
1667 which = &bsd4_queuebits;
1669 case RTP_PRIO_REALTIME:
1671 q = &bsd4_rtqueues[pri];
1672 which = &bsd4_rtqueuebits;
1675 q = &bsd4_idqueues[pri];
1676 which = &bsd4_idqueuebits;
1679 panic("remrunqueue: invalid rtprio type");
1682 TAILQ_REMOVE(q, lp, lwp_procq);
1683 if (TAILQ_EMPTY(q)) {
1684 KASSERT((*which & (1 << pri)) != 0,
1685 ("remrunqueue: remove from empty queue"));
1686 *which &= ~(1 << pri);
1691 * bsd4_setrunqueue_locked()
1693 * Add a process whos rqtype and rqindex had previously been calculated
1694 * onto the appropriate run queue. Determine if the addition requires
1695 * a reschedule on a cpu and return the cpuid or -1.
1697 * NOTE: Lower priorities are better priorities.
1699 * MPSAFE - bsd4_spin must be held exclusively on call
1702 bsd4_setrunqueue_locked(struct lwp *lp)
1708 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0);
1709 atomic_set_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ);
1712 pri = lp->lwp_rqindex;
1714 switch(lp->lwp_rqtype) {
1715 case RTP_PRIO_NORMAL:
1716 q = &bsd4_queues[pri];
1717 which = &bsd4_queuebits;
1719 case RTP_PRIO_REALTIME:
1721 q = &bsd4_rtqueues[pri];
1722 which = &bsd4_rtqueuebits;
1725 q = &bsd4_idqueues[pri];
1726 which = &bsd4_idqueuebits;
1729 panic("remrunqueue: invalid rtprio type");
1734 * Add to the correct queue and set the appropriate bit. If no
1735 * lower priority (i.e. better) processes are in the queue then
1736 * we want a reschedule, calculate the best cpu for the job.
1738 * Always run reschedules on the LWPs original cpu.
1740 TAILQ_INSERT_TAIL(q, lp, lwp_procq);
1747 * For SMP systems a user scheduler helper thread is created for each
1748 * cpu and is used to allow one cpu to wakeup another for the purposes of
1749 * scheduling userland threads from setrunqueue().
1751 * UP systems do not need the helper since there is only one cpu.
1753 * We can't use the idle thread for this because we might block.
1754 * Additionally, doing things this way allows us to HLT idle cpus
1760 sched_thread(void *dummy)
1772 cpuid = gd->gd_cpuid; /* doesn't change */
1773 mask = gd->gd_cpumask; /* doesn't change */
1774 dd = &bsd4_pcpu[cpuid];
1777 * Since we are woken up only when no user processes are scheduled
1778 * on a cpu, we can run at an ultra low priority.
1780 lwkt_setpri_self(TDPRI_USER_SCHEDULER);
1782 tsleep(&dd->helper_thread, 0, "sched_thread_sleep", 0);
1786 * We use the LWKT deschedule-interlock trick to avoid racing
1787 * bsd4_rdyprocmask. This means we cannot block through to the
1788 * manual lwkt_switch() call we make below.
1791 tsleep_interlock(&dd->helper_thread, 0);
1792 spin_lock(&bsd4_spin);
1793 atomic_set_cpumask(&bsd4_rdyprocmask, mask);
1795 clear_user_resched(); /* This satisfied the reschedule request */
1796 dd->rrcount = 0; /* Reset the round-robin counter */
1798 if ((bsd4_curprocmask & mask) == 0) {
1800 * No thread is currently scheduled.
1802 KKASSERT(dd->uschedcp == NULL);
1803 if ((nlp = bsd4_chooseproc_locked(NULL)) != NULL) {
1804 KTR_COND_LOG(usched_sched_thread_no_process,
1805 nlp->lwp_proc->p_pid == usched_bsd4_pid_debug,
1807 nlp->lwp_proc->p_pid,
1808 nlp->lwp_thread->td_gd->gd_cpuid);
1810 atomic_set_cpumask(&bsd4_curprocmask, mask);
1811 dd->upri = nlp->lwp_priority;
1813 dd->rrcount = 0; /* reset round robin */
1814 spin_unlock(&bsd4_spin);
1815 lwkt_acquire(nlp->lwp_thread);
1816 lwkt_schedule(nlp->lwp_thread);
1818 spin_unlock(&bsd4_spin);
1820 } else if (bsd4_runqcount) {
1821 if ((nlp = bsd4_chooseproc_locked(dd->uschedcp)) != NULL) {
1822 KTR_COND_LOG(usched_sched_thread_process,
1823 nlp->lwp_proc->p_pid == usched_bsd4_pid_debug,
1825 nlp->lwp_proc->p_pid,
1826 nlp->lwp_thread->td_gd->gd_cpuid);
1828 dd->upri = nlp->lwp_priority;
1830 dd->rrcount = 0; /* reset round robin */
1831 spin_unlock(&bsd4_spin);
1832 lwkt_acquire(nlp->lwp_thread);
1833 lwkt_schedule(nlp->lwp_thread);
1836 * CHAINING CONDITION TRAIN
1838 * We could not deal with the scheduler wakeup
1839 * request on this cpu, locate a ready scheduler
1840 * with no current lp assignment and chain to it.
1842 * This ensures that a wakeup race which fails due
1843 * to priority test does not leave other unscheduled
1844 * cpus idle when the runqueue is not empty.
1846 tmpmask = ~bsd4_curprocmask &
1847 bsd4_rdyprocmask & smp_active_mask;
1849 tmpid = BSFCPUMASK(tmpmask);
1850 tmpdd = &bsd4_pcpu[tmpid];
1851 atomic_clear_cpumask(&bsd4_rdyprocmask,
1853 spin_unlock(&bsd4_spin);
1854 wakeup(&tmpdd->helper_thread);
1856 spin_unlock(&bsd4_spin);
1859 KTR_LOG(usched_sched_thread_no_process_found,
1860 gd->gd_cpuid, (unsigned long)tmpmask);
1864 * The runq is empty.
1866 spin_unlock(&bsd4_spin);
1870 * We're descheduled unless someone scheduled us. Switch away.
1871 * Exiting the critical section will cause splz() to be called
1872 * for us if interrupts and such are pending.
1875 tsleep(&dd->helper_thread, PINTERLOCKED, "schslp", 0);
1879 /* sysctl stick_to_level parameter */
1881 sysctl_usched_bsd4_stick_to_level(SYSCTL_HANDLER_ARGS)
1885 new_val = usched_bsd4_stick_to_level;
1887 error = sysctl_handle_int(oidp, &new_val, 0, req);
1888 if (error != 0 || req->newptr == NULL)
1890 if (new_val > cpu_topology_levels_number - 1 || new_val < 0)
1892 usched_bsd4_stick_to_level = new_val;
1897 * Setup our scheduler helpers. Note that curprocmask bit 0 has already
1898 * been cleared by rqinit() and we should not mess with it further.
1901 sched_thread_cpu_init(void)
1905 int smt_not_supported = 0;
1906 int cache_coherent_not_supported = 0;
1909 kprintf("Start scheduler helpers on cpus:\n");
1911 sysctl_ctx_init(&usched_bsd4_sysctl_ctx);
1912 usched_bsd4_sysctl_tree =
1913 SYSCTL_ADD_NODE(&usched_bsd4_sysctl_ctx,
1914 SYSCTL_STATIC_CHILDREN(_kern), OID_AUTO,
1915 "usched_bsd4", CTLFLAG_RD, 0, "");
1917 for (i = 0; i < ncpus; ++i) {
1918 bsd4_pcpu_t dd = &bsd4_pcpu[i];
1919 cpumask_t mask = CPUMASK(i);
1921 if ((mask & smp_active_mask) == 0)
1924 dd->cpunode = get_cpu_node_by_cpuid(i);
1926 if (dd->cpunode == NULL) {
1927 smt_not_supported = 1;
1928 cache_coherent_not_supported = 1;
1930 kprintf ("\tcpu%d - WARNING: No CPU NODE "
1931 "found for cpu\n", i);
1933 switch (dd->cpunode->type) {
1936 kprintf ("\tcpu%d - HyperThreading "
1937 "available. Core siblings: ",
1941 smt_not_supported = 1;
1944 kprintf ("\tcpu%d - No HT available, "
1945 "multi-core/physical "
1946 "cpu. Physical siblings: ",
1950 smt_not_supported = 1;
1953 kprintf ("\tcpu%d - No HT available, "
1954 "single-core/physical cpu. "
1955 "Package Siblings: ",
1959 /* Let's go for safe defaults here */
1960 smt_not_supported = 1;
1961 cache_coherent_not_supported = 1;
1963 kprintf ("\tcpu%d - Unknown cpunode->"
1964 "type=%u. Siblings: ",
1966 (u_int)dd->cpunode->type);
1971 if (dd->cpunode->parent_node != NULL) {
1972 CPUSET_FOREACH(cpuid, dd->cpunode->parent_node->members)
1973 kprintf("cpu%d ", cpuid);
1976 kprintf(" no siblings\n");
1981 lwkt_create(sched_thread, NULL, NULL, &dd->helper_thread,
1982 0, i, "usched %d", i);
1985 * Allow user scheduling on the target cpu. cpu #0 has already
1986 * been enabled in rqinit().
1989 atomic_clear_cpumask(&bsd4_curprocmask, mask);
1990 atomic_set_cpumask(&bsd4_rdyprocmask, mask);
1991 dd->upri = PRIBASE_NULL;
1995 /* usched_bsd4 sysctl configurable parameters */
1997 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx,
1998 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
1999 OID_AUTO, "rrinterval", CTLFLAG_RW,
2000 &usched_bsd4_rrinterval, 0, "");
2001 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx,
2002 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
2003 OID_AUTO, "decay", CTLFLAG_RW,
2004 &usched_bsd4_decay, 0, "Extra decay when not running");
2005 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx,
2006 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
2007 OID_AUTO, "batch_time", CTLFLAG_RW,
2008 &usched_bsd4_batch_time, 0, "Min batch counter value");
2009 SYSCTL_ADD_LONG(&usched_bsd4_sysctl_ctx,
2010 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
2011 OID_AUTO, "kicks", CTLFLAG_RW,
2012 &usched_bsd4_kicks, "Number of kickstarts");
2014 /* Add enable/disable option for SMT scheduling if supported */
2015 if (smt_not_supported) {
2016 usched_bsd4_smt = 0;
2017 SYSCTL_ADD_STRING(&usched_bsd4_sysctl_ctx,
2018 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
2019 OID_AUTO, "smt", CTLFLAG_RD,
2020 "NOT SUPPORTED", 0, "SMT NOT SUPPORTED");
2022 usched_bsd4_smt = 1;
2023 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx,
2024 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
2025 OID_AUTO, "smt", CTLFLAG_RW,
2026 &usched_bsd4_smt, 0, "Enable SMT scheduling");
2030 * Add enable/disable option for cache coherent scheduling
2033 if (cache_coherent_not_supported) {
2034 usched_bsd4_cache_coherent = 0;
2035 SYSCTL_ADD_STRING(&usched_bsd4_sysctl_ctx,
2036 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
2037 OID_AUTO, "cache_coherent", CTLFLAG_RD,
2039 "Cache coherence NOT SUPPORTED");
2041 usched_bsd4_cache_coherent = 1;
2042 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx,
2043 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
2044 OID_AUTO, "cache_coherent", CTLFLAG_RW,
2045 &usched_bsd4_cache_coherent, 0,
2046 "Enable/Disable cache coherent scheduling");
2048 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx,
2049 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
2050 OID_AUTO, "upri_affinity", CTLFLAG_RW,
2051 &usched_bsd4_upri_affinity, 1,
2052 "Number of PPQs in user priority check");
2054 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx,
2055 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
2056 OID_AUTO, "queue_checks", CTLFLAG_RW,
2057 &usched_bsd4_queue_checks, 5,
2058 "LWPs to check from a queue before giving up");
2060 SYSCTL_ADD_PROC(&usched_bsd4_sysctl_ctx,
2061 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
2062 OID_AUTO, "stick_to_level",
2063 CTLTYPE_INT | CTLFLAG_RW,
2064 NULL, sizeof usched_bsd4_stick_to_level,
2065 sysctl_usched_bsd4_stick_to_level, "I",
2066 "Stick a process to this level. See sysctl"
2067 "paremter hw.cpu_topology.level_description");
2070 SYSINIT(uschedtd, SI_BOOT2_USCHED, SI_ORDER_SECOND,
2071 sched_thread_cpu_init, NULL)
2073 #else /* No SMP options - just add the configurable parameters to sysctl */
2076 sched_sysctl_tree_init(void)
2078 sysctl_ctx_init(&usched_bsd4_sysctl_ctx);
2079 usched_bsd4_sysctl_tree =
2080 SYSCTL_ADD_NODE(&usched_bsd4_sysctl_ctx,
2081 SYSCTL_STATIC_CHILDREN(_kern), OID_AUTO,
2082 "usched_bsd4", CTLFLAG_RD, 0, "");
2084 /* usched_bsd4 sysctl configurable parameters */
2085 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx,
2086 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
2087 OID_AUTO, "rrinterval", CTLFLAG_RW,
2088 &usched_bsd4_rrinterval, 0, "");
2089 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx,
2090 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
2091 OID_AUTO, "decay", CTLFLAG_RW,
2092 &usched_bsd4_decay, 0, "Extra decay when not running");
2093 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx,
2094 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
2095 OID_AUTO, "batch_time", CTLFLAG_RW,
2096 &usched_bsd4_batch_time, 0, "Min batch counter value");
2098 SYSINIT(uschedtd, SI_BOOT2_USCHED, SI_ORDER_SECOND,
2099 sched_sysctl_tree_init, NULL)