usched_bsd4 - Topology-aware scheduling
[dragonfly.git] / sys / kern / usched_bsd4.c
CommitLineData
38b25931
MD
1/*
2 * Copyright (c) 1999 Peter Wemm <peter@FreeBSD.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
38b25931
MD
25 */
26
27#include <sys/param.h>
28#include <sys/systm.h>
29#include <sys/kernel.h>
30#include <sys/lock.h>
31#include <sys/queue.h>
32#include <sys/proc.h>
33#include <sys/rtprio.h>
38b25931
MD
34#include <sys/uio.h>
35#include <sys/sysctl.h>
36#include <sys/resourcevar.h>
52eedfb5 37#include <sys/spinlock.h>
d6d39bc7 38#include <sys/cpu_topology.h>
52eedfb5
MD
39#include <sys/thread2.h>
40#include <sys/spinlock2.h>
684a93c4 41#include <sys/mplock2.h>
52eedfb5 42
d6d39bc7
MC
43#include <sys/ktr.h>
44
45#include <machine/cpu.h>
46#include <machine/smp.h>
47
38b25931
MD
48/*
49 * Priorities. Note that with 32 run queues per scheduler each queue
50 * represents four priority levels.
51 */
52
53#define MAXPRI 128
54#define PRIMASK (MAXPRI - 1)
55#define PRIBASE_REALTIME 0
56#define PRIBASE_NORMAL MAXPRI
57#define PRIBASE_IDLE (MAXPRI * 2)
58#define PRIBASE_THREAD (MAXPRI * 3)
59#define PRIBASE_NULL (MAXPRI * 4)
60
61#define NQS 32 /* 32 run queues. */
62#define PPQ (MAXPRI / NQS) /* priorities per queue */
52eedfb5 63#define PPQMASK (PPQ - 1)
38b25931
MD
64
65/*
66 * NICEPPQ - number of nice units per priority queue
38b25931
MD
67 *
68 * ESTCPUPPQ - number of estcpu units per priority queue
69 * ESTCPUMAX - number of estcpu units
38b25931
MD
70 */
71#define NICEPPQ 2
38b25931
MD
72#define ESTCPUPPQ 512
73#define ESTCPUMAX (ESTCPUPPQ * NQS)
52cac9fb 74#define BATCHMAX (ESTCPUFREQ * 30)
38b25931
MD
75#define PRIO_RANGE (PRIO_MAX - PRIO_MIN + 1)
76
77#define ESTCPULIM(v) min((v), ESTCPUMAX)
78
553ea3c8 79TAILQ_HEAD(rq, lwp);
38b25931 80
553ea3c8
SS
81#define lwp_priority lwp_usdata.bsd4.priority
82#define lwp_rqindex lwp_usdata.bsd4.rqindex
553ea3c8 83#define lwp_estcpu lwp_usdata.bsd4.estcpu
52cac9fb 84#define lwp_batch lwp_usdata.bsd4.batch
52eedfb5 85#define lwp_rqtype lwp_usdata.bsd4.rqtype
38b25931 86
553ea3c8
SS
87static void bsd4_acquire_curproc(struct lwp *lp);
88static void bsd4_release_curproc(struct lwp *lp);
38b25931 89static void bsd4_select_curproc(globaldata_t gd);
553ea3c8 90static void bsd4_setrunqueue(struct lwp *lp);
553ea3c8 91static void bsd4_schedulerclock(struct lwp *lp, sysclock_t period,
38b25931 92 sysclock_t cpstamp);
52eedfb5 93static void bsd4_recalculate_estcpu(struct lwp *lp);
553ea3c8
SS
94static void bsd4_resetpriority(struct lwp *lp);
95static void bsd4_forking(struct lwp *plp, struct lwp *lp);
52cac9fb 96static void bsd4_exiting(struct lwp *lp, struct proc *);
c3149361 97static void bsd4_yield(struct lwp *lp);
38b25931 98
52eedfb5
MD
99#ifdef SMP
100static void need_user_resched_remote(void *dummy);
d6d39bc7
MC
101static int batchy_looser_pri_test(struct lwp* lp);
102static struct lwp *chooseproc_locked_cache_coherent(struct lwp *chklp);
52eedfb5
MD
103#endif
104static struct lwp *chooseproc_locked(struct lwp *chklp);
105static void bsd4_remrunqueue_locked(struct lwp *lp);
106static void bsd4_setrunqueue_locked(struct lwp *lp);
38b25931
MD
107
108struct usched usched_bsd4 = {
109 { NULL },
110 "bsd4", "Original DragonFly Scheduler",
cb7f4ab1
MD
111 NULL, /* default registration */
112 NULL, /* default deregistration */
38b25931
MD
113 bsd4_acquire_curproc,
114 bsd4_release_curproc,
38b25931 115 bsd4_setrunqueue,
38b25931
MD
116 bsd4_schedulerclock,
117 bsd4_recalculate_estcpu,
118 bsd4_resetpriority,
119 bsd4_forking,
cb7f4ab1 120 bsd4_exiting,
c3149361
MD
121 NULL, /* setcpumask not supported */
122 bsd4_yield
38b25931
MD
123};
124
52eedfb5 125struct usched_bsd4_pcpu {
d6d39bc7
MC
126 struct thread helper_thread;
127 short rrcount;
128 short upri;
129 struct lwp *uschedcp;
130 struct lwp *old_uschedcp;
131#ifdef SMP
132 cpu_node_t *cpunode;
133#endif
52eedfb5
MD
134};
135
136typedef struct usched_bsd4_pcpu *bsd4_pcpu_t;
137
38b25931
MD
138/*
139 * We have NQS (32) run queues per scheduling class. For the normal
140 * class, there are 128 priorities scaled onto these 32 queues. New
141 * processes are added to the last entry in each queue, and processes
142 * are selected for running by taking them from the head and maintaining
143 * a simple FIFO arrangement. Realtime and Idle priority processes have
144 * and explicit 0-31 priority which maps directly onto their class queue
145 * index. When a queue has something in it, the corresponding bit is
146 * set in the queuebits variable, allowing a single read to determine
147 * the state of all 32 queues and then a ffs() to find the first busy
148 * queue.
149 */
52eedfb5
MD
150static struct rq bsd4_queues[NQS];
151static struct rq bsd4_rtqueues[NQS];
152static struct rq bsd4_idqueues[NQS];
153static u_int32_t bsd4_queuebits;
154static u_int32_t bsd4_rtqueuebits;
155static u_int32_t bsd4_idqueuebits;
156static cpumask_t bsd4_curprocmask = -1; /* currently running a user process */
157static cpumask_t bsd4_rdyprocmask; /* ready to accept a user process */
158static int bsd4_runqcount;
38b25931 159#ifdef SMP
52eedfb5 160static volatile int bsd4_scancpu;
38b25931 161#endif
52eedfb5
MD
162static struct spinlock bsd4_spin;
163static struct usched_bsd4_pcpu bsd4_pcpu[MAXCPU];
d6d39bc7
MC
164static struct sysctl_ctx_list usched_bsd4_sysctl_ctx;
165static struct sysctl_oid *usched_bsd4_sysctl_tree;
166
167/* Debug info exposed through debug.* sysctl */
38b25931 168
0c52fa62
SG
169SYSCTL_INT(_debug, OID_AUTO, bsd4_runqcount, CTLFLAG_RD, &bsd4_runqcount, 0,
170 "Number of run queues");
38b25931
MD
171#ifdef INVARIANTS
172static int usched_nonoptimal;
173SYSCTL_INT(_debug, OID_AUTO, usched_nonoptimal, CTLFLAG_RW,
174 &usched_nonoptimal, 0, "acquire_curproc() was not optimal");
175static int usched_optimal;
176SYSCTL_INT(_debug, OID_AUTO, usched_optimal, CTLFLAG_RW,
177 &usched_optimal, 0, "acquire_curproc() was optimal");
178#endif
d6d39bc7
MC
179
180static int usched_bsd4_debug = -1;
181SYSCTL_INT(_debug, OID_AUTO, scdebug, CTLFLAG_RW, &usched_bsd4_debug, 0,
0c52fa62 182 "Print debug information for this pid");
d6d39bc7
MC
183static int usched_bsd4_pid_debug = -1;
184SYSCTL_INT(_debug, OID_AUTO, pid_debug, CTLFLAG_RW, &usched_bsd4_pid_debug, 0,
185 "Print KTR debug information for this pid");
186
38b25931 187#ifdef SMP
38b25931
MD
188static int remote_resched_nonaffinity;
189static int remote_resched_affinity;
190static int choose_affinity;
38b25931
MD
191SYSCTL_INT(_debug, OID_AUTO, remote_resched_nonaffinity, CTLFLAG_RD,
192 &remote_resched_nonaffinity, 0, "Number of remote rescheds");
193SYSCTL_INT(_debug, OID_AUTO, remote_resched_affinity, CTLFLAG_RD,
194 &remote_resched_affinity, 0, "Number of remote rescheds");
195SYSCTL_INT(_debug, OID_AUTO, choose_affinity, CTLFLAG_RD,
196 &choose_affinity, 0, "chooseproc() was smart");
197#endif
198
d6d39bc7
MC
199
200/* Tunning usched_bsd4 - configurable through kern.usched_bsd4.* */
201#ifdef SMP
202static int usched_bsd4_smt = 0;
203static int usched_bsd4_cache_coherent = 0;
204static int usched_bsd4_upri_affinity = 16; /* 32 queues - half-way */
205static int usched_bsd4_queue_checks = 5;
206static int usched_bsd4_stick_to_level = 0;
207#endif
38b25931 208static int usched_bsd4_rrinterval = (ESTCPUFREQ + 9) / 10;
52cac9fb 209static int usched_bsd4_decay = 8;
52cac9fb 210static int usched_bsd4_batch_time = 10;
d6d39bc7
MC
211
212/* KTR debug printings */
213
214KTR_INFO_MASTER(usched);
215
216#if !defined(KTR_USCHED_BSD4)
217#define KTR_USCHED_BSD4 KTR_ALL
218#endif
219
220KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_acquire_curproc_urw, 0,
221 "USCHED_BSD4(bsd4_acquire_curproc in user_reseched_wanted "
222 "after release: pid %d, cpuid %d, curr_cpuid %d)",
223 pid_t pid, int cpuid, int curr);
224KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_acquire_curproc_before_loop, 0,
225 "USCHED_BSD4(bsd4_acquire_curproc before loop: pid %d, cpuid %d, "
226 "curr_cpuid %d)",
227 pid_t pid, int cpuid, int curr);
228KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_acquire_curproc_not, 0,
229 "USCHED_BSD4(bsd4_acquire_curproc couldn't acquire after "
230 "bsd4_setrunqueue: pid %d, cpuid %d, curr_lp pid %d, curr_cpuid %d)",
231 pid_t pid, int cpuid, pid_t curr_pid, int curr_cpuid);
232KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_acquire_curproc_switch, 0,
233 "USCHED_BSD4(bsd4_acquire_curproc after lwkt_switch: pid %d, "
234 "cpuid %d, curr_cpuid %d)",
235 pid_t pid, int cpuid, int curr);
236
237KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_release_curproc, 0,
238 "USCHED_BSD4(bsd4_release_curproc before select: pid %d, "
239 "cpuid %d, curr_cpuid %d)",
240 pid_t pid, int cpuid, int curr);
241
242KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_select_curproc, 0,
243 "USCHED_BSD4(bsd4_release_curproc before select: pid %d, "
244 "cpuid %d, old_pid %d, old_cpuid %d, curr_cpuid %d)",
245 pid_t pid, int cpuid, pid_t old_pid, int old_cpuid, int curr);
246
247#ifdef SMP
248KTR_INFO(KTR_USCHED_BSD4, usched, batchy_test_false, 0,
249 "USCHED_BSD4(batchy_looser_pri_test false: pid %d, "
250 "cpuid %d, verify_mask %lu)",
251 pid_t pid, int cpuid, cpumask_t mask);
252KTR_INFO(KTR_USCHED_BSD4, usched, batchy_test_true, 0,
253 "USCHED_BSD4(batchy_looser_pri_test true: pid %d, "
254 "cpuid %d, verify_mask %lu)",
255 pid_t pid, int cpuid, cpumask_t mask);
256
257KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_fc_smt, 0,
258 "USCHED_BSD4(bsd4_setrunqueue free cpus smt: pid %d, cpuid %d, "
259 "mask %lu, curr_cpuid %d)",
260 pid_t pid, int cpuid, cpumask_t mask, int curr);
261KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_fc_non_smt, 0,
262 "USCHED_BSD4(bsd4_setrunqueue free cpus check non_smt: pid %d, "
263 "cpuid %d, mask %lu, curr_cpuid %d)",
264 pid_t pid, int cpuid, cpumask_t mask, int curr);
265KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_rc, 0,
266 "USCHED_BSD4(bsd4_setrunqueue running cpus check: pid %d, "
267 "cpuid %d, mask %lu, curr_cpuid %d)",
268 pid_t pid, int cpuid, cpumask_t mask, int curr);
269KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_found, 0,
270 "USCHED_BSD4(bsd4_setrunqueue found cpu: pid %d, cpuid %d, "
271 "mask %lu, found_cpuid %d, curr_cpuid %d)",
272 pid_t pid, int cpuid, cpumask_t mask, int found_cpuid, int curr);
273KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_not_found, 0,
274 "USCHED_BSD4(bsd4_setrunqueue not found cpu: pid %d, cpuid %d, "
275 "try_cpuid %d, curr_cpuid %d)",
276 pid_t pid, int cpuid, int try_cpuid, int curr);
277KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_found_best_cpuid, 0,
278 "USCHED_BSD4(bsd4_setrunqueue found cpu: pid %d, cpuid %d, "
279 "mask %lu, found_cpuid %d, curr_cpuid %d)",
280 pid_t pid, int cpuid, cpumask_t mask, int found_cpuid, int curr);
281#endif
282
283KTR_INFO(KTR_USCHED_BSD4, usched, chooseproc, 0,
284 "USCHED_BSD4(chooseproc: pid %d, old_cpuid %d, curr_cpuid %d)",
285 pid_t pid, int old_cpuid, int curr);
286#ifdef SMP
287KTR_INFO(KTR_USCHED_BSD4, usched, chooseproc_cc, 0,
288 "USCHED_BSD4(chooseproc_cc: pid %d, old_cpuid %d, curr_cpuid %d)",
289 pid_t pid, int old_cpuid, int curr);
290KTR_INFO(KTR_USCHED_BSD4, usched, chooseproc_cc_not_good, 0,
291 "USCHED_BSD4(chooseproc_cc not good: pid %d, old_cpumask %lu, "
292 "sibling_mask %lu, curr_cpumask %lu)",
293 pid_t pid, cpumask_t old_cpumask, cpumask_t sibling_mask, cpumask_t curr);
294KTR_INFO(KTR_USCHED_BSD4, usched, chooseproc_cc_elected, 0,
295 "USCHED_BSD4(chooseproc_cc elected: pid %d, old_cpumask %lu, "
296 "sibling_mask %lu, curr_cpumask: %lu)",
297 pid_t pid, cpumask_t old_cpumask, cpumask_t sibling_mask, cpumask_t curr);
298
299KTR_INFO(KTR_USCHED_BSD4, usched, sched_thread_no_process, 0,
300 "USCHED_BSD4(sched_thread %d no process scheduled: pid %d, old_cpuid %d)",
301 int id, pid_t pid, int cpuid);
302KTR_INFO(KTR_USCHED_BSD4, usched, sched_thread_process, 0,
303 "USCHED_BSD4(sched_thread %d process scheduled: pid %d, old_cpuid %d)",
304 int id, pid_t pid, int cpuid);
305KTR_INFO(KTR_USCHED_BSD4, usched, sched_thread_no_process_found, 0,
306 "USCHED_BSD4(sched_thread %d no process found; tmpmask %lu)",
307 int id, cpumask_t tmpmask);
308#endif
38b25931
MD
309
310/*
311 * Initialize the run queues at boot time.
312 */
313static void
314rqinit(void *dummy)
315{
316 int i;
317
52eedfb5 318 spin_init(&bsd4_spin);
38b25931 319 for (i = 0; i < NQS; i++) {
52eedfb5
MD
320 TAILQ_INIT(&bsd4_queues[i]);
321 TAILQ_INIT(&bsd4_rtqueues[i]);
322 TAILQ_INIT(&bsd4_idqueues[i]);
38b25931 323 }
da23a592 324 atomic_clear_cpumask(&bsd4_curprocmask, 1);
38b25931 325}
ba39e2e0 326SYSINIT(runqueue, SI_BOOT2_USCHED, SI_ORDER_FIRST, rqinit, NULL)
38b25931
MD
327
328/*
52eedfb5 329 * BSD4_ACQUIRE_CURPROC
38b25931 330 *
52eedfb5
MD
331 * This function is called when the kernel intends to return to userland.
332 * It is responsible for making the thread the current designated userland
333 * thread for this cpu, blocking if necessary.
334 *
b9eb1c19
MD
335 * The kernel has already depressed our LWKT priority so we must not switch
336 * until we have either assigned or disposed of the thread.
52eedfb5
MD
337 *
338 * WARNING! THIS FUNCTION IS ALLOWED TO CAUSE THE CURRENT THREAD TO MIGRATE
339 * TO ANOTHER CPU! Because most of the kernel assumes that no migration will
340 * occur, this function is called only under very controlled circumstances.
341 *
52eedfb5 342 * MPSAFE
38b25931 343 */
52eedfb5
MD
344static void
345bsd4_acquire_curproc(struct lwp *lp)
38b25931 346{
b9eb1c19
MD
347 globaldata_t gd;
348 bsd4_pcpu_t dd;
4643740a 349 thread_t td;
85946b6c 350#if 0
b9eb1c19 351 struct lwp *olp;
85946b6c 352#endif
38b25931 353
4643740a
MD
354 /*
355 * Make sure we aren't sitting on a tsleep queue.
356 */
357 td = lp->lwp_thread;
358 crit_enter_quick(td);
359 if (td->td_flags & TDF_TSLEEPQ)
360 tsleep_remove(td);
b9eb1c19 361 bsd4_recalculate_estcpu(lp);
38b25931 362
38b25931 363 /*
b9eb1c19
MD
364 * If a reschedule was requested give another thread the
365 * driver's seat.
38b25931 366 */
b9eb1c19
MD
367 if (user_resched_wanted()) {
368 clear_user_resched();
369 bsd4_release_curproc(lp);
d6d39bc7
MC
370
371 KTR_COND_LOG(usched_bsd4_acquire_curproc_urw,
372 lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
373 lp->lwp_proc->p_pid,
374 lp->lwp_thread->td_gd->gd_cpuid,
375 mycpu->gd_cpuid);
38b25931 376 }
38b25931 377
52eedfb5 378 /*
b9eb1c19 379 * Loop until we are the current user thread
52eedfb5 380 */
85946b6c
MD
381 gd = mycpu;
382 dd = &bsd4_pcpu[gd->gd_cpuid];
383
d6d39bc7
MC
384 KTR_COND_LOG(usched_bsd4_acquire_curproc_before_loop,
385 lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
386 lp->lwp_proc->p_pid,
387 lp->lwp_thread->td_gd->gd_cpuid,
388 gd->gd_cpuid);
389
52eedfb5 390 do {
b9eb1c19 391 /*
85946b6c 392 * Process any pending events and higher priority threads.
b9eb1c19 393 */
85946b6c 394 lwkt_yield();
b9eb1c19
MD
395
396 /*
397 * Become the currently scheduled user thread for this cpu
398 * if we can do so trivially.
399 *
400 * We can steal another thread's current thread designation
401 * on this cpu since if we are running that other thread
402 * must not be, so we can safely deschedule it.
403 */
404 if (dd->uschedcp == lp) {
eb501f47
MD
405 /*
406 * We are already the current lwp (hot path).
407 */
b9eb1c19
MD
408 dd->upri = lp->lwp_priority;
409 } else if (dd->uschedcp == NULL) {
eb501f47
MD
410 /*
411 * We can trivially become the current lwp.
412 */
da23a592 413 atomic_set_cpumask(&bsd4_curprocmask, gd->gd_cpumask);
b9eb1c19
MD
414 dd->uschedcp = lp;
415 dd->upri = lp->lwp_priority;
416 } else if (dd->upri > lp->lwp_priority) {
eb501f47 417 /*
85946b6c
MD
418 * We can steal the current cpu's lwp designation
419 * away simply by replacing it. The other thread
420 * will stall when it tries to return to userland.
eb501f47 421 */
b9eb1c19
MD
422 dd->uschedcp = lp;
423 dd->upri = lp->lwp_priority;
85946b6c 424 /*
b9eb1c19
MD
425 lwkt_deschedule(olp->lwp_thread);
426 bsd4_setrunqueue(olp);
85946b6c 427 */
b9eb1c19 428 } else {
eb501f47
MD
429 /*
430 * We cannot become the current lwp, place the lp
431 * on the bsd4 run-queue and deschedule ourselves.
85946b6c
MD
432 *
433 * When we are reactivated we will have another
434 * chance.
eb501f47 435 */
b9eb1c19 436 lwkt_deschedule(lp->lwp_thread);
d6d39bc7 437
b9eb1c19 438 bsd4_setrunqueue(lp);
d6d39bc7
MC
439
440 KTR_COND_LOG(usched_bsd4_acquire_curproc_not,
441 lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
442 lp->lwp_proc->p_pid,
443 lp->lwp_thread->td_gd->gd_cpuid,
444 dd->uschedcp->lwp_proc->p_pid,
445 gd->gd_cpuid);
446
447
b9eb1c19 448 lwkt_switch();
d6d39bc7 449
85946b6c
MD
450 /*
451 * Reload after a switch or setrunqueue/switch possibly
452 * moved us to another cpu.
453 */
454 gd = mycpu;
455 dd = &bsd4_pcpu[gd->gd_cpuid];
d6d39bc7
MC
456
457 KTR_COND_LOG(usched_bsd4_acquire_curproc_switch,
458 lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
459 lp->lwp_proc->p_pid,
460 lp->lwp_thread->td_gd->gd_cpuid,
461 gd->gd_cpuid);
b9eb1c19 462 }
52eedfb5 463 } while (dd->uschedcp != lp);
b9eb1c19 464
4643740a
MD
465 crit_exit_quick(td);
466 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0);
52eedfb5
MD
467}
468
469/*
470 * BSD4_RELEASE_CURPROC
471 *
472 * This routine detaches the current thread from the userland scheduler,
b9eb1c19
MD
473 * usually because the thread needs to run or block in the kernel (at
474 * kernel priority) for a while.
52eedfb5
MD
475 *
476 * This routine is also responsible for selecting a new thread to
477 * make the current thread.
478 *
479 * NOTE: This implementation differs from the dummy example in that
480 * bsd4_select_curproc() is able to select the current process, whereas
481 * dummy_select_curproc() is not able to select the current process.
482 * This means we have to NULL out uschedcp.
483 *
484 * Additionally, note that we may already be on a run queue if releasing
485 * via the lwkt_switch() in bsd4_setrunqueue().
486 *
52eedfb5
MD
487 * MPSAFE
488 */
d6d39bc7 489
52eedfb5
MD
490static void
491bsd4_release_curproc(struct lwp *lp)
492{
493 globaldata_t gd = mycpu;
494 bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid];
495
496 if (dd->uschedcp == lp) {
b9eb1c19 497 crit_enter();
4643740a 498 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0);
d6d39bc7
MC
499
500 KTR_COND_LOG(usched_bsd4_release_curproc,
501 lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
502 lp->lwp_proc->p_pid,
503 lp->lwp_thread->td_gd->gd_cpuid,
504 gd->gd_cpuid);
505
52eedfb5 506 dd->uschedcp = NULL; /* don't let lp be selected */
b9eb1c19 507 dd->upri = PRIBASE_NULL;
da23a592 508 atomic_clear_cpumask(&bsd4_curprocmask, gd->gd_cpumask);
d6d39bc7 509 dd->old_uschedcp = lp; /* used only for KTR debug prints */
52eedfb5 510 bsd4_select_curproc(gd);
b9eb1c19 511 crit_exit();
52eedfb5 512 }
38b25931
MD
513}
514
38b25931 515/*
52eedfb5
MD
516 * BSD4_SELECT_CURPROC
517 *
b9eb1c19
MD
518 * Select a new current process for this cpu and clear any pending user
519 * reschedule request. The cpu currently has no current process.
52eedfb5
MD
520 *
521 * This routine is also responsible for equal-priority round-robining,
522 * typically triggered from bsd4_schedulerclock(). In our dummy example
523 * all the 'user' threads are LWKT scheduled all at once and we just
524 * call lwkt_switch().
525 *
b9eb1c19
MD
526 * The calling process is not on the queue and cannot be selected.
527 *
52eedfb5 528 * MPSAFE
38b25931
MD
529 */
530static
531void
52eedfb5 532bsd4_select_curproc(globaldata_t gd)
38b25931 533{
52eedfb5
MD
534 bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid];
535 struct lwp *nlp;
536 int cpuid = gd->gd_cpuid;
38b25931 537
52eedfb5 538 crit_enter_gd(gd);
52eedfb5 539
287a8577 540 spin_lock(&bsd4_spin);
d6d39bc7
MC
541#ifdef SMP
542 if(usched_bsd4_cache_coherent)
543 nlp = chooseproc_locked_cache_coherent(dd->uschedcp);
544 else
545#endif
546 nlp = chooseproc_locked(dd->uschedcp);
547
548 if (nlp) {
549
550 KTR_COND_LOG(usched_bsd4_select_curproc,
551 nlp->lwp_proc->p_pid == usched_bsd4_pid_debug,
552 nlp->lwp_proc->p_pid,
553 nlp->lwp_thread->td_gd->gd_cpuid,
554 dd->old_uschedcp->lwp_proc->p_pid,
555 dd->old_uschedcp->lwp_thread->td_gd->gd_cpuid,
556 gd->gd_cpuid);
557
da23a592 558 atomic_set_cpumask(&bsd4_curprocmask, CPUMASK(cpuid));
52eedfb5
MD
559 dd->upri = nlp->lwp_priority;
560 dd->uschedcp = nlp;
287a8577 561 spin_unlock(&bsd4_spin);
52eedfb5
MD
562#ifdef SMP
563 lwkt_acquire(nlp->lwp_thread);
38b25931 564#endif
52eedfb5 565 lwkt_schedule(nlp->lwp_thread);
eb501f47
MD
566 } else {
567 spin_unlock(&bsd4_spin);
568 }
d6d39bc7 569
eb501f47 570#if 0
da23a592
MD
571 } else if (bsd4_runqcount && (bsd4_rdyprocmask & CPUMASK(cpuid))) {
572 atomic_clear_cpumask(&bsd4_rdyprocmask, CPUMASK(cpuid));
287a8577 573 spin_unlock(&bsd4_spin);
52eedfb5
MD
574 lwkt_schedule(&dd->helper_thread);
575 } else {
287a8577 576 spin_unlock(&bsd4_spin);
52eedfb5 577 }
eb501f47 578#endif
52eedfb5
MD
579 crit_exit_gd(gd);
580}
d6d39bc7
MC
581#ifdef SMP
582
583/*
584 * batchy_looser_pri_test() - determine if a process is batchy or not
585 * relative to the other processes running in the system
586 */
587static int
588batchy_looser_pri_test(struct lwp* lp)
589{
590 cpumask_t mask;
591 bsd4_pcpu_t other_dd;
592 int cpu;
593
594 /* Current running processes */
595 mask = bsd4_curprocmask & smp_active_mask
596 & usched_global_cpumask;
597
598 while(mask) {
599 cpu = BSFCPUMASK(mask);
600 other_dd = &bsd4_pcpu[cpu];
601 if (other_dd->upri - lp->lwp_priority > usched_bsd4_upri_affinity * PPQ) {
602
603 KTR_COND_LOG(usched_batchy_test_false,
604 lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
605 lp->lwp_proc->p_pid,
606 lp->lwp_thread->td_gd->gd_cpuid,
607 mask);
608
609 return 0;
610 }
611 mask &= ~CPUMASK(cpu);
612 }
613
614 KTR_COND_LOG(usched_batchy_test_true,
615 lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
616 lp->lwp_proc->p_pid,
617 lp->lwp_thread->td_gd->gd_cpuid,
618 mask);
619
620 return 1;
621}
38b25931 622
d6d39bc7 623#endif
38b25931 624/*
d6d39bc7 625 *
52eedfb5
MD
626 * BSD4_SETRUNQUEUE
627 *
b9eb1c19
MD
628 * Place the specified lwp on the user scheduler's run queue. This routine
629 * must be called with the thread descheduled. The lwp must be runnable.
38b25931 630 *
b9eb1c19 631 * The thread may be the current thread as a special case.
52eedfb5
MD
632 *
633 * MPSAFE
38b25931
MD
634 */
635static void
553ea3c8 636bsd4_setrunqueue(struct lwp *lp)
38b25931 637{
52eedfb5
MD
638 globaldata_t gd;
639 bsd4_pcpu_t dd;
38b25931 640#ifdef SMP
b9eb1c19 641 int cpuid;
38b25931 642 cpumask_t mask;
52eedfb5 643 cpumask_t tmpmask;
38b25931
MD
644#endif
645
52eedfb5
MD
646 /*
647 * First validate the process state relative to the current cpu.
648 * We don't need the spinlock for this, just a critical section.
649 * We are in control of the process.
650 */
38b25931 651 crit_enter();
164b8401 652 KASSERT(lp->lwp_stat == LSRUN, ("setrunqueue: lwp not LSRUN"));
4643740a 653 KASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0,
164b8401 654 ("lwp %d/%d already on runq! flag %08x/%08x", lp->lwp_proc->p_pid,
4643740a 655 lp->lwp_tid, lp->lwp_proc->p_flags, lp->lwp_flags));
553ea3c8 656 KKASSERT((lp->lwp_thread->td_flags & TDF_RUNQ) == 0);
38b25931
MD
657
658 /*
52eedfb5
MD
659 * Note: gd and dd are relative to the target thread's last cpu,
660 * NOT our current cpu.
38b25931 661 */
553ea3c8 662 gd = lp->lwp_thread->td_gd;
52eedfb5 663 dd = &bsd4_pcpu[gd->gd_cpuid];
38b25931
MD
664
665 /*
52eedfb5
MD
666 * This process is not supposed to be scheduled anywhere or assigned
667 * as the current process anywhere. Assert the condition.
38b25931 668 */
52eedfb5 669 KKASSERT(dd->uschedcp != lp);
38b25931 670
b9eb1c19 671#ifndef SMP
38b25931 672 /*
b9eb1c19
MD
673 * If we are not SMP we do not have a scheduler helper to kick
674 * and must directly activate the process if none are scheduled.
38b25931 675 *
b9eb1c19
MD
676 * This is really only an issue when bootstrapping init since
677 * the caller in all other cases will be a user process, and
678 * even if released (dd->uschedcp == NULL), that process will
679 * kickstart the scheduler when it returns to user mode from
680 * the kernel.
38b25931 681 */
b9eb1c19 682 if (dd->uschedcp == NULL) {
da23a592 683 atomic_set_cpumask(&bsd4_curprocmask, gd->gd_cpumask);
52eedfb5
MD
684 dd->uschedcp = lp;
685 dd->upri = lp->lwp_priority;
553ea3c8 686 lwkt_schedule(lp->lwp_thread);
38b25931 687 crit_exit();
38b25931
MD
688 return;
689 }
b9eb1c19 690#endif
38b25931 691
38b25931
MD
692#ifdef SMP
693 /*
52eedfb5
MD
694 * XXX fixme. Could be part of a remrunqueue/setrunqueue
695 * operation when the priority is recalculated, so TDF_MIGRATING
696 * may already be set.
38b25931 697 */
52eedfb5
MD
698 if ((lp->lwp_thread->td_flags & TDF_MIGRATING) == 0)
699 lwkt_giveaway(lp->lwp_thread);
700#endif
50017724
MD
701
702 /*
703 * We lose control of lp the moment we release the spinlock after
704 * having placed lp on the queue. i.e. another cpu could pick it
705 * up and it could exit, or its priority could be further adjusted,
706 * or something like that.
707 */
287a8577 708 spin_lock(&bsd4_spin);
52eedfb5 709 bsd4_setrunqueue_locked(lp);
d6d39bc7 710 lp->lwp_setrunqueue_ticks = sched_ticks;
38b25931 711
b9eb1c19 712#ifdef SMP
38b25931 713 /*
b9eb1c19
MD
714 * Kick the scheduler helper on one of the other cpu's
715 * and request a reschedule if appropriate.
eb501f47
MD
716 *
717 * NOTE: We check all cpus whos rdyprocmask is set. First we
718 * look for cpus without designated lps, then we look for
719 * cpus with designated lps with a worse priority than our
720 * process.
38b25931 721 */
b9eb1c19 722 ++bsd4_scancpu;
38b25931 723
d6d39bc7
MC
724 if(usched_bsd4_smt) {
725
726 /*
727 * SMT heuristic - Try to schedule on a free physical core. If no physical core
728 * found than choose the one that has an interactive thread
729 */
730
731 int best_cpuid = -1;
732 int min_prio = MAXPRI * MAXPRI;
733 int sibling;
734
735 cpuid = (bsd4_scancpu & 0xFFFF) % ncpus;
736 mask = ~bsd4_curprocmask & bsd4_rdyprocmask & lp->lwp_cpumask &
737 smp_active_mask & usched_global_cpumask;
738
739 KTR_COND_LOG(usched_bsd4_setrunqueue_fc_smt,
740 lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
741 lp->lwp_proc->p_pid,
742 lp->lwp_thread->td_gd->gd_cpuid,
743 mask,
744 mycpu->gd_cpuid);
745
746 while (mask) {
747 tmpmask = ~(CPUMASK(cpuid) - 1);
748 if (mask & tmpmask)
749 cpuid = BSFCPUMASK(mask & tmpmask);
750 else
751 cpuid = BSFCPUMASK(mask);
752 gd = globaldata_find(cpuid);
753 dd = &bsd4_pcpu[cpuid];
754
755 if ((dd->upri & ~PPQMASK) >= (lp->lwp_priority & ~PPQMASK)) {
756 if (dd->cpunode->parent_node->members & ~dd->cpunode->members & mask) {
757
758 KTR_COND_LOG(usched_bsd4_setrunqueue_found,
759 lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
760 lp->lwp_proc->p_pid,
761 lp->lwp_thread->td_gd->gd_cpuid,
762 mask,
763 cpuid,
764 mycpu->gd_cpuid);
765
766 goto found;
767 } else {
768 sibling = BSFCPUMASK(dd->cpunode->parent_node->members &
769 ~dd->cpunode->members);
770 if (min_prio > bsd4_pcpu[sibling].upri) {
771 min_prio = bsd4_pcpu[sibling].upri;
772 best_cpuid = cpuid;
773 }
774 }
775 }
776 mask &= ~CPUMASK(cpuid);
777 }
778
779 if (best_cpuid != -1) {
780 cpuid = best_cpuid;
781 gd = globaldata_find(cpuid);
782 dd = &bsd4_pcpu[cpuid];
783
784 KTR_COND_LOG(usched_bsd4_setrunqueue_found_best_cpuid,
785 lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
786 lp->lwp_proc->p_pid,
787 lp->lwp_thread->td_gd->gd_cpuid,
788 mask,
789 cpuid,
790 mycpu->gd_cpuid);
b9eb1c19 791
eb501f47 792 goto found;
d6d39bc7
MC
793 }
794 } else {
795 /* Fallback to the original heuristic */
796 cpuid = (bsd4_scancpu & 0xFFFF) % ncpus;
797 mask = ~bsd4_curprocmask & bsd4_rdyprocmask & lp->lwp_cpumask &
798 smp_active_mask & usched_global_cpumask;
799
800 KTR_COND_LOG(usched_bsd4_setrunqueue_fc_non_smt,
801 lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
802 lp->lwp_proc->p_pid,
803 lp->lwp_thread->td_gd->gd_cpuid,
804 mask,
805 mycpu->gd_cpuid);
806
807 while (mask) {
808 tmpmask = ~(CPUMASK(cpuid) - 1);
809 if (mask & tmpmask)
810 cpuid = BSFCPUMASK(mask & tmpmask);
811 else
812 cpuid = BSFCPUMASK(mask);
813 gd = globaldata_find(cpuid);
814 dd = &bsd4_pcpu[cpuid];
815
816 if ((dd->upri & ~PPQMASK) >= (lp->lwp_priority & ~PPQMASK)) {
817
818 KTR_COND_LOG(usched_bsd4_setrunqueue_found,
819 lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
820 lp->lwp_proc->p_pid,
821 lp->lwp_thread->td_gd->gd_cpuid,
822 mask,
823 cpuid,
824 mycpu->gd_cpuid);
825
826 goto found;
827 }
828 mask &= ~CPUMASK(cpuid);
829 }
eb501f47
MD
830 }
831
832 /*
833 * Then cpus which might have a currently running lp
834 */
835 mask = bsd4_curprocmask & bsd4_rdyprocmask &
916e604f 836 lp->lwp_cpumask & smp_active_mask & usched_global_cpumask;
eb501f47 837
d6d39bc7
MC
838 KTR_COND_LOG(usched_bsd4_setrunqueue_rc,
839 lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
840 lp->lwp_proc->p_pid,
841 lp->lwp_thread->td_gd->gd_cpuid,
842 mask,
843 mycpu->gd_cpuid);
844
eb501f47
MD
845 while (mask) {
846 tmpmask = ~(CPUMASK(cpuid) - 1);
847 if (mask & tmpmask)
848 cpuid = BSFCPUMASK(mask & tmpmask);
849 else
850 cpuid = BSFCPUMASK(mask);
851 gd = globaldata_find(cpuid);
852 dd = &bsd4_pcpu[cpuid];
853
d6d39bc7
MC
854 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) {
855
856 KTR_COND_LOG(usched_bsd4_setrunqueue_found,
857 lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
858 lp->lwp_proc->p_pid,
859 lp->lwp_thread->td_gd->gd_cpuid,
860 mask,
861 cpuid,
862 mycpu->gd_cpuid);
863
eb501f47 864 goto found;
d6d39bc7 865 }
da23a592 866 mask &= ~CPUMASK(cpuid);
b9eb1c19 867 }
eb501f47
MD
868
869 /*
870 * If we cannot find a suitable cpu we reload from bsd4_scancpu
871 * and round-robin. Other cpus will pickup as they release their
872 * current lwps or become ready.
873 *
916e604f
MD
874 * Avoid a degenerate system lockup case if usched_global_cpumask
875 * is set to 0 or otherwise does not cover lwp_cpumask.
876 *
eb501f47
MD
877 * We only kick the target helper thread in this case, we do not
878 * set the user resched flag because
879 */
880 cpuid = (bsd4_scancpu & 0xFFFF) % ncpus;
916e604f
MD
881 if ((CPUMASK(cpuid) & usched_global_cpumask) == 0) {
882 cpuid = 0;
883 }
eb501f47
MD
884 gd = globaldata_find(cpuid);
885 dd = &bsd4_pcpu[cpuid];
d6d39bc7
MC
886
887 KTR_COND_LOG(usched_bsd4_setrunqueue_not_found,
888 lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
889 lp->lwp_proc->p_pid,
890 lp->lwp_thread->td_gd->gd_cpuid,
891 cpuid,
892 mycpu->gd_cpuid);
893
eb501f47
MD
894found:
895 if (gd == mycpu) {
896 spin_unlock(&bsd4_spin);
58bb3381
MD
897 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) {
898 if (dd->uschedcp == NULL) {
d6d39bc7 899 wakeup(&dd->helper_thread);
58bb3381
MD
900 } else {
901 need_user_resched();
902 }
903 }
eb501f47
MD
904 } else {
905 atomic_clear_cpumask(&bsd4_rdyprocmask, CPUMASK(cpuid));
906 spin_unlock(&bsd4_spin);
907 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK))
908 lwkt_send_ipiq(gd, need_user_resched_remote, NULL);
909 else
d6d39bc7 910 wakeup(&dd->helper_thread);
eb501f47 911 }
b9eb1c19
MD
912#else
913 /*
914 * Request a reschedule if appropriate.
915 */
287a8577 916 spin_unlock(&bsd4_spin);
b9eb1c19
MD
917 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) {
918 need_user_resched();
38b25931
MD
919 }
920#endif
921 crit_exit();
922}
923
924/*
38b25931 925 * This routine is called from a systimer IPI. It MUST be MP-safe and
52eedfb5
MD
926 * the BGL IS NOT HELD ON ENTRY. This routine is called at ESTCPUFREQ on
927 * each cpu.
928 *
270ac911 929 * MPSAFE
38b25931
MD
930 */
931static
932void
553ea3c8 933bsd4_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp)
38b25931
MD
934{
935 globaldata_t gd = mycpu;
52eedfb5 936 bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid];
38b25931
MD
937
938 /*
939 * Do we need to round-robin? We round-robin 10 times a second.
940 * This should only occur for cpu-bound batch processes.
941 */
52eedfb5
MD
942 if (++dd->rrcount >= usched_bsd4_rrinterval) {
943 dd->rrcount = 0;
38b25931
MD
944 need_user_resched();
945 }
946
947 /*
52cac9fb 948 * Adjust estcpu upward using a real time equivalent calculation.
38b25931 949 */
52cac9fb 950 lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu + ESTCPUMAX / ESTCPUFREQ + 1);
50017724
MD
951
952 /*
77912481
MD
953 * Spinlocks also hold a critical section so there should not be
954 * any active.
50017724 955 */
77912481
MD
956 KKASSERT(gd->gd_spinlocks_wr == 0);
957
958 bsd4_resetpriority(lp);
959#if 0
960 /*
961 * if we can't call bsd4_resetpriority for some reason we must call
962 * need user_resched().
963 */
964 need_user_resched();
965#endif
38b25931
MD
966}
967
968/*
52eedfb5 969 * Called from acquire and from kern_synch's one-second timer (one of the
d6d39bc7 970 * callout helper threads) with a critical section held.
38b25931 971 *
52eedfb5
MD
972 * Decay p_estcpu based on the number of ticks we haven't been running
973 * and our p_nice. As the load increases each process observes a larger
974 * number of idle ticks (because other processes are running in them).
975 * This observation leads to a larger correction which tends to make the
976 * system more 'batchy'.
38b25931 977 *
52eedfb5
MD
978 * Note that no recalculation occurs for a process which sleeps and wakes
979 * up in the same tick. That is, a system doing thousands of context
980 * switches per second will still only do serious estcpu calculations
981 * ESTCPUFREQ times per second.
38b25931 982 *
52eedfb5 983 * MPSAFE
38b25931
MD
984 */
985static
d6d39bc7 986void
52eedfb5 987bsd4_recalculate_estcpu(struct lwp *lp)
38b25931 988{
52eedfb5
MD
989 globaldata_t gd = mycpu;
990 sysclock_t cpbase;
52cac9fb
MD
991 sysclock_t ttlticks;
992 int estcpu;
993 int decay_factor;
38b25931
MD
994
995 /*
52eedfb5
MD
996 * We have to subtract periodic to get the last schedclock
997 * timeout time, otherwise we would get the upcoming timeout.
998 * Keep in mind that a process can migrate between cpus and
999 * while the scheduler clock should be very close, boundary
1000 * conditions could lead to a small negative delta.
38b25931 1001 */
52eedfb5 1002 cpbase = gd->gd_schedclock.time - gd->gd_schedclock.periodic;
38b25931 1003
52eedfb5
MD
1004 if (lp->lwp_slptime > 1) {
1005 /*
1006 * Too much time has passed, do a coarse correction.
1007 */
1008 lp->lwp_estcpu = lp->lwp_estcpu >> 1;
1009 bsd4_resetpriority(lp);
1010 lp->lwp_cpbase = cpbase;
1011 lp->lwp_cpticks = 0;
52cac9fb
MD
1012 lp->lwp_batch -= ESTCPUFREQ;
1013 if (lp->lwp_batch < 0)
1014 lp->lwp_batch = 0;
52eedfb5
MD
1015 } else if (lp->lwp_cpbase != cpbase) {
1016 /*
1017 * Adjust estcpu if we are in a different tick. Don't waste
d6d39bc7
MC
1018 * time if we are in the same tick.
1019 *
52eedfb5 1020 * First calculate the number of ticks in the measurement
52cac9fb 1021 * interval. The ttlticks calculation can wind up 0 due to
52eedfb5
MD
1022 * a bug in the handling of lwp_slptime (as yet not found),
1023 * so make sure we do not get a divide by 0 panic.
1024 */
52cac9fb
MD
1025 ttlticks = (cpbase - lp->lwp_cpbase) /
1026 gd->gd_schedclock.periodic;
1027 if (ttlticks < 0) {
1028 ttlticks = 0;
1029 lp->lwp_cpbase = cpbase;
52eedfb5 1030 }
52cac9fb
MD
1031 if (ttlticks == 0)
1032 return;
1033 updatepcpu(lp, lp->lwp_cpticks, ttlticks);
38b25931 1034
52eedfb5 1035 /*
52cac9fb
MD
1036 * Calculate the percentage of one cpu used factoring in ncpus
1037 * and the load and adjust estcpu. Handle degenerate cases
1038 * by adding 1 to bsd4_runqcount.
1039 *
1040 * estcpu is scaled by ESTCPUMAX.
1041 *
1042 * bsd4_runqcount is the excess number of user processes
1043 * that cannot be immediately scheduled to cpus. We want
1044 * to count these as running to avoid range compression
1045 * in the base calculation (which is the actual percentage
1046 * of one cpu used).
52eedfb5 1047 */
52cac9fb
MD
1048 estcpu = (lp->lwp_cpticks * ESTCPUMAX) *
1049 (bsd4_runqcount + ncpus) / (ncpus * ttlticks);
38b25931 1050
52eedfb5 1051 /*
52cac9fb
MD
1052 * If estcpu is > 50% we become more batch-like
1053 * If estcpu is <= 50% we become less batch-like
5c559233 1054 *
52cac9fb 1055 * It takes 30 cpu seconds to traverse the entire range.
52eedfb5 1056 */
52cac9fb
MD
1057 if (estcpu > ESTCPUMAX / 2) {
1058 lp->lwp_batch += ttlticks;
1059 if (lp->lwp_batch > BATCHMAX)
1060 lp->lwp_batch = BATCHMAX;
1061 } else {
1062 lp->lwp_batch -= ttlticks;
1063 if (lp->lwp_batch < 0)
1064 lp->lwp_batch = 0;
5c559233 1065 }
344ad853 1066
d6d39bc7 1067 if (usched_bsd4_debug == lp->lwp_proc->p_pid) {
52cac9fb
MD
1068 kprintf("pid %d lwp %p estcpu %3d %3d bat %d cp %d/%d",
1069 lp->lwp_proc->p_pid, lp,
1070 estcpu, lp->lwp_estcpu,
1071 lp->lwp_batch,
1072 lp->lwp_cpticks, ttlticks);
5c559233 1073 }
52cac9fb
MD
1074
1075 /*
1076 * Adjust lp->lwp_esetcpu. The decay factor determines how
1077 * quickly lwp_estcpu collapses to its realtime calculation.
1078 * A slower collapse gives us a more accurate number but
1079 * can cause a cpu hog to eat too much cpu before the
1080 * scheduler decides to downgrade it.
1081 *
1082 * NOTE: p_nice is accounted for in bsd4_resetpriority(),
1083 * and not here, but we must still ensure that a
1084 * cpu-bound nice -20 process does not completely
1085 * override a cpu-bound nice +20 process.
1086 *
1087 * NOTE: We must use ESTCPULIM() here to deal with any
1088 * overshoot.
1089 */
1090 decay_factor = usched_bsd4_decay;
1091 if (decay_factor < 1)
1092 decay_factor = 1;
1093 if (decay_factor > 1024)
1094 decay_factor = 1024;
1095
1096 lp->lwp_estcpu = ESTCPULIM(
1097 (lp->lwp_estcpu * decay_factor + estcpu) /
1098 (decay_factor + 1));
1099
d6d39bc7 1100 if (usched_bsd4_debug == lp->lwp_proc->p_pid)
52cac9fb 1101 kprintf(" finalestcpu %d\n", lp->lwp_estcpu);
52eedfb5 1102 bsd4_resetpriority(lp);
52cac9fb 1103 lp->lwp_cpbase += ttlticks * gd->gd_schedclock.periodic;
52eedfb5
MD
1104 lp->lwp_cpticks = 0;
1105 }
38b25931
MD
1106}
1107
1108/*
1109 * Compute the priority of a process when running in user mode.
1110 * Arrange to reschedule if the resulting priority is better
1111 * than that of the current process.
52eedfb5
MD
1112 *
1113 * This routine may be called with any process.
1114 *
1115 * This routine is called by fork1() for initial setup with the process
1116 * of the run queue, and also may be called normally with the process on or
1117 * off the run queue.
1118 *
1119 * MPSAFE
38b25931
MD
1120 */
1121static void
553ea3c8 1122bsd4_resetpriority(struct lwp *lp)
38b25931 1123{
52eedfb5 1124 bsd4_pcpu_t dd;
38b25931 1125 int newpriority;
52eedfb5
MD
1126 u_short newrqtype;
1127 int reschedcpu;
52cac9fb
MD
1128 int checkpri;
1129 int estcpu;
270ac911 1130
38b25931 1131 /*
52eedfb5 1132 * Calculate the new priority and queue type
38b25931 1133 */
52eedfb5 1134 crit_enter();
287a8577 1135 spin_lock(&bsd4_spin);
52eedfb5
MD
1136
1137 newrqtype = lp->lwp_rtprio.type;
1138
1139 switch(newrqtype) {
38b25931 1140 case RTP_PRIO_REALTIME:
f64250e0 1141 case RTP_PRIO_FIFO:
52eedfb5
MD
1142 newpriority = PRIBASE_REALTIME +
1143 (lp->lwp_rtprio.prio & PRIMASK);
1144 break;
38b25931 1145 case RTP_PRIO_NORMAL:
52cac9fb
MD
1146 /*
1147 * Detune estcpu based on batchiness. lwp_batch ranges
1148 * from 0 to BATCHMAX. Limit estcpu for the sake of
1149 * the priority calculation to between 50% and 100%.
1150 */
1151 estcpu = lp->lwp_estcpu * (lp->lwp_batch + BATCHMAX) /
1152 (BATCHMAX * 2);
1153
1154 /*
1155 * p_nice piece Adds (0-40) * 2 0-80
1156 * estcpu Adds 16384 * 4 / 512 0-128
1157 */
52eedfb5 1158 newpriority = (lp->lwp_proc->p_nice - PRIO_MIN) * PPQ / NICEPPQ;
52cac9fb 1159 newpriority += estcpu * PPQ / ESTCPUPPQ;
52eedfb5
MD
1160 newpriority = newpriority * MAXPRI / (PRIO_RANGE * PPQ /
1161 NICEPPQ + ESTCPUMAX * PPQ / ESTCPUPPQ);
1162 newpriority = PRIBASE_NORMAL + (newpriority & PRIMASK);
38b25931
MD
1163 break;
1164 case RTP_PRIO_IDLE:
52eedfb5
MD
1165 newpriority = PRIBASE_IDLE + (lp->lwp_rtprio.prio & PRIMASK);
1166 break;
38b25931 1167 case RTP_PRIO_THREAD:
52eedfb5
MD
1168 newpriority = PRIBASE_THREAD + (lp->lwp_rtprio.prio & PRIMASK);
1169 break;
1170 default:
1171 panic("Bad RTP_PRIO %d", newrqtype);
1172 /* NOT REACHED */
38b25931
MD
1173 }
1174
1175 /*
52eedfb5
MD
1176 * The newpriority incorporates the queue type so do a simple masked
1177 * check to determine if the process has moved to another queue. If
1178 * it has, and it is currently on a run queue, then move it.
38b25931 1179 */
52eedfb5
MD
1180 if ((lp->lwp_priority ^ newpriority) & ~PPQMASK) {
1181 lp->lwp_priority = newpriority;
4643740a 1182 if (lp->lwp_mpflags & LWP_MP_ONRUNQ) {
52eedfb5
MD
1183 bsd4_remrunqueue_locked(lp);
1184 lp->lwp_rqtype = newrqtype;
1185 lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ;
1186 bsd4_setrunqueue_locked(lp);
52cac9fb 1187 checkpri = 1;
52eedfb5
MD
1188 } else {
1189 lp->lwp_rqtype = newrqtype;
1190 lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ;
52cac9fb 1191 checkpri = 0;
52eedfb5 1192 }
52cac9fb 1193 reschedcpu = lp->lwp_thread->td_gd->gd_cpuid;
38b25931 1194 } else {
52eedfb5
MD
1195 lp->lwp_priority = newpriority;
1196 reschedcpu = -1;
52cac9fb 1197 checkpri = 1;
52eedfb5 1198 }
52eedfb5
MD
1199
1200 /*
50017724
MD
1201 * Determine if we need to reschedule the target cpu. This only
1202 * occurs if the LWP is already on a scheduler queue, which means
1203 * that idle cpu notification has already occured. At most we
1204 * need only issue a need_user_resched() on the appropriate cpu.
281b4fa8
YT
1205 *
1206 * The LWP may be owned by a CPU different from the current one,
1207 * in which case dd->uschedcp may be modified without an MP lock
1208 * or a spinlock held. The worst that happens is that the code
1209 * below causes a spurious need_user_resched() on the target CPU
1210 * and dd->pri to be wrong for a short period of time, both of
1211 * which are harmless.
52cac9fb
MD
1212 *
1213 * If checkpri is 0 we are adjusting the priority of the current
1214 * process, possibly higher (less desireable), so ignore the upri
1215 * check which will fail in that case.
52eedfb5
MD
1216 */
1217 if (reschedcpu >= 0) {
1218 dd = &bsd4_pcpu[reschedcpu];
eb501f47 1219 if ((bsd4_rdyprocmask & CPUMASK(reschedcpu)) &&
52cac9fb
MD
1220 (checkpri == 0 ||
1221 (dd->upri & ~PRIMASK) > (lp->lwp_priority & ~PRIMASK))) {
52eedfb5
MD
1222#ifdef SMP
1223 if (reschedcpu == mycpu->gd_cpuid) {
eb501f47 1224 spin_unlock(&bsd4_spin);
52eedfb5
MD
1225 need_user_resched();
1226 } else {
eb501f47
MD
1227 spin_unlock(&bsd4_spin);
1228 atomic_clear_cpumask(&bsd4_rdyprocmask,
1229 CPUMASK(reschedcpu));
52eedfb5
MD
1230 lwkt_send_ipiq(lp->lwp_thread->td_gd,
1231 need_user_resched_remote, NULL);
1232 }
1233#else
eb501f47 1234 spin_unlock(&bsd4_spin);
52eedfb5
MD
1235 need_user_resched();
1236#endif
eb501f47
MD
1237 } else {
1238 spin_unlock(&bsd4_spin);
52eedfb5 1239 }
eb501f47
MD
1240 } else {
1241 spin_unlock(&bsd4_spin);
38b25931
MD
1242 }
1243 crit_exit();
1244}
1245
3919ced0
MD
1246/*
1247 * MPSAFE
1248 */
c3149361
MD
1249static
1250void
d6d39bc7 1251bsd4_yield(struct lwp *lp)
c3149361
MD
1252{
1253#if 0
1254 /* FUTURE (or something similar) */
1255 switch(lp->lwp_rqtype) {
1256 case RTP_PRIO_NORMAL:
1257 lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu + ESTCPUINCR);
c3149361
MD
1258 break;
1259 default:
1260 break;
1261 }
1262#endif
1263 need_user_resched();
1264}
1265
38b25931
MD
1266/*
1267 * Called from fork1() when a new child process is being created.
1268 *
1269 * Give the child process an initial estcpu that is more batch then
1270 * its parent and dock the parent for the fork (but do not
1271 * reschedule the parent). This comprises the main part of our batch
1272 * detection heuristic for both parallel forking and sequential execs.
1273 *
553ea3c8 1274 * XXX lwp should be "spawning" instead of "forking"
270ac911
MD
1275 *
1276 * MPSAFE
38b25931
MD
1277 */
1278static void
553ea3c8 1279bsd4_forking(struct lwp *plp, struct lwp *lp)
38b25931 1280{
52cac9fb
MD
1281 /*
1282 * Put the child 4 queue slots (out of 32) higher than the parent
1283 * (less desireable than the parent).
1284 */
1285 lp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ * 4);
1286
1287 /*
1288 * The batch status of children always starts out centerline
1289 * and will inch-up or inch-down as appropriate. It takes roughly
1290 * ~15 seconds of >50% cpu to hit the limit.
1291 */
1292 lp->lwp_batch = BATCHMAX / 2;
1293
1294 /*
1295 * Dock the parent a cost for the fork, protecting us from fork
1296 * bombs. If the parent is forking quickly make the child more
1297 * batchy.
1298 */
1299 plp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ / 16);
38b25931
MD
1300}
1301
1302/*
52cac9fb 1303 * Called when a parent waits for a child.
270ac911
MD
1304 *
1305 * MPSAFE
38b25931
MD
1306 */
1307static void
52cac9fb 1308bsd4_exiting(struct lwp *lp, struct proc *child_proc)
38b25931 1309{
38b25931
MD
1310}
1311
1312/*
52eedfb5
MD
1313 * chooseproc() is called when a cpu needs a user process to LWKT schedule,
1314 * it selects a user process and returns it. If chklp is non-NULL and chklp
1315 * has a better or equal priority then the process that would otherwise be
1316 * chosen, NULL is returned.
38b25931 1317 *
52eedfb5
MD
1318 * Until we fix the RUNQ code the chklp test has to be strict or we may
1319 * bounce between processes trying to acquire the current process designation.
38b25931 1320 *
52eedfb5
MD
1321 * MPSAFE - must be called with bsd4_spin exclusive held. The spinlock is
1322 * left intact through the entire routine.
38b25931
MD
1323 */
1324static
52eedfb5
MD
1325struct lwp *
1326chooseproc_locked(struct lwp *chklp)
38b25931 1327{
52eedfb5
MD
1328 struct lwp *lp;
1329 struct rq *q;
a60ccb85 1330 u_int32_t *which, *which2;
52eedfb5 1331 u_int32_t pri;
a60ccb85
DX
1332 u_int32_t rtqbits;
1333 u_int32_t tsqbits;
1334 u_int32_t idqbits;
1335 cpumask_t cpumask;
38b25931 1336
a60ccb85
DX
1337 rtqbits = bsd4_rtqueuebits;
1338 tsqbits = bsd4_queuebits;
1339 idqbits = bsd4_idqueuebits;
1340 cpumask = mycpu->gd_cpumask;
1341
d6d39bc7 1342
a60ccb85
DX
1343#ifdef SMP
1344again:
1345#endif
1346 if (rtqbits) {
1347 pri = bsfl(rtqbits);
52eedfb5
MD
1348 q = &bsd4_rtqueues[pri];
1349 which = &bsd4_rtqueuebits;
a60ccb85
DX
1350 which2 = &rtqbits;
1351 } else if (tsqbits) {
1352 pri = bsfl(tsqbits);
52eedfb5
MD
1353 q = &bsd4_queues[pri];
1354 which = &bsd4_queuebits;
a60ccb85
DX
1355 which2 = &tsqbits;
1356 } else if (idqbits) {
1357 pri = bsfl(idqbits);
52eedfb5
MD
1358 q = &bsd4_idqueues[pri];
1359 which = &bsd4_idqueuebits;
a60ccb85 1360 which2 = &idqbits;
52eedfb5
MD
1361 } else {
1362 return NULL;
1363 }
1364 lp = TAILQ_FIRST(q);
1365 KASSERT(lp, ("chooseproc: no lwp on busy queue"));
270ac911 1366
a60ccb85
DX
1367#ifdef SMP
1368 while ((lp->lwp_cpumask & cpumask) == 0) {
1369 lp = TAILQ_NEXT(lp, lwp_procq);
1370 if (lp == NULL) {
1371 *which2 &= ~(1 << pri);
1372 goto again;
1373 }
1374 }
1375#endif
1376
38b25931 1377 /*
52eedfb5
MD
1378 * If the passed lwp <chklp> is reasonably close to the selected
1379 * lwp <lp>, return NULL (indicating that <chklp> should be kept).
d6d39bc7 1380 *
52eedfb5
MD
1381 * Note that we must error on the side of <chklp> to avoid bouncing
1382 * between threads in the acquire code.
38b25931 1383 */
52eedfb5
MD
1384 if (chklp) {
1385 if (chklp->lwp_priority < lp->lwp_priority + PPQ)
1386 return(NULL);
1387 }
38b25931 1388
52eedfb5
MD
1389#ifdef SMP
1390 /*
1391 * If the chosen lwp does not reside on this cpu spend a few
1392 * cycles looking for a better candidate at the same priority level.
1393 * This is a fallback check, setrunqueue() tries to wakeup the
1394 * correct cpu and is our front-line affinity.
1395 */
1396 if (lp->lwp_thread->td_gd != mycpu &&
1397 (chklp = TAILQ_NEXT(lp, lwp_procq)) != NULL
1398 ) {
1399 if (chklp->lwp_thread->td_gd == mycpu) {
1400 ++choose_affinity;
1401 lp = chklp;
38b25931 1402 }
52eedfb5
MD
1403 }
1404#endif
38b25931 1405
d6d39bc7
MC
1406 KTR_COND_LOG(usched_chooseproc,
1407 lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
1408 lp->lwp_proc->p_pid,
1409 lp->lwp_thread->td_gd->gd_cpuid,
1410 mycpu->gd_cpuid);
1411
52eedfb5
MD
1412 TAILQ_REMOVE(q, lp, lwp_procq);
1413 --bsd4_runqcount;
1414 if (TAILQ_EMPTY(q))
1415 *which &= ~(1 << pri);
4643740a
MD
1416 KASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) != 0, ("not on runq6!"));
1417 atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ);
52eedfb5
MD
1418 return lp;
1419}
38b25931 1420
52eedfb5 1421#ifdef SMP
d6d39bc7
MC
1422/*
1423 * chooseproc() - with a cache coherence heuristic. Try to pull a process that
1424 * has its home on the current CPU> If the process doesn't have its home here
1425 * and is a batchy one (see batcy_looser_pri_test), we can wait for a
1426 * sched_tick, may be its home will become free and pull it in. Anyway,
1427 * we can't wait more than one tick. If that tick expired, we pull in that
1428 * process, no matter what.
1429 */
1430static
1431struct lwp *
1432chooseproc_locked_cache_coherent(struct lwp *chklp)
1433{
1434 struct lwp *lp;
1435 struct rq *q;
1436 u_int32_t *which, *which2;
1437 u_int32_t pri;
1438 u_int32_t checks;
1439 u_int32_t rtqbits;
1440 u_int32_t tsqbits;
1441 u_int32_t idqbits;
1442 cpumask_t cpumask;
1443
1444 struct lwp * min_level_lwp = NULL;
1445 struct rq *min_q = NULL;
1446 cpumask_t siblings;
1447 cpu_node_t* cpunode = NULL;
1448 u_int32_t min_level = MAXCPU; /* number of levels < MAXCPU */
1449 u_int32_t *min_which = NULL;
1450 u_int32_t min_pri = 0;
1451 u_int32_t level = 0;
1452
1453 rtqbits = bsd4_rtqueuebits;
1454 tsqbits = bsd4_queuebits;
1455 idqbits = bsd4_idqueuebits;
1456 cpumask = mycpu->gd_cpumask;
1457
1458 /* Get the mask coresponding to the sysctl configured level */
1459 cpunode = bsd4_pcpu[mycpu->gd_cpuid].cpunode;
1460 level = usched_bsd4_stick_to_level;
1461 while (level) {
1462 cpunode = cpunode->parent_node;
1463 level--;
1464 }
1465 /* The cpus which can ellect a process */
1466 siblings = cpunode->members;
1467
1468again:
1469 if (rtqbits) {
1470 pri = bsfl(rtqbits);
1471 q = &bsd4_rtqueues[pri];
1472 which = &bsd4_rtqueuebits;
1473 which2 = &rtqbits;
1474 } else if (tsqbits) {
1475 pri = bsfl(tsqbits);
1476 q = &bsd4_queues[pri];
1477 which = &bsd4_queuebits;
1478 which2 = &tsqbits;
1479 } else if (idqbits) {
1480 pri = bsfl(idqbits);
1481 q = &bsd4_idqueues[pri];
1482 which = &bsd4_idqueuebits;
1483 which2 = &idqbits;
1484 } else {
1485 return NULL;
1486 }
1487 lp = TAILQ_FIRST(q);
1488 KASSERT(lp, ("chooseproc: no lwp on busy queue"));
1489
1490 /* Limit the number of checks/queue to a configurable value to
1491 * minimize the contention (we are in a locked region
1492 */
1493 for (checks = 0; checks < usched_bsd4_queue_checks; checks++) {
1494
1495 if ((lp->lwp_cpumask & cpumask) == 0 ||
1496 ((siblings & lp->lwp_thread->td_gd->gd_cpumask) == 0 &&
1497 batchy_looser_pri_test(lp) &&
1498 (lp->lwp_setrunqueue_ticks == sched_ticks ||
1499 lp->lwp_setrunqueue_ticks == (int)(sched_ticks - 1)))) {
1500
1501 KTR_COND_LOG(usched_chooseproc_cc_not_good,
1502 lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
1503 lp->lwp_proc->p_pid,
1504 lp->lwp_thread->td_gd->gd_cpumask,
1505 siblings,
1506 cpumask);
1507
1508 cpunode = bsd4_pcpu[lp->lwp_thread->td_gd->gd_cpuid].cpunode;
1509 level = 0;
1510 while (cpunode) {
1511 if (cpunode->members & cpumask) {
1512 break;
1513 }
1514 cpunode = cpunode->parent_node;
1515 level++;
1516 }
1517 if (level < min_level) {
1518 min_level_lwp = lp;
1519 min_level = level;
1520 min_q = q;
1521 min_which = which;
1522 min_pri = pri;
1523 }
1524
1525 lp = TAILQ_NEXT(lp, lwp_procq);
1526 if (lp == NULL) {
1527 *which2 &= ~(1 << pri);
1528 goto again;
1529 }
1530 } else {
1531 KTR_COND_LOG(usched_chooseproc_cc_elected,
1532 lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
1533 lp->lwp_proc->p_pid,
1534 lp->lwp_thread->td_gd->gd_cpumask,
1535 siblings,
1536 cpumask);
1537
1538 goto found;
1539 }
1540 }
1541 lp = min_level_lwp;
1542 q = min_q;
1543 which = min_which;
1544 pri = min_pri;
1545 KASSERT(lp, ("chooseproc: at least the first lp was good"));
1546
1547found:
1548
1549 /*
1550 * If the passed lwp <chklp> is reasonably close to the selected
1551 * lwp <lp>, return NULL (indicating that <chklp> should be kept).
1552 *
1553 * Note that we must error on the side of <chklp> to avoid bouncing
1554 * between threads in the acquire code.
1555 */
1556 if (chklp) {
1557 if (chklp->lwp_priority < lp->lwp_priority + PPQ)
1558 return(NULL);
1559 }
1560
1561 KTR_COND_LOG(usched_chooseproc_cc,
1562 lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
1563 lp->lwp_proc->p_pid,
1564 lp->lwp_thread->td_gd->gd_cpuid,
1565 mycpu->gd_cpuid);
1566
1567 TAILQ_REMOVE(q, lp, lwp_procq);
1568 --bsd4_runqcount;
1569 if (TAILQ_EMPTY(q))
1570 *which &= ~(1 << pri);
1571 KASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) != 0, ("not on runq6!"));
1572 atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ);
1573 return lp;
1574}
1575
b9eb1c19 1576
52eedfb5
MD
1577static
1578void
1579need_user_resched_remote(void *dummy)
1580{
b9eb1c19
MD
1581 globaldata_t gd = mycpu;
1582 bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid];
1583
eb501f47 1584 need_user_resched();
d6d39bc7 1585 wakeup(&dd->helper_thread);
52eedfb5 1586}
38b25931 1587
52eedfb5 1588#endif
38b25931 1589
52eedfb5
MD
1590/*
1591 * bsd4_remrunqueue_locked() removes a given process from the run queue
1592 * that it is on, clearing the queue busy bit if it becomes empty.
1593 *
1594 * Note that user process scheduler is different from the LWKT schedule.
1595 * The user process scheduler only manages user processes but it uses LWKT
1596 * underneath, and a user process operating in the kernel will often be
1597 * 'released' from our management.
1598 *
1599 * MPSAFE - bsd4_spin must be held exclusively on call
1600 */
1601static void
1602bsd4_remrunqueue_locked(struct lwp *lp)
1603{
1604 struct rq *q;
1605 u_int32_t *which;
1606 u_int8_t pri;
1607
4643740a
MD
1608 KKASSERT(lp->lwp_mpflags & LWP_MP_ONRUNQ);
1609 atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ);
52eedfb5
MD
1610 --bsd4_runqcount;
1611 KKASSERT(bsd4_runqcount >= 0);
1612
1613 pri = lp->lwp_rqindex;
1614 switch(lp->lwp_rqtype) {
1615 case RTP_PRIO_NORMAL:
1616 q = &bsd4_queues[pri];
1617 which = &bsd4_queuebits;
1618 break;
1619 case RTP_PRIO_REALTIME:
1620 case RTP_PRIO_FIFO:
1621 q = &bsd4_rtqueues[pri];
1622 which = &bsd4_rtqueuebits;
1623 break;
1624 case RTP_PRIO_IDLE:
1625 q = &bsd4_idqueues[pri];
1626 which = &bsd4_idqueuebits;
1627 break;
1628 default:
1629 panic("remrunqueue: invalid rtprio type");
1630 /* NOT REACHED */
1631 }
1632 TAILQ_REMOVE(q, lp, lwp_procq);
1633 if (TAILQ_EMPTY(q)) {
1634 KASSERT((*which & (1 << pri)) != 0,
1635 ("remrunqueue: remove from empty queue"));
1636 *which &= ~(1 << pri);
38b25931
MD
1637 }
1638}
1639
52eedfb5
MD
1640/*
1641 * bsd4_setrunqueue_locked()
1642 *
1643 * Add a process whos rqtype and rqindex had previously been calculated
1644 * onto the appropriate run queue. Determine if the addition requires
1645 * a reschedule on a cpu and return the cpuid or -1.
1646 *
1647 * NOTE: Lower priorities are better priorities.
1648 *
1649 * MPSAFE - bsd4_spin must be held exclusively on call
1650 */
1651static void
1652bsd4_setrunqueue_locked(struct lwp *lp)
1653{
1654 struct rq *q;
1655 u_int32_t *which;
1656 int pri;
1657
4643740a
MD
1658 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0);
1659 atomic_set_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ);
52eedfb5
MD
1660 ++bsd4_runqcount;
1661
1662 pri = lp->lwp_rqindex;
1663
1664 switch(lp->lwp_rqtype) {
1665 case RTP_PRIO_NORMAL:
1666 q = &bsd4_queues[pri];
1667 which = &bsd4_queuebits;
1668 break;
1669 case RTP_PRIO_REALTIME:
1670 case RTP_PRIO_FIFO:
1671 q = &bsd4_rtqueues[pri];
1672 which = &bsd4_rtqueuebits;
1673 break;
1674 case RTP_PRIO_IDLE:
1675 q = &bsd4_idqueues[pri];
1676 which = &bsd4_idqueuebits;
1677 break;
1678 default:
1679 panic("remrunqueue: invalid rtprio type");
1680 /* NOT REACHED */
1681 }
1682
1683 /*
1684 * Add to the correct queue and set the appropriate bit. If no
1685 * lower priority (i.e. better) processes are in the queue then
1686 * we want a reschedule, calculate the best cpu for the job.
1687 *
1688 * Always run reschedules on the LWPs original cpu.
1689 */
1690 TAILQ_INSERT_TAIL(q, lp, lwp_procq);
1691 *which |= 1 << pri;
1692}
1693
38b25931
MD
1694#ifdef SMP
1695
1696/*
1697 * For SMP systems a user scheduler helper thread is created for each
1698 * cpu and is used to allow one cpu to wakeup another for the purposes of
c9e9fb21
MD
1699 * scheduling userland threads from setrunqueue().
1700 *
1701 * UP systems do not need the helper since there is only one cpu.
1702 *
1703 * We can't use the idle thread for this because we might block.
1704 * Additionally, doing things this way allows us to HLT idle cpus
1705 * on MP systems.
52eedfb5
MD
1706 *
1707 * MPSAFE
38b25931
MD
1708 */
1709static void
1710sched_thread(void *dummy)
1711{
52eedfb5
MD
1712 globaldata_t gd;
1713 bsd4_pcpu_t dd;
85946b6c 1714 bsd4_pcpu_t tmpdd;
52eedfb5 1715 struct lwp *nlp;
eb501f47 1716 cpumask_t mask;
52eedfb5 1717 int cpuid;
eb501f47 1718#ifdef SMP
418f19aa 1719 cpumask_t tmpmask;
52eedfb5 1720 int tmpid;
418f19aa 1721#endif
52eedfb5
MD
1722
1723 gd = mycpu;
1724 cpuid = gd->gd_cpuid; /* doesn't change */
eb501f47 1725 mask = gd->gd_cpumask; /* doesn't change */
52eedfb5
MD
1726 dd = &bsd4_pcpu[cpuid];
1727
1728 /*
c9e9fb21
MD
1729 * Since we are woken up only when no user processes are scheduled
1730 * on a cpu, we can run at an ultra low priority.
52eedfb5 1731 */
50017724 1732 lwkt_setpri_self(TDPRI_USER_SCHEDULER);
38b25931 1733
d6d39bc7
MC
1734 tsleep(&dd->helper_thread, PINTERLOCKED, "sched_thread_sleep", 0);
1735
38b25931 1736 for (;;) {
d6d39bc7 1737//again:
50017724
MD
1738 /*
1739 * We use the LWKT deschedule-interlock trick to avoid racing
1740 * bsd4_rdyprocmask. This means we cannot block through to the
1741 * manual lwkt_switch() call we make below.
1742 */
52eedfb5 1743 crit_enter_gd(gd);
d6d39bc7
MC
1744 //lwkt_deschedule_self(gd->gd_curthread);
1745 tsleep_interlock(&dd->helper_thread, 0);
287a8577 1746 spin_lock(&bsd4_spin);
eb501f47 1747 atomic_set_cpumask(&bsd4_rdyprocmask, mask);
b9eb1c19
MD
1748
1749 clear_user_resched(); /* This satisfied the reschedule request */
1750 dd->rrcount = 0; /* Reset the round-robin counter */
1751
eb501f47 1752 if ((bsd4_curprocmask & mask) == 0) {
b9eb1c19
MD
1753 /*
1754 * No thread is currently scheduled.
1755 */
1756 KKASSERT(dd->uschedcp == NULL);
52eedfb5 1757 if ((nlp = chooseproc_locked(NULL)) != NULL) {
d6d39bc7
MC
1758
1759 KTR_COND_LOG(usched_sched_thread_no_process,
1760 nlp->lwp_proc->p_pid == usched_bsd4_pid_debug,
1761 gd->gd_cpuid,
1762 nlp->lwp_proc->p_pid,
1763 nlp->lwp_thread->td_gd->gd_cpuid);
1764
eb501f47 1765 atomic_set_cpumask(&bsd4_curprocmask, mask);
52eedfb5
MD
1766 dd->upri = nlp->lwp_priority;
1767 dd->uschedcp = nlp;
287a8577 1768 spin_unlock(&bsd4_spin);
cc9b6223 1769#ifdef SMP
52eedfb5 1770 lwkt_acquire(nlp->lwp_thread);
cc9b6223 1771#endif
52eedfb5
MD
1772 lwkt_schedule(nlp->lwp_thread);
1773 } else {
287a8577 1774 spin_unlock(&bsd4_spin);
52eedfb5 1775 }
b9eb1c19 1776 } else if (bsd4_runqcount) {
eb501f47 1777 if ((nlp = chooseproc_locked(dd->uschedcp)) != NULL) {
d6d39bc7
MC
1778
1779 KTR_COND_LOG(usched_sched_thread_process,
1780 nlp->lwp_proc->p_pid == usched_bsd4_pid_debug,
1781 gd->gd_cpuid,
1782 nlp->lwp_proc->p_pid,
1783 nlp->lwp_thread->td_gd->gd_cpuid);
1784
eb501f47
MD
1785 dd->upri = nlp->lwp_priority;
1786 dd->uschedcp = nlp;
1787 spin_unlock(&bsd4_spin);
cc9b6223 1788#ifdef SMP
eb501f47 1789 lwkt_acquire(nlp->lwp_thread);
cc9b6223 1790#endif
eb501f47 1791 lwkt_schedule(nlp->lwp_thread);
52eedfb5 1792 } else {
eb501f47
MD
1793 /*
1794 * CHAINING CONDITION TRAIN
1795 *
1796 * We could not deal with the scheduler wakeup
1797 * request on this cpu, locate a ready scheduler
1798 * with no current lp assignment and chain to it.
1799 *
1800 * This ensures that a wakeup race which fails due
1801 * to priority test does not leave other unscheduled
1802 * cpus idle when the runqueue is not empty.
1803 */
d6d39bc7
MC
1804 tmpmask = ~bsd4_curprocmask &
1805 bsd4_rdyprocmask & smp_active_mask;
eb501f47
MD
1806 if (tmpmask) {
1807 tmpid = BSFCPUMASK(tmpmask);
85946b6c 1808 tmpdd = &bsd4_pcpu[tmpid];
eb501f47 1809 atomic_clear_cpumask(&bsd4_rdyprocmask,
d6d39bc7 1810 CPUMASK(tmpid));
eb501f47 1811 spin_unlock(&bsd4_spin);
d6d39bc7 1812 wakeup(&tmpdd->helper_thread);
eb501f47
MD
1813 } else {
1814 spin_unlock(&bsd4_spin);
1815 }
d6d39bc7
MC
1816
1817 KTR_LOG(usched_sched_thread_no_process_found,
1818 gd->gd_cpuid,
1819 tmpmask);
52eedfb5 1820 }
b9eb1c19
MD
1821 } else {
1822 /*
1823 * The runq is empty.
1824 */
287a8577 1825 spin_unlock(&bsd4_spin);
38b25931 1826 }
85946b6c
MD
1827
1828 /*
1829 * We're descheduled unless someone scheduled us. Switch away.
1830 * Exiting the critical section will cause splz() to be called
1831 * for us if interrupts and such are pending.
1832 */
52eedfb5 1833 crit_exit_gd(gd);
d6d39bc7
MC
1834 tsleep(&dd->helper_thread, PINTERLOCKED, "sched_thread_sleep", 0);
1835// lwkt_switch();
38b25931
MD
1836 }
1837}
1838
d6d39bc7
MC
1839/* sysctl stick_to_level parameter */
1840static int
1841sysctl_usched_bsd4_stick_to_level(SYSCTL_HANDLER_ARGS)
1842{
1843 int error, new_val;
1844
1845 new_val = usched_bsd4_stick_to_level;
1846
1847 error = sysctl_handle_int(oidp, &new_val, 0, req);
1848 if (error != 0 || req->newptr == NULL)
1849 return (error);
1850 if (new_val > cpu_topology_levels_number - 1 ||
1851 new_val < 0)
1852 return (EINVAL);
1853 usched_bsd4_stick_to_level = new_val;
1854 return (0);
1855}
1856
38b25931
MD
1857/*
1858 * Setup our scheduler helpers. Note that curprocmask bit 0 has already
1859 * been cleared by rqinit() and we should not mess with it further.
1860 */
1861static void
1862sched_thread_cpu_init(void)
1863{
d6d39bc7
MC
1864 int i;
1865 int cpuid;
1866 int smt_not_supported = 0;
1867 int cache_coherent_not_supported = 0;
1868 if (bootverbose)
1869 kprintf("Start scheduler helpers on cpus:\n");
38b25931 1870
d6d39bc7
MC
1871 sysctl_ctx_init(&usched_bsd4_sysctl_ctx);
1872 usched_bsd4_sysctl_tree = SYSCTL_ADD_NODE(&usched_bsd4_sysctl_ctx,
1873 SYSCTL_STATIC_CHILDREN(_kern), OID_AUTO,
1874 "usched_bsd4", CTLFLAG_RD, 0, "");
38b25931 1875
d6d39bc7
MC
1876 for (i = 0; i < ncpus; ++i) {
1877 bsd4_pcpu_t dd = &bsd4_pcpu[i];
1878 cpumask_t mask = CPUMASK(i);
38b25931 1879
d6d39bc7
MC
1880 if ((mask & smp_active_mask) == 0)
1881 continue;
38b25931 1882
d6d39bc7 1883 dd->cpunode = get_cpu_node_by_cpuid(i);
38b25931 1884
d6d39bc7
MC
1885 if (dd->cpunode == NULL) {
1886 smt_not_supported = 1;
1887 cache_coherent_not_supported = 1;
1888 if (bootverbose)
1889 kprintf ("\tcpu%d - WARNING: No CPU NODE found for cpu\n", i);
38b25931 1890
d6d39bc7
MC
1891 } else {
1892
1893 switch (dd->cpunode->type) {
1894 case THREAD_LEVEL:
1895 if (bootverbose)
1896 kprintf ("\tcpu%d - HyperThreading available. "
1897 "Core siblings: ", i);
1898 break;
1899 case CORE_LEVEL:
1900 smt_not_supported = 1;
1901
1902 if (bootverbose)
1903 kprintf ("\tcpu%d - No HT available, multi-core/physical "
1904 "cpu. Physical siblings: ", i);
1905 break;
1906 case CHIP_LEVEL:
1907 smt_not_supported = 1;
1908
1909 if (bootverbose)
1910 kprintf ("\tcpu%d - No HT available, single-core/physical cpu. "
1911 "Package Siblings: ", i);
1912 break;
1913 default:
1914 if (bootverbose)
1915 kprintf ("\tcpu%d - Unknown cpunode->type. Siblings: ", i);
1916 break;
1917 }
1918
1919 if (bootverbose) {
1920 if (dd->cpunode->parent_node != NULL) {
1921 CPUSET_FOREACH(cpuid, dd->cpunode->parent_node->members)
1922 kprintf("cpu%d ", cpuid);
1923 kprintf("\n");
1924 } else {
1925 kprintf(" no siblings\n");
1926 }
1927 }
1928 }
1929
1930 lwkt_create(sched_thread, NULL, NULL, &dd->helper_thread,
1931 0, i, "usched %d", i);
1932
1933 /*
1934 * Allow user scheduling on the target cpu. cpu #0 has already
1935 * been enabled in rqinit().
1936 */
1937 if (i)
1938 atomic_clear_cpumask(&bsd4_curprocmask, mask);
1939 atomic_set_cpumask(&bsd4_rdyprocmask, mask);
1940 dd->upri = PRIBASE_NULL;
1941
1942 }
1943
1944 /* usched_bsd4 sysctl configurable parameters */
1945
1946 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx,
1947 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
1948 OID_AUTO, "rrinterval", CTLFLAG_RW,
1949 &usched_bsd4_rrinterval, 0, "");
1950 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx,
1951 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
1952 OID_AUTO, "decay", CTLFLAG_RW,
1953 &usched_bsd4_decay, 0, "Extra decay when not running");
1954 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx,
1955 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
1956 OID_AUTO, "batch_time", CTLFLAG_RW,
1957 &usched_bsd4_batch_time, 0, "Minimum batch counter value");
1958
1959 /* Add enable/disable option for SMT scheduling if supported */
1960 if (smt_not_supported) {
1961 usched_bsd4_smt = 0;
1962 SYSCTL_ADD_STRING(&usched_bsd4_sysctl_ctx,
1963 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
1964 OID_AUTO, "smt", CTLFLAG_RD,
1965 "NOT SUPPORTED", 0, "SMT NOT SUPPORTED");
1966 } else {
1967 usched_bsd4_smt = 1;
1968 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx,
1969 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
1970 OID_AUTO, "smt", CTLFLAG_RW,
1971 &usched_bsd4_smt, 0, "Enable/Disable SMT scheduling");
1972
1973 }
1974
1975 /* Add enable/disable option for cache coherent scheduling if supported */
1976 if (cache_coherent_not_supported) {
1977#ifdef SMP
1978 usched_bsd4_cache_coherent = 0;
1979 SYSCTL_ADD_STRING(&usched_bsd4_sysctl_ctx,
1980 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
1981 OID_AUTO, "cache_coherent", CTLFLAG_RD,
1982 "NOT SUPPORTED", 0, "Cache coherence NOT SUPPORTED");
1983#endif
1984 } else {
1985#ifdef SMP
1986 usched_bsd4_cache_coherent = 1;
1987 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx,
1988 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
1989 OID_AUTO, "cache_coherent", CTLFLAG_RW,
1990 &usched_bsd4_cache_coherent, 0,
1991 "Enable/Disable cache coherent scheduling");
1992#endif
1993
1994 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx,
1995 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
1996 OID_AUTO, "upri_affinity", CTLFLAG_RW,
1997 &usched_bsd4_upri_affinity, 1,
1998 "Number of PPQs in user priority check");
1999
2000 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx,
2001 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
2002 OID_AUTO, "queue_checks", CTLFLAG_RW,
2003 &usched_bsd4_queue_checks, 5,
2004 "Number of LWP to check from a queue before giving up");
2005
2006 SYSCTL_ADD_PROC(&usched_bsd4_sysctl_ctx,
2007 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
2008 OID_AUTO, "stick_to_level", CTLTYPE_INT | CTLFLAG_RW,
2009 NULL, sizeof usched_bsd4_stick_to_level,
2010 sysctl_usched_bsd4_stick_to_level, "I",
2011 "Stick a process to this level. See sysctl"
2012 "paremter hw.cpu_topology.level_description");
2013 }
38b25931 2014}
ba39e2e0
MD
2015SYSINIT(uschedtd, SI_BOOT2_USCHED, SI_ORDER_SECOND,
2016 sched_thread_cpu_init, NULL)
d6d39bc7 2017#else /* No SMP options - just add the configurable parameters to sysctl */
38b25931 2018
d6d39bc7
MC
2019static void
2020sched_sysctl_tree_init(void)
2021{
2022 sysctl_ctx_init(&usched_bsd4_sysctl_ctx);
2023 usched_bsd4_sysctl_tree = SYSCTL_ADD_NODE(&usched_bsd4_sysctl_ctx,
2024 SYSCTL_STATIC_CHILDREN(_kern), OID_AUTO,
2025 "usched_bsd4", CTLFLAG_RD, 0, "");
2026
2027 /* usched_bsd4 sysctl configurable parameters */
2028 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx,
2029 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
2030 OID_AUTO, "rrinterval", CTLFLAG_RW,
2031 &usched_bsd4_rrinterval, 0, "");
2032 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx,
2033 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
2034 OID_AUTO, "decay", CTLFLAG_RW,
2035 &usched_bsd4_decay, 0, "Extra decay when not running");
2036 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx,
2037 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
2038 OID_AUTO, "batch_time", CTLFLAG_RW,
2039 &usched_bsd4_batch_time, 0, "Minimum batch counter value");
2040}
2041SYSINIT(uschedtd, SI_BOOT2_USCHED, SI_ORDER_SECOND,
2042 sched_sysctl_tree_init, NULL)
38b25931
MD
2043#endif
2044