kernel - Major signal path adjustments to fix races, tsleep race fixes, +more
[dragonfly.git] / sys / kern / usched_bsd4.c
CommitLineData
38b25931
MD
1/*
2 * Copyright (c) 1999 Peter Wemm <peter@FreeBSD.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
38b25931
MD
25 */
26
27#include <sys/param.h>
28#include <sys/systm.h>
29#include <sys/kernel.h>
30#include <sys/lock.h>
31#include <sys/queue.h>
32#include <sys/proc.h>
33#include <sys/rtprio.h>
38b25931
MD
34#include <sys/uio.h>
35#include <sys/sysctl.h>
36#include <sys/resourcevar.h>
52eedfb5 37#include <sys/spinlock.h>
38b25931
MD
38#include <machine/cpu.h>
39#include <machine/smp.h>
40
52eedfb5
MD
41#include <sys/thread2.h>
42#include <sys/spinlock2.h>
684a93c4 43#include <sys/mplock2.h>
52eedfb5 44
38b25931
MD
45/*
46 * Priorities. Note that with 32 run queues per scheduler each queue
47 * represents four priority levels.
48 */
49
50#define MAXPRI 128
51#define PRIMASK (MAXPRI - 1)
52#define PRIBASE_REALTIME 0
53#define PRIBASE_NORMAL MAXPRI
54#define PRIBASE_IDLE (MAXPRI * 2)
55#define PRIBASE_THREAD (MAXPRI * 3)
56#define PRIBASE_NULL (MAXPRI * 4)
57
58#define NQS 32 /* 32 run queues. */
59#define PPQ (MAXPRI / NQS) /* priorities per queue */
52eedfb5 60#define PPQMASK (PPQ - 1)
38b25931
MD
61
62/*
63 * NICEPPQ - number of nice units per priority queue
38b25931
MD
64 *
65 * ESTCPUPPQ - number of estcpu units per priority queue
66 * ESTCPUMAX - number of estcpu units
38b25931
MD
67 */
68#define NICEPPQ 2
38b25931
MD
69#define ESTCPUPPQ 512
70#define ESTCPUMAX (ESTCPUPPQ * NQS)
52cac9fb 71#define BATCHMAX (ESTCPUFREQ * 30)
38b25931
MD
72#define PRIO_RANGE (PRIO_MAX - PRIO_MIN + 1)
73
74#define ESTCPULIM(v) min((v), ESTCPUMAX)
75
553ea3c8 76TAILQ_HEAD(rq, lwp);
38b25931 77
553ea3c8
SS
78#define lwp_priority lwp_usdata.bsd4.priority
79#define lwp_rqindex lwp_usdata.bsd4.rqindex
553ea3c8 80#define lwp_estcpu lwp_usdata.bsd4.estcpu
52cac9fb 81#define lwp_batch lwp_usdata.bsd4.batch
52eedfb5 82#define lwp_rqtype lwp_usdata.bsd4.rqtype
38b25931 83
553ea3c8
SS
84static void bsd4_acquire_curproc(struct lwp *lp);
85static void bsd4_release_curproc(struct lwp *lp);
38b25931 86static void bsd4_select_curproc(globaldata_t gd);
553ea3c8 87static void bsd4_setrunqueue(struct lwp *lp);
553ea3c8 88static void bsd4_schedulerclock(struct lwp *lp, sysclock_t period,
38b25931 89 sysclock_t cpstamp);
52eedfb5 90static void bsd4_recalculate_estcpu(struct lwp *lp);
553ea3c8
SS
91static void bsd4_resetpriority(struct lwp *lp);
92static void bsd4_forking(struct lwp *plp, struct lwp *lp);
52cac9fb 93static void bsd4_exiting(struct lwp *lp, struct proc *);
c3149361 94static void bsd4_yield(struct lwp *lp);
38b25931 95
52eedfb5
MD
96#ifdef SMP
97static void need_user_resched_remote(void *dummy);
98#endif
99static struct lwp *chooseproc_locked(struct lwp *chklp);
100static void bsd4_remrunqueue_locked(struct lwp *lp);
101static void bsd4_setrunqueue_locked(struct lwp *lp);
38b25931
MD
102
103struct usched usched_bsd4 = {
104 { NULL },
105 "bsd4", "Original DragonFly Scheduler",
cb7f4ab1
MD
106 NULL, /* default registration */
107 NULL, /* default deregistration */
38b25931
MD
108 bsd4_acquire_curproc,
109 bsd4_release_curproc,
38b25931 110 bsd4_setrunqueue,
38b25931
MD
111 bsd4_schedulerclock,
112 bsd4_recalculate_estcpu,
113 bsd4_resetpriority,
114 bsd4_forking,
cb7f4ab1 115 bsd4_exiting,
c3149361
MD
116 NULL, /* setcpumask not supported */
117 bsd4_yield
38b25931
MD
118};
119
52eedfb5
MD
120struct usched_bsd4_pcpu {
121 struct thread helper_thread;
122 short rrcount;
123 short upri;
124 struct lwp *uschedcp;
125};
126
127typedef struct usched_bsd4_pcpu *bsd4_pcpu_t;
128
38b25931
MD
129/*
130 * We have NQS (32) run queues per scheduling class. For the normal
131 * class, there are 128 priorities scaled onto these 32 queues. New
132 * processes are added to the last entry in each queue, and processes
133 * are selected for running by taking them from the head and maintaining
134 * a simple FIFO arrangement. Realtime and Idle priority processes have
135 * and explicit 0-31 priority which maps directly onto their class queue
136 * index. When a queue has something in it, the corresponding bit is
137 * set in the queuebits variable, allowing a single read to determine
138 * the state of all 32 queues and then a ffs() to find the first busy
139 * queue.
140 */
52eedfb5
MD
141static struct rq bsd4_queues[NQS];
142static struct rq bsd4_rtqueues[NQS];
143static struct rq bsd4_idqueues[NQS];
144static u_int32_t bsd4_queuebits;
145static u_int32_t bsd4_rtqueuebits;
146static u_int32_t bsd4_idqueuebits;
147static cpumask_t bsd4_curprocmask = -1; /* currently running a user process */
148static cpumask_t bsd4_rdyprocmask; /* ready to accept a user process */
149static int bsd4_runqcount;
38b25931 150#ifdef SMP
52eedfb5 151static volatile int bsd4_scancpu;
38b25931 152#endif
52eedfb5
MD
153static struct spinlock bsd4_spin;
154static struct usched_bsd4_pcpu bsd4_pcpu[MAXCPU];
38b25931 155
0c52fa62
SG
156SYSCTL_INT(_debug, OID_AUTO, bsd4_runqcount, CTLFLAG_RD, &bsd4_runqcount, 0,
157 "Number of run queues");
38b25931
MD
158#ifdef INVARIANTS
159static int usched_nonoptimal;
160SYSCTL_INT(_debug, OID_AUTO, usched_nonoptimal, CTLFLAG_RW,
161 &usched_nonoptimal, 0, "acquire_curproc() was not optimal");
162static int usched_optimal;
163SYSCTL_INT(_debug, OID_AUTO, usched_optimal, CTLFLAG_RW,
164 &usched_optimal, 0, "acquire_curproc() was optimal");
165#endif
166static int usched_debug = -1;
0c52fa62
SG
167SYSCTL_INT(_debug, OID_AUTO, scdebug, CTLFLAG_RW, &usched_debug, 0,
168 "Print debug information for this pid");
38b25931 169#ifdef SMP
38b25931
MD
170static int remote_resched_nonaffinity;
171static int remote_resched_affinity;
172static int choose_affinity;
38b25931
MD
173SYSCTL_INT(_debug, OID_AUTO, remote_resched_nonaffinity, CTLFLAG_RD,
174 &remote_resched_nonaffinity, 0, "Number of remote rescheds");
175SYSCTL_INT(_debug, OID_AUTO, remote_resched_affinity, CTLFLAG_RD,
176 &remote_resched_affinity, 0, "Number of remote rescheds");
177SYSCTL_INT(_debug, OID_AUTO, choose_affinity, CTLFLAG_RD,
178 &choose_affinity, 0, "chooseproc() was smart");
179#endif
180
181static int usched_bsd4_rrinterval = (ESTCPUFREQ + 9) / 10;
182SYSCTL_INT(_kern, OID_AUTO, usched_bsd4_rrinterval, CTLFLAG_RW,
183 &usched_bsd4_rrinterval, 0, "");
52cac9fb 184static int usched_bsd4_decay = 8;
38b25931 185SYSCTL_INT(_kern, OID_AUTO, usched_bsd4_decay, CTLFLAG_RW,
5c559233 186 &usched_bsd4_decay, 0, "Extra decay when not running");
52cac9fb
MD
187static int usched_bsd4_batch_time = 10;
188SYSCTL_INT(_kern, OID_AUTO, usched_bsd4_batch_time, CTLFLAG_RW,
189 &usched_bsd4_batch_time, 0, "Minimum batch counter value");
38b25931
MD
190
191/*
192 * Initialize the run queues at boot time.
193 */
194static void
195rqinit(void *dummy)
196{
197 int i;
198
52eedfb5 199 spin_init(&bsd4_spin);
38b25931 200 for (i = 0; i < NQS; i++) {
52eedfb5
MD
201 TAILQ_INIT(&bsd4_queues[i]);
202 TAILQ_INIT(&bsd4_rtqueues[i]);
203 TAILQ_INIT(&bsd4_idqueues[i]);
38b25931 204 }
da23a592 205 atomic_clear_cpumask(&bsd4_curprocmask, 1);
38b25931 206}
ba39e2e0 207SYSINIT(runqueue, SI_BOOT2_USCHED, SI_ORDER_FIRST, rqinit, NULL)
38b25931
MD
208
209/*
52eedfb5 210 * BSD4_ACQUIRE_CURPROC
38b25931 211 *
52eedfb5
MD
212 * This function is called when the kernel intends to return to userland.
213 * It is responsible for making the thread the current designated userland
214 * thread for this cpu, blocking if necessary.
215 *
b9eb1c19
MD
216 * The kernel has already depressed our LWKT priority so we must not switch
217 * until we have either assigned or disposed of the thread.
52eedfb5
MD
218 *
219 * WARNING! THIS FUNCTION IS ALLOWED TO CAUSE THE CURRENT THREAD TO MIGRATE
220 * TO ANOTHER CPU! Because most of the kernel assumes that no migration will
221 * occur, this function is called only under very controlled circumstances.
222 *
52eedfb5 223 * MPSAFE
38b25931 224 */
52eedfb5
MD
225static void
226bsd4_acquire_curproc(struct lwp *lp)
38b25931 227{
b9eb1c19
MD
228 globaldata_t gd;
229 bsd4_pcpu_t dd;
4643740a 230 thread_t td;
85946b6c 231#if 0
b9eb1c19 232 struct lwp *olp;
85946b6c 233#endif
38b25931 234
4643740a
MD
235 /*
236 * Make sure we aren't sitting on a tsleep queue.
237 */
238 td = lp->lwp_thread;
239 crit_enter_quick(td);
240 if (td->td_flags & TDF_TSLEEPQ)
241 tsleep_remove(td);
b9eb1c19 242 bsd4_recalculate_estcpu(lp);
38b25931 243
38b25931 244 /*
b9eb1c19
MD
245 * If a reschedule was requested give another thread the
246 * driver's seat.
38b25931 247 */
b9eb1c19
MD
248 if (user_resched_wanted()) {
249 clear_user_resched();
250 bsd4_release_curproc(lp);
38b25931 251 }
38b25931 252
52eedfb5 253 /*
b9eb1c19 254 * Loop until we are the current user thread
52eedfb5 255 */
85946b6c
MD
256 gd = mycpu;
257 dd = &bsd4_pcpu[gd->gd_cpuid];
258
52eedfb5 259 do {
b9eb1c19 260 /*
85946b6c 261 * Process any pending events and higher priority threads.
b9eb1c19 262 */
85946b6c 263 lwkt_yield();
b9eb1c19
MD
264
265 /*
266 * Become the currently scheduled user thread for this cpu
267 * if we can do so trivially.
268 *
269 * We can steal another thread's current thread designation
270 * on this cpu since if we are running that other thread
271 * must not be, so we can safely deschedule it.
272 */
273 if (dd->uschedcp == lp) {
eb501f47
MD
274 /*
275 * We are already the current lwp (hot path).
276 */
b9eb1c19
MD
277 dd->upri = lp->lwp_priority;
278 } else if (dd->uschedcp == NULL) {
eb501f47
MD
279 /*
280 * We can trivially become the current lwp.
281 */
da23a592 282 atomic_set_cpumask(&bsd4_curprocmask, gd->gd_cpumask);
b9eb1c19
MD
283 dd->uschedcp = lp;
284 dd->upri = lp->lwp_priority;
285 } else if (dd->upri > lp->lwp_priority) {
eb501f47 286 /*
85946b6c
MD
287 * We can steal the current cpu's lwp designation
288 * away simply by replacing it. The other thread
289 * will stall when it tries to return to userland.
eb501f47 290 */
b9eb1c19
MD
291 dd->uschedcp = lp;
292 dd->upri = lp->lwp_priority;
85946b6c 293 /*
b9eb1c19
MD
294 lwkt_deschedule(olp->lwp_thread);
295 bsd4_setrunqueue(olp);
85946b6c 296 */
b9eb1c19 297 } else {
eb501f47
MD
298 /*
299 * We cannot become the current lwp, place the lp
300 * on the bsd4 run-queue and deschedule ourselves.
85946b6c
MD
301 *
302 * When we are reactivated we will have another
303 * chance.
eb501f47 304 */
b9eb1c19
MD
305 lwkt_deschedule(lp->lwp_thread);
306 bsd4_setrunqueue(lp);
307 lwkt_switch();
85946b6c
MD
308 /*
309 * Reload after a switch or setrunqueue/switch possibly
310 * moved us to another cpu.
311 */
312 gd = mycpu;
313 dd = &bsd4_pcpu[gd->gd_cpuid];
b9eb1c19 314 }
52eedfb5 315 } while (dd->uschedcp != lp);
b9eb1c19 316
4643740a
MD
317 crit_exit_quick(td);
318 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0);
52eedfb5
MD
319}
320
321/*
322 * BSD4_RELEASE_CURPROC
323 *
324 * This routine detaches the current thread from the userland scheduler,
b9eb1c19
MD
325 * usually because the thread needs to run or block in the kernel (at
326 * kernel priority) for a while.
52eedfb5
MD
327 *
328 * This routine is also responsible for selecting a new thread to
329 * make the current thread.
330 *
331 * NOTE: This implementation differs from the dummy example in that
332 * bsd4_select_curproc() is able to select the current process, whereas
333 * dummy_select_curproc() is not able to select the current process.
334 * This means we have to NULL out uschedcp.
335 *
336 * Additionally, note that we may already be on a run queue if releasing
337 * via the lwkt_switch() in bsd4_setrunqueue().
338 *
52eedfb5
MD
339 * MPSAFE
340 */
341static void
342bsd4_release_curproc(struct lwp *lp)
343{
344 globaldata_t gd = mycpu;
345 bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid];
346
347 if (dd->uschedcp == lp) {
b9eb1c19 348 crit_enter();
4643740a 349 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0);
52eedfb5 350 dd->uschedcp = NULL; /* don't let lp be selected */
b9eb1c19 351 dd->upri = PRIBASE_NULL;
da23a592 352 atomic_clear_cpumask(&bsd4_curprocmask, gd->gd_cpumask);
52eedfb5 353 bsd4_select_curproc(gd);
b9eb1c19 354 crit_exit();
52eedfb5 355 }
38b25931
MD
356}
357
38b25931 358/*
52eedfb5
MD
359 * BSD4_SELECT_CURPROC
360 *
b9eb1c19
MD
361 * Select a new current process for this cpu and clear any pending user
362 * reschedule request. The cpu currently has no current process.
52eedfb5
MD
363 *
364 * This routine is also responsible for equal-priority round-robining,
365 * typically triggered from bsd4_schedulerclock(). In our dummy example
366 * all the 'user' threads are LWKT scheduled all at once and we just
367 * call lwkt_switch().
368 *
b9eb1c19
MD
369 * The calling process is not on the queue and cannot be selected.
370 *
52eedfb5 371 * MPSAFE
38b25931
MD
372 */
373static
374void
52eedfb5 375bsd4_select_curproc(globaldata_t gd)
38b25931 376{
52eedfb5
MD
377 bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid];
378 struct lwp *nlp;
379 int cpuid = gd->gd_cpuid;
38b25931 380
52eedfb5 381 crit_enter_gd(gd);
52eedfb5 382
287a8577 383 spin_lock(&bsd4_spin);
52eedfb5 384 if ((nlp = chooseproc_locked(dd->uschedcp)) != NULL) {
da23a592 385 atomic_set_cpumask(&bsd4_curprocmask, CPUMASK(cpuid));
52eedfb5
MD
386 dd->upri = nlp->lwp_priority;
387 dd->uschedcp = nlp;
287a8577 388 spin_unlock(&bsd4_spin);
52eedfb5
MD
389#ifdef SMP
390 lwkt_acquire(nlp->lwp_thread);
38b25931 391#endif
52eedfb5 392 lwkt_schedule(nlp->lwp_thread);
eb501f47
MD
393 } else {
394 spin_unlock(&bsd4_spin);
395 }
396#if 0
da23a592
MD
397 } else if (bsd4_runqcount && (bsd4_rdyprocmask & CPUMASK(cpuid))) {
398 atomic_clear_cpumask(&bsd4_rdyprocmask, CPUMASK(cpuid));
287a8577 399 spin_unlock(&bsd4_spin);
52eedfb5
MD
400 lwkt_schedule(&dd->helper_thread);
401 } else {
287a8577 402 spin_unlock(&bsd4_spin);
52eedfb5 403 }
eb501f47 404#endif
52eedfb5
MD
405 crit_exit_gd(gd);
406}
38b25931
MD
407
408/*
52eedfb5
MD
409 * BSD4_SETRUNQUEUE
410 *
b9eb1c19
MD
411 * Place the specified lwp on the user scheduler's run queue. This routine
412 * must be called with the thread descheduled. The lwp must be runnable.
38b25931 413 *
b9eb1c19 414 * The thread may be the current thread as a special case.
52eedfb5
MD
415 *
416 * MPSAFE
38b25931
MD
417 */
418static void
553ea3c8 419bsd4_setrunqueue(struct lwp *lp)
38b25931 420{
52eedfb5
MD
421 globaldata_t gd;
422 bsd4_pcpu_t dd;
38b25931 423#ifdef SMP
b9eb1c19 424 int cpuid;
38b25931 425 cpumask_t mask;
52eedfb5 426 cpumask_t tmpmask;
38b25931
MD
427#endif
428
52eedfb5
MD
429 /*
430 * First validate the process state relative to the current cpu.
431 * We don't need the spinlock for this, just a critical section.
432 * We are in control of the process.
433 */
38b25931 434 crit_enter();
164b8401 435 KASSERT(lp->lwp_stat == LSRUN, ("setrunqueue: lwp not LSRUN"));
4643740a 436 KASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0,
164b8401 437 ("lwp %d/%d already on runq! flag %08x/%08x", lp->lwp_proc->p_pid,
4643740a 438 lp->lwp_tid, lp->lwp_proc->p_flags, lp->lwp_flags));
553ea3c8 439 KKASSERT((lp->lwp_thread->td_flags & TDF_RUNQ) == 0);
38b25931
MD
440
441 /*
52eedfb5
MD
442 * Note: gd and dd are relative to the target thread's last cpu,
443 * NOT our current cpu.
38b25931 444 */
553ea3c8 445 gd = lp->lwp_thread->td_gd;
52eedfb5 446 dd = &bsd4_pcpu[gd->gd_cpuid];
38b25931
MD
447
448 /*
52eedfb5
MD
449 * This process is not supposed to be scheduled anywhere or assigned
450 * as the current process anywhere. Assert the condition.
38b25931 451 */
52eedfb5 452 KKASSERT(dd->uschedcp != lp);
38b25931 453
b9eb1c19 454#ifndef SMP
38b25931 455 /*
b9eb1c19
MD
456 * If we are not SMP we do not have a scheduler helper to kick
457 * and must directly activate the process if none are scheduled.
38b25931 458 *
b9eb1c19
MD
459 * This is really only an issue when bootstrapping init since
460 * the caller in all other cases will be a user process, and
461 * even if released (dd->uschedcp == NULL), that process will
462 * kickstart the scheduler when it returns to user mode from
463 * the kernel.
38b25931 464 */
b9eb1c19 465 if (dd->uschedcp == NULL) {
da23a592 466 atomic_set_cpumask(&bsd4_curprocmask, gd->gd_cpumask);
52eedfb5
MD
467 dd->uschedcp = lp;
468 dd->upri = lp->lwp_priority;
553ea3c8 469 lwkt_schedule(lp->lwp_thread);
38b25931 470 crit_exit();
38b25931
MD
471 return;
472 }
b9eb1c19 473#endif
38b25931 474
38b25931
MD
475#ifdef SMP
476 /*
52eedfb5
MD
477 * XXX fixme. Could be part of a remrunqueue/setrunqueue
478 * operation when the priority is recalculated, so TDF_MIGRATING
479 * may already be set.
38b25931 480 */
52eedfb5
MD
481 if ((lp->lwp_thread->td_flags & TDF_MIGRATING) == 0)
482 lwkt_giveaway(lp->lwp_thread);
483#endif
50017724
MD
484
485 /*
486 * We lose control of lp the moment we release the spinlock after
487 * having placed lp on the queue. i.e. another cpu could pick it
488 * up and it could exit, or its priority could be further adjusted,
489 * or something like that.
490 */
287a8577 491 spin_lock(&bsd4_spin);
52eedfb5 492 bsd4_setrunqueue_locked(lp);
38b25931 493
b9eb1c19 494#ifdef SMP
38b25931 495 /*
b9eb1c19
MD
496 * Kick the scheduler helper on one of the other cpu's
497 * and request a reschedule if appropriate.
eb501f47
MD
498 *
499 * NOTE: We check all cpus whos rdyprocmask is set. First we
500 * look for cpus without designated lps, then we look for
501 * cpus with designated lps with a worse priority than our
502 * process.
38b25931 503 */
b9eb1c19 504 ++bsd4_scancpu;
eb501f47
MD
505 cpuid = (bsd4_scancpu & 0xFFFF) % ncpus;
506 mask = ~bsd4_curprocmask & bsd4_rdyprocmask & lp->lwp_cpumask &
916e604f 507 smp_active_mask & usched_global_cpumask;
38b25931 508
b9eb1c19 509 while (mask) {
da23a592 510 tmpmask = ~(CPUMASK(cpuid) - 1);
52eedfb5 511 if (mask & tmpmask)
da23a592 512 cpuid = BSFCPUMASK(mask & tmpmask);
52eedfb5 513 else
da23a592 514 cpuid = BSFCPUMASK(mask);
b9eb1c19
MD
515 gd = globaldata_find(cpuid);
516 dd = &bsd4_pcpu[cpuid];
517
eb501f47
MD
518 if ((dd->upri & ~PPQMASK) >= (lp->lwp_priority & ~PPQMASK))
519 goto found;
520 mask &= ~CPUMASK(cpuid);
521 }
522
523 /*
524 * Then cpus which might have a currently running lp
525 */
526 mask = bsd4_curprocmask & bsd4_rdyprocmask &
916e604f 527 lp->lwp_cpumask & smp_active_mask & usched_global_cpumask;
eb501f47
MD
528
529 while (mask) {
530 tmpmask = ~(CPUMASK(cpuid) - 1);
531 if (mask & tmpmask)
532 cpuid = BSFCPUMASK(mask & tmpmask);
533 else
534 cpuid = BSFCPUMASK(mask);
535 gd = globaldata_find(cpuid);
536 dd = &bsd4_pcpu[cpuid];
537
538 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK))
539 goto found;
da23a592 540 mask &= ~CPUMASK(cpuid);
b9eb1c19 541 }
eb501f47
MD
542
543 /*
544 * If we cannot find a suitable cpu we reload from bsd4_scancpu
545 * and round-robin. Other cpus will pickup as they release their
546 * current lwps or become ready.
547 *
916e604f
MD
548 * Avoid a degenerate system lockup case if usched_global_cpumask
549 * is set to 0 or otherwise does not cover lwp_cpumask.
550 *
eb501f47
MD
551 * We only kick the target helper thread in this case, we do not
552 * set the user resched flag because
553 */
554 cpuid = (bsd4_scancpu & 0xFFFF) % ncpus;
916e604f
MD
555 if ((CPUMASK(cpuid) & usched_global_cpumask) == 0) {
556 cpuid = 0;
557 }
eb501f47
MD
558 gd = globaldata_find(cpuid);
559 dd = &bsd4_pcpu[cpuid];
560found:
561 if (gd == mycpu) {
562 spin_unlock(&bsd4_spin);
58bb3381
MD
563 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) {
564 if (dd->uschedcp == NULL) {
565 lwkt_schedule(&dd->helper_thread);
566 } else {
567 need_user_resched();
568 }
569 }
eb501f47
MD
570 } else {
571 atomic_clear_cpumask(&bsd4_rdyprocmask, CPUMASK(cpuid));
572 spin_unlock(&bsd4_spin);
573 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK))
574 lwkt_send_ipiq(gd, need_user_resched_remote, NULL);
575 else
576 lwkt_schedule(&dd->helper_thread);
577 }
b9eb1c19
MD
578#else
579 /*
580 * Request a reschedule if appropriate.
581 */
287a8577 582 spin_unlock(&bsd4_spin);
b9eb1c19
MD
583 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) {
584 need_user_resched();
38b25931
MD
585 }
586#endif
587 crit_exit();
588}
589
590/*
38b25931 591 * This routine is called from a systimer IPI. It MUST be MP-safe and
52eedfb5
MD
592 * the BGL IS NOT HELD ON ENTRY. This routine is called at ESTCPUFREQ on
593 * each cpu.
594 *
270ac911 595 * MPSAFE
38b25931
MD
596 */
597static
598void
553ea3c8 599bsd4_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp)
38b25931
MD
600{
601 globaldata_t gd = mycpu;
52eedfb5 602 bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid];
38b25931
MD
603
604 /*
605 * Do we need to round-robin? We round-robin 10 times a second.
606 * This should only occur for cpu-bound batch processes.
607 */
52eedfb5
MD
608 if (++dd->rrcount >= usched_bsd4_rrinterval) {
609 dd->rrcount = 0;
38b25931
MD
610 need_user_resched();
611 }
612
613 /*
52cac9fb 614 * Adjust estcpu upward using a real time equivalent calculation.
38b25931 615 */
52cac9fb 616 lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu + ESTCPUMAX / ESTCPUFREQ + 1);
50017724
MD
617
618 /*
77912481
MD
619 * Spinlocks also hold a critical section so there should not be
620 * any active.
50017724 621 */
77912481
MD
622 KKASSERT(gd->gd_spinlocks_wr == 0);
623
624 bsd4_resetpriority(lp);
625#if 0
626 /*
627 * if we can't call bsd4_resetpriority for some reason we must call
628 * need user_resched().
629 */
630 need_user_resched();
631#endif
38b25931
MD
632}
633
634/*
52eedfb5
MD
635 * Called from acquire and from kern_synch's one-second timer (one of the
636 * callout helper threads) with a critical section held.
38b25931 637 *
52eedfb5
MD
638 * Decay p_estcpu based on the number of ticks we haven't been running
639 * and our p_nice. As the load increases each process observes a larger
640 * number of idle ticks (because other processes are running in them).
641 * This observation leads to a larger correction which tends to make the
642 * system more 'batchy'.
38b25931 643 *
52eedfb5
MD
644 * Note that no recalculation occurs for a process which sleeps and wakes
645 * up in the same tick. That is, a system doing thousands of context
646 * switches per second will still only do serious estcpu calculations
647 * ESTCPUFREQ times per second.
38b25931 648 *
52eedfb5 649 * MPSAFE
38b25931
MD
650 */
651static
52eedfb5
MD
652void
653bsd4_recalculate_estcpu(struct lwp *lp)
38b25931 654{
52eedfb5
MD
655 globaldata_t gd = mycpu;
656 sysclock_t cpbase;
52cac9fb
MD
657 sysclock_t ttlticks;
658 int estcpu;
659 int decay_factor;
38b25931
MD
660
661 /*
52eedfb5
MD
662 * We have to subtract periodic to get the last schedclock
663 * timeout time, otherwise we would get the upcoming timeout.
664 * Keep in mind that a process can migrate between cpus and
665 * while the scheduler clock should be very close, boundary
666 * conditions could lead to a small negative delta.
38b25931 667 */
52eedfb5 668 cpbase = gd->gd_schedclock.time - gd->gd_schedclock.periodic;
38b25931 669
52eedfb5
MD
670 if (lp->lwp_slptime > 1) {
671 /*
672 * Too much time has passed, do a coarse correction.
673 */
674 lp->lwp_estcpu = lp->lwp_estcpu >> 1;
675 bsd4_resetpriority(lp);
676 lp->lwp_cpbase = cpbase;
677 lp->lwp_cpticks = 0;
52cac9fb
MD
678 lp->lwp_batch -= ESTCPUFREQ;
679 if (lp->lwp_batch < 0)
680 lp->lwp_batch = 0;
52eedfb5
MD
681 } else if (lp->lwp_cpbase != cpbase) {
682 /*
683 * Adjust estcpu if we are in a different tick. Don't waste
684 * time if we are in the same tick.
685 *
686 * First calculate the number of ticks in the measurement
52cac9fb 687 * interval. The ttlticks calculation can wind up 0 due to
52eedfb5
MD
688 * a bug in the handling of lwp_slptime (as yet not found),
689 * so make sure we do not get a divide by 0 panic.
690 */
52cac9fb
MD
691 ttlticks = (cpbase - lp->lwp_cpbase) /
692 gd->gd_schedclock.periodic;
693 if (ttlticks < 0) {
694 ttlticks = 0;
695 lp->lwp_cpbase = cpbase;
52eedfb5 696 }
52cac9fb
MD
697 if (ttlticks == 0)
698 return;
699 updatepcpu(lp, lp->lwp_cpticks, ttlticks);
38b25931 700
52eedfb5 701 /*
52cac9fb
MD
702 * Calculate the percentage of one cpu used factoring in ncpus
703 * and the load and adjust estcpu. Handle degenerate cases
704 * by adding 1 to bsd4_runqcount.
705 *
706 * estcpu is scaled by ESTCPUMAX.
707 *
708 * bsd4_runqcount is the excess number of user processes
709 * that cannot be immediately scheduled to cpus. We want
710 * to count these as running to avoid range compression
711 * in the base calculation (which is the actual percentage
712 * of one cpu used).
52eedfb5 713 */
52cac9fb
MD
714 estcpu = (lp->lwp_cpticks * ESTCPUMAX) *
715 (bsd4_runqcount + ncpus) / (ncpus * ttlticks);
38b25931 716
52eedfb5 717 /*
52cac9fb
MD
718 * If estcpu is > 50% we become more batch-like
719 * If estcpu is <= 50% we become less batch-like
5c559233 720 *
52cac9fb 721 * It takes 30 cpu seconds to traverse the entire range.
52eedfb5 722 */
52cac9fb
MD
723 if (estcpu > ESTCPUMAX / 2) {
724 lp->lwp_batch += ttlticks;
725 if (lp->lwp_batch > BATCHMAX)
726 lp->lwp_batch = BATCHMAX;
727 } else {
728 lp->lwp_batch -= ttlticks;
729 if (lp->lwp_batch < 0)
730 lp->lwp_batch = 0;
5c559233 731 }
344ad853 732
5c559233 733 if (usched_debug == lp->lwp_proc->p_pid) {
52cac9fb
MD
734 kprintf("pid %d lwp %p estcpu %3d %3d bat %d cp %d/%d",
735 lp->lwp_proc->p_pid, lp,
736 estcpu, lp->lwp_estcpu,
737 lp->lwp_batch,
738 lp->lwp_cpticks, ttlticks);
5c559233 739 }
52cac9fb
MD
740
741 /*
742 * Adjust lp->lwp_esetcpu. The decay factor determines how
743 * quickly lwp_estcpu collapses to its realtime calculation.
744 * A slower collapse gives us a more accurate number but
745 * can cause a cpu hog to eat too much cpu before the
746 * scheduler decides to downgrade it.
747 *
748 * NOTE: p_nice is accounted for in bsd4_resetpriority(),
749 * and not here, but we must still ensure that a
750 * cpu-bound nice -20 process does not completely
751 * override a cpu-bound nice +20 process.
752 *
753 * NOTE: We must use ESTCPULIM() here to deal with any
754 * overshoot.
755 */
756 decay_factor = usched_bsd4_decay;
757 if (decay_factor < 1)
758 decay_factor = 1;
759 if (decay_factor > 1024)
760 decay_factor = 1024;
761
762 lp->lwp_estcpu = ESTCPULIM(
763 (lp->lwp_estcpu * decay_factor + estcpu) /
764 (decay_factor + 1));
765
766 if (usched_debug == lp->lwp_proc->p_pid)
767 kprintf(" finalestcpu %d\n", lp->lwp_estcpu);
52eedfb5 768 bsd4_resetpriority(lp);
52cac9fb 769 lp->lwp_cpbase += ttlticks * gd->gd_schedclock.periodic;
52eedfb5
MD
770 lp->lwp_cpticks = 0;
771 }
38b25931
MD
772}
773
774/*
775 * Compute the priority of a process when running in user mode.
776 * Arrange to reschedule if the resulting priority is better
777 * than that of the current process.
52eedfb5
MD
778 *
779 * This routine may be called with any process.
780 *
781 * This routine is called by fork1() for initial setup with the process
782 * of the run queue, and also may be called normally with the process on or
783 * off the run queue.
784 *
785 * MPSAFE
38b25931
MD
786 */
787static void
553ea3c8 788bsd4_resetpriority(struct lwp *lp)
38b25931 789{
52eedfb5 790 bsd4_pcpu_t dd;
38b25931 791 int newpriority;
52eedfb5
MD
792 u_short newrqtype;
793 int reschedcpu;
52cac9fb
MD
794 int checkpri;
795 int estcpu;
270ac911 796
38b25931 797 /*
52eedfb5 798 * Calculate the new priority and queue type
38b25931 799 */
52eedfb5 800 crit_enter();
287a8577 801 spin_lock(&bsd4_spin);
52eedfb5
MD
802
803 newrqtype = lp->lwp_rtprio.type;
804
805 switch(newrqtype) {
38b25931 806 case RTP_PRIO_REALTIME:
f64250e0 807 case RTP_PRIO_FIFO:
52eedfb5
MD
808 newpriority = PRIBASE_REALTIME +
809 (lp->lwp_rtprio.prio & PRIMASK);
810 break;
38b25931 811 case RTP_PRIO_NORMAL:
52cac9fb
MD
812 /*
813 * Detune estcpu based on batchiness. lwp_batch ranges
814 * from 0 to BATCHMAX. Limit estcpu for the sake of
815 * the priority calculation to between 50% and 100%.
816 */
817 estcpu = lp->lwp_estcpu * (lp->lwp_batch + BATCHMAX) /
818 (BATCHMAX * 2);
819
820 /*
821 * p_nice piece Adds (0-40) * 2 0-80
822 * estcpu Adds 16384 * 4 / 512 0-128
823 */
52eedfb5 824 newpriority = (lp->lwp_proc->p_nice - PRIO_MIN) * PPQ / NICEPPQ;
52cac9fb 825 newpriority += estcpu * PPQ / ESTCPUPPQ;
52eedfb5
MD
826 newpriority = newpriority * MAXPRI / (PRIO_RANGE * PPQ /
827 NICEPPQ + ESTCPUMAX * PPQ / ESTCPUPPQ);
828 newpriority = PRIBASE_NORMAL + (newpriority & PRIMASK);
38b25931
MD
829 break;
830 case RTP_PRIO_IDLE:
52eedfb5
MD
831 newpriority = PRIBASE_IDLE + (lp->lwp_rtprio.prio & PRIMASK);
832 break;
38b25931 833 case RTP_PRIO_THREAD:
52eedfb5
MD
834 newpriority = PRIBASE_THREAD + (lp->lwp_rtprio.prio & PRIMASK);
835 break;
836 default:
837 panic("Bad RTP_PRIO %d", newrqtype);
838 /* NOT REACHED */
38b25931
MD
839 }
840
841 /*
52eedfb5
MD
842 * The newpriority incorporates the queue type so do a simple masked
843 * check to determine if the process has moved to another queue. If
844 * it has, and it is currently on a run queue, then move it.
38b25931 845 */
52eedfb5
MD
846 if ((lp->lwp_priority ^ newpriority) & ~PPQMASK) {
847 lp->lwp_priority = newpriority;
4643740a 848 if (lp->lwp_mpflags & LWP_MP_ONRUNQ) {
52eedfb5
MD
849 bsd4_remrunqueue_locked(lp);
850 lp->lwp_rqtype = newrqtype;
851 lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ;
852 bsd4_setrunqueue_locked(lp);
52cac9fb 853 checkpri = 1;
52eedfb5
MD
854 } else {
855 lp->lwp_rqtype = newrqtype;
856 lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ;
52cac9fb 857 checkpri = 0;
52eedfb5 858 }
52cac9fb 859 reschedcpu = lp->lwp_thread->td_gd->gd_cpuid;
38b25931 860 } else {
52eedfb5
MD
861 lp->lwp_priority = newpriority;
862 reschedcpu = -1;
52cac9fb 863 checkpri = 1;
52eedfb5 864 }
52eedfb5
MD
865
866 /*
50017724
MD
867 * Determine if we need to reschedule the target cpu. This only
868 * occurs if the LWP is already on a scheduler queue, which means
869 * that idle cpu notification has already occured. At most we
870 * need only issue a need_user_resched() on the appropriate cpu.
281b4fa8
YT
871 *
872 * The LWP may be owned by a CPU different from the current one,
873 * in which case dd->uschedcp may be modified without an MP lock
874 * or a spinlock held. The worst that happens is that the code
875 * below causes a spurious need_user_resched() on the target CPU
876 * and dd->pri to be wrong for a short period of time, both of
877 * which are harmless.
52cac9fb
MD
878 *
879 * If checkpri is 0 we are adjusting the priority of the current
880 * process, possibly higher (less desireable), so ignore the upri
881 * check which will fail in that case.
52eedfb5
MD
882 */
883 if (reschedcpu >= 0) {
884 dd = &bsd4_pcpu[reschedcpu];
eb501f47 885 if ((bsd4_rdyprocmask & CPUMASK(reschedcpu)) &&
52cac9fb
MD
886 (checkpri == 0 ||
887 (dd->upri & ~PRIMASK) > (lp->lwp_priority & ~PRIMASK))) {
52eedfb5
MD
888#ifdef SMP
889 if (reschedcpu == mycpu->gd_cpuid) {
eb501f47 890 spin_unlock(&bsd4_spin);
52eedfb5
MD
891 need_user_resched();
892 } else {
eb501f47
MD
893 spin_unlock(&bsd4_spin);
894 atomic_clear_cpumask(&bsd4_rdyprocmask,
895 CPUMASK(reschedcpu));
52eedfb5
MD
896 lwkt_send_ipiq(lp->lwp_thread->td_gd,
897 need_user_resched_remote, NULL);
898 }
899#else
eb501f47 900 spin_unlock(&bsd4_spin);
52eedfb5
MD
901 need_user_resched();
902#endif
eb501f47
MD
903 } else {
904 spin_unlock(&bsd4_spin);
52eedfb5 905 }
eb501f47
MD
906 } else {
907 spin_unlock(&bsd4_spin);
38b25931
MD
908 }
909 crit_exit();
910}
911
3919ced0
MD
912/*
913 * MPSAFE
914 */
c3149361
MD
915static
916void
917bsd4_yield(struct lwp *lp)
918{
919#if 0
920 /* FUTURE (or something similar) */
921 switch(lp->lwp_rqtype) {
922 case RTP_PRIO_NORMAL:
923 lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu + ESTCPUINCR);
c3149361
MD
924 break;
925 default:
926 break;
927 }
928#endif
929 need_user_resched();
930}
931
38b25931
MD
932/*
933 * Called from fork1() when a new child process is being created.
934 *
935 * Give the child process an initial estcpu that is more batch then
936 * its parent and dock the parent for the fork (but do not
937 * reschedule the parent). This comprises the main part of our batch
938 * detection heuristic for both parallel forking and sequential execs.
939 *
553ea3c8 940 * XXX lwp should be "spawning" instead of "forking"
270ac911
MD
941 *
942 * MPSAFE
38b25931
MD
943 */
944static void
553ea3c8 945bsd4_forking(struct lwp *plp, struct lwp *lp)
38b25931 946{
52cac9fb
MD
947 /*
948 * Put the child 4 queue slots (out of 32) higher than the parent
949 * (less desireable than the parent).
950 */
951 lp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ * 4);
952
953 /*
954 * The batch status of children always starts out centerline
955 * and will inch-up or inch-down as appropriate. It takes roughly
956 * ~15 seconds of >50% cpu to hit the limit.
957 */
958 lp->lwp_batch = BATCHMAX / 2;
959
960 /*
961 * Dock the parent a cost for the fork, protecting us from fork
962 * bombs. If the parent is forking quickly make the child more
963 * batchy.
964 */
965 plp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ / 16);
38b25931
MD
966}
967
968/*
52cac9fb 969 * Called when a parent waits for a child.
270ac911
MD
970 *
971 * MPSAFE
38b25931
MD
972 */
973static void
52cac9fb 974bsd4_exiting(struct lwp *lp, struct proc *child_proc)
38b25931 975{
38b25931
MD
976}
977
978/*
52eedfb5
MD
979 * chooseproc() is called when a cpu needs a user process to LWKT schedule,
980 * it selects a user process and returns it. If chklp is non-NULL and chklp
981 * has a better or equal priority then the process that would otherwise be
982 * chosen, NULL is returned.
38b25931 983 *
52eedfb5
MD
984 * Until we fix the RUNQ code the chklp test has to be strict or we may
985 * bounce between processes trying to acquire the current process designation.
38b25931 986 *
52eedfb5
MD
987 * MPSAFE - must be called with bsd4_spin exclusive held. The spinlock is
988 * left intact through the entire routine.
38b25931
MD
989 */
990static
52eedfb5
MD
991struct lwp *
992chooseproc_locked(struct lwp *chklp)
38b25931 993{
52eedfb5
MD
994 struct lwp *lp;
995 struct rq *q;
a60ccb85 996 u_int32_t *which, *which2;
52eedfb5 997 u_int32_t pri;
a60ccb85
DX
998 u_int32_t rtqbits;
999 u_int32_t tsqbits;
1000 u_int32_t idqbits;
1001 cpumask_t cpumask;
38b25931 1002
a60ccb85
DX
1003 rtqbits = bsd4_rtqueuebits;
1004 tsqbits = bsd4_queuebits;
1005 idqbits = bsd4_idqueuebits;
1006 cpumask = mycpu->gd_cpumask;
1007
1008#ifdef SMP
1009again:
1010#endif
1011 if (rtqbits) {
1012 pri = bsfl(rtqbits);
52eedfb5
MD
1013 q = &bsd4_rtqueues[pri];
1014 which = &bsd4_rtqueuebits;
a60ccb85
DX
1015 which2 = &rtqbits;
1016 } else if (tsqbits) {
1017 pri = bsfl(tsqbits);
52eedfb5
MD
1018 q = &bsd4_queues[pri];
1019 which = &bsd4_queuebits;
a60ccb85
DX
1020 which2 = &tsqbits;
1021 } else if (idqbits) {
1022 pri = bsfl(idqbits);
52eedfb5
MD
1023 q = &bsd4_idqueues[pri];
1024 which = &bsd4_idqueuebits;
a60ccb85 1025 which2 = &idqbits;
52eedfb5
MD
1026 } else {
1027 return NULL;
1028 }
1029 lp = TAILQ_FIRST(q);
1030 KASSERT(lp, ("chooseproc: no lwp on busy queue"));
270ac911 1031
a60ccb85
DX
1032#ifdef SMP
1033 while ((lp->lwp_cpumask & cpumask) == 0) {
1034 lp = TAILQ_NEXT(lp, lwp_procq);
1035 if (lp == NULL) {
1036 *which2 &= ~(1 << pri);
1037 goto again;
1038 }
1039 }
1040#endif
1041
38b25931 1042 /*
52eedfb5
MD
1043 * If the passed lwp <chklp> is reasonably close to the selected
1044 * lwp <lp>, return NULL (indicating that <chklp> should be kept).
1045 *
1046 * Note that we must error on the side of <chklp> to avoid bouncing
1047 * between threads in the acquire code.
38b25931 1048 */
52eedfb5
MD
1049 if (chklp) {
1050 if (chklp->lwp_priority < lp->lwp_priority + PPQ)
1051 return(NULL);
1052 }
38b25931 1053
52eedfb5
MD
1054#ifdef SMP
1055 /*
1056 * If the chosen lwp does not reside on this cpu spend a few
1057 * cycles looking for a better candidate at the same priority level.
1058 * This is a fallback check, setrunqueue() tries to wakeup the
1059 * correct cpu and is our front-line affinity.
1060 */
1061 if (lp->lwp_thread->td_gd != mycpu &&
1062 (chklp = TAILQ_NEXT(lp, lwp_procq)) != NULL
1063 ) {
1064 if (chklp->lwp_thread->td_gd == mycpu) {
1065 ++choose_affinity;
1066 lp = chklp;
38b25931 1067 }
52eedfb5
MD
1068 }
1069#endif
38b25931 1070
52eedfb5
MD
1071 TAILQ_REMOVE(q, lp, lwp_procq);
1072 --bsd4_runqcount;
1073 if (TAILQ_EMPTY(q))
1074 *which &= ~(1 << pri);
4643740a
MD
1075 KASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) != 0, ("not on runq6!"));
1076 atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ);
52eedfb5
MD
1077 return lp;
1078}
38b25931 1079
52eedfb5 1080#ifdef SMP
b9eb1c19 1081
52eedfb5
MD
1082static
1083void
1084need_user_resched_remote(void *dummy)
1085{
b9eb1c19
MD
1086 globaldata_t gd = mycpu;
1087 bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid];
1088
eb501f47
MD
1089 need_user_resched();
1090 lwkt_schedule(&dd->helper_thread);
52eedfb5 1091}
38b25931 1092
52eedfb5 1093#endif
38b25931 1094
52eedfb5
MD
1095/*
1096 * bsd4_remrunqueue_locked() removes a given process from the run queue
1097 * that it is on, clearing the queue busy bit if it becomes empty.
1098 *
1099 * Note that user process scheduler is different from the LWKT schedule.
1100 * The user process scheduler only manages user processes but it uses LWKT
1101 * underneath, and a user process operating in the kernel will often be
1102 * 'released' from our management.
1103 *
1104 * MPSAFE - bsd4_spin must be held exclusively on call
1105 */
1106static void
1107bsd4_remrunqueue_locked(struct lwp *lp)
1108{
1109 struct rq *q;
1110 u_int32_t *which;
1111 u_int8_t pri;
1112
4643740a
MD
1113 KKASSERT(lp->lwp_mpflags & LWP_MP_ONRUNQ);
1114 atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ);
52eedfb5
MD
1115 --bsd4_runqcount;
1116 KKASSERT(bsd4_runqcount >= 0);
1117
1118 pri = lp->lwp_rqindex;
1119 switch(lp->lwp_rqtype) {
1120 case RTP_PRIO_NORMAL:
1121 q = &bsd4_queues[pri];
1122 which = &bsd4_queuebits;
1123 break;
1124 case RTP_PRIO_REALTIME:
1125 case RTP_PRIO_FIFO:
1126 q = &bsd4_rtqueues[pri];
1127 which = &bsd4_rtqueuebits;
1128 break;
1129 case RTP_PRIO_IDLE:
1130 q = &bsd4_idqueues[pri];
1131 which = &bsd4_idqueuebits;
1132 break;
1133 default:
1134 panic("remrunqueue: invalid rtprio type");
1135 /* NOT REACHED */
1136 }
1137 TAILQ_REMOVE(q, lp, lwp_procq);
1138 if (TAILQ_EMPTY(q)) {
1139 KASSERT((*which & (1 << pri)) != 0,
1140 ("remrunqueue: remove from empty queue"));
1141 *which &= ~(1 << pri);
38b25931
MD
1142 }
1143}
1144
52eedfb5
MD
1145/*
1146 * bsd4_setrunqueue_locked()
1147 *
1148 * Add a process whos rqtype and rqindex had previously been calculated
1149 * onto the appropriate run queue. Determine if the addition requires
1150 * a reschedule on a cpu and return the cpuid or -1.
1151 *
1152 * NOTE: Lower priorities are better priorities.
1153 *
1154 * MPSAFE - bsd4_spin must be held exclusively on call
1155 */
1156static void
1157bsd4_setrunqueue_locked(struct lwp *lp)
1158{
1159 struct rq *q;
1160 u_int32_t *which;
1161 int pri;
1162
4643740a
MD
1163 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0);
1164 atomic_set_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ);
52eedfb5
MD
1165 ++bsd4_runqcount;
1166
1167 pri = lp->lwp_rqindex;
1168
1169 switch(lp->lwp_rqtype) {
1170 case RTP_PRIO_NORMAL:
1171 q = &bsd4_queues[pri];
1172 which = &bsd4_queuebits;
1173 break;
1174 case RTP_PRIO_REALTIME:
1175 case RTP_PRIO_FIFO:
1176 q = &bsd4_rtqueues[pri];
1177 which = &bsd4_rtqueuebits;
1178 break;
1179 case RTP_PRIO_IDLE:
1180 q = &bsd4_idqueues[pri];
1181 which = &bsd4_idqueuebits;
1182 break;
1183 default:
1184 panic("remrunqueue: invalid rtprio type");
1185 /* NOT REACHED */
1186 }
1187
1188 /*
1189 * Add to the correct queue and set the appropriate bit. If no
1190 * lower priority (i.e. better) processes are in the queue then
1191 * we want a reschedule, calculate the best cpu for the job.
1192 *
1193 * Always run reschedules on the LWPs original cpu.
1194 */
1195 TAILQ_INSERT_TAIL(q, lp, lwp_procq);
1196 *which |= 1 << pri;
1197}
1198
38b25931
MD
1199#ifdef SMP
1200
1201/*
1202 * For SMP systems a user scheduler helper thread is created for each
1203 * cpu and is used to allow one cpu to wakeup another for the purposes of
c9e9fb21
MD
1204 * scheduling userland threads from setrunqueue().
1205 *
1206 * UP systems do not need the helper since there is only one cpu.
1207 *
1208 * We can't use the idle thread for this because we might block.
1209 * Additionally, doing things this way allows us to HLT idle cpus
1210 * on MP systems.
52eedfb5
MD
1211 *
1212 * MPSAFE
38b25931
MD
1213 */
1214static void
1215sched_thread(void *dummy)
1216{
52eedfb5
MD
1217 globaldata_t gd;
1218 bsd4_pcpu_t dd;
85946b6c 1219 bsd4_pcpu_t tmpdd;
52eedfb5 1220 struct lwp *nlp;
eb501f47 1221 cpumask_t mask;
52eedfb5 1222 int cpuid;
eb501f47 1223#ifdef SMP
418f19aa 1224 cpumask_t tmpmask;
52eedfb5 1225 int tmpid;
418f19aa 1226#endif
52eedfb5
MD
1227
1228 gd = mycpu;
1229 cpuid = gd->gd_cpuid; /* doesn't change */
eb501f47 1230 mask = gd->gd_cpumask; /* doesn't change */
52eedfb5
MD
1231 dd = &bsd4_pcpu[cpuid];
1232
1233 /*
c9e9fb21
MD
1234 * Since we are woken up only when no user processes are scheduled
1235 * on a cpu, we can run at an ultra low priority.
52eedfb5 1236 */
50017724 1237 lwkt_setpri_self(TDPRI_USER_SCHEDULER);
38b25931 1238
38b25931 1239 for (;;) {
50017724
MD
1240 /*
1241 * We use the LWKT deschedule-interlock trick to avoid racing
1242 * bsd4_rdyprocmask. This means we cannot block through to the
1243 * manual lwkt_switch() call we make below.
1244 */
52eedfb5 1245 crit_enter_gd(gd);
50017724 1246 lwkt_deschedule_self(gd->gd_curthread);
287a8577 1247 spin_lock(&bsd4_spin);
eb501f47 1248 atomic_set_cpumask(&bsd4_rdyprocmask, mask);
b9eb1c19
MD
1249
1250 clear_user_resched(); /* This satisfied the reschedule request */
1251 dd->rrcount = 0; /* Reset the round-robin counter */
1252
eb501f47 1253 if ((bsd4_curprocmask & mask) == 0) {
b9eb1c19
MD
1254 /*
1255 * No thread is currently scheduled.
1256 */
1257 KKASSERT(dd->uschedcp == NULL);
52eedfb5 1258 if ((nlp = chooseproc_locked(NULL)) != NULL) {
eb501f47 1259 atomic_set_cpumask(&bsd4_curprocmask, mask);
52eedfb5
MD
1260 dd->upri = nlp->lwp_priority;
1261 dd->uschedcp = nlp;
287a8577 1262 spin_unlock(&bsd4_spin);
cc9b6223 1263#ifdef SMP
52eedfb5 1264 lwkt_acquire(nlp->lwp_thread);
cc9b6223 1265#endif
52eedfb5
MD
1266 lwkt_schedule(nlp->lwp_thread);
1267 } else {
287a8577 1268 spin_unlock(&bsd4_spin);
52eedfb5 1269 }
b9eb1c19 1270 } else if (bsd4_runqcount) {
eb501f47
MD
1271 if ((nlp = chooseproc_locked(dd->uschedcp)) != NULL) {
1272 dd->upri = nlp->lwp_priority;
1273 dd->uschedcp = nlp;
1274 spin_unlock(&bsd4_spin);
cc9b6223 1275#ifdef SMP
eb501f47 1276 lwkt_acquire(nlp->lwp_thread);
cc9b6223 1277#endif
eb501f47 1278 lwkt_schedule(nlp->lwp_thread);
52eedfb5 1279 } else {
eb501f47
MD
1280 /*
1281 * CHAINING CONDITION TRAIN
1282 *
1283 * We could not deal with the scheduler wakeup
1284 * request on this cpu, locate a ready scheduler
1285 * with no current lp assignment and chain to it.
1286 *
1287 * This ensures that a wakeup race which fails due
1288 * to priority test does not leave other unscheduled
1289 * cpus idle when the runqueue is not empty.
1290 */
1291 tmpmask = ~bsd4_curprocmask & bsd4_rdyprocmask &
1292 smp_active_mask;
1293 if (tmpmask) {
1294 tmpid = BSFCPUMASK(tmpmask);
85946b6c 1295 tmpdd = &bsd4_pcpu[tmpid];
eb501f47
MD
1296 atomic_clear_cpumask(&bsd4_rdyprocmask,
1297 CPUMASK(tmpid));
1298 spin_unlock(&bsd4_spin);
85946b6c 1299 lwkt_schedule(&tmpdd->helper_thread);
eb501f47
MD
1300 } else {
1301 spin_unlock(&bsd4_spin);
1302 }
52eedfb5 1303 }
b9eb1c19
MD
1304 } else {
1305 /*
1306 * The runq is empty.
1307 */
287a8577 1308 spin_unlock(&bsd4_spin);
38b25931 1309 }
85946b6c
MD
1310
1311 /*
1312 * We're descheduled unless someone scheduled us. Switch away.
1313 * Exiting the critical section will cause splz() to be called
1314 * for us if interrupts and such are pending.
1315 */
52eedfb5 1316 crit_exit_gd(gd);
38b25931
MD
1317 lwkt_switch();
1318 }
1319}
1320
1321/*
1322 * Setup our scheduler helpers. Note that curprocmask bit 0 has already
1323 * been cleared by rqinit() and we should not mess with it further.
1324 */
1325static void
1326sched_thread_cpu_init(void)
1327{
1328 int i;
1329
1330 if (bootverbose)
6ea70f76 1331 kprintf("start scheduler helpers on cpus:");
38b25931
MD
1332
1333 for (i = 0; i < ncpus; ++i) {
52eedfb5 1334 bsd4_pcpu_t dd = &bsd4_pcpu[i];
da23a592 1335 cpumask_t mask = CPUMASK(i);
38b25931
MD
1336
1337 if ((mask & smp_active_mask) == 0)
1338 continue;
1339
1340 if (bootverbose)
6ea70f76 1341 kprintf(" %d", i);
38b25931 1342
52eedfb5 1343 lwkt_create(sched_thread, NULL, NULL, &dd->helper_thread,
4643740a 1344 TDF_NOSTART, i, "usched %d", i);
38b25931
MD
1345
1346 /*
1347 * Allow user scheduling on the target cpu. cpu #0 has already
1348 * been enabled in rqinit().
1349 */
1350 if (i)
da23a592
MD
1351 atomic_clear_cpumask(&bsd4_curprocmask, mask);
1352 atomic_set_cpumask(&bsd4_rdyprocmask, mask);
b9eb1c19 1353 dd->upri = PRIBASE_NULL;
38b25931
MD
1354 }
1355 if (bootverbose)
6ea70f76 1356 kprintf("\n");
38b25931 1357}
ba39e2e0
MD
1358SYSINIT(uschedtd, SI_BOOT2_USCHED, SI_ORDER_SECOND,
1359 sched_thread_cpu_init, NULL)
38b25931
MD
1360
1361#endif
1362