| Commit | Line | Data |
|---|---|---|
| 38b25931 MD |
1 | /* |
| 2 | * Copyright (c) 1999 Peter Wemm <peter@FreeBSD.org> | |
| 3 | * All rights reserved. | |
| 4 | * | |
| 5 | * Redistribution and use in source and binary forms, with or without | |
| 6 | * modification, are permitted provided that the following conditions | |
| 7 | * are met: | |
| 8 | * 1. Redistributions of source code must retain the above copyright | |
| 9 | * notice, this list of conditions and the following disclaimer. | |
| 10 | * 2. Redistributions in binary form must reproduce the above copyright | |
| 11 | * notice, this list of conditions and the following disclaimer in the | |
| 12 | * documentation and/or other materials provided with the distribution. | |
| 13 | * | |
| 14 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND | |
| 15 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
| 16 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
| 17 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE | |
| 18 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
| 19 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
| 20 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
| 21 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
| 22 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
| 23 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
| 24 | * SUCH DAMAGE. | |
| 25 | * | |
| 3925aa71 | 26 | * $DragonFly: src/sys/kern/usched_bsd4.c,v 1.26 2008/11/01 23:31:19 dillon Exp $ |
| 38b25931 MD |
27 | */ |
| 28 | ||
| 29 | #include <sys/param.h> | |
| 30 | #include <sys/systm.h> | |
| 31 | #include <sys/kernel.h> | |
| 32 | #include <sys/lock.h> | |
| 33 | #include <sys/queue.h> | |
| 34 | #include <sys/proc.h> | |
| 35 | #include <sys/rtprio.h> | |
| 38b25931 MD |
36 | #include <sys/uio.h> |
| 37 | #include <sys/sysctl.h> | |
| 38 | #include <sys/resourcevar.h> | |
| 52eedfb5 | 39 | #include <sys/spinlock.h> |
| 38b25931 MD |
40 | #include <machine/cpu.h> |
| 41 | #include <machine/smp.h> | |
| 42 | ||
| 52eedfb5 MD |
43 | #include <sys/thread2.h> |
| 44 | #include <sys/spinlock2.h> | |
| 684a93c4 | 45 | #include <sys/mplock2.h> |
| 52eedfb5 | 46 | |
| 38b25931 MD |
47 | /* |
| 48 | * Priorities. Note that with 32 run queues per scheduler each queue | |
| 49 | * represents four priority levels. | |
| 50 | */ | |
| 51 | ||
| 52 | #define MAXPRI 128 | |
| 53 | #define PRIMASK (MAXPRI - 1) | |
| 54 | #define PRIBASE_REALTIME 0 | |
| 55 | #define PRIBASE_NORMAL MAXPRI | |
| 56 | #define PRIBASE_IDLE (MAXPRI * 2) | |
| 57 | #define PRIBASE_THREAD (MAXPRI * 3) | |
| 58 | #define PRIBASE_NULL (MAXPRI * 4) | |
| 59 | ||
| 60 | #define NQS 32 /* 32 run queues. */ | |
| 61 | #define PPQ (MAXPRI / NQS) /* priorities per queue */ | |
| 52eedfb5 | 62 | #define PPQMASK (PPQ - 1) |
| 38b25931 MD |
63 | |
| 64 | /* | |
| 65 | * NICEPPQ - number of nice units per priority queue | |
| 66 | * ESTCPURAMP - number of scheduler ticks for estcpu to switch queues | |
| 67 | * | |
| 68 | * ESTCPUPPQ - number of estcpu units per priority queue | |
| 69 | * ESTCPUMAX - number of estcpu units | |
| 70 | * ESTCPUINCR - amount we have to increment p_estcpu per scheduling tick at | |
| 71 | * 100% cpu. | |
| 72 | */ | |
| 73 | #define NICEPPQ 2 | |
| 74 | #define ESTCPURAMP 4 | |
| 75 | #define ESTCPUPPQ 512 | |
| 76 | #define ESTCPUMAX (ESTCPUPPQ * NQS) | |
| 77 | #define ESTCPUINCR (ESTCPUPPQ / ESTCPURAMP) | |
| 78 | #define PRIO_RANGE (PRIO_MAX - PRIO_MIN + 1) | |
| 79 | ||
| 80 | #define ESTCPULIM(v) min((v), ESTCPUMAX) | |
| 81 | ||
| 553ea3c8 | 82 | TAILQ_HEAD(rq, lwp); |
| 38b25931 | 83 | |
| 553ea3c8 SS |
84 | #define lwp_priority lwp_usdata.bsd4.priority |
| 85 | #define lwp_rqindex lwp_usdata.bsd4.rqindex | |
| 86 | #define lwp_origcpu lwp_usdata.bsd4.origcpu | |
| 87 | #define lwp_estcpu lwp_usdata.bsd4.estcpu | |
| 52eedfb5 | 88 | #define lwp_rqtype lwp_usdata.bsd4.rqtype |
| 38b25931 | 89 | |
| 553ea3c8 SS |
90 | static void bsd4_acquire_curproc(struct lwp *lp); |
| 91 | static void bsd4_release_curproc(struct lwp *lp); | |
| 38b25931 | 92 | static void bsd4_select_curproc(globaldata_t gd); |
| 553ea3c8 | 93 | static void bsd4_setrunqueue(struct lwp *lp); |
| 553ea3c8 | 94 | static void bsd4_schedulerclock(struct lwp *lp, sysclock_t period, |
| 38b25931 | 95 | sysclock_t cpstamp); |
| 52eedfb5 | 96 | static void bsd4_recalculate_estcpu(struct lwp *lp); |
| 553ea3c8 SS |
97 | static void bsd4_resetpriority(struct lwp *lp); |
| 98 | static void bsd4_forking(struct lwp *plp, struct lwp *lp); | |
| 99 | static void bsd4_exiting(struct lwp *plp, struct lwp *lp); | |
| c3149361 | 100 | static void bsd4_yield(struct lwp *lp); |
| 38b25931 | 101 | |
| 52eedfb5 MD |
102 | #ifdef SMP |
| 103 | static void need_user_resched_remote(void *dummy); | |
| 104 | #endif | |
| 105 | static struct lwp *chooseproc_locked(struct lwp *chklp); | |
| 106 | static void bsd4_remrunqueue_locked(struct lwp *lp); | |
| 107 | static void bsd4_setrunqueue_locked(struct lwp *lp); | |
| 38b25931 MD |
108 | |
| 109 | struct usched usched_bsd4 = { | |
| 110 | { NULL }, | |
| 111 | "bsd4", "Original DragonFly Scheduler", | |
| cb7f4ab1 MD |
112 | NULL, /* default registration */ |
| 113 | NULL, /* default deregistration */ | |
| 38b25931 MD |
114 | bsd4_acquire_curproc, |
| 115 | bsd4_release_curproc, | |
| 38b25931 | 116 | bsd4_setrunqueue, |
| 38b25931 MD |
117 | bsd4_schedulerclock, |
| 118 | bsd4_recalculate_estcpu, | |
| 119 | bsd4_resetpriority, | |
| 120 | bsd4_forking, | |
| cb7f4ab1 | 121 | bsd4_exiting, |
| c3149361 MD |
122 | NULL, /* setcpumask not supported */ |
| 123 | bsd4_yield | |
| 38b25931 MD |
124 | }; |
| 125 | ||
| 52eedfb5 MD |
126 | struct usched_bsd4_pcpu { |
| 127 | struct thread helper_thread; | |
| 128 | short rrcount; | |
| 129 | short upri; | |
| 130 | struct lwp *uschedcp; | |
| 131 | }; | |
| 132 | ||
| 133 | typedef struct usched_bsd4_pcpu *bsd4_pcpu_t; | |
| 134 | ||
| 38b25931 MD |
135 | /* |
| 136 | * We have NQS (32) run queues per scheduling class. For the normal | |
| 137 | * class, there are 128 priorities scaled onto these 32 queues. New | |
| 138 | * processes are added to the last entry in each queue, and processes | |
| 139 | * are selected for running by taking them from the head and maintaining | |
| 140 | * a simple FIFO arrangement. Realtime and Idle priority processes have | |
| 141 | * and explicit 0-31 priority which maps directly onto their class queue | |
| 142 | * index. When a queue has something in it, the corresponding bit is | |
| 143 | * set in the queuebits variable, allowing a single read to determine | |
| 144 | * the state of all 32 queues and then a ffs() to find the first busy | |
| 145 | * queue. | |
| 146 | */ | |
| 52eedfb5 MD |
147 | static struct rq bsd4_queues[NQS]; |
| 148 | static struct rq bsd4_rtqueues[NQS]; | |
| 149 | static struct rq bsd4_idqueues[NQS]; | |
| 150 | static u_int32_t bsd4_queuebits; | |
| 151 | static u_int32_t bsd4_rtqueuebits; | |
| 152 | static u_int32_t bsd4_idqueuebits; | |
| 153 | static cpumask_t bsd4_curprocmask = -1; /* currently running a user process */ | |
| 154 | static cpumask_t bsd4_rdyprocmask; /* ready to accept a user process */ | |
| 155 | static int bsd4_runqcount; | |
| 38b25931 | 156 | #ifdef SMP |
| 52eedfb5 | 157 | static volatile int bsd4_scancpu; |
| 38b25931 | 158 | #endif |
| 52eedfb5 MD |
159 | static struct spinlock bsd4_spin; |
| 160 | static struct usched_bsd4_pcpu bsd4_pcpu[MAXCPU]; | |
| 38b25931 | 161 | |
| 0c52fa62 SG |
162 | SYSCTL_INT(_debug, OID_AUTO, bsd4_runqcount, CTLFLAG_RD, &bsd4_runqcount, 0, |
| 163 | "Number of run queues"); | |
| 38b25931 MD |
164 | #ifdef INVARIANTS |
| 165 | static int usched_nonoptimal; | |
| 166 | SYSCTL_INT(_debug, OID_AUTO, usched_nonoptimal, CTLFLAG_RW, | |
| 167 | &usched_nonoptimal, 0, "acquire_curproc() was not optimal"); | |
| 168 | static int usched_optimal; | |
| 169 | SYSCTL_INT(_debug, OID_AUTO, usched_optimal, CTLFLAG_RW, | |
| 170 | &usched_optimal, 0, "acquire_curproc() was optimal"); | |
| 171 | #endif | |
| 172 | static int usched_debug = -1; | |
| 0c52fa62 SG |
173 | SYSCTL_INT(_debug, OID_AUTO, scdebug, CTLFLAG_RW, &usched_debug, 0, |
| 174 | "Print debug information for this pid"); | |
| 38b25931 | 175 | #ifdef SMP |
| 38b25931 MD |
176 | static int remote_resched_nonaffinity; |
| 177 | static int remote_resched_affinity; | |
| 178 | static int choose_affinity; | |
| 38b25931 MD |
179 | SYSCTL_INT(_debug, OID_AUTO, remote_resched_nonaffinity, CTLFLAG_RD, |
| 180 | &remote_resched_nonaffinity, 0, "Number of remote rescheds"); | |
| 181 | SYSCTL_INT(_debug, OID_AUTO, remote_resched_affinity, CTLFLAG_RD, | |
| 182 | &remote_resched_affinity, 0, "Number of remote rescheds"); | |
| 183 | SYSCTL_INT(_debug, OID_AUTO, choose_affinity, CTLFLAG_RD, | |
| 184 | &choose_affinity, 0, "chooseproc() was smart"); | |
| 185 | #endif | |
| 186 | ||
| 187 | static int usched_bsd4_rrinterval = (ESTCPUFREQ + 9) / 10; | |
| 188 | SYSCTL_INT(_kern, OID_AUTO, usched_bsd4_rrinterval, CTLFLAG_RW, | |
| 189 | &usched_bsd4_rrinterval, 0, ""); | |
| 190 | static int usched_bsd4_decay = ESTCPUINCR / 2; | |
| 191 | SYSCTL_INT(_kern, OID_AUTO, usched_bsd4_decay, CTLFLAG_RW, | |
| 192 | &usched_bsd4_decay, 0, ""); | |
| 193 | ||
| 194 | /* | |
| 195 | * Initialize the run queues at boot time. | |
| 196 | */ | |
| 197 | static void | |
| 198 | rqinit(void *dummy) | |
| 199 | { | |
| 200 | int i; | |
| 201 | ||
| 52eedfb5 | 202 | spin_init(&bsd4_spin); |
| 38b25931 | 203 | for (i = 0; i < NQS; i++) { |
| 52eedfb5 MD |
204 | TAILQ_INIT(&bsd4_queues[i]); |
| 205 | TAILQ_INIT(&bsd4_rtqueues[i]); | |
| 206 | TAILQ_INIT(&bsd4_idqueues[i]); | |
| 38b25931 | 207 | } |
| 52eedfb5 | 208 | atomic_clear_int(&bsd4_curprocmask, 1); |
| 38b25931 | 209 | } |
| ba39e2e0 | 210 | SYSINIT(runqueue, SI_BOOT2_USCHED, SI_ORDER_FIRST, rqinit, NULL) |
| 38b25931 MD |
211 | |
| 212 | /* | |
| 52eedfb5 | 213 | * BSD4_ACQUIRE_CURPROC |
| 38b25931 | 214 | * |
| 52eedfb5 MD |
215 | * This function is called when the kernel intends to return to userland. |
| 216 | * It is responsible for making the thread the current designated userland | |
| 217 | * thread for this cpu, blocking if necessary. | |
| 218 | * | |
| b9eb1c19 MD |
219 | * The kernel has already depressed our LWKT priority so we must not switch |
| 220 | * until we have either assigned or disposed of the thread. | |
| 52eedfb5 MD |
221 | * |
| 222 | * WARNING! THIS FUNCTION IS ALLOWED TO CAUSE THE CURRENT THREAD TO MIGRATE | |
| 223 | * TO ANOTHER CPU! Because most of the kernel assumes that no migration will | |
| 224 | * occur, this function is called only under very controlled circumstances. | |
| 225 | * | |
| 52eedfb5 | 226 | * MPSAFE |
| 38b25931 | 227 | */ |
| 52eedfb5 MD |
228 | static void |
| 229 | bsd4_acquire_curproc(struct lwp *lp) | |
| 38b25931 | 230 | { |
| b9eb1c19 MD |
231 | globaldata_t gd; |
| 232 | bsd4_pcpu_t dd; | |
| 233 | struct lwp *olp; | |
| 38b25931 | 234 | |
| b9eb1c19 MD |
235 | crit_enter(); |
| 236 | bsd4_recalculate_estcpu(lp); | |
| 38b25931 | 237 | |
| 38b25931 | 238 | /* |
| b9eb1c19 MD |
239 | * If a reschedule was requested give another thread the |
| 240 | * driver's seat. | |
| 38b25931 | 241 | */ |
| b9eb1c19 MD |
242 | if (user_resched_wanted()) { |
| 243 | clear_user_resched(); | |
| 244 | bsd4_release_curproc(lp); | |
| 38b25931 | 245 | } |
| 38b25931 | 246 | |
| 52eedfb5 | 247 | /* |
| b9eb1c19 | 248 | * Loop until we are the current user thread |
| 52eedfb5 MD |
249 | */ |
| 250 | do { | |
| b9eb1c19 MD |
251 | /* |
| 252 | * Reload after a switch or setrunqueue/switch possibly | |
| 253 | * moved us to another cpu. | |
| 254 | */ | |
| 255 | clear_lwkt_resched(); | |
| 52eedfb5 MD |
256 | gd = mycpu; |
| 257 | dd = &bsd4_pcpu[gd->gd_cpuid]; | |
| b9eb1c19 MD |
258 | |
| 259 | /* | |
| 260 | * Become the currently scheduled user thread for this cpu | |
| 261 | * if we can do so trivially. | |
| 262 | * | |
| 263 | * We can steal another thread's current thread designation | |
| 264 | * on this cpu since if we are running that other thread | |
| 265 | * must not be, so we can safely deschedule it. | |
| 266 | */ | |
| 267 | if (dd->uschedcp == lp) { | |
| 268 | dd->upri = lp->lwp_priority; | |
| 269 | } else if (dd->uschedcp == NULL) { | |
| 270 | atomic_set_int(&bsd4_curprocmask, gd->gd_cpumask); | |
| 271 | dd->uschedcp = lp; | |
| 272 | dd->upri = lp->lwp_priority; | |
| 273 | } else if (dd->upri > lp->lwp_priority) { | |
| 274 | olp = dd->uschedcp; | |
| 275 | dd->uschedcp = lp; | |
| 276 | dd->upri = lp->lwp_priority; | |
| 277 | lwkt_deschedule(olp->lwp_thread); | |
| 278 | bsd4_setrunqueue(olp); | |
| 279 | } else { | |
| 280 | lwkt_deschedule(lp->lwp_thread); | |
| 281 | bsd4_setrunqueue(lp); | |
| 282 | lwkt_switch(); | |
| 283 | } | |
| 284 | ||
| 285 | /* | |
| 286 | * Other threads at our current user priority have already | |
| 287 | * put in their bids, but we must run any kernel threads | |
| 288 | * at higher priorities, and we could lose our bid to | |
| 289 | * another thread trying to return to user mode in the | |
| 290 | * process. | |
| 291 | * | |
| 292 | * If we lose our bid we will be descheduled and put on | |
| 293 | * the run queue. When we are reactivated we will have | |
| 294 | * another chance. | |
| 295 | */ | |
| 77912481 | 296 | lwkt_switch(); |
| 52eedfb5 | 297 | } while (dd->uschedcp != lp); |
| b9eb1c19 MD |
298 | |
| 299 | crit_exit(); | |
| 9388413d | 300 | KKASSERT((lp->lwp_flag & LWP_ONRUNQ) == 0); |
| 52eedfb5 MD |
301 | } |
| 302 | ||
| 303 | /* | |
| 304 | * BSD4_RELEASE_CURPROC | |
| 305 | * | |
| 306 | * This routine detaches the current thread from the userland scheduler, | |
| b9eb1c19 MD |
307 | * usually because the thread needs to run or block in the kernel (at |
| 308 | * kernel priority) for a while. | |
| 52eedfb5 MD |
309 | * |
| 310 | * This routine is also responsible for selecting a new thread to | |
| 311 | * make the current thread. | |
| 312 | * | |
| 313 | * NOTE: This implementation differs from the dummy example in that | |
| 314 | * bsd4_select_curproc() is able to select the current process, whereas | |
| 315 | * dummy_select_curproc() is not able to select the current process. | |
| 316 | * This means we have to NULL out uschedcp. | |
| 317 | * | |
| 318 | * Additionally, note that we may already be on a run queue if releasing | |
| 319 | * via the lwkt_switch() in bsd4_setrunqueue(). | |
| 320 | * | |
| 321 | * WARNING! The MP lock may be in an unsynchronized state due to the | |
| 322 | * way get_mplock() works and the fact that this function may be called | |
| 323 | * from a passive release during a lwkt_switch(). try_mplock() will deal | |
| 324 | * with this for us but you should be aware that td_mpcount may not be | |
| 325 | * useable. | |
| 326 | * | |
| 327 | * MPSAFE | |
| 328 | */ | |
| 329 | static void | |
| 330 | bsd4_release_curproc(struct lwp *lp) | |
| 331 | { | |
| 332 | globaldata_t gd = mycpu; | |
| 333 | bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid]; | |
| 334 | ||
| 335 | if (dd->uschedcp == lp) { | |
| b9eb1c19 | 336 | crit_enter(); |
| 9388413d | 337 | KKASSERT((lp->lwp_flag & LWP_ONRUNQ) == 0); |
| 52eedfb5 | 338 | dd->uschedcp = NULL; /* don't let lp be selected */ |
| b9eb1c19 MD |
339 | dd->upri = PRIBASE_NULL; |
| 340 | atomic_clear_int(&bsd4_curprocmask, gd->gd_cpumask); | |
| 52eedfb5 | 341 | bsd4_select_curproc(gd); |
| b9eb1c19 | 342 | crit_exit(); |
| 52eedfb5 | 343 | } |
| 38b25931 MD |
344 | } |
| 345 | ||
| 38b25931 | 346 | /* |
| 52eedfb5 MD |
347 | * BSD4_SELECT_CURPROC |
| 348 | * | |
| b9eb1c19 MD |
349 | * Select a new current process for this cpu and clear any pending user |
| 350 | * reschedule request. The cpu currently has no current process. | |
| 52eedfb5 MD |
351 | * |
| 352 | * This routine is also responsible for equal-priority round-robining, | |
| 353 | * typically triggered from bsd4_schedulerclock(). In our dummy example | |
| 354 | * all the 'user' threads are LWKT scheduled all at once and we just | |
| 355 | * call lwkt_switch(). | |
| 356 | * | |
| b9eb1c19 MD |
357 | * The calling process is not on the queue and cannot be selected. |
| 358 | * | |
| 52eedfb5 | 359 | * MPSAFE |
| 38b25931 MD |
360 | */ |
| 361 | static | |
| 362 | void | |
| 52eedfb5 | 363 | bsd4_select_curproc(globaldata_t gd) |
| 38b25931 | 364 | { |
| 52eedfb5 MD |
365 | bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid]; |
| 366 | struct lwp *nlp; | |
| 367 | int cpuid = gd->gd_cpuid; | |
| 38b25931 | 368 | |
| 52eedfb5 | 369 | crit_enter_gd(gd); |
| 52eedfb5 | 370 | |
| 287a8577 | 371 | spin_lock(&bsd4_spin); |
| 52eedfb5 MD |
372 | if ((nlp = chooseproc_locked(dd->uschedcp)) != NULL) { |
| 373 | atomic_set_int(&bsd4_curprocmask, 1 << cpuid); | |
| 374 | dd->upri = nlp->lwp_priority; | |
| 375 | dd->uschedcp = nlp; | |
| 287a8577 | 376 | spin_unlock(&bsd4_spin); |
| 52eedfb5 MD |
377 | #ifdef SMP |
| 378 | lwkt_acquire(nlp->lwp_thread); | |
| 38b25931 | 379 | #endif |
| 52eedfb5 | 380 | lwkt_schedule(nlp->lwp_thread); |
| 52eedfb5 | 381 | } else if (bsd4_runqcount && (bsd4_rdyprocmask & (1 << cpuid))) { |
| 52eedfb5 | 382 | atomic_clear_int(&bsd4_rdyprocmask, 1 << cpuid); |
| 287a8577 | 383 | spin_unlock(&bsd4_spin); |
| 52eedfb5 MD |
384 | lwkt_schedule(&dd->helper_thread); |
| 385 | } else { | |
| 287a8577 | 386 | spin_unlock(&bsd4_spin); |
| 52eedfb5 MD |
387 | } |
| 388 | crit_exit_gd(gd); | |
| 389 | } | |
| 38b25931 MD |
390 | |
| 391 | /* | |
| 52eedfb5 MD |
392 | * BSD4_SETRUNQUEUE |
| 393 | * | |
| b9eb1c19 MD |
394 | * Place the specified lwp on the user scheduler's run queue. This routine |
| 395 | * must be called with the thread descheduled. The lwp must be runnable. | |
| 38b25931 | 396 | * |
| b9eb1c19 | 397 | * The thread may be the current thread as a special case. |
| 52eedfb5 MD |
398 | * |
| 399 | * MPSAFE | |
| 38b25931 MD |
400 | */ |
| 401 | static void | |
| 553ea3c8 | 402 | bsd4_setrunqueue(struct lwp *lp) |
| 38b25931 | 403 | { |
| 52eedfb5 MD |
404 | globaldata_t gd; |
| 405 | bsd4_pcpu_t dd; | |
| 38b25931 | 406 | #ifdef SMP |
| b9eb1c19 | 407 | int cpuid; |
| 38b25931 | 408 | cpumask_t mask; |
| 52eedfb5 | 409 | cpumask_t tmpmask; |
| 38b25931 MD |
410 | #endif |
| 411 | ||
| 52eedfb5 MD |
412 | /* |
| 413 | * First validate the process state relative to the current cpu. | |
| 414 | * We don't need the spinlock for this, just a critical section. | |
| 415 | * We are in control of the process. | |
| 416 | */ | |
| 38b25931 | 417 | crit_enter(); |
| 164b8401 | 418 | KASSERT(lp->lwp_stat == LSRUN, ("setrunqueue: lwp not LSRUN")); |
| 9388413d | 419 | KASSERT((lp->lwp_flag & LWP_ONRUNQ) == 0, |
| 164b8401 SS |
420 | ("lwp %d/%d already on runq! flag %08x/%08x", lp->lwp_proc->p_pid, |
| 421 | lp->lwp_tid, lp->lwp_proc->p_flag, lp->lwp_flag)); | |
| 553ea3c8 | 422 | KKASSERT((lp->lwp_thread->td_flags & TDF_RUNQ) == 0); |
| 38b25931 MD |
423 | |
| 424 | /* | |
| 52eedfb5 MD |
425 | * Note: gd and dd are relative to the target thread's last cpu, |
| 426 | * NOT our current cpu. | |
| 38b25931 | 427 | */ |
| 553ea3c8 | 428 | gd = lp->lwp_thread->td_gd; |
| 52eedfb5 | 429 | dd = &bsd4_pcpu[gd->gd_cpuid]; |
| 38b25931 MD |
430 | |
| 431 | /* | |
| 52eedfb5 MD |
432 | * This process is not supposed to be scheduled anywhere or assigned |
| 433 | * as the current process anywhere. Assert the condition. | |
| 38b25931 | 434 | */ |
| 52eedfb5 | 435 | KKASSERT(dd->uschedcp != lp); |
| 38b25931 | 436 | |
| b9eb1c19 | 437 | #ifndef SMP |
| 38b25931 | 438 | /* |
| b9eb1c19 MD |
439 | * If we are not SMP we do not have a scheduler helper to kick |
| 440 | * and must directly activate the process if none are scheduled. | |
| 38b25931 | 441 | * |
| b9eb1c19 MD |
442 | * This is really only an issue when bootstrapping init since |
| 443 | * the caller in all other cases will be a user process, and | |
| 444 | * even if released (dd->uschedcp == NULL), that process will | |
| 445 | * kickstart the scheduler when it returns to user mode from | |
| 446 | * the kernel. | |
| 38b25931 | 447 | */ |
| b9eb1c19 MD |
448 | if (dd->uschedcp == NULL) { |
| 449 | atomic_set_int(&bsd4_curprocmask, gd->gd_cpumask); | |
| 52eedfb5 MD |
450 | dd->uschedcp = lp; |
| 451 | dd->upri = lp->lwp_priority; | |
| 553ea3c8 | 452 | lwkt_schedule(lp->lwp_thread); |
| 38b25931 | 453 | crit_exit(); |
| 38b25931 MD |
454 | return; |
| 455 | } | |
| b9eb1c19 | 456 | #endif |
| 38b25931 | 457 | |
| 38b25931 MD |
458 | #ifdef SMP |
| 459 | /* | |
| 52eedfb5 MD |
460 | * XXX fixme. Could be part of a remrunqueue/setrunqueue |
| 461 | * operation when the priority is recalculated, so TDF_MIGRATING | |
| 462 | * may already be set. | |
| 38b25931 | 463 | */ |
| 52eedfb5 MD |
464 | if ((lp->lwp_thread->td_flags & TDF_MIGRATING) == 0) |
| 465 | lwkt_giveaway(lp->lwp_thread); | |
| 466 | #endif | |
| 50017724 MD |
467 | |
| 468 | /* | |
| 469 | * We lose control of lp the moment we release the spinlock after | |
| 470 | * having placed lp on the queue. i.e. another cpu could pick it | |
| 471 | * up and it could exit, or its priority could be further adjusted, | |
| 472 | * or something like that. | |
| 473 | */ | |
| 287a8577 | 474 | spin_lock(&bsd4_spin); |
| 52eedfb5 | 475 | bsd4_setrunqueue_locked(lp); |
| 38b25931 | 476 | |
| b9eb1c19 | 477 | #ifdef SMP |
| 38b25931 | 478 | /* |
| b9eb1c19 MD |
479 | * Kick the scheduler helper on one of the other cpu's |
| 480 | * and request a reschedule if appropriate. | |
| 38b25931 | 481 | */ |
| b9eb1c19 MD |
482 | cpuid = (bsd4_scancpu & 0xFFFF) % ncpus; |
| 483 | ++bsd4_scancpu; | |
| 484 | mask = ~bsd4_curprocmask & bsd4_rdyprocmask & | |
| 485 | lp->lwp_cpumask & smp_active_mask; | |
| 287a8577 | 486 | spin_unlock(&bsd4_spin); |
| 38b25931 | 487 | |
| b9eb1c19 | 488 | while (mask) { |
| 52eedfb5 MD |
489 | tmpmask = ~((1 << cpuid) - 1); |
| 490 | if (mask & tmpmask) | |
| 491 | cpuid = bsfl(mask & tmpmask); | |
| 492 | else | |
| 493 | cpuid = bsfl(mask); | |
| b9eb1c19 MD |
494 | gd = globaldata_find(cpuid); |
| 495 | dd = &bsd4_pcpu[cpuid]; | |
| 496 | ||
| 497 | if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) { | |
| 498 | if (gd == mycpu) | |
| 499 | need_user_resched_remote(NULL); | |
| 500 | else | |
| 501 | lwkt_send_ipiq(gd, need_user_resched_remote, NULL); | |
| 502 | break; | |
| 503 | } | |
| 504 | mask &= ~(1 << cpuid); | |
| 505 | } | |
| 506 | #else | |
| 507 | /* | |
| 508 | * Request a reschedule if appropriate. | |
| 509 | */ | |
| 287a8577 | 510 | spin_unlock(&bsd4_spin); |
| b9eb1c19 MD |
511 | if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) { |
| 512 | need_user_resched(); | |
| 38b25931 MD |
513 | } |
| 514 | #endif | |
| 515 | crit_exit(); | |
| 516 | } | |
| 517 | ||
| 518 | /* | |
| 38b25931 | 519 | * This routine is called from a systimer IPI. It MUST be MP-safe and |
| 52eedfb5 MD |
520 | * the BGL IS NOT HELD ON ENTRY. This routine is called at ESTCPUFREQ on |
| 521 | * each cpu. | |
| 522 | * | |
| 270ac911 | 523 | * MPSAFE |
| 38b25931 MD |
524 | */ |
| 525 | static | |
| 526 | void | |
| 553ea3c8 | 527 | bsd4_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp) |
| 38b25931 MD |
528 | { |
| 529 | globaldata_t gd = mycpu; | |
| 52eedfb5 | 530 | bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid]; |
| 38b25931 MD |
531 | |
| 532 | /* | |
| 533 | * Do we need to round-robin? We round-robin 10 times a second. | |
| 534 | * This should only occur for cpu-bound batch processes. | |
| 535 | */ | |
| 52eedfb5 MD |
536 | if (++dd->rrcount >= usched_bsd4_rrinterval) { |
| 537 | dd->rrcount = 0; | |
| 38b25931 MD |
538 | need_user_resched(); |
| 539 | } | |
| 540 | ||
| 541 | /* | |
| 542 | * As the process accumulates cpu time p_estcpu is bumped and may | |
| 543 | * push the process into another scheduling queue. It typically | |
| 544 | * takes 4 ticks to bump the queue. | |
| 545 | */ | |
| 553ea3c8 | 546 | lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu + ESTCPUINCR); |
| 38b25931 MD |
547 | |
| 548 | /* | |
| 549 | * Reducing p_origcpu over time causes more of our estcpu to be | |
| 550 | * returned to the parent when we exit. This is a small tweak | |
| 551 | * for the batch detection heuristic. | |
| 552 | */ | |
| 553ea3c8 SS |
553 | if (lp->lwp_origcpu) |
| 554 | --lp->lwp_origcpu; | |
| 50017724 MD |
555 | |
| 556 | /* | |
| 77912481 MD |
557 | * Spinlocks also hold a critical section so there should not be |
| 558 | * any active. | |
| 50017724 | 559 | */ |
| 77912481 MD |
560 | KKASSERT(gd->gd_spinlocks_wr == 0); |
| 561 | ||
| 562 | bsd4_resetpriority(lp); | |
| 563 | #if 0 | |
| 564 | /* | |
| 565 | * if we can't call bsd4_resetpriority for some reason we must call | |
| 566 | * need user_resched(). | |
| 567 | */ | |
| 568 | need_user_resched(); | |
| 569 | #endif | |
| 38b25931 MD |
570 | } |
| 571 | ||
| 572 | /* | |
| 52eedfb5 MD |
573 | * Called from acquire and from kern_synch's one-second timer (one of the |
| 574 | * callout helper threads) with a critical section held. | |
| 38b25931 | 575 | * |
| 52eedfb5 MD |
576 | * Decay p_estcpu based on the number of ticks we haven't been running |
| 577 | * and our p_nice. As the load increases each process observes a larger | |
| 578 | * number of idle ticks (because other processes are running in them). | |
| 579 | * This observation leads to a larger correction which tends to make the | |
| 580 | * system more 'batchy'. | |
| 38b25931 | 581 | * |
| 52eedfb5 MD |
582 | * Note that no recalculation occurs for a process which sleeps and wakes |
| 583 | * up in the same tick. That is, a system doing thousands of context | |
| 584 | * switches per second will still only do serious estcpu calculations | |
| 585 | * ESTCPUFREQ times per second. | |
| 38b25931 | 586 | * |
| 52eedfb5 | 587 | * MPSAFE |
| 38b25931 MD |
588 | */ |
| 589 | static | |
| 52eedfb5 MD |
590 | void |
| 591 | bsd4_recalculate_estcpu(struct lwp *lp) | |
| 38b25931 | 592 | { |
| 52eedfb5 MD |
593 | globaldata_t gd = mycpu; |
| 594 | sysclock_t cpbase; | |
| 595 | int loadfac; | |
| 596 | int ndecay; | |
| 597 | int nticks; | |
| 598 | int nleft; | |
| 38b25931 MD |
599 | |
| 600 | /* | |
| 52eedfb5 MD |
601 | * We have to subtract periodic to get the last schedclock |
| 602 | * timeout time, otherwise we would get the upcoming timeout. | |
| 603 | * Keep in mind that a process can migrate between cpus and | |
| 604 | * while the scheduler clock should be very close, boundary | |
| 605 | * conditions could lead to a small negative delta. | |
| 38b25931 | 606 | */ |
| 52eedfb5 | 607 | cpbase = gd->gd_schedclock.time - gd->gd_schedclock.periodic; |
| 38b25931 | 608 | |
| 52eedfb5 MD |
609 | if (lp->lwp_slptime > 1) { |
| 610 | /* | |
| 611 | * Too much time has passed, do a coarse correction. | |
| 612 | */ | |
| 613 | lp->lwp_estcpu = lp->lwp_estcpu >> 1; | |
| 614 | bsd4_resetpriority(lp); | |
| 615 | lp->lwp_cpbase = cpbase; | |
| 616 | lp->lwp_cpticks = 0; | |
| 617 | } else if (lp->lwp_cpbase != cpbase) { | |
| 618 | /* | |
| 619 | * Adjust estcpu if we are in a different tick. Don't waste | |
| 620 | * time if we are in the same tick. | |
| 621 | * | |
| 622 | * First calculate the number of ticks in the measurement | |
| 623 | * interval. The nticks calculation can wind up 0 due to | |
| 624 | * a bug in the handling of lwp_slptime (as yet not found), | |
| 625 | * so make sure we do not get a divide by 0 panic. | |
| 626 | */ | |
| 627 | nticks = (cpbase - lp->lwp_cpbase) / gd->gd_schedclock.periodic; | |
| 628 | if (nticks <= 0) | |
| 629 | nticks = 1; | |
| 630 | updatepcpu(lp, lp->lwp_cpticks, nticks); | |
| 38b25931 | 631 | |
| 52eedfb5 MD |
632 | if ((nleft = nticks - lp->lwp_cpticks) < 0) |
| 633 | nleft = 0; | |
| 634 | if (usched_debug == lp->lwp_proc->p_pid) { | |
| 6ea70f76 | 635 | kprintf("pid %d tid %d estcpu %d cpticks %d nticks %d nleft %d", |
| 52eedfb5 MD |
636 | lp->lwp_proc->p_pid, lp->lwp_tid, lp->lwp_estcpu, |
| 637 | lp->lwp_cpticks, nticks, nleft); | |
| 638 | } | |
| 38b25931 | 639 | |
| 52eedfb5 MD |
640 | /* |
| 641 | * Calculate a decay value based on ticks remaining scaled | |
| 642 | * down by the instantanious load and p_nice. | |
| 643 | */ | |
| 644 | if ((loadfac = bsd4_runqcount) < 2) | |
| 645 | loadfac = 2; | |
| 646 | ndecay = nleft * usched_bsd4_decay * 2 * | |
| 647 | (PRIO_MAX * 2 - lp->lwp_proc->p_nice) / (loadfac * PRIO_MAX * 2); | |
| 38b25931 | 648 | |
| 52eedfb5 MD |
649 | /* |
| 650 | * Adjust p_estcpu. Handle a border case where batch jobs | |
| 651 | * can get stalled long enough to decay to zero when they | |
| 652 | * shouldn't. | |
| 653 | */ | |
| 654 | if (lp->lwp_estcpu > ndecay * 2) | |
| 655 | lp->lwp_estcpu -= ndecay; | |
| 656 | else | |
| 657 | lp->lwp_estcpu >>= 1; | |
| 344ad853 | 658 | |
| 52eedfb5 | 659 | if (usched_debug == lp->lwp_proc->p_pid) |
| 6ea70f76 | 660 | kprintf(" ndecay %d estcpu %d\n", ndecay, lp->lwp_estcpu); |
| 52eedfb5 MD |
661 | bsd4_resetpriority(lp); |
| 662 | lp->lwp_cpbase = cpbase; | |
| 663 | lp->lwp_cpticks = 0; | |
| 664 | } | |
| 38b25931 MD |
665 | } |
| 666 | ||
| 667 | /* | |
| 668 | * Compute the priority of a process when running in user mode. | |
| 669 | * Arrange to reschedule if the resulting priority is better | |
| 670 | * than that of the current process. | |
| 52eedfb5 MD |
671 | * |
| 672 | * This routine may be called with any process. | |
| 673 | * | |
| 674 | * This routine is called by fork1() for initial setup with the process | |
| 675 | * of the run queue, and also may be called normally with the process on or | |
| 676 | * off the run queue. | |
| 677 | * | |
| 678 | * MPSAFE | |
| 38b25931 MD |
679 | */ |
| 680 | static void | |
| 553ea3c8 | 681 | bsd4_resetpriority(struct lwp *lp) |
| 38b25931 | 682 | { |
| 52eedfb5 | 683 | bsd4_pcpu_t dd; |
| 38b25931 | 684 | int newpriority; |
| 52eedfb5 MD |
685 | u_short newrqtype; |
| 686 | int reschedcpu; | |
| 270ac911 | 687 | |
| 38b25931 | 688 | /* |
| 52eedfb5 | 689 | * Calculate the new priority and queue type |
| 38b25931 | 690 | */ |
| 52eedfb5 | 691 | crit_enter(); |
| 287a8577 | 692 | spin_lock(&bsd4_spin); |
| 52eedfb5 MD |
693 | |
| 694 | newrqtype = lp->lwp_rtprio.type; | |
| 695 | ||
| 696 | switch(newrqtype) { | |
| 38b25931 | 697 | case RTP_PRIO_REALTIME: |
| f64250e0 | 698 | case RTP_PRIO_FIFO: |
| 52eedfb5 MD |
699 | newpriority = PRIBASE_REALTIME + |
| 700 | (lp->lwp_rtprio.prio & PRIMASK); | |
| 701 | break; | |
| 38b25931 | 702 | case RTP_PRIO_NORMAL: |
| 52eedfb5 MD |
703 | newpriority = (lp->lwp_proc->p_nice - PRIO_MIN) * PPQ / NICEPPQ; |
| 704 | newpriority += lp->lwp_estcpu * PPQ / ESTCPUPPQ; | |
| 705 | newpriority = newpriority * MAXPRI / (PRIO_RANGE * PPQ / | |
| 706 | NICEPPQ + ESTCPUMAX * PPQ / ESTCPUPPQ); | |
| 707 | newpriority = PRIBASE_NORMAL + (newpriority & PRIMASK); | |
| 38b25931 MD |
708 | break; |
| 709 | case RTP_PRIO_IDLE: | |
| 52eedfb5 MD |
710 | newpriority = PRIBASE_IDLE + (lp->lwp_rtprio.prio & PRIMASK); |
| 711 | break; | |
| 38b25931 | 712 | case RTP_PRIO_THREAD: |
| 52eedfb5 MD |
713 | newpriority = PRIBASE_THREAD + (lp->lwp_rtprio.prio & PRIMASK); |
| 714 | break; | |
| 715 | default: | |
| 716 | panic("Bad RTP_PRIO %d", newrqtype); | |
| 717 | /* NOT REACHED */ | |
| 38b25931 MD |
718 | } |
| 719 | ||
| 720 | /* | |
| 52eedfb5 MD |
721 | * The newpriority incorporates the queue type so do a simple masked |
| 722 | * check to determine if the process has moved to another queue. If | |
| 723 | * it has, and it is currently on a run queue, then move it. | |
| 38b25931 | 724 | */ |
| 52eedfb5 MD |
725 | if ((lp->lwp_priority ^ newpriority) & ~PPQMASK) { |
| 726 | lp->lwp_priority = newpriority; | |
| 9388413d | 727 | if (lp->lwp_flag & LWP_ONRUNQ) { |
| 52eedfb5 MD |
728 | bsd4_remrunqueue_locked(lp); |
| 729 | lp->lwp_rqtype = newrqtype; | |
| 730 | lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ; | |
| 731 | bsd4_setrunqueue_locked(lp); | |
| 732 | reschedcpu = lp->lwp_thread->td_gd->gd_cpuid; | |
| 733 | } else { | |
| 734 | lp->lwp_rqtype = newrqtype; | |
| 735 | lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ; | |
| 736 | reschedcpu = -1; | |
| 737 | } | |
| 38b25931 | 738 | } else { |
| 52eedfb5 MD |
739 | lp->lwp_priority = newpriority; |
| 740 | reschedcpu = -1; | |
| 741 | } | |
| 287a8577 | 742 | spin_unlock(&bsd4_spin); |
| 52eedfb5 MD |
743 | |
| 744 | /* | |
| 50017724 MD |
745 | * Determine if we need to reschedule the target cpu. This only |
| 746 | * occurs if the LWP is already on a scheduler queue, which means | |
| 747 | * that idle cpu notification has already occured. At most we | |
| 748 | * need only issue a need_user_resched() on the appropriate cpu. | |
| 281b4fa8 YT |
749 | * |
| 750 | * The LWP may be owned by a CPU different from the current one, | |
| 751 | * in which case dd->uschedcp may be modified without an MP lock | |
| 752 | * or a spinlock held. The worst that happens is that the code | |
| 753 | * below causes a spurious need_user_resched() on the target CPU | |
| 754 | * and dd->pri to be wrong for a short period of time, both of | |
| 755 | * which are harmless. | |
| 52eedfb5 MD |
756 | */ |
| 757 | if (reschedcpu >= 0) { | |
| 758 | dd = &bsd4_pcpu[reschedcpu]; | |
| 50017724 | 759 | if ((dd->upri & ~PRIMASK) > (lp->lwp_priority & ~PRIMASK)) { |
| 52eedfb5 | 760 | dd->upri = lp->lwp_priority; |
| 52eedfb5 MD |
761 | #ifdef SMP |
| 762 | if (reschedcpu == mycpu->gd_cpuid) { | |
| 763 | need_user_resched(); | |
| 764 | } else { | |
| 765 | lwkt_send_ipiq(lp->lwp_thread->td_gd, | |
| 766 | need_user_resched_remote, NULL); | |
| 767 | } | |
| 768 | #else | |
| 769 | need_user_resched(); | |
| 770 | #endif | |
| 771 | } | |
| 38b25931 MD |
772 | } |
| 773 | crit_exit(); | |
| 774 | } | |
| 775 | ||
| 3919ced0 MD |
776 | /* |
| 777 | * MPSAFE | |
| 778 | */ | |
| c3149361 MD |
779 | static |
| 780 | void | |
| 781 | bsd4_yield(struct lwp *lp) | |
| 782 | { | |
| 783 | #if 0 | |
| 784 | /* FUTURE (or something similar) */ | |
| 785 | switch(lp->lwp_rqtype) { | |
| 786 | case RTP_PRIO_NORMAL: | |
| 787 | lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu + ESTCPUINCR); | |
| c3149361 MD |
788 | break; |
| 789 | default: | |
| 790 | break; | |
| 791 | } | |
| 792 | #endif | |
| 793 | need_user_resched(); | |
| 794 | } | |
| 795 | ||
| 38b25931 MD |
796 | /* |
| 797 | * Called from fork1() when a new child process is being created. | |
| 798 | * | |
| 799 | * Give the child process an initial estcpu that is more batch then | |
| 800 | * its parent and dock the parent for the fork (but do not | |
| 801 | * reschedule the parent). This comprises the main part of our batch | |
| 802 | * detection heuristic for both parallel forking and sequential execs. | |
| 803 | * | |
| 804 | * Interactive processes will decay the boosted estcpu quickly while batch | |
| 805 | * processes will tend to compound it. | |
| 553ea3c8 | 806 | * XXX lwp should be "spawning" instead of "forking" |
| 270ac911 MD |
807 | * |
| 808 | * MPSAFE | |
| 38b25931 MD |
809 | */ |
| 810 | static void | |
| 553ea3c8 | 811 | bsd4_forking(struct lwp *plp, struct lwp *lp) |
| 38b25931 | 812 | { |
| 553ea3c8 SS |
813 | lp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ); |
| 814 | lp->lwp_origcpu = lp->lwp_estcpu; | |
| 815 | plp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ); | |
| 38b25931 MD |
816 | } |
| 817 | ||
| 818 | /* | |
| 819 | * Called when the parent reaps a child. Propogate cpu use by the child | |
| 820 | * back to the parent. | |
| 270ac911 MD |
821 | * |
| 822 | * MPSAFE | |
| 38b25931 MD |
823 | */ |
| 824 | static void | |
| 553ea3c8 | 825 | bsd4_exiting(struct lwp *plp, struct lwp *lp) |
| 38b25931 MD |
826 | { |
| 827 | int delta; | |
| 828 | ||
| 553ea3c8 SS |
829 | if (plp->lwp_proc->p_pid != 1) { |
| 830 | delta = lp->lwp_estcpu - lp->lwp_origcpu; | |
| 38b25931 | 831 | if (delta > 0) |
| 553ea3c8 | 832 | plp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + delta); |
| 38b25931 MD |
833 | } |
| 834 | } | |
| 835 | ||
| 52eedfb5 | 836 | |
| 38b25931 | 837 | /* |
| 52eedfb5 MD |
838 | * chooseproc() is called when a cpu needs a user process to LWKT schedule, |
| 839 | * it selects a user process and returns it. If chklp is non-NULL and chklp | |
| 840 | * has a better or equal priority then the process that would otherwise be | |
| 841 | * chosen, NULL is returned. | |
| 38b25931 | 842 | * |
| 52eedfb5 MD |
843 | * Until we fix the RUNQ code the chklp test has to be strict or we may |
| 844 | * bounce between processes trying to acquire the current process designation. | |
| 38b25931 | 845 | * |
| 52eedfb5 MD |
846 | * MPSAFE - must be called with bsd4_spin exclusive held. The spinlock is |
| 847 | * left intact through the entire routine. | |
| 38b25931 MD |
848 | */ |
| 849 | static | |
| 52eedfb5 MD |
850 | struct lwp * |
| 851 | chooseproc_locked(struct lwp *chklp) | |
| 38b25931 | 852 | { |
| 52eedfb5 MD |
853 | struct lwp *lp; |
| 854 | struct rq *q; | |
| a60ccb85 | 855 | u_int32_t *which, *which2; |
| 52eedfb5 | 856 | u_int32_t pri; |
| a60ccb85 DX |
857 | u_int32_t rtqbits; |
| 858 | u_int32_t tsqbits; | |
| 859 | u_int32_t idqbits; | |
| 860 | cpumask_t cpumask; | |
| 38b25931 | 861 | |
| a60ccb85 DX |
862 | rtqbits = bsd4_rtqueuebits; |
| 863 | tsqbits = bsd4_queuebits; | |
| 864 | idqbits = bsd4_idqueuebits; | |
| 865 | cpumask = mycpu->gd_cpumask; | |
| 866 | ||
| 867 | #ifdef SMP | |
| 868 | again: | |
| 869 | #endif | |
| 870 | if (rtqbits) { | |
| 871 | pri = bsfl(rtqbits); | |
| 52eedfb5 MD |
872 | q = &bsd4_rtqueues[pri]; |
| 873 | which = &bsd4_rtqueuebits; | |
| a60ccb85 DX |
874 | which2 = &rtqbits; |
| 875 | } else if (tsqbits) { | |
| 876 | pri = bsfl(tsqbits); | |
| 52eedfb5 MD |
877 | q = &bsd4_queues[pri]; |
| 878 | which = &bsd4_queuebits; | |
| a60ccb85 DX |
879 | which2 = &tsqbits; |
| 880 | } else if (idqbits) { | |
| 881 | pri = bsfl(idqbits); | |
| 52eedfb5 MD |
882 | q = &bsd4_idqueues[pri]; |
| 883 | which = &bsd4_idqueuebits; | |
| a60ccb85 | 884 | which2 = &idqbits; |
| 52eedfb5 MD |
885 | } else { |
| 886 | return NULL; | |
| 887 | } | |
| 888 | lp = TAILQ_FIRST(q); | |
| 889 | KASSERT(lp, ("chooseproc: no lwp on busy queue")); | |
| 270ac911 | 890 | |
| a60ccb85 DX |
891 | #ifdef SMP |
| 892 | while ((lp->lwp_cpumask & cpumask) == 0) { | |
| 893 | lp = TAILQ_NEXT(lp, lwp_procq); | |
| 894 | if (lp == NULL) { | |
| 895 | *which2 &= ~(1 << pri); | |
| 896 | goto again; | |
| 897 | } | |
| 898 | } | |
| 899 | #endif | |
| 900 | ||
| 38b25931 | 901 | /* |
| 52eedfb5 MD |
902 | * If the passed lwp <chklp> is reasonably close to the selected |
| 903 | * lwp <lp>, return NULL (indicating that <chklp> should be kept). | |
| 904 | * | |
| 905 | * Note that we must error on the side of <chklp> to avoid bouncing | |
| 906 | * between threads in the acquire code. | |
| 38b25931 | 907 | */ |
| 52eedfb5 MD |
908 | if (chklp) { |
| 909 | if (chklp->lwp_priority < lp->lwp_priority + PPQ) | |
| 910 | return(NULL); | |
| 911 | } | |
| 38b25931 | 912 | |
| 52eedfb5 MD |
913 | #ifdef SMP |
| 914 | /* | |
| 915 | * If the chosen lwp does not reside on this cpu spend a few | |
| 916 | * cycles looking for a better candidate at the same priority level. | |
| 917 | * This is a fallback check, setrunqueue() tries to wakeup the | |
| 918 | * correct cpu and is our front-line affinity. | |
| 919 | */ | |
| 920 | if (lp->lwp_thread->td_gd != mycpu && | |
| 921 | (chklp = TAILQ_NEXT(lp, lwp_procq)) != NULL | |
| 922 | ) { | |
| 923 | if (chklp->lwp_thread->td_gd == mycpu) { | |
| 924 | ++choose_affinity; | |
| 925 | lp = chklp; | |
| 38b25931 | 926 | } |
| 52eedfb5 MD |
927 | } |
| 928 | #endif | |
| 38b25931 | 929 | |
| 52eedfb5 MD |
930 | TAILQ_REMOVE(q, lp, lwp_procq); |
| 931 | --bsd4_runqcount; | |
| 932 | if (TAILQ_EMPTY(q)) | |
| 933 | *which &= ~(1 << pri); | |
| 9388413d SS |
934 | KASSERT((lp->lwp_flag & LWP_ONRUNQ) != 0, ("not on runq6!")); |
| 935 | lp->lwp_flag &= ~LWP_ONRUNQ; | |
| 52eedfb5 MD |
936 | return lp; |
| 937 | } | |
| 38b25931 | 938 | |
| 52eedfb5 | 939 | #ifdef SMP |
| b9eb1c19 | 940 | |
| 52eedfb5 | 941 | /* |
| b9eb1c19 MD |
942 | * Called via an ipi message to reschedule on another cpu. If no |
| 943 | * user thread is active on the target cpu we wake the scheduler | |
| 944 | * helper thread up to help schedule one. | |
| 52eedfb5 MD |
945 | * |
| 946 | * MPSAFE | |
| 947 | */ | |
| 948 | static | |
| 949 | void | |
| 950 | need_user_resched_remote(void *dummy) | |
| 951 | { | |
| b9eb1c19 MD |
952 | globaldata_t gd = mycpu; |
| 953 | bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid]; | |
| 954 | ||
| 955 | if (dd->uschedcp == NULL && (bsd4_rdyprocmask & gd->gd_cpumask)) { | |
| 956 | atomic_clear_int(&bsd4_rdyprocmask, gd->gd_cpumask); | |
| 957 | lwkt_schedule(&dd->helper_thread); | |
| 958 | } else { | |
| 959 | need_user_resched(); | |
| 960 | } | |
| 52eedfb5 | 961 | } |
| 38b25931 | 962 | |
| 52eedfb5 | 963 | #endif |
| 38b25931 | 964 | |
| 52eedfb5 MD |
965 | /* |
| 966 | * bsd4_remrunqueue_locked() removes a given process from the run queue | |
| 967 | * that it is on, clearing the queue busy bit if it becomes empty. | |
| 968 | * | |
| 969 | * Note that user process scheduler is different from the LWKT schedule. | |
| 970 | * The user process scheduler only manages user processes but it uses LWKT | |
| 971 | * underneath, and a user process operating in the kernel will often be | |
| 972 | * 'released' from our management. | |
| 973 | * | |
| 974 | * MPSAFE - bsd4_spin must be held exclusively on call | |
| 975 | */ | |
| 976 | static void | |
| 977 | bsd4_remrunqueue_locked(struct lwp *lp) | |
| 978 | { | |
| 979 | struct rq *q; | |
| 980 | u_int32_t *which; | |
| 981 | u_int8_t pri; | |
| 982 | ||
| 9388413d SS |
983 | KKASSERT(lp->lwp_flag & LWP_ONRUNQ); |
| 984 | lp->lwp_flag &= ~LWP_ONRUNQ; | |
| 52eedfb5 MD |
985 | --bsd4_runqcount; |
| 986 | KKASSERT(bsd4_runqcount >= 0); | |
| 987 | ||
| 988 | pri = lp->lwp_rqindex; | |
| 989 | switch(lp->lwp_rqtype) { | |
| 990 | case RTP_PRIO_NORMAL: | |
| 991 | q = &bsd4_queues[pri]; | |
| 992 | which = &bsd4_queuebits; | |
| 993 | break; | |
| 994 | case RTP_PRIO_REALTIME: | |
| 995 | case RTP_PRIO_FIFO: | |
| 996 | q = &bsd4_rtqueues[pri]; | |
| 997 | which = &bsd4_rtqueuebits; | |
| 998 | break; | |
| 999 | case RTP_PRIO_IDLE: | |
| 1000 | q = &bsd4_idqueues[pri]; | |
| 1001 | which = &bsd4_idqueuebits; | |
| 1002 | break; | |
| 1003 | default: | |
| 1004 | panic("remrunqueue: invalid rtprio type"); | |
| 1005 | /* NOT REACHED */ | |
| 1006 | } | |
| 1007 | TAILQ_REMOVE(q, lp, lwp_procq); | |
| 1008 | if (TAILQ_EMPTY(q)) { | |
| 1009 | KASSERT((*which & (1 << pri)) != 0, | |
| 1010 | ("remrunqueue: remove from empty queue")); | |
| 1011 | *which &= ~(1 << pri); | |
| 38b25931 MD |
1012 | } |
| 1013 | } | |
| 1014 | ||
| 52eedfb5 MD |
1015 | /* |
| 1016 | * bsd4_setrunqueue_locked() | |
| 1017 | * | |
| 1018 | * Add a process whos rqtype and rqindex had previously been calculated | |
| 1019 | * onto the appropriate run queue. Determine if the addition requires | |
| 1020 | * a reschedule on a cpu and return the cpuid or -1. | |
| 1021 | * | |
| 1022 | * NOTE: Lower priorities are better priorities. | |
| 1023 | * | |
| 1024 | * MPSAFE - bsd4_spin must be held exclusively on call | |
| 1025 | */ | |
| 1026 | static void | |
| 1027 | bsd4_setrunqueue_locked(struct lwp *lp) | |
| 1028 | { | |
| 1029 | struct rq *q; | |
| 1030 | u_int32_t *which; | |
| 1031 | int pri; | |
| 1032 | ||
| 9388413d SS |
1033 | KKASSERT((lp->lwp_flag & LWP_ONRUNQ) == 0); |
| 1034 | lp->lwp_flag |= LWP_ONRUNQ; | |
| 52eedfb5 MD |
1035 | ++bsd4_runqcount; |
| 1036 | ||
| 1037 | pri = lp->lwp_rqindex; | |
| 1038 | ||
| 1039 | switch(lp->lwp_rqtype) { | |
| 1040 | case RTP_PRIO_NORMAL: | |
| 1041 | q = &bsd4_queues[pri]; | |
| 1042 | which = &bsd4_queuebits; | |
| 1043 | break; | |
| 1044 | case RTP_PRIO_REALTIME: | |
| 1045 | case RTP_PRIO_FIFO: | |
| 1046 | q = &bsd4_rtqueues[pri]; | |
| 1047 | which = &bsd4_rtqueuebits; | |
| 1048 | break; | |
| 1049 | case RTP_PRIO_IDLE: | |
| 1050 | q = &bsd4_idqueues[pri]; | |
| 1051 | which = &bsd4_idqueuebits; | |
| 1052 | break; | |
| 1053 | default: | |
| 1054 | panic("remrunqueue: invalid rtprio type"); | |
| 1055 | /* NOT REACHED */ | |
| 1056 | } | |
| 1057 | ||
| 1058 | /* | |
| 1059 | * Add to the correct queue and set the appropriate bit. If no | |
| 1060 | * lower priority (i.e. better) processes are in the queue then | |
| 1061 | * we want a reschedule, calculate the best cpu for the job. | |
| 1062 | * | |
| 1063 | * Always run reschedules on the LWPs original cpu. | |
| 1064 | */ | |
| 1065 | TAILQ_INSERT_TAIL(q, lp, lwp_procq); | |
| 1066 | *which |= 1 << pri; | |
| 1067 | } | |
| 1068 | ||
| 38b25931 MD |
1069 | #ifdef SMP |
| 1070 | ||
| 1071 | /* | |
| 1072 | * For SMP systems a user scheduler helper thread is created for each | |
| 1073 | * cpu and is used to allow one cpu to wakeup another for the purposes of | |
| c9e9fb21 MD |
1074 | * scheduling userland threads from setrunqueue(). |
| 1075 | * | |
| 1076 | * UP systems do not need the helper since there is only one cpu. | |
| 1077 | * | |
| 1078 | * We can't use the idle thread for this because we might block. | |
| 1079 | * Additionally, doing things this way allows us to HLT idle cpus | |
| 1080 | * on MP systems. | |
| 52eedfb5 MD |
1081 | * |
| 1082 | * MPSAFE | |
| 38b25931 MD |
1083 | */ |
| 1084 | static void | |
| 1085 | sched_thread(void *dummy) | |
| 1086 | { | |
| 52eedfb5 MD |
1087 | globaldata_t gd; |
| 1088 | bsd4_pcpu_t dd; | |
| 1089 | struct lwp *nlp; | |
| 1090 | cpumask_t cpumask; | |
| 52eedfb5 | 1091 | int cpuid; |
| 418f19aa SW |
1092 | #if 0 |
| 1093 | cpumask_t tmpmask; | |
| 52eedfb5 | 1094 | int tmpid; |
| 418f19aa | 1095 | #endif |
| 52eedfb5 MD |
1096 | |
| 1097 | gd = mycpu; | |
| 1098 | cpuid = gd->gd_cpuid; /* doesn't change */ | |
| b9eb1c19 | 1099 | cpumask = gd->gd_cpumask; /* doesn't change */ |
| 52eedfb5 MD |
1100 | dd = &bsd4_pcpu[cpuid]; |
| 1101 | ||
| 1102 | /* | |
| c9e9fb21 MD |
1103 | * Since we are woken up only when no user processes are scheduled |
| 1104 | * on a cpu, we can run at an ultra low priority. | |
| 52eedfb5 | 1105 | */ |
| 50017724 | 1106 | lwkt_setpri_self(TDPRI_USER_SCHEDULER); |
| 38b25931 | 1107 | |
| 38b25931 | 1108 | for (;;) { |
| 50017724 MD |
1109 | /* |
| 1110 | * We use the LWKT deschedule-interlock trick to avoid racing | |
| 1111 | * bsd4_rdyprocmask. This means we cannot block through to the | |
| 1112 | * manual lwkt_switch() call we make below. | |
| 1113 | */ | |
| 52eedfb5 | 1114 | crit_enter_gd(gd); |
| 50017724 | 1115 | lwkt_deschedule_self(gd->gd_curthread); |
| 287a8577 | 1116 | spin_lock(&bsd4_spin); |
| 52eedfb5 | 1117 | atomic_set_int(&bsd4_rdyprocmask, cpumask); |
| b9eb1c19 MD |
1118 | |
| 1119 | clear_user_resched(); /* This satisfied the reschedule request */ | |
| 1120 | dd->rrcount = 0; /* Reset the round-robin counter */ | |
| 1121 | ||
| 52eedfb5 | 1122 | if ((bsd4_curprocmask & cpumask) == 0) { |
| b9eb1c19 MD |
1123 | /* |
| 1124 | * No thread is currently scheduled. | |
| 1125 | */ | |
| 1126 | KKASSERT(dd->uschedcp == NULL); | |
| 52eedfb5 MD |
1127 | if ((nlp = chooseproc_locked(NULL)) != NULL) { |
| 1128 | atomic_set_int(&bsd4_curprocmask, cpumask); | |
| 1129 | dd->upri = nlp->lwp_priority; | |
| 1130 | dd->uschedcp = nlp; | |
| 287a8577 | 1131 | spin_unlock(&bsd4_spin); |
| 52eedfb5 MD |
1132 | lwkt_acquire(nlp->lwp_thread); |
| 1133 | lwkt_schedule(nlp->lwp_thread); | |
| 1134 | } else { | |
| 287a8577 | 1135 | spin_unlock(&bsd4_spin); |
| 52eedfb5 | 1136 | } |
| b9eb1c19 MD |
1137 | #if 0 |
| 1138 | /* | |
| 1139 | * Disabled for now, this can create an infinite loop. | |
| 1140 | */ | |
| 1141 | } else if (bsd4_runqcount) { | |
| 52eedfb5 MD |
1142 | /* |
| 1143 | * Someone scheduled us but raced. In order to not lose | |
| 1144 | * track of the fact that there may be a LWP ready to go, | |
| 1145 | * forward the request to another cpu if available. | |
| 1146 | * | |
| 1147 | * Rotate through cpus starting with cpuid + 1. Since cpuid | |
| 1148 | * is already masked out by gd_other_cpus, just use ~cpumask. | |
| 1149 | */ | |
| b9eb1c19 MD |
1150 | tmpmask = bsd4_rdyprocmask & mycpu->gd_other_cpus & |
| 1151 | ~bsd4_curprocmask; | |
| 52eedfb5 MD |
1152 | if (tmpmask) { |
| 1153 | if (tmpmask & ~(cpumask - 1)) | |
| 1154 | tmpid = bsfl(tmpmask & ~(cpumask - 1)); | |
| 1155 | else | |
| 1156 | tmpid = bsfl(tmpmask); | |
| 1157 | bsd4_scancpu = tmpid; | |
| 1158 | atomic_clear_int(&bsd4_rdyprocmask, 1 << tmpid); | |
| 1159 | spin_unlock_wr(&bsd4_spin); | |
| 1160 | lwkt_schedule(&bsd4_pcpu[tmpid].helper_thread); | |
| 1161 | } else { | |
| 1162 | spin_unlock_wr(&bsd4_spin); | |
| 1163 | } | |
| b9eb1c19 MD |
1164 | #endif |
| 1165 | } else { | |
| 1166 | /* | |
| 1167 | * The runq is empty. | |
| 1168 | */ | |
| 287a8577 | 1169 | spin_unlock(&bsd4_spin); |
| 38b25931 | 1170 | } |
| 52eedfb5 | 1171 | crit_exit_gd(gd); |
| 38b25931 MD |
1172 | lwkt_switch(); |
| 1173 | } | |
| 1174 | } | |
| 1175 | ||
| 1176 | /* | |
| 1177 | * Setup our scheduler helpers. Note that curprocmask bit 0 has already | |
| 1178 | * been cleared by rqinit() and we should not mess with it further. | |
| 1179 | */ | |
| 1180 | static void | |
| 1181 | sched_thread_cpu_init(void) | |
| 1182 | { | |
| 1183 | int i; | |
| 1184 | ||
| 1185 | if (bootverbose) | |
| 6ea70f76 | 1186 | kprintf("start scheduler helpers on cpus:"); |
| 38b25931 MD |
1187 | |
| 1188 | for (i = 0; i < ncpus; ++i) { | |
| 52eedfb5 | 1189 | bsd4_pcpu_t dd = &bsd4_pcpu[i]; |
| 38b25931 MD |
1190 | cpumask_t mask = 1 << i; |
| 1191 | ||
| 1192 | if ((mask & smp_active_mask) == 0) | |
| 1193 | continue; | |
| 1194 | ||
| 1195 | if (bootverbose) | |
| 6ea70f76 | 1196 | kprintf(" %d", i); |
| 38b25931 | 1197 | |
| 52eedfb5 | 1198 | lwkt_create(sched_thread, NULL, NULL, &dd->helper_thread, |
| fdce8919 | 1199 | TDF_STOPREQ, i, "usched %d", i); |
| 38b25931 MD |
1200 | |
| 1201 | /* | |
| 1202 | * Allow user scheduling on the target cpu. cpu #0 has already | |
| 1203 | * been enabled in rqinit(). | |
| 1204 | */ | |
| 1205 | if (i) | |
| 52eedfb5 MD |
1206 | atomic_clear_int(&bsd4_curprocmask, mask); |
| 1207 | atomic_set_int(&bsd4_rdyprocmask, mask); | |
| b9eb1c19 | 1208 | dd->upri = PRIBASE_NULL; |
| 38b25931 MD |
1209 | } |
| 1210 | if (bootverbose) | |
| 6ea70f76 | 1211 | kprintf("\n"); |
| 38b25931 | 1212 | } |
| ba39e2e0 MD |
1213 | SYSINIT(uschedtd, SI_BOOT2_USCHED, SI_ORDER_SECOND, |
| 1214 | sched_thread_cpu_init, NULL) | |
| 38b25931 MD |
1215 | |
| 1216 | #endif | |
| 1217 |