| Commit | Line | Data |
|---|---|---|
| 38b25931 MD |
1 | /* |
| 2 | * Copyright (c) 1999 Peter Wemm <peter@FreeBSD.org> | |
| 3 | * All rights reserved. | |
| 4 | * | |
| 5 | * Redistribution and use in source and binary forms, with or without | |
| 6 | * modification, are permitted provided that the following conditions | |
| 7 | * are met: | |
| 8 | * 1. Redistributions of source code must retain the above copyright | |
| 9 | * notice, this list of conditions and the following disclaimer. | |
| 10 | * 2. Redistributions in binary form must reproduce the above copyright | |
| 11 | * notice, this list of conditions and the following disclaimer in the | |
| 12 | * documentation and/or other materials provided with the distribution. | |
| 13 | * | |
| 14 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND | |
| 15 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
| 16 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
| 17 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE | |
| 18 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
| 19 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
| 20 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
| 21 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
| 22 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
| 23 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
| 24 | * SUCH DAMAGE. | |
| 25 | * | |
| 3925aa71 | 26 | * $DragonFly: src/sys/kern/usched_bsd4.c,v 1.26 2008/11/01 23:31:19 dillon Exp $ |
| 38b25931 MD |
27 | */ |
| 28 | ||
| 29 | #include <sys/param.h> | |
| 30 | #include <sys/systm.h> | |
| 31 | #include <sys/kernel.h> | |
| 32 | #include <sys/lock.h> | |
| 33 | #include <sys/queue.h> | |
| 34 | #include <sys/proc.h> | |
| 35 | #include <sys/rtprio.h> | |
| 38b25931 MD |
36 | #include <sys/uio.h> |
| 37 | #include <sys/sysctl.h> | |
| 38 | #include <sys/resourcevar.h> | |
| 52eedfb5 | 39 | #include <sys/spinlock.h> |
| 38b25931 MD |
40 | #include <machine/cpu.h> |
| 41 | #include <machine/smp.h> | |
| 42 | ||
| 52eedfb5 MD |
43 | #include <sys/thread2.h> |
| 44 | #include <sys/spinlock2.h> | |
| 684a93c4 | 45 | #include <sys/mplock2.h> |
| 52eedfb5 | 46 | |
| 38b25931 MD |
47 | /* |
| 48 | * Priorities. Note that with 32 run queues per scheduler each queue | |
| 49 | * represents four priority levels. | |
| 50 | */ | |
| 51 | ||
| 52 | #define MAXPRI 128 | |
| 53 | #define PRIMASK (MAXPRI - 1) | |
| 54 | #define PRIBASE_REALTIME 0 | |
| 55 | #define PRIBASE_NORMAL MAXPRI | |
| 56 | #define PRIBASE_IDLE (MAXPRI * 2) | |
| 57 | #define PRIBASE_THREAD (MAXPRI * 3) | |
| 58 | #define PRIBASE_NULL (MAXPRI * 4) | |
| 59 | ||
| 60 | #define NQS 32 /* 32 run queues. */ | |
| 61 | #define PPQ (MAXPRI / NQS) /* priorities per queue */ | |
| 52eedfb5 | 62 | #define PPQMASK (PPQ - 1) |
| 38b25931 MD |
63 | |
| 64 | /* | |
| 65 | * NICEPPQ - number of nice units per priority queue | |
| 66 | * ESTCPURAMP - number of scheduler ticks for estcpu to switch queues | |
| 67 | * | |
| 68 | * ESTCPUPPQ - number of estcpu units per priority queue | |
| 69 | * ESTCPUMAX - number of estcpu units | |
| 70 | * ESTCPUINCR - amount we have to increment p_estcpu per scheduling tick at | |
| 71 | * 100% cpu. | |
| 72 | */ | |
| 73 | #define NICEPPQ 2 | |
| 74 | #define ESTCPURAMP 4 | |
| 75 | #define ESTCPUPPQ 512 | |
| 76 | #define ESTCPUMAX (ESTCPUPPQ * NQS) | |
| 77 | #define ESTCPUINCR (ESTCPUPPQ / ESTCPURAMP) | |
| 78 | #define PRIO_RANGE (PRIO_MAX - PRIO_MIN + 1) | |
| 79 | ||
| 80 | #define ESTCPULIM(v) min((v), ESTCPUMAX) | |
| 81 | ||
| 553ea3c8 | 82 | TAILQ_HEAD(rq, lwp); |
| 38b25931 | 83 | |
| 553ea3c8 SS |
84 | #define lwp_priority lwp_usdata.bsd4.priority |
| 85 | #define lwp_rqindex lwp_usdata.bsd4.rqindex | |
| 86 | #define lwp_origcpu lwp_usdata.bsd4.origcpu | |
| 87 | #define lwp_estcpu lwp_usdata.bsd4.estcpu | |
| 52eedfb5 | 88 | #define lwp_rqtype lwp_usdata.bsd4.rqtype |
| 38b25931 | 89 | |
| 553ea3c8 SS |
90 | static void bsd4_acquire_curproc(struct lwp *lp); |
| 91 | static void bsd4_release_curproc(struct lwp *lp); | |
| 38b25931 | 92 | static void bsd4_select_curproc(globaldata_t gd); |
| 553ea3c8 | 93 | static void bsd4_setrunqueue(struct lwp *lp); |
| 553ea3c8 | 94 | static void bsd4_schedulerclock(struct lwp *lp, sysclock_t period, |
| 38b25931 | 95 | sysclock_t cpstamp); |
| 52eedfb5 | 96 | static void bsd4_recalculate_estcpu(struct lwp *lp); |
| 553ea3c8 SS |
97 | static void bsd4_resetpriority(struct lwp *lp); |
| 98 | static void bsd4_forking(struct lwp *plp, struct lwp *lp); | |
| 99 | static void bsd4_exiting(struct lwp *plp, struct lwp *lp); | |
| c3149361 | 100 | static void bsd4_yield(struct lwp *lp); |
| 38b25931 | 101 | |
| 52eedfb5 MD |
102 | #ifdef SMP |
| 103 | static void need_user_resched_remote(void *dummy); | |
| 104 | #endif | |
| 105 | static struct lwp *chooseproc_locked(struct lwp *chklp); | |
| 106 | static void bsd4_remrunqueue_locked(struct lwp *lp); | |
| 107 | static void bsd4_setrunqueue_locked(struct lwp *lp); | |
| 38b25931 MD |
108 | |
| 109 | struct usched usched_bsd4 = { | |
| 110 | { NULL }, | |
| 111 | "bsd4", "Original DragonFly Scheduler", | |
| cb7f4ab1 MD |
112 | NULL, /* default registration */ |
| 113 | NULL, /* default deregistration */ | |
| 38b25931 MD |
114 | bsd4_acquire_curproc, |
| 115 | bsd4_release_curproc, | |
| 38b25931 | 116 | bsd4_setrunqueue, |
| 38b25931 MD |
117 | bsd4_schedulerclock, |
| 118 | bsd4_recalculate_estcpu, | |
| 119 | bsd4_resetpriority, | |
| 120 | bsd4_forking, | |
| cb7f4ab1 | 121 | bsd4_exiting, |
| c3149361 MD |
122 | NULL, /* setcpumask not supported */ |
| 123 | bsd4_yield | |
| 38b25931 MD |
124 | }; |
| 125 | ||
| 52eedfb5 MD |
126 | struct usched_bsd4_pcpu { |
| 127 | struct thread helper_thread; | |
| 128 | short rrcount; | |
| 129 | short upri; | |
| 130 | struct lwp *uschedcp; | |
| 131 | }; | |
| 132 | ||
| 133 | typedef struct usched_bsd4_pcpu *bsd4_pcpu_t; | |
| 134 | ||
| 38b25931 MD |
135 | /* |
| 136 | * We have NQS (32) run queues per scheduling class. For the normal | |
| 137 | * class, there are 128 priorities scaled onto these 32 queues. New | |
| 138 | * processes are added to the last entry in each queue, and processes | |
| 139 | * are selected for running by taking them from the head and maintaining | |
| 140 | * a simple FIFO arrangement. Realtime and Idle priority processes have | |
| 141 | * and explicit 0-31 priority which maps directly onto their class queue | |
| 142 | * index. When a queue has something in it, the corresponding bit is | |
| 143 | * set in the queuebits variable, allowing a single read to determine | |
| 144 | * the state of all 32 queues and then a ffs() to find the first busy | |
| 145 | * queue. | |
| 146 | */ | |
| 52eedfb5 MD |
147 | static struct rq bsd4_queues[NQS]; |
| 148 | static struct rq bsd4_rtqueues[NQS]; | |
| 149 | static struct rq bsd4_idqueues[NQS]; | |
| 150 | static u_int32_t bsd4_queuebits; | |
| 151 | static u_int32_t bsd4_rtqueuebits; | |
| 152 | static u_int32_t bsd4_idqueuebits; | |
| 153 | static cpumask_t bsd4_curprocmask = -1; /* currently running a user process */ | |
| 154 | static cpumask_t bsd4_rdyprocmask; /* ready to accept a user process */ | |
| 155 | static int bsd4_runqcount; | |
| 38b25931 | 156 | #ifdef SMP |
| 52eedfb5 | 157 | static volatile int bsd4_scancpu; |
| 38b25931 | 158 | #endif |
| 52eedfb5 MD |
159 | static struct spinlock bsd4_spin; |
| 160 | static struct usched_bsd4_pcpu bsd4_pcpu[MAXCPU]; | |
| 38b25931 | 161 | |
| 52eedfb5 | 162 | SYSCTL_INT(_debug, OID_AUTO, bsd4_runqcount, CTLFLAG_RD, &bsd4_runqcount, 0, ""); |
| 38b25931 MD |
163 | #ifdef INVARIANTS |
| 164 | static int usched_nonoptimal; | |
| 165 | SYSCTL_INT(_debug, OID_AUTO, usched_nonoptimal, CTLFLAG_RW, | |
| 166 | &usched_nonoptimal, 0, "acquire_curproc() was not optimal"); | |
| 167 | static int usched_optimal; | |
| 168 | SYSCTL_INT(_debug, OID_AUTO, usched_optimal, CTLFLAG_RW, | |
| 169 | &usched_optimal, 0, "acquire_curproc() was optimal"); | |
| 170 | #endif | |
| 171 | static int usched_debug = -1; | |
| 172 | SYSCTL_INT(_debug, OID_AUTO, scdebug, CTLFLAG_RW, &usched_debug, 0, ""); | |
| 173 | #ifdef SMP | |
| 38b25931 MD |
174 | static int remote_resched_nonaffinity; |
| 175 | static int remote_resched_affinity; | |
| 176 | static int choose_affinity; | |
| 38b25931 MD |
177 | SYSCTL_INT(_debug, OID_AUTO, remote_resched_nonaffinity, CTLFLAG_RD, |
| 178 | &remote_resched_nonaffinity, 0, "Number of remote rescheds"); | |
| 179 | SYSCTL_INT(_debug, OID_AUTO, remote_resched_affinity, CTLFLAG_RD, | |
| 180 | &remote_resched_affinity, 0, "Number of remote rescheds"); | |
| 181 | SYSCTL_INT(_debug, OID_AUTO, choose_affinity, CTLFLAG_RD, | |
| 182 | &choose_affinity, 0, "chooseproc() was smart"); | |
| 183 | #endif | |
| 184 | ||
| 185 | static int usched_bsd4_rrinterval = (ESTCPUFREQ + 9) / 10; | |
| 186 | SYSCTL_INT(_kern, OID_AUTO, usched_bsd4_rrinterval, CTLFLAG_RW, | |
| 187 | &usched_bsd4_rrinterval, 0, ""); | |
| 188 | static int usched_bsd4_decay = ESTCPUINCR / 2; | |
| 189 | SYSCTL_INT(_kern, OID_AUTO, usched_bsd4_decay, CTLFLAG_RW, | |
| 190 | &usched_bsd4_decay, 0, ""); | |
| 191 | ||
| 192 | /* | |
| 193 | * Initialize the run queues at boot time. | |
| 194 | */ | |
| 195 | static void | |
| 196 | rqinit(void *dummy) | |
| 197 | { | |
| 198 | int i; | |
| 199 | ||
| 52eedfb5 | 200 | spin_init(&bsd4_spin); |
| 38b25931 | 201 | for (i = 0; i < NQS; i++) { |
| 52eedfb5 MD |
202 | TAILQ_INIT(&bsd4_queues[i]); |
| 203 | TAILQ_INIT(&bsd4_rtqueues[i]); | |
| 204 | TAILQ_INIT(&bsd4_idqueues[i]); | |
| 38b25931 | 205 | } |
| 52eedfb5 | 206 | atomic_clear_int(&bsd4_curprocmask, 1); |
| 38b25931 | 207 | } |
| ba39e2e0 | 208 | SYSINIT(runqueue, SI_BOOT2_USCHED, SI_ORDER_FIRST, rqinit, NULL) |
| 38b25931 MD |
209 | |
| 210 | /* | |
| 52eedfb5 | 211 | * BSD4_ACQUIRE_CURPROC |
| 38b25931 | 212 | * |
| 52eedfb5 MD |
213 | * This function is called when the kernel intends to return to userland. |
| 214 | * It is responsible for making the thread the current designated userland | |
| 215 | * thread for this cpu, blocking if necessary. | |
| 216 | * | |
| b9eb1c19 MD |
217 | * The kernel has already depressed our LWKT priority so we must not switch |
| 218 | * until we have either assigned or disposed of the thread. | |
| 52eedfb5 MD |
219 | * |
| 220 | * WARNING! THIS FUNCTION IS ALLOWED TO CAUSE THE CURRENT THREAD TO MIGRATE | |
| 221 | * TO ANOTHER CPU! Because most of the kernel assumes that no migration will | |
| 222 | * occur, this function is called only under very controlled circumstances. | |
| 223 | * | |
| 52eedfb5 | 224 | * MPSAFE |
| 38b25931 | 225 | */ |
| 52eedfb5 MD |
226 | static void |
| 227 | bsd4_acquire_curproc(struct lwp *lp) | |
| 38b25931 | 228 | { |
| b9eb1c19 MD |
229 | globaldata_t gd; |
| 230 | bsd4_pcpu_t dd; | |
| 231 | struct lwp *olp; | |
| 38b25931 | 232 | |
| b9eb1c19 MD |
233 | crit_enter(); |
| 234 | bsd4_recalculate_estcpu(lp); | |
| 38b25931 | 235 | |
| 38b25931 | 236 | /* |
| b9eb1c19 MD |
237 | * If a reschedule was requested give another thread the |
| 238 | * driver's seat. | |
| 38b25931 | 239 | */ |
| b9eb1c19 MD |
240 | if (user_resched_wanted()) { |
| 241 | clear_user_resched(); | |
| 242 | bsd4_release_curproc(lp); | |
| 38b25931 | 243 | } |
| 38b25931 | 244 | |
| 52eedfb5 | 245 | /* |
| b9eb1c19 | 246 | * Loop until we are the current user thread |
| 52eedfb5 MD |
247 | */ |
| 248 | do { | |
| b9eb1c19 MD |
249 | /* |
| 250 | * Reload after a switch or setrunqueue/switch possibly | |
| 251 | * moved us to another cpu. | |
| 252 | */ | |
| 253 | clear_lwkt_resched(); | |
| 52eedfb5 MD |
254 | gd = mycpu; |
| 255 | dd = &bsd4_pcpu[gd->gd_cpuid]; | |
| b9eb1c19 MD |
256 | |
| 257 | /* | |
| 258 | * Become the currently scheduled user thread for this cpu | |
| 259 | * if we can do so trivially. | |
| 260 | * | |
| 261 | * We can steal another thread's current thread designation | |
| 262 | * on this cpu since if we are running that other thread | |
| 263 | * must not be, so we can safely deschedule it. | |
| 264 | */ | |
| 265 | if (dd->uschedcp == lp) { | |
| 266 | dd->upri = lp->lwp_priority; | |
| 267 | } else if (dd->uschedcp == NULL) { | |
| 268 | atomic_set_int(&bsd4_curprocmask, gd->gd_cpumask); | |
| 269 | dd->uschedcp = lp; | |
| 270 | dd->upri = lp->lwp_priority; | |
| 271 | } else if (dd->upri > lp->lwp_priority) { | |
| 272 | olp = dd->uschedcp; | |
| 273 | dd->uschedcp = lp; | |
| 274 | dd->upri = lp->lwp_priority; | |
| 275 | lwkt_deschedule(olp->lwp_thread); | |
| 276 | bsd4_setrunqueue(olp); | |
| 277 | } else { | |
| 278 | lwkt_deschedule(lp->lwp_thread); | |
| 279 | bsd4_setrunqueue(lp); | |
| 280 | lwkt_switch(); | |
| 281 | } | |
| 282 | ||
| 283 | /* | |
| 284 | * Other threads at our current user priority have already | |
| 285 | * put in their bids, but we must run any kernel threads | |
| 286 | * at higher priorities, and we could lose our bid to | |
| 287 | * another thread trying to return to user mode in the | |
| 288 | * process. | |
| 289 | * | |
| 290 | * If we lose our bid we will be descheduled and put on | |
| 291 | * the run queue. When we are reactivated we will have | |
| 292 | * another chance. | |
| 293 | */ | |
| 77912481 | 294 | lwkt_switch(); |
| 52eedfb5 | 295 | } while (dd->uschedcp != lp); |
| b9eb1c19 MD |
296 | |
| 297 | crit_exit(); | |
| 9388413d | 298 | KKASSERT((lp->lwp_flag & LWP_ONRUNQ) == 0); |
| 52eedfb5 MD |
299 | } |
| 300 | ||
| 301 | /* | |
| 302 | * BSD4_RELEASE_CURPROC | |
| 303 | * | |
| 304 | * This routine detaches the current thread from the userland scheduler, | |
| b9eb1c19 MD |
305 | * usually because the thread needs to run or block in the kernel (at |
| 306 | * kernel priority) for a while. | |
| 52eedfb5 MD |
307 | * |
| 308 | * This routine is also responsible for selecting a new thread to | |
| 309 | * make the current thread. | |
| 310 | * | |
| 311 | * NOTE: This implementation differs from the dummy example in that | |
| 312 | * bsd4_select_curproc() is able to select the current process, whereas | |
| 313 | * dummy_select_curproc() is not able to select the current process. | |
| 314 | * This means we have to NULL out uschedcp. | |
| 315 | * | |
| 316 | * Additionally, note that we may already be on a run queue if releasing | |
| 317 | * via the lwkt_switch() in bsd4_setrunqueue(). | |
| 318 | * | |
| 319 | * WARNING! The MP lock may be in an unsynchronized state due to the | |
| 320 | * way get_mplock() works and the fact that this function may be called | |
| 321 | * from a passive release during a lwkt_switch(). try_mplock() will deal | |
| 322 | * with this for us but you should be aware that td_mpcount may not be | |
| 323 | * useable. | |
| 324 | * | |
| 325 | * MPSAFE | |
| 326 | */ | |
| 327 | static void | |
| 328 | bsd4_release_curproc(struct lwp *lp) | |
| 329 | { | |
| 330 | globaldata_t gd = mycpu; | |
| 331 | bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid]; | |
| 332 | ||
| 333 | if (dd->uschedcp == lp) { | |
| b9eb1c19 | 334 | crit_enter(); |
| 9388413d | 335 | KKASSERT((lp->lwp_flag & LWP_ONRUNQ) == 0); |
| 52eedfb5 | 336 | dd->uschedcp = NULL; /* don't let lp be selected */ |
| b9eb1c19 MD |
337 | dd->upri = PRIBASE_NULL; |
| 338 | atomic_clear_int(&bsd4_curprocmask, gd->gd_cpumask); | |
| 52eedfb5 | 339 | bsd4_select_curproc(gd); |
| b9eb1c19 | 340 | crit_exit(); |
| 52eedfb5 | 341 | } |
| 38b25931 MD |
342 | } |
| 343 | ||
| 38b25931 | 344 | /* |
| 52eedfb5 MD |
345 | * BSD4_SELECT_CURPROC |
| 346 | * | |
| b9eb1c19 MD |
347 | * Select a new current process for this cpu and clear any pending user |
| 348 | * reschedule request. The cpu currently has no current process. | |
| 52eedfb5 MD |
349 | * |
| 350 | * This routine is also responsible for equal-priority round-robining, | |
| 351 | * typically triggered from bsd4_schedulerclock(). In our dummy example | |
| 352 | * all the 'user' threads are LWKT scheduled all at once and we just | |
| 353 | * call lwkt_switch(). | |
| 354 | * | |
| b9eb1c19 MD |
355 | * The calling process is not on the queue and cannot be selected. |
| 356 | * | |
| 52eedfb5 | 357 | * MPSAFE |
| 38b25931 MD |
358 | */ |
| 359 | static | |
| 360 | void | |
| 52eedfb5 | 361 | bsd4_select_curproc(globaldata_t gd) |
| 38b25931 | 362 | { |
| 52eedfb5 MD |
363 | bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid]; |
| 364 | struct lwp *nlp; | |
| 365 | int cpuid = gd->gd_cpuid; | |
| 38b25931 | 366 | |
| 52eedfb5 | 367 | crit_enter_gd(gd); |
| 52eedfb5 MD |
368 | |
| 369 | spin_lock_wr(&bsd4_spin); | |
| 370 | if ((nlp = chooseproc_locked(dd->uschedcp)) != NULL) { | |
| 371 | atomic_set_int(&bsd4_curprocmask, 1 << cpuid); | |
| 372 | dd->upri = nlp->lwp_priority; | |
| 373 | dd->uschedcp = nlp; | |
| 374 | spin_unlock_wr(&bsd4_spin); | |
| 375 | #ifdef SMP | |
| 376 | lwkt_acquire(nlp->lwp_thread); | |
| 38b25931 | 377 | #endif |
| 52eedfb5 | 378 | lwkt_schedule(nlp->lwp_thread); |
| 52eedfb5 | 379 | } else if (bsd4_runqcount && (bsd4_rdyprocmask & (1 << cpuid))) { |
| 52eedfb5 | 380 | atomic_clear_int(&bsd4_rdyprocmask, 1 << cpuid); |
| 52eedfb5 MD |
381 | spin_unlock_wr(&bsd4_spin); |
| 382 | lwkt_schedule(&dd->helper_thread); | |
| 383 | } else { | |
| 52eedfb5 MD |
384 | spin_unlock_wr(&bsd4_spin); |
| 385 | } | |
| 386 | crit_exit_gd(gd); | |
| 387 | } | |
| 38b25931 MD |
388 | |
| 389 | /* | |
| 52eedfb5 MD |
390 | * BSD4_SETRUNQUEUE |
| 391 | * | |
| b9eb1c19 MD |
392 | * Place the specified lwp on the user scheduler's run queue. This routine |
| 393 | * must be called with the thread descheduled. The lwp must be runnable. | |
| 38b25931 | 394 | * |
| b9eb1c19 | 395 | * The thread may be the current thread as a special case. |
| 52eedfb5 MD |
396 | * |
| 397 | * MPSAFE | |
| 38b25931 MD |
398 | */ |
| 399 | static void | |
| 553ea3c8 | 400 | bsd4_setrunqueue(struct lwp *lp) |
| 38b25931 | 401 | { |
| 52eedfb5 MD |
402 | globaldata_t gd; |
| 403 | bsd4_pcpu_t dd; | |
| 38b25931 | 404 | #ifdef SMP |
| b9eb1c19 | 405 | int cpuid; |
| 38b25931 | 406 | cpumask_t mask; |
| 52eedfb5 | 407 | cpumask_t tmpmask; |
| 38b25931 MD |
408 | #endif |
| 409 | ||
| 52eedfb5 MD |
410 | /* |
| 411 | * First validate the process state relative to the current cpu. | |
| 412 | * We don't need the spinlock for this, just a critical section. | |
| 413 | * We are in control of the process. | |
| 414 | */ | |
| 38b25931 | 415 | crit_enter(); |
| 164b8401 | 416 | KASSERT(lp->lwp_stat == LSRUN, ("setrunqueue: lwp not LSRUN")); |
| 9388413d | 417 | KASSERT((lp->lwp_flag & LWP_ONRUNQ) == 0, |
| 164b8401 SS |
418 | ("lwp %d/%d already on runq! flag %08x/%08x", lp->lwp_proc->p_pid, |
| 419 | lp->lwp_tid, lp->lwp_proc->p_flag, lp->lwp_flag)); | |
| 553ea3c8 | 420 | KKASSERT((lp->lwp_thread->td_flags & TDF_RUNQ) == 0); |
| 38b25931 MD |
421 | |
| 422 | /* | |
| 52eedfb5 MD |
423 | * Note: gd and dd are relative to the target thread's last cpu, |
| 424 | * NOT our current cpu. | |
| 38b25931 | 425 | */ |
| 553ea3c8 | 426 | gd = lp->lwp_thread->td_gd; |
| 52eedfb5 | 427 | dd = &bsd4_pcpu[gd->gd_cpuid]; |
| 38b25931 MD |
428 | |
| 429 | /* | |
| 52eedfb5 MD |
430 | * This process is not supposed to be scheduled anywhere or assigned |
| 431 | * as the current process anywhere. Assert the condition. | |
| 38b25931 | 432 | */ |
| 52eedfb5 | 433 | KKASSERT(dd->uschedcp != lp); |
| 38b25931 | 434 | |
| b9eb1c19 | 435 | #ifndef SMP |
| 38b25931 | 436 | /* |
| b9eb1c19 MD |
437 | * If we are not SMP we do not have a scheduler helper to kick |
| 438 | * and must directly activate the process if none are scheduled. | |
| 38b25931 | 439 | * |
| b9eb1c19 MD |
440 | * This is really only an issue when bootstrapping init since |
| 441 | * the caller in all other cases will be a user process, and | |
| 442 | * even if released (dd->uschedcp == NULL), that process will | |
| 443 | * kickstart the scheduler when it returns to user mode from | |
| 444 | * the kernel. | |
| 38b25931 | 445 | */ |
| b9eb1c19 MD |
446 | if (dd->uschedcp == NULL) { |
| 447 | atomic_set_int(&bsd4_curprocmask, gd->gd_cpumask); | |
| 52eedfb5 MD |
448 | dd->uschedcp = lp; |
| 449 | dd->upri = lp->lwp_priority; | |
| 553ea3c8 | 450 | lwkt_schedule(lp->lwp_thread); |
| 38b25931 | 451 | crit_exit(); |
| 38b25931 MD |
452 | return; |
| 453 | } | |
| b9eb1c19 | 454 | #endif |
| 38b25931 | 455 | |
| 38b25931 MD |
456 | #ifdef SMP |
| 457 | /* | |
| 52eedfb5 MD |
458 | * XXX fixme. Could be part of a remrunqueue/setrunqueue |
| 459 | * operation when the priority is recalculated, so TDF_MIGRATING | |
| 460 | * may already be set. | |
| 38b25931 | 461 | */ |
| 52eedfb5 MD |
462 | if ((lp->lwp_thread->td_flags & TDF_MIGRATING) == 0) |
| 463 | lwkt_giveaway(lp->lwp_thread); | |
| 464 | #endif | |
| 50017724 MD |
465 | |
| 466 | /* | |
| 467 | * We lose control of lp the moment we release the spinlock after | |
| 468 | * having placed lp on the queue. i.e. another cpu could pick it | |
| 469 | * up and it could exit, or its priority could be further adjusted, | |
| 470 | * or something like that. | |
| 471 | */ | |
| 52eedfb5 MD |
472 | spin_lock_wr(&bsd4_spin); |
| 473 | bsd4_setrunqueue_locked(lp); | |
| 38b25931 | 474 | |
| b9eb1c19 | 475 | #ifdef SMP |
| 38b25931 | 476 | /* |
| b9eb1c19 MD |
477 | * Kick the scheduler helper on one of the other cpu's |
| 478 | * and request a reschedule if appropriate. | |
| 38b25931 | 479 | */ |
| b9eb1c19 MD |
480 | cpuid = (bsd4_scancpu & 0xFFFF) % ncpus; |
| 481 | ++bsd4_scancpu; | |
| 482 | mask = ~bsd4_curprocmask & bsd4_rdyprocmask & | |
| 483 | lp->lwp_cpumask & smp_active_mask; | |
| 50017724 | 484 | spin_unlock_wr(&bsd4_spin); |
| 38b25931 | 485 | |
| b9eb1c19 | 486 | while (mask) { |
| 52eedfb5 MD |
487 | tmpmask = ~((1 << cpuid) - 1); |
| 488 | if (mask & tmpmask) | |
| 489 | cpuid = bsfl(mask & tmpmask); | |
| 490 | else | |
| 491 | cpuid = bsfl(mask); | |
| b9eb1c19 MD |
492 | gd = globaldata_find(cpuid); |
| 493 | dd = &bsd4_pcpu[cpuid]; | |
| 494 | ||
| 495 | if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) { | |
| 496 | if (gd == mycpu) | |
| 497 | need_user_resched_remote(NULL); | |
| 498 | else | |
| 499 | lwkt_send_ipiq(gd, need_user_resched_remote, NULL); | |
| 500 | break; | |
| 501 | } | |
| 502 | mask &= ~(1 << cpuid); | |
| 503 | } | |
| 504 | #else | |
| 505 | /* | |
| 506 | * Request a reschedule if appropriate. | |
| 507 | */ | |
| 508 | spin_unlock_wr(&bsd4_spin); | |
| 509 | if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) { | |
| 510 | need_user_resched(); | |
| 38b25931 MD |
511 | } |
| 512 | #endif | |
| 513 | crit_exit(); | |
| 514 | } | |
| 515 | ||
| 516 | /* | |
| 38b25931 | 517 | * This routine is called from a systimer IPI. It MUST be MP-safe and |
| 52eedfb5 MD |
518 | * the BGL IS NOT HELD ON ENTRY. This routine is called at ESTCPUFREQ on |
| 519 | * each cpu. | |
| 520 | * | |
| 270ac911 | 521 | * MPSAFE |
| 38b25931 MD |
522 | */ |
| 523 | static | |
| 524 | void | |
| 553ea3c8 | 525 | bsd4_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp) |
| 38b25931 MD |
526 | { |
| 527 | globaldata_t gd = mycpu; | |
| 52eedfb5 | 528 | bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid]; |
| 38b25931 MD |
529 | |
| 530 | /* | |
| 531 | * Do we need to round-robin? We round-robin 10 times a second. | |
| 532 | * This should only occur for cpu-bound batch processes. | |
| 533 | */ | |
| 52eedfb5 MD |
534 | if (++dd->rrcount >= usched_bsd4_rrinterval) { |
| 535 | dd->rrcount = 0; | |
| 38b25931 MD |
536 | need_user_resched(); |
| 537 | } | |
| 538 | ||
| 539 | /* | |
| 540 | * As the process accumulates cpu time p_estcpu is bumped and may | |
| 541 | * push the process into another scheduling queue. It typically | |
| 542 | * takes 4 ticks to bump the queue. | |
| 543 | */ | |
| 553ea3c8 | 544 | lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu + ESTCPUINCR); |
| 38b25931 MD |
545 | |
| 546 | /* | |
| 547 | * Reducing p_origcpu over time causes more of our estcpu to be | |
| 548 | * returned to the parent when we exit. This is a small tweak | |
| 549 | * for the batch detection heuristic. | |
| 550 | */ | |
| 553ea3c8 SS |
551 | if (lp->lwp_origcpu) |
| 552 | --lp->lwp_origcpu; | |
| 50017724 MD |
553 | |
| 554 | /* | |
| 77912481 MD |
555 | * Spinlocks also hold a critical section so there should not be |
| 556 | * any active. | |
| 50017724 | 557 | */ |
| 77912481 MD |
558 | KKASSERT(gd->gd_spinlock_rd == NULL); |
| 559 | KKASSERT(gd->gd_spinlocks_wr == 0); | |
| 560 | ||
| 561 | bsd4_resetpriority(lp); | |
| 562 | #if 0 | |
| 563 | /* | |
| 564 | * if we can't call bsd4_resetpriority for some reason we must call | |
| 565 | * need user_resched(). | |
| 566 | */ | |
| 567 | need_user_resched(); | |
| 568 | #endif | |
| 38b25931 MD |
569 | } |
| 570 | ||
| 571 | /* | |
| 52eedfb5 MD |
572 | * Called from acquire and from kern_synch's one-second timer (one of the |
| 573 | * callout helper threads) with a critical section held. | |
| 38b25931 | 574 | * |
| 52eedfb5 MD |
575 | * Decay p_estcpu based on the number of ticks we haven't been running |
| 576 | * and our p_nice. As the load increases each process observes a larger | |
| 577 | * number of idle ticks (because other processes are running in them). | |
| 578 | * This observation leads to a larger correction which tends to make the | |
| 579 | * system more 'batchy'. | |
| 38b25931 | 580 | * |
| 52eedfb5 MD |
581 | * Note that no recalculation occurs for a process which sleeps and wakes |
| 582 | * up in the same tick. That is, a system doing thousands of context | |
| 583 | * switches per second will still only do serious estcpu calculations | |
| 584 | * ESTCPUFREQ times per second. | |
| 38b25931 | 585 | * |
| 52eedfb5 | 586 | * MPSAFE |
| 38b25931 MD |
587 | */ |
| 588 | static | |
| 52eedfb5 MD |
589 | void |
| 590 | bsd4_recalculate_estcpu(struct lwp *lp) | |
| 38b25931 | 591 | { |
| 52eedfb5 MD |
592 | globaldata_t gd = mycpu; |
| 593 | sysclock_t cpbase; | |
| 594 | int loadfac; | |
| 595 | int ndecay; | |
| 596 | int nticks; | |
| 597 | int nleft; | |
| 38b25931 MD |
598 | |
| 599 | /* | |
| 52eedfb5 MD |
600 | * We have to subtract periodic to get the last schedclock |
| 601 | * timeout time, otherwise we would get the upcoming timeout. | |
| 602 | * Keep in mind that a process can migrate between cpus and | |
| 603 | * while the scheduler clock should be very close, boundary | |
| 604 | * conditions could lead to a small negative delta. | |
| 38b25931 | 605 | */ |
| 52eedfb5 | 606 | cpbase = gd->gd_schedclock.time - gd->gd_schedclock.periodic; |
| 38b25931 | 607 | |
| 52eedfb5 MD |
608 | if (lp->lwp_slptime > 1) { |
| 609 | /* | |
| 610 | * Too much time has passed, do a coarse correction. | |
| 611 | */ | |
| 612 | lp->lwp_estcpu = lp->lwp_estcpu >> 1; | |
| 613 | bsd4_resetpriority(lp); | |
| 614 | lp->lwp_cpbase = cpbase; | |
| 615 | lp->lwp_cpticks = 0; | |
| 616 | } else if (lp->lwp_cpbase != cpbase) { | |
| 617 | /* | |
| 618 | * Adjust estcpu if we are in a different tick. Don't waste | |
| 619 | * time if we are in the same tick. | |
| 620 | * | |
| 621 | * First calculate the number of ticks in the measurement | |
| 622 | * interval. The nticks calculation can wind up 0 due to | |
| 623 | * a bug in the handling of lwp_slptime (as yet not found), | |
| 624 | * so make sure we do not get a divide by 0 panic. | |
| 625 | */ | |
| 626 | nticks = (cpbase - lp->lwp_cpbase) / gd->gd_schedclock.periodic; | |
| 627 | if (nticks <= 0) | |
| 628 | nticks = 1; | |
| 629 | updatepcpu(lp, lp->lwp_cpticks, nticks); | |
| 38b25931 | 630 | |
| 52eedfb5 MD |
631 | if ((nleft = nticks - lp->lwp_cpticks) < 0) |
| 632 | nleft = 0; | |
| 633 | if (usched_debug == lp->lwp_proc->p_pid) { | |
| 6ea70f76 | 634 | kprintf("pid %d tid %d estcpu %d cpticks %d nticks %d nleft %d", |
| 52eedfb5 MD |
635 | lp->lwp_proc->p_pid, lp->lwp_tid, lp->lwp_estcpu, |
| 636 | lp->lwp_cpticks, nticks, nleft); | |
| 637 | } | |
| 38b25931 | 638 | |
| 52eedfb5 MD |
639 | /* |
| 640 | * Calculate a decay value based on ticks remaining scaled | |
| 641 | * down by the instantanious load and p_nice. | |
| 642 | */ | |
| 643 | if ((loadfac = bsd4_runqcount) < 2) | |
| 644 | loadfac = 2; | |
| 645 | ndecay = nleft * usched_bsd4_decay * 2 * | |
| 646 | (PRIO_MAX * 2 - lp->lwp_proc->p_nice) / (loadfac * PRIO_MAX * 2); | |
| 38b25931 | 647 | |
| 52eedfb5 MD |
648 | /* |
| 649 | * Adjust p_estcpu. Handle a border case where batch jobs | |
| 650 | * can get stalled long enough to decay to zero when they | |
| 651 | * shouldn't. | |
| 652 | */ | |
| 653 | if (lp->lwp_estcpu > ndecay * 2) | |
| 654 | lp->lwp_estcpu -= ndecay; | |
| 655 | else | |
| 656 | lp->lwp_estcpu >>= 1; | |
| 344ad853 | 657 | |
| 52eedfb5 | 658 | if (usched_debug == lp->lwp_proc->p_pid) |
| 6ea70f76 | 659 | kprintf(" ndecay %d estcpu %d\n", ndecay, lp->lwp_estcpu); |
| 52eedfb5 MD |
660 | bsd4_resetpriority(lp); |
| 661 | lp->lwp_cpbase = cpbase; | |
| 662 | lp->lwp_cpticks = 0; | |
| 663 | } | |
| 38b25931 MD |
664 | } |
| 665 | ||
| 666 | /* | |
| 667 | * Compute the priority of a process when running in user mode. | |
| 668 | * Arrange to reschedule if the resulting priority is better | |
| 669 | * than that of the current process. | |
| 52eedfb5 MD |
670 | * |
| 671 | * This routine may be called with any process. | |
| 672 | * | |
| 673 | * This routine is called by fork1() for initial setup with the process | |
| 674 | * of the run queue, and also may be called normally with the process on or | |
| 675 | * off the run queue. | |
| 676 | * | |
| 677 | * MPSAFE | |
| 38b25931 MD |
678 | */ |
| 679 | static void | |
| 553ea3c8 | 680 | bsd4_resetpriority(struct lwp *lp) |
| 38b25931 | 681 | { |
| 52eedfb5 | 682 | bsd4_pcpu_t dd; |
| 38b25931 | 683 | int newpriority; |
| 52eedfb5 MD |
684 | u_short newrqtype; |
| 685 | int reschedcpu; | |
| 270ac911 | 686 | |
| 38b25931 | 687 | /* |
| 52eedfb5 | 688 | * Calculate the new priority and queue type |
| 38b25931 | 689 | */ |
| 52eedfb5 MD |
690 | crit_enter(); |
| 691 | spin_lock_wr(&bsd4_spin); | |
| 692 | ||
| 693 | newrqtype = lp->lwp_rtprio.type; | |
| 694 | ||
| 695 | switch(newrqtype) { | |
| 38b25931 | 696 | case RTP_PRIO_REALTIME: |
| f64250e0 | 697 | case RTP_PRIO_FIFO: |
| 52eedfb5 MD |
698 | newpriority = PRIBASE_REALTIME + |
| 699 | (lp->lwp_rtprio.prio & PRIMASK); | |
| 700 | break; | |
| 38b25931 | 701 | case RTP_PRIO_NORMAL: |
| 52eedfb5 MD |
702 | newpriority = (lp->lwp_proc->p_nice - PRIO_MIN) * PPQ / NICEPPQ; |
| 703 | newpriority += lp->lwp_estcpu * PPQ / ESTCPUPPQ; | |
| 704 | newpriority = newpriority * MAXPRI / (PRIO_RANGE * PPQ / | |
| 705 | NICEPPQ + ESTCPUMAX * PPQ / ESTCPUPPQ); | |
| 706 | newpriority = PRIBASE_NORMAL + (newpriority & PRIMASK); | |
| 38b25931 MD |
707 | break; |
| 708 | case RTP_PRIO_IDLE: | |
| 52eedfb5 MD |
709 | newpriority = PRIBASE_IDLE + (lp->lwp_rtprio.prio & PRIMASK); |
| 710 | break; | |
| 38b25931 | 711 | case RTP_PRIO_THREAD: |
| 52eedfb5 MD |
712 | newpriority = PRIBASE_THREAD + (lp->lwp_rtprio.prio & PRIMASK); |
| 713 | break; | |
| 714 | default: | |
| 715 | panic("Bad RTP_PRIO %d", newrqtype); | |
| 716 | /* NOT REACHED */ | |
| 38b25931 MD |
717 | } |
| 718 | ||
| 719 | /* | |
| 52eedfb5 MD |
720 | * The newpriority incorporates the queue type so do a simple masked |
| 721 | * check to determine if the process has moved to another queue. If | |
| 722 | * it has, and it is currently on a run queue, then move it. | |
| 38b25931 | 723 | */ |
| 52eedfb5 MD |
724 | if ((lp->lwp_priority ^ newpriority) & ~PPQMASK) { |
| 725 | lp->lwp_priority = newpriority; | |
| 9388413d | 726 | if (lp->lwp_flag & LWP_ONRUNQ) { |
| 52eedfb5 MD |
727 | bsd4_remrunqueue_locked(lp); |
| 728 | lp->lwp_rqtype = newrqtype; | |
| 729 | lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ; | |
| 730 | bsd4_setrunqueue_locked(lp); | |
| 731 | reschedcpu = lp->lwp_thread->td_gd->gd_cpuid; | |
| 732 | } else { | |
| 733 | lp->lwp_rqtype = newrqtype; | |
| 734 | lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ; | |
| 735 | reschedcpu = -1; | |
| 736 | } | |
| 38b25931 | 737 | } else { |
| 52eedfb5 MD |
738 | lp->lwp_priority = newpriority; |
| 739 | reschedcpu = -1; | |
| 740 | } | |
| 741 | spin_unlock_wr(&bsd4_spin); | |
| 742 | ||
| 743 | /* | |
| 50017724 MD |
744 | * Determine if we need to reschedule the target cpu. This only |
| 745 | * occurs if the LWP is already on a scheduler queue, which means | |
| 746 | * that idle cpu notification has already occured. At most we | |
| 747 | * need only issue a need_user_resched() on the appropriate cpu. | |
| 281b4fa8 YT |
748 | * |
| 749 | * The LWP may be owned by a CPU different from the current one, | |
| 750 | * in which case dd->uschedcp may be modified without an MP lock | |
| 751 | * or a spinlock held. The worst that happens is that the code | |
| 752 | * below causes a spurious need_user_resched() on the target CPU | |
| 753 | * and dd->pri to be wrong for a short period of time, both of | |
| 754 | * which are harmless. | |
| 52eedfb5 MD |
755 | */ |
| 756 | if (reschedcpu >= 0) { | |
| 757 | dd = &bsd4_pcpu[reschedcpu]; | |
| 50017724 | 758 | if ((dd->upri & ~PRIMASK) > (lp->lwp_priority & ~PRIMASK)) { |
| 52eedfb5 | 759 | dd->upri = lp->lwp_priority; |
| 52eedfb5 MD |
760 | #ifdef SMP |
| 761 | if (reschedcpu == mycpu->gd_cpuid) { | |
| 762 | need_user_resched(); | |
| 763 | } else { | |
| 764 | lwkt_send_ipiq(lp->lwp_thread->td_gd, | |
| 765 | need_user_resched_remote, NULL); | |
| 766 | } | |
| 767 | #else | |
| 768 | need_user_resched(); | |
| 769 | #endif | |
| 770 | } | |
| 38b25931 MD |
771 | } |
| 772 | crit_exit(); | |
| 773 | } | |
| 774 | ||
| 3919ced0 MD |
775 | /* |
| 776 | * MPSAFE | |
| 777 | */ | |
| c3149361 MD |
778 | static |
| 779 | void | |
| 780 | bsd4_yield(struct lwp *lp) | |
| 781 | { | |
| 782 | #if 0 | |
| 783 | /* FUTURE (or something similar) */ | |
| 784 | switch(lp->lwp_rqtype) { | |
| 785 | case RTP_PRIO_NORMAL: | |
| 786 | lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu + ESTCPUINCR); | |
| c3149361 MD |
787 | break; |
| 788 | default: | |
| 789 | break; | |
| 790 | } | |
| 791 | #endif | |
| 792 | need_user_resched(); | |
| 793 | } | |
| 794 | ||
| 38b25931 MD |
795 | /* |
| 796 | * Called from fork1() when a new child process is being created. | |
| 797 | * | |
| 798 | * Give the child process an initial estcpu that is more batch then | |
| 799 | * its parent and dock the parent for the fork (but do not | |
| 800 | * reschedule the parent). This comprises the main part of our batch | |
| 801 | * detection heuristic for both parallel forking and sequential execs. | |
| 802 | * | |
| 803 | * Interactive processes will decay the boosted estcpu quickly while batch | |
| 804 | * processes will tend to compound it. | |
| 553ea3c8 | 805 | * XXX lwp should be "spawning" instead of "forking" |
| 270ac911 MD |
806 | * |
| 807 | * MPSAFE | |
| 38b25931 MD |
808 | */ |
| 809 | static void | |
| 553ea3c8 | 810 | bsd4_forking(struct lwp *plp, struct lwp *lp) |
| 38b25931 | 811 | { |
| 553ea3c8 SS |
812 | lp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ); |
| 813 | lp->lwp_origcpu = lp->lwp_estcpu; | |
| 814 | plp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ); | |
| 38b25931 MD |
815 | } |
| 816 | ||
| 817 | /* | |
| 818 | * Called when the parent reaps a child. Propogate cpu use by the child | |
| 819 | * back to the parent. | |
| 270ac911 MD |
820 | * |
| 821 | * MPSAFE | |
| 38b25931 MD |
822 | */ |
| 823 | static void | |
| 553ea3c8 | 824 | bsd4_exiting(struct lwp *plp, struct lwp *lp) |
| 38b25931 MD |
825 | { |
| 826 | int delta; | |
| 827 | ||
| 553ea3c8 SS |
828 | if (plp->lwp_proc->p_pid != 1) { |
| 829 | delta = lp->lwp_estcpu - lp->lwp_origcpu; | |
| 38b25931 | 830 | if (delta > 0) |
| 553ea3c8 | 831 | plp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + delta); |
| 38b25931 MD |
832 | } |
| 833 | } | |
| 834 | ||
| 52eedfb5 | 835 | |
| 38b25931 | 836 | /* |
| 52eedfb5 MD |
837 | * chooseproc() is called when a cpu needs a user process to LWKT schedule, |
| 838 | * it selects a user process and returns it. If chklp is non-NULL and chklp | |
| 839 | * has a better or equal priority then the process that would otherwise be | |
| 840 | * chosen, NULL is returned. | |
| 38b25931 | 841 | * |
| 52eedfb5 MD |
842 | * Until we fix the RUNQ code the chklp test has to be strict or we may |
| 843 | * bounce between processes trying to acquire the current process designation. | |
| 38b25931 | 844 | * |
| 52eedfb5 MD |
845 | * MPSAFE - must be called with bsd4_spin exclusive held. The spinlock is |
| 846 | * left intact through the entire routine. | |
| 38b25931 MD |
847 | */ |
| 848 | static | |
| 52eedfb5 MD |
849 | struct lwp * |
| 850 | chooseproc_locked(struct lwp *chklp) | |
| 38b25931 | 851 | { |
| 52eedfb5 MD |
852 | struct lwp *lp; |
| 853 | struct rq *q; | |
| a60ccb85 | 854 | u_int32_t *which, *which2; |
| 52eedfb5 | 855 | u_int32_t pri; |
| a60ccb85 DX |
856 | u_int32_t rtqbits; |
| 857 | u_int32_t tsqbits; | |
| 858 | u_int32_t idqbits; | |
| 859 | cpumask_t cpumask; | |
| 38b25931 | 860 | |
| a60ccb85 DX |
861 | rtqbits = bsd4_rtqueuebits; |
| 862 | tsqbits = bsd4_queuebits; | |
| 863 | idqbits = bsd4_idqueuebits; | |
| 864 | cpumask = mycpu->gd_cpumask; | |
| 865 | ||
| 866 | #ifdef SMP | |
| 867 | again: | |
| 868 | #endif | |
| 869 | if (rtqbits) { | |
| 870 | pri = bsfl(rtqbits); | |
| 52eedfb5 MD |
871 | q = &bsd4_rtqueues[pri]; |
| 872 | which = &bsd4_rtqueuebits; | |
| a60ccb85 DX |
873 | which2 = &rtqbits; |
| 874 | } else if (tsqbits) { | |
| 875 | pri = bsfl(tsqbits); | |
| 52eedfb5 MD |
876 | q = &bsd4_queues[pri]; |
| 877 | which = &bsd4_queuebits; | |
| a60ccb85 DX |
878 | which2 = &tsqbits; |
| 879 | } else if (idqbits) { | |
| 880 | pri = bsfl(idqbits); | |
| 52eedfb5 MD |
881 | q = &bsd4_idqueues[pri]; |
| 882 | which = &bsd4_idqueuebits; | |
| a60ccb85 | 883 | which2 = &idqbits; |
| 52eedfb5 MD |
884 | } else { |
| 885 | return NULL; | |
| 886 | } | |
| 887 | lp = TAILQ_FIRST(q); | |
| 888 | KASSERT(lp, ("chooseproc: no lwp on busy queue")); | |
| 270ac911 | 889 | |
| a60ccb85 DX |
890 | #ifdef SMP |
| 891 | while ((lp->lwp_cpumask & cpumask) == 0) { | |
| 892 | lp = TAILQ_NEXT(lp, lwp_procq); | |
| 893 | if (lp == NULL) { | |
| 894 | *which2 &= ~(1 << pri); | |
| 895 | goto again; | |
| 896 | } | |
| 897 | } | |
| 898 | #endif | |
| 899 | ||
| 38b25931 | 900 | /* |
| 52eedfb5 MD |
901 | * If the passed lwp <chklp> is reasonably close to the selected |
| 902 | * lwp <lp>, return NULL (indicating that <chklp> should be kept). | |
| 903 | * | |
| 904 | * Note that we must error on the side of <chklp> to avoid bouncing | |
| 905 | * between threads in the acquire code. | |
| 38b25931 | 906 | */ |
| 52eedfb5 MD |
907 | if (chklp) { |
| 908 | if (chklp->lwp_priority < lp->lwp_priority + PPQ) | |
| 909 | return(NULL); | |
| 910 | } | |
| 38b25931 | 911 | |
| 52eedfb5 MD |
912 | #ifdef SMP |
| 913 | /* | |
| 914 | * If the chosen lwp does not reside on this cpu spend a few | |
| 915 | * cycles looking for a better candidate at the same priority level. | |
| 916 | * This is a fallback check, setrunqueue() tries to wakeup the | |
| 917 | * correct cpu and is our front-line affinity. | |
| 918 | */ | |
| 919 | if (lp->lwp_thread->td_gd != mycpu && | |
| 920 | (chklp = TAILQ_NEXT(lp, lwp_procq)) != NULL | |
| 921 | ) { | |
| 922 | if (chklp->lwp_thread->td_gd == mycpu) { | |
| 923 | ++choose_affinity; | |
| 924 | lp = chklp; | |
| 38b25931 | 925 | } |
| 52eedfb5 MD |
926 | } |
| 927 | #endif | |
| 38b25931 | 928 | |
| 52eedfb5 MD |
929 | TAILQ_REMOVE(q, lp, lwp_procq); |
| 930 | --bsd4_runqcount; | |
| 931 | if (TAILQ_EMPTY(q)) | |
| 932 | *which &= ~(1 << pri); | |
| 9388413d SS |
933 | KASSERT((lp->lwp_flag & LWP_ONRUNQ) != 0, ("not on runq6!")); |
| 934 | lp->lwp_flag &= ~LWP_ONRUNQ; | |
| 52eedfb5 MD |
935 | return lp; |
| 936 | } | |
| 38b25931 | 937 | |
| 52eedfb5 | 938 | #ifdef SMP |
| b9eb1c19 | 939 | |
| 52eedfb5 | 940 | /* |
| b9eb1c19 MD |
941 | * Called via an ipi message to reschedule on another cpu. If no |
| 942 | * user thread is active on the target cpu we wake the scheduler | |
| 943 | * helper thread up to help schedule one. | |
| 52eedfb5 MD |
944 | * |
| 945 | * MPSAFE | |
| 946 | */ | |
| 947 | static | |
| 948 | void | |
| 949 | need_user_resched_remote(void *dummy) | |
| 950 | { | |
| b9eb1c19 MD |
951 | globaldata_t gd = mycpu; |
| 952 | bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid]; | |
| 953 | ||
| 954 | if (dd->uschedcp == NULL && (bsd4_rdyprocmask & gd->gd_cpumask)) { | |
| 955 | atomic_clear_int(&bsd4_rdyprocmask, gd->gd_cpumask); | |
| 956 | lwkt_schedule(&dd->helper_thread); | |
| 957 | } else { | |
| 958 | need_user_resched(); | |
| 959 | } | |
| 52eedfb5 | 960 | } |
| 38b25931 | 961 | |
| 52eedfb5 | 962 | #endif |
| 38b25931 | 963 | |
| 52eedfb5 MD |
964 | /* |
| 965 | * bsd4_remrunqueue_locked() removes a given process from the run queue | |
| 966 | * that it is on, clearing the queue busy bit if it becomes empty. | |
| 967 | * | |
| 968 | * Note that user process scheduler is different from the LWKT schedule. | |
| 969 | * The user process scheduler only manages user processes but it uses LWKT | |
| 970 | * underneath, and a user process operating in the kernel will often be | |
| 971 | * 'released' from our management. | |
| 972 | * | |
| 973 | * MPSAFE - bsd4_spin must be held exclusively on call | |
| 974 | */ | |
| 975 | static void | |
| 976 | bsd4_remrunqueue_locked(struct lwp *lp) | |
| 977 | { | |
| 978 | struct rq *q; | |
| 979 | u_int32_t *which; | |
| 980 | u_int8_t pri; | |
| 981 | ||
| 9388413d SS |
982 | KKASSERT(lp->lwp_flag & LWP_ONRUNQ); |
| 983 | lp->lwp_flag &= ~LWP_ONRUNQ; | |
| 52eedfb5 MD |
984 | --bsd4_runqcount; |
| 985 | KKASSERT(bsd4_runqcount >= 0); | |
| 986 | ||
| 987 | pri = lp->lwp_rqindex; | |
| 988 | switch(lp->lwp_rqtype) { | |
| 989 | case RTP_PRIO_NORMAL: | |
| 990 | q = &bsd4_queues[pri]; | |
| 991 | which = &bsd4_queuebits; | |
| 992 | break; | |
| 993 | case RTP_PRIO_REALTIME: | |
| 994 | case RTP_PRIO_FIFO: | |
| 995 | q = &bsd4_rtqueues[pri]; | |
| 996 | which = &bsd4_rtqueuebits; | |
| 997 | break; | |
| 998 | case RTP_PRIO_IDLE: | |
| 999 | q = &bsd4_idqueues[pri]; | |
| 1000 | which = &bsd4_idqueuebits; | |
| 1001 | break; | |
| 1002 | default: | |
| 1003 | panic("remrunqueue: invalid rtprio type"); | |
| 1004 | /* NOT REACHED */ | |
| 1005 | } | |
| 1006 | TAILQ_REMOVE(q, lp, lwp_procq); | |
| 1007 | if (TAILQ_EMPTY(q)) { | |
| 1008 | KASSERT((*which & (1 << pri)) != 0, | |
| 1009 | ("remrunqueue: remove from empty queue")); | |
| 1010 | *which &= ~(1 << pri); | |
| 38b25931 MD |
1011 | } |
| 1012 | } | |
| 1013 | ||
| 52eedfb5 MD |
1014 | /* |
| 1015 | * bsd4_setrunqueue_locked() | |
| 1016 | * | |
| 1017 | * Add a process whos rqtype and rqindex had previously been calculated | |
| 1018 | * onto the appropriate run queue. Determine if the addition requires | |
| 1019 | * a reschedule on a cpu and return the cpuid or -1. | |
| 1020 | * | |
| 1021 | * NOTE: Lower priorities are better priorities. | |
| 1022 | * | |
| 1023 | * MPSAFE - bsd4_spin must be held exclusively on call | |
| 1024 | */ | |
| 1025 | static void | |
| 1026 | bsd4_setrunqueue_locked(struct lwp *lp) | |
| 1027 | { | |
| 1028 | struct rq *q; | |
| 1029 | u_int32_t *which; | |
| 1030 | int pri; | |
| 1031 | ||
| 9388413d SS |
1032 | KKASSERT((lp->lwp_flag & LWP_ONRUNQ) == 0); |
| 1033 | lp->lwp_flag |= LWP_ONRUNQ; | |
| 52eedfb5 MD |
1034 | ++bsd4_runqcount; |
| 1035 | ||
| 1036 | pri = lp->lwp_rqindex; | |
| 1037 | ||
| 1038 | switch(lp->lwp_rqtype) { | |
| 1039 | case RTP_PRIO_NORMAL: | |
| 1040 | q = &bsd4_queues[pri]; | |
| 1041 | which = &bsd4_queuebits; | |
| 1042 | break; | |
| 1043 | case RTP_PRIO_REALTIME: | |
| 1044 | case RTP_PRIO_FIFO: | |
| 1045 | q = &bsd4_rtqueues[pri]; | |
| 1046 | which = &bsd4_rtqueuebits; | |
| 1047 | break; | |
| 1048 | case RTP_PRIO_IDLE: | |
| 1049 | q = &bsd4_idqueues[pri]; | |
| 1050 | which = &bsd4_idqueuebits; | |
| 1051 | break; | |
| 1052 | default: | |
| 1053 | panic("remrunqueue: invalid rtprio type"); | |
| 1054 | /* NOT REACHED */ | |
| 1055 | } | |
| 1056 | ||
| 1057 | /* | |
| 1058 | * Add to the correct queue and set the appropriate bit. If no | |
| 1059 | * lower priority (i.e. better) processes are in the queue then | |
| 1060 | * we want a reschedule, calculate the best cpu for the job. | |
| 1061 | * | |
| 1062 | * Always run reschedules on the LWPs original cpu. | |
| 1063 | */ | |
| 1064 | TAILQ_INSERT_TAIL(q, lp, lwp_procq); | |
| 1065 | *which |= 1 << pri; | |
| 1066 | } | |
| 1067 | ||
| 38b25931 MD |
1068 | #ifdef SMP |
| 1069 | ||
| 1070 | /* | |
| 1071 | * For SMP systems a user scheduler helper thread is created for each | |
| 1072 | * cpu and is used to allow one cpu to wakeup another for the purposes of | |
| c9e9fb21 MD |
1073 | * scheduling userland threads from setrunqueue(). |
| 1074 | * | |
| 1075 | * UP systems do not need the helper since there is only one cpu. | |
| 1076 | * | |
| 1077 | * We can't use the idle thread for this because we might block. | |
| 1078 | * Additionally, doing things this way allows us to HLT idle cpus | |
| 1079 | * on MP systems. | |
| 52eedfb5 MD |
1080 | * |
| 1081 | * MPSAFE | |
| 38b25931 MD |
1082 | */ |
| 1083 | static void | |
| 1084 | sched_thread(void *dummy) | |
| 1085 | { | |
| 52eedfb5 MD |
1086 | globaldata_t gd; |
| 1087 | bsd4_pcpu_t dd; | |
| 1088 | struct lwp *nlp; | |
| 1089 | cpumask_t cpumask; | |
| 52eedfb5 | 1090 | int cpuid; |
| 418f19aa SW |
1091 | #if 0 |
| 1092 | cpumask_t tmpmask; | |
| 52eedfb5 | 1093 | int tmpid; |
| 418f19aa | 1094 | #endif |
| 52eedfb5 MD |
1095 | |
| 1096 | gd = mycpu; | |
| 1097 | cpuid = gd->gd_cpuid; /* doesn't change */ | |
| b9eb1c19 | 1098 | cpumask = gd->gd_cpumask; /* doesn't change */ |
| 52eedfb5 MD |
1099 | dd = &bsd4_pcpu[cpuid]; |
| 1100 | ||
| 1101 | /* | |
| c9e9fb21 MD |
1102 | * Since we are woken up only when no user processes are scheduled |
| 1103 | * on a cpu, we can run at an ultra low priority. | |
| 52eedfb5 | 1104 | */ |
| 50017724 | 1105 | lwkt_setpri_self(TDPRI_USER_SCHEDULER); |
| 38b25931 | 1106 | |
| 38b25931 | 1107 | for (;;) { |
| 50017724 MD |
1108 | /* |
| 1109 | * We use the LWKT deschedule-interlock trick to avoid racing | |
| 1110 | * bsd4_rdyprocmask. This means we cannot block through to the | |
| 1111 | * manual lwkt_switch() call we make below. | |
| 1112 | */ | |
| 52eedfb5 | 1113 | crit_enter_gd(gd); |
| 50017724 | 1114 | lwkt_deschedule_self(gd->gd_curthread); |
| 52eedfb5 MD |
1115 | spin_lock_wr(&bsd4_spin); |
| 1116 | atomic_set_int(&bsd4_rdyprocmask, cpumask); | |
| b9eb1c19 MD |
1117 | |
| 1118 | clear_user_resched(); /* This satisfied the reschedule request */ | |
| 1119 | dd->rrcount = 0; /* Reset the round-robin counter */ | |
| 1120 | ||
| 52eedfb5 | 1121 | if ((bsd4_curprocmask & cpumask) == 0) { |
| b9eb1c19 MD |
1122 | /* |
| 1123 | * No thread is currently scheduled. | |
| 1124 | */ | |
| 1125 | KKASSERT(dd->uschedcp == NULL); | |
| 52eedfb5 MD |
1126 | if ((nlp = chooseproc_locked(NULL)) != NULL) { |
| 1127 | atomic_set_int(&bsd4_curprocmask, cpumask); | |
| 1128 | dd->upri = nlp->lwp_priority; | |
| 1129 | dd->uschedcp = nlp; | |
| 1130 | spin_unlock_wr(&bsd4_spin); | |
| 1131 | lwkt_acquire(nlp->lwp_thread); | |
| 1132 | lwkt_schedule(nlp->lwp_thread); | |
| 1133 | } else { | |
| 1134 | spin_unlock_wr(&bsd4_spin); | |
| 1135 | } | |
| b9eb1c19 MD |
1136 | #if 0 |
| 1137 | /* | |
| 1138 | * Disabled for now, this can create an infinite loop. | |
| 1139 | */ | |
| 1140 | } else if (bsd4_runqcount) { | |
| 52eedfb5 MD |
1141 | /* |
| 1142 | * Someone scheduled us but raced. In order to not lose | |
| 1143 | * track of the fact that there may be a LWP ready to go, | |
| 1144 | * forward the request to another cpu if available. | |
| 1145 | * | |
| 1146 | * Rotate through cpus starting with cpuid + 1. Since cpuid | |
| 1147 | * is already masked out by gd_other_cpus, just use ~cpumask. | |
| 1148 | */ | |
| b9eb1c19 MD |
1149 | tmpmask = bsd4_rdyprocmask & mycpu->gd_other_cpus & |
| 1150 | ~bsd4_curprocmask; | |
| 52eedfb5 MD |
1151 | if (tmpmask) { |
| 1152 | if (tmpmask & ~(cpumask - 1)) | |
| 1153 | tmpid = bsfl(tmpmask & ~(cpumask - 1)); | |
| 1154 | else | |
| 1155 | tmpid = bsfl(tmpmask); | |
| 1156 | bsd4_scancpu = tmpid; | |
| 1157 | atomic_clear_int(&bsd4_rdyprocmask, 1 << tmpid); | |
| 1158 | spin_unlock_wr(&bsd4_spin); | |
| 1159 | lwkt_schedule(&bsd4_pcpu[tmpid].helper_thread); | |
| 1160 | } else { | |
| 1161 | spin_unlock_wr(&bsd4_spin); | |
| 1162 | } | |
| b9eb1c19 MD |
1163 | #endif |
| 1164 | } else { | |
| 1165 | /* | |
| 1166 | * The runq is empty. | |
| 1167 | */ | |
| 1168 | spin_unlock_wr(&bsd4_spin); | |
| 38b25931 | 1169 | } |
| 52eedfb5 | 1170 | crit_exit_gd(gd); |
| 38b25931 MD |
1171 | lwkt_switch(); |
| 1172 | } | |
| 1173 | } | |
| 1174 | ||
| 1175 | /* | |
| 1176 | * Setup our scheduler helpers. Note that curprocmask bit 0 has already | |
| 1177 | * been cleared by rqinit() and we should not mess with it further. | |
| 1178 | */ | |
| 1179 | static void | |
| 1180 | sched_thread_cpu_init(void) | |
| 1181 | { | |
| 1182 | int i; | |
| 1183 | ||
| 1184 | if (bootverbose) | |
| 6ea70f76 | 1185 | kprintf("start scheduler helpers on cpus:"); |
| 38b25931 MD |
1186 | |
| 1187 | for (i = 0; i < ncpus; ++i) { | |
| 52eedfb5 | 1188 | bsd4_pcpu_t dd = &bsd4_pcpu[i]; |
| 38b25931 MD |
1189 | cpumask_t mask = 1 << i; |
| 1190 | ||
| 1191 | if ((mask & smp_active_mask) == 0) | |
| 1192 | continue; | |
| 1193 | ||
| 1194 | if (bootverbose) | |
| 6ea70f76 | 1195 | kprintf(" %d", i); |
| 38b25931 | 1196 | |
| 52eedfb5 | 1197 | lwkt_create(sched_thread, NULL, NULL, &dd->helper_thread, |
| c9e9fb21 | 1198 | TDF_STOPREQ | TDF_MPSAFE, i, "usched %d", i); |
| 38b25931 MD |
1199 | |
| 1200 | /* | |
| 1201 | * Allow user scheduling on the target cpu. cpu #0 has already | |
| 1202 | * been enabled in rqinit(). | |
| 1203 | */ | |
| 1204 | if (i) | |
| 52eedfb5 MD |
1205 | atomic_clear_int(&bsd4_curprocmask, mask); |
| 1206 | atomic_set_int(&bsd4_rdyprocmask, mask); | |
| b9eb1c19 | 1207 | dd->upri = PRIBASE_NULL; |
| 38b25931 MD |
1208 | } |
| 1209 | if (bootverbose) | |
| 6ea70f76 | 1210 | kprintf("\n"); |
| 38b25931 | 1211 | } |
| ba39e2e0 MD |
1212 | SYSINIT(uschedtd, SI_BOOT2_USCHED, SI_ORDER_SECOND, |
| 1213 | sched_thread_cpu_init, NULL) | |
| 38b25931 MD |
1214 | |
| 1215 | #endif | |
| 1216 |