sys/kern/usched_bsd4.c

   1 /*
   2  * Copyright (c) 1999 Peter Wemm <peter@FreeBSD.org>
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  *
  14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  24  * SUCH DAMAGE.
  25  *
  26  * $DragonFly: src/sys/kern/usched_bsd4.c,v 1.3 2005/10/11 09:59:56 corecode Exp $
  27  */
  28
  29 #include <sys/param.h>
  30 #include <sys/systm.h>
  31 #include <sys/kernel.h>
  32 #include <sys/lock.h>
  33 #include <sys/queue.h>
  34 #include <sys/proc.h>
  35 #include <sys/rtprio.h>
  36 #include <sys/thread2.h>
  37 #include <sys/uio.h>
  38 #include <sys/sysctl.h>
  39 #include <sys/resourcevar.h>
  40 #include <machine/ipl.h>
  41 #include <machine/cpu.h>
  42 #include <machine/smp.h>
  43
  44 /*
  45  * Priorities.  Note that with 32 run queues per scheduler each queue
  46  * represents four priority levels.
  47  */
  48
  49 #define MAXPRI                  128
  50 #define PRIMASK                 (MAXPRI - 1)
  51 #define PRIBASE_REALTIME        0
  52 #define PRIBASE_NORMAL          MAXPRI
  53 #define PRIBASE_IDLE            (MAXPRI * 2)
  54 #define PRIBASE_THREAD          (MAXPRI * 3)
  55 #define PRIBASE_NULL            (MAXPRI * 4)
  56
  57 #define NQS     32                      /* 32 run queues. */
  58 #define PPQ     (MAXPRI / NQS)          /* priorities per queue */
  59
  60 /*
  61  * NICEPPQ      - number of nice units per priority queue
  62  * ESTCPURAMP   - number of scheduler ticks for estcpu to switch queues
  63  *
  64  * ESTCPUPPQ    - number of estcpu units per priority queue
  65  * ESTCPUMAX    - number of estcpu units
  66  * ESTCPUINCR   - amount we have to increment p_estcpu per scheduling tick at
  67  *                100% cpu.
  68  */
  69 #define NICEPPQ         2
  70 #define ESTCPURAMP      4
  71 #define ESTCPUPPQ       512
  72 #define ESTCPUMAX       (ESTCPUPPQ * NQS)
  73 #define ESTCPUINCR      (ESTCPUPPQ / ESTCPURAMP)
  74 #define PRIO_RANGE      (PRIO_MAX - PRIO_MIN + 1)
  75
  76 #define ESTCPULIM(v)    min((v), ESTCPUMAX)
  77
  78 TAILQ_HEAD(rq, lwp);
  79
  80 #define lwp_priority    lwp_usdata.bsd4.priority
  81 #define lwp_rqindex     lwp_usdata.bsd4.rqindex
  82 #define lwp_origcpu     lwp_usdata.bsd4.origcpu
  83 #define lwp_estcpu      lwp_usdata.bsd4.estcpu
  84
  85 static void bsd4_acquire_curproc(struct lwp *lp);
  86 static void bsd4_release_curproc(struct lwp *lp);
  87 static void bsd4_select_curproc(globaldata_t gd);
  88 static void bsd4_setrunqueue(struct lwp *lp);
  89 static void bsd4_remrunqueue(struct lwp *lp);
  90 static void bsd4_schedulerclock(struct lwp *lp, sysclock_t period,
  91                                 sysclock_t cpstamp);
  92 static void bsd4_resetpriority(struct lwp *lp);
  93 static void bsd4_forking(struct lwp *plp, struct lwp *lp);
  94 static void bsd4_exiting(struct lwp *plp, struct lwp *lp);
  95
  96 static void bsd4_recalculate_estcpu(struct lwp *lp);
  97
  98 struct usched usched_bsd4 = {
  99         { NULL },
 100         "bsd4", "Original DragonFly Scheduler",
 101         bsd4_acquire_curproc,
 102         bsd4_release_curproc,
 103         bsd4_select_curproc,
 104         bsd4_setrunqueue,
 105         bsd4_remrunqueue,
 106         bsd4_schedulerclock,
 107         bsd4_recalculate_estcpu,
 108         bsd4_resetpriority,
 109         bsd4_forking,
 110         bsd4_exiting
 111 };
 112
 113 /*
 114  * We have NQS (32) run queues per scheduling class.  For the normal
 115  * class, there are 128 priorities scaled onto these 32 queues.  New
 116  * processes are added to the last entry in each queue, and processes
 117  * are selected for running by taking them from the head and maintaining
 118  * a simple FIFO arrangement.  Realtime and Idle priority processes have
 119  * and explicit 0-31 priority which maps directly onto their class queue
 120  * index.  When a queue has something in it, the corresponding bit is
 121  * set in the queuebits variable, allowing a single read to determine
 122  * the state of all 32 queues and then a ffs() to find the first busy
 123  * queue.
 124  */
 125 static struct rq queues[NQS];
 126 static struct rq rtqueues[NQS];
 127 static struct rq idqueues[NQS];
 128 static u_int32_t queuebits;
 129 static u_int32_t rtqueuebits;
 130 static u_int32_t idqueuebits;
 131 static cpumask_t curprocmask = -1;      /* currently running a user process */
 132 static cpumask_t rdyprocmask;           /* ready to accept a user process */
 133 static int       runqcount;
 134 #ifdef SMP
 135 static int       scancpu;
 136 #endif
 137
 138 SYSCTL_INT(_debug, OID_AUTO, runqcount, CTLFLAG_RD, &runqcount, 0, "");
 139 #ifdef INVARIANTS
 140 static int usched_nonoptimal;
 141 SYSCTL_INT(_debug, OID_AUTO, usched_nonoptimal, CTLFLAG_RW,
 142         &usched_nonoptimal, 0, "acquire_curproc() was not optimal");
 143 static int usched_optimal;
 144 SYSCTL_INT(_debug, OID_AUTO, usched_optimal, CTLFLAG_RW,
 145         &usched_optimal, 0, "acquire_curproc() was optimal");
 146 #endif
 147 static int usched_debug = -1;
 148 SYSCTL_INT(_debug, OID_AUTO, scdebug, CTLFLAG_RW, &usched_debug, 0, "");
 149 #ifdef SMP
 150 static int remote_resched = 1;
 151 static int remote_resched_nonaffinity;
 152 static int remote_resched_affinity;
 153 static int choose_affinity;
 154 SYSCTL_INT(_debug, OID_AUTO, remote_resched, CTLFLAG_RW,
 155         &remote_resched, 0, "Resched to another cpu");
 156 SYSCTL_INT(_debug, OID_AUTO, remote_resched_nonaffinity, CTLFLAG_RD,
 157         &remote_resched_nonaffinity, 0, "Number of remote rescheds");
 158 SYSCTL_INT(_debug, OID_AUTO, remote_resched_affinity, CTLFLAG_RD,
 159         &remote_resched_affinity, 0, "Number of remote rescheds");
 160 SYSCTL_INT(_debug, OID_AUTO, choose_affinity, CTLFLAG_RD,
 161         &choose_affinity, 0, "chooseproc() was smart");
 162 #endif
 163
 164 static int usched_bsd4_rrinterval = (ESTCPUFREQ + 9) / 10;
 165 SYSCTL_INT(_kern, OID_AUTO, usched_bsd4_rrinterval, CTLFLAG_RW,
 166         &usched_bsd4_rrinterval, 0, "");
 167 static int usched_bsd4_decay = ESTCPUINCR / 2;
 168 SYSCTL_INT(_kern, OID_AUTO, usched_bsd4_decay, CTLFLAG_RW,
 169         &usched_bsd4_decay, 0, "");
 170
 171 /*
 172  * Initialize the run queues at boot time.
 173  */
 174 static void
 175 rqinit(void *dummy)
 176 {
 177         int i;
 178
 179         for (i = 0; i < NQS; i++) {
 180                 TAILQ_INIT(&queues[i]);
 181                 TAILQ_INIT(&rtqueues[i]);
 182                 TAILQ_INIT(&idqueues[i]);
 183         }
 184         curprocmask &= ~1;
 185 }
 186 SYSINIT(runqueue, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, rqinit, NULL)
 187
 188 /*
 189  * chooseproc() is called when a cpu needs a user process to LWKT schedule,
 190  * it selects a user process and returns it.  If chkp is non-NULL and chkp
 191  * has a better or equal then the process that would otherwise be
 192  * chosen, NULL is returned.
 193  *
 194  * Until we fix the RUNQ code the chkp test has to be strict or we may
 195  * bounce between processes trying to acquire the current process designation.
 196  */
 197 static
 198 struct lwp *
 199 chooseproc(struct lwp *chklp)
 200 {
 201         struct lwp *lp;
 202         struct rq *q;
 203         u_int32_t *which;
 204         u_int32_t pri;
 205
 206         if (rtqueuebits) {
 207                 pri = bsfl(rtqueuebits);
 208                 q = &rtqueues[pri];
 209                 which = &rtqueuebits;
 210         } else if (queuebits) {
 211                 pri = bsfl(queuebits);
 212                 q = &queues[pri];
 213                 which = &queuebits;
 214         } else if (idqueuebits) {
 215                 pri = bsfl(idqueuebits);
 216                 q = &idqueues[pri];
 217                 which = &idqueuebits;
 218         } else {
 219                 return NULL;
 220         }
 221         lp = TAILQ_FIRST(q);
 222         KASSERT(lp, ("chooseproc: no lwp on busy queue"));
 223
 224         /*
 225          * If the passed lwp <chklp> is reasonably close to the selected
 226          * lwp <lp>, return NULL (indicating that <chklp> should be kept).
 227          *
 228          * Note that we must error on the side of <chklp> to avoid bouncing
 229          * between threads in the acquire code.
 230          */
 231         if (chklp) {
 232                 if (chklp->lwp_priority < lp->lwp_priority + PPQ)
 233                         return(NULL);
 234         }
 235
 236 #ifdef SMP
 237         /*
 238          * If the chosen lwp does not reside on this cpu spend a few
 239          * cycles looking for a better candidate at the same priority level.
 240          * This is a fallback check, setrunqueue() tries to wakeup the
 241          * correct cpu and is our front-line affinity.
 242          */
 243         if (lp->lwp_thread->td_gd != mycpu &&
 244             (chklp = TAILQ_NEXT(lp, lwp_procq)) != NULL
 245         ) {
 246                 if (chklp->lwp_thread->td_gd == mycpu) {
 247                         ++choose_affinity;
 248                         lp = chklp;
 249                 }
 250         }
 251 #endif
 252
 253         TAILQ_REMOVE(q, lp, lwp_procq);
 254         --runqcount;
 255         if (TAILQ_EMPTY(q))
 256                 *which &= ~(1 << pri);
 257         KASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) != 0, ("not on runq6!"));
 258         lp->lwp_proc->p_flag &= ~P_ONRUNQ;
 259         return lp;
 260 }
 261
 262 #ifdef SMP
 263 /*
 264  * called via an ipi message to reschedule on another cpu.
 265  */
 266 static
 267 void
 268 need_user_resched_remote(void *dummy)
 269 {
 270         need_user_resched();
 271 }
 272
 273 #endif
 274
 275 /*
 276  * setrunqueue() 'wakes up' a 'user' process.  GIANT must be held.  The
 277  * user process may represent any user process, including the current
 278  * process.
 279  *
 280  * If P_PASSIVE_ACQ is set setrunqueue() will not wakeup potential target
 281  * cpus in an attempt to keep the process on the current cpu at least for
 282  * a little while to take advantage of locality of reference (e.g. fork/exec
 283  * or short fork/exit, and uio_yield()).
 284  *
 285  * CPU AFFINITY: cpu affinity is handled by attempting to either schedule
 286  * or (user level) preempt on the same cpu that a process was previously
 287  * scheduled to.  If we cannot do this but we are at enough of a higher
 288  * priority then the processes running on other cpus, we will allow the
 289  * process to be stolen by another cpu.
 290  *
 291  * WARNING! a thread can be acquired by another cpu the moment it is put
 292  * on the user scheduler's run queue AND we release the MP lock.  Since we
 293  * release the MP lock before switching out another cpu may begin stealing
 294  * our current thread before we are completely switched out!  The
 295  * lwkt_acquire() function will stall until TDF_RUNNING is cleared on the
 296  * thread before stealing it.
 297  *
 298  * NOTE on need_user_resched() calls: we have to call need_user_resched()
 299  * if the new process is more important then the current process, or if
 300  * the new process is the current process and is now less important then
 301  * other processes.
 302  *
 303  * The associated thread must NOT be scheduled.
 304  * The process must be runnable.
 305  * This must be called at splhigh().
 306  */
 307 static void
 308 bsd4_setrunqueue(struct lwp *lp)
 309 {
 310         struct rq *q;
 311         struct globaldata *gd;
 312         int pri;
 313         int cpuid;
 314         u_int32_t needresched;
 315 #ifdef SMP
 316         int count;
 317         cpumask_t mask;
 318 #endif
 319
 320         crit_enter();
 321         KASSERT(lp->lwp_proc->p_stat == SRUN, ("setrunqueue: proc not SRUN"));
 322         KASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) == 0,
 323             ("lwp %d/%d already on runq! flag %08x", lp->lwp_proc->p_pid,
 324              lp->lwp_tid, lp->lwp_proc->p_flag));
 325         KKASSERT((lp->lwp_thread->td_flags & TDF_RUNQ) == 0);
 326
 327         /*
 328          * Note: gd is the gd of the TARGET thread's cpu, not our cpu.
 329          */
 330         gd = lp->lwp_thread->td_gd;
 331
 332         /*
 333          * Because recalculate is only called once or twice for long sleeps,
 334          * not every second forever while the process is sleeping, we have
 335          * to manually call it to resynchronize p_cpbase on wakeup or it
 336          * will wrap if the process was sleeping long enough (e.g. ~10 min
 337          * with the ACPI timer) and really mess up the nticks calculation.
 338          */
 339         if (lp->lwp_slptime) {
 340             bsd4_recalculate_estcpu(lp);
 341             lp->lwp_slptime = 0;
 342         }
 343         /*
 344          * We have not been released, make sure that we are not the currently
 345          * designated process.
 346          */
 347         KKASSERT(gd->gd_uschedcp != lp);
 348
 349         /*
 350          * Check cpu affinity.  The associated thread is stable at the
 351          * moment.  Note that we may be checking another cpu here so we
 352          * have to be careful.  We are currently protected by the BGL.
 353          *
 354          * This allows us to avoid actually queueing the process.
 355          * acquire_curproc() will handle any threads we mistakenly schedule.
 356          */
 357         cpuid = gd->gd_cpuid;
 358
 359         if ((curprocmask & (1 << cpuid)) == 0) {
 360                 curprocmask |= 1 << cpuid;
 361                 gd->gd_uschedcp = lp;
 362                 gd->gd_upri = lp->lwp_priority;
 363                 lwkt_schedule(lp->lwp_thread);
 364                 /* CANNOT TOUCH PROC OR TD AFTER SCHEDULE CALL TO REMOTE CPU */
 365                 crit_exit();
 366 #ifdef SMP
 367                 if (gd != mycpu)
 368                         ++remote_resched_affinity;
 369 #endif
 370                 return;
 371         }
 372
 373         /*
 374          * gd and cpuid may still 'hint' at another cpu.  Even so we have
 375          * to place this process on the userland scheduler's run queue for
 376          * action by the target cpu.
 377          */
 378         ++runqcount;
 379         lp->lwp_proc->p_flag |= P_ONRUNQ;
 380         if (lp->lwp_rtprio.type == RTP_PRIO_NORMAL) {
 381                 pri = (lp->lwp_priority & PRIMASK) / PPQ;
 382                 q = &queues[pri];
 383                 queuebits |= 1 << pri;
 384                 needresched = (queuebits & ((1 << pri) - 1));
 385         } else if (lp->lwp_rtprio.type == RTP_PRIO_REALTIME ||
 386                    lp->lwp_rtprio.type == RTP_PRIO_FIFO) {
 387                 pri = (u_int8_t)lp->lwp_rtprio.prio;
 388                 q = &rtqueues[pri];
 389                 rtqueuebits |= 1 << pri;
 390                 needresched = (rtqueuebits & ((1 << pri) - 1));
 391         } else if (lp->lwp_rtprio.type == RTP_PRIO_IDLE) {
 392                 pri = (u_int8_t)lp->lwp_rtprio.prio;
 393                 q = &idqueues[pri];
 394                 idqueuebits |= 1 << pri;
 395                 needresched = (idqueuebits & ((1 << pri) - 1));
 396         } else {
 397                 needresched = 0;
 398                 panic("setrunqueue: invalid rtprio type");
 399         }
 400         KKASSERT(pri < 32);
 401         lp->lwp_rqindex = pri;          /* remember the queue index */
 402         TAILQ_INSERT_TAIL(q, lp, lwp_procq);
 403
 404 #ifdef SMP
 405         /*
 406          * Either wakeup other cpus user thread scheduler or request
 407          * preemption on other cpus (which will also wakeup a HLT).
 408          *
 409          * NOTE!  gd and cpuid may still be our 'hint', not our current
 410          * cpu info.
 411          */
 412
 413         count = runqcount;
 414
 415         /*
 416          * Check cpu affinity for user preemption (when the curprocmask bit
 417          * is set).  Note that gd_upri is a speculative field (we modify
 418          * another cpu's gd_upri to avoid sending ipiq storms).
 419          */
 420         if (gd == mycpu) {
 421                 if ((lp->lwp_thread->td_flags & TDF_NORESCHED) == 0) {
 422                         if (lp->lwp_priority < gd->gd_upri - PPQ) {
 423                                 gd->gd_upri = lp->lwp_priority;
 424                                 gd->gd_rrcount = 0;
 425                                 need_user_resched();
 426                                 --count;
 427                         } else if (gd->gd_uschedcp == lp && needresched) {
 428                                 gd->gd_rrcount = 0;
 429                                 need_user_resched();
 430                                 --count;
 431                         }
 432                 }
 433         } else if (remote_resched) {
 434                 if (lp->lwp_priority < gd->gd_upri - PPQ) {
 435                         gd->gd_upri = lp->lwp_priority;
 436                         lwkt_send_ipiq(gd, need_user_resched_remote, NULL);
 437                         --count;
 438                         ++remote_resched_affinity;
 439                 }
 440         }
 441
 442         /*
 443          * No affinity, first schedule to any cpus that do not have a current
 444          * process.  If there is a free cpu we always schedule to it.
 445          */
 446         if (count &&
 447             (mask = ~curprocmask & rdyprocmask & mycpu->gd_other_cpus) != 0 &&
 448             (lp->lwp_proc->p_flag & P_PASSIVE_ACQ) == 0) {
 449                 if (!mask)
 450                         printf("lwp %d/%d nocpu to schedule it on\n",
 451                                lp->lwp_proc->p_pid, lp->lwp_tid);
 452                 while (mask && count) {
 453                         cpuid = bsfl(mask);
 454                         KKASSERT((curprocmask & (1 << cpuid)) == 0);
 455                         rdyprocmask &= ~(1 << cpuid);
 456                         lwkt_schedule(&globaldata_find(cpuid)->gd_schedthread);
 457                         --count;
 458                         mask &= ~(1 << cpuid);
 459                 }
 460         }
 461
 462         /*
 463          * If there are still runnable processes try to wakeup a random
 464          * cpu that is running a much lower priority process in order to
 465          * preempt on it.  Note that gd_upri is only a hint, so we can
 466          * overwrite it from the wrong cpu.   If we can't find one, we
 467          * are SOL.
 468          *
 469          * We depress the priority check so multiple cpu bound programs
 470          * do not bounce between cpus.  Remember that the clock interrupt
 471          * will also cause all cpus to reschedule.
 472          *
 473          * We must mask against rdyprocmask or we will race in the boot
 474          * code (before all cpus have working scheduler helpers), plus
 475          * some cpus might not be operational and/or not configured to
 476          * handle user processes.
 477          */
 478         if (count && remote_resched && ncpus > 1) {
 479                 cpuid = scancpu;
 480                 do {
 481                         if (++cpuid == ncpus)
 482                                 cpuid = 0;
 483                 } while (cpuid == mycpu->gd_cpuid);
 484                 scancpu = cpuid;
 485
 486                 if (rdyprocmask & (1 << cpuid)) {
 487                         gd = globaldata_find(cpuid);
 488
 489                         if (lp->lwp_priority < gd->gd_upri - PPQ) {
 490                                 gd->gd_upri = lp->lwp_priority;
 491                                 lwkt_send_ipiq(gd, need_user_resched_remote, NULL);
 492                                 ++remote_resched_nonaffinity;
 493                         }
 494                 }
 495         }
 496 #else
 497         if ((lp->lwp_thread->td_flags & TDF_NORESCHED) == 0) {
 498                 if (lp->lwp_priority < gd->gd_upri - PPQ) {
 499                         gd->gd_upri = lp->lwp_priority;
 500                         gd->gd_rrcount = 0;
 501                         need_user_resched();
 502                 } else if (gd->gd_uschedcp == lp && needresched) {
 503                         gd->gd_rrcount = 0;
 504                         need_user_resched();
 505                 }
 506         }
 507 #endif
 508         crit_exit();
 509 }
 510
 511 /*
 512  * remrunqueue() removes a given process from the run queue that it is on,
 513  * clearing the queue busy bit if it becomes empty.  This function is called
 514  * when a userland process is selected for LWKT scheduling.  Note that
 515  * LWKT scheduling is an abstraction of 'curproc'.. there could very well be
 516  * several userland processes whos threads are scheduled or otherwise in
 517  * a special state, and such processes are NOT on the userland scheduler's
 518  * run queue.
 519  *
 520  * This must be called at splhigh().
 521  */
 522 static void
 523 bsd4_remrunqueue(struct lwp *lp)
 524 {
 525         struct rq *q;
 526         u_int32_t *which;
 527         u_int8_t pri;
 528
 529         crit_enter();
 530         KASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) != 0, ("not on runq4!"));
 531         lp->lwp_proc->p_flag &= ~P_ONRUNQ;
 532         --runqcount;
 533         KKASSERT(runqcount >= 0);
 534         pri = lp->lwp_rqindex;
 535         if (lp->lwp_rtprio.type == RTP_PRIO_NORMAL) {
 536                 q = &queues[pri];
 537                 which = &queuebits;
 538         } else if (lp->lwp_rtprio.type == RTP_PRIO_REALTIME ||
 539                    lp->lwp_rtprio.type == RTP_PRIO_FIFO) {
 540                 q = &rtqueues[pri];
 541                 which = &rtqueuebits;
 542         } else if (lp->lwp_rtprio.type == RTP_PRIO_IDLE) {
 543                 q = &idqueues[pri];
 544                 which = &idqueuebits;
 545         } else {
 546                 panic("remrunqueue: invalid rtprio type");
 547         }
 548         TAILQ_REMOVE(q, lp, lwp_procq);
 549         if (TAILQ_EMPTY(q)) {
 550                 KASSERT((*which & (1 << pri)) != 0,
 551                         ("remrunqueue: remove from empty queue"));
 552                 *which &= ~(1 << pri);
 553         }
 554         crit_exit();
 555 }
 556
 557 /*
 558  * This routine is called from a systimer IPI.  It MUST be MP-safe and
 559  * the BGL IS NOT HELD ON ENTRY.  This routine is called at ESTCPUFREQ.
 560  */
 561 static
 562 void
 563 bsd4_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp)
 564 {
 565         globaldata_t gd = mycpu;
 566
 567         /*
 568          * Do we need to round-robin?  We round-robin 10 times a second.
 569          * This should only occur for cpu-bound batch processes.
 570          */
 571         if (++gd->gd_rrcount >= usched_bsd4_rrinterval) {
 572                 gd->gd_rrcount = 0;
 573                 need_user_resched();
 574         }
 575
 576         /*
 577          * As the process accumulates cpu time p_estcpu is bumped and may
 578          * push the process into another scheduling queue.  It typically
 579          * takes 4 ticks to bump the queue.
 580          */
 581         lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu + ESTCPUINCR);
 582
 583         /*
 584          * Reducing p_origcpu over time causes more of our estcpu to be
 585          * returned to the parent when we exit.  This is a small tweak
 586          * for the batch detection heuristic.
 587          */
 588         if (lp->lwp_origcpu)
 589                 --lp->lwp_origcpu;
 590
 591         /* XXX optimize, avoid lock if no reset is required */
 592         if (try_mplock()) {
 593                 bsd4_resetpriority(lp);
 594                 rel_mplock();
 595         }
 596 }
 597
 598 /*
 599  * Release the current process designation on p.  P MUST BE CURPROC.
 600  * Attempt to assign a new current process from the run queue.
 601  *
 602  * This function is called from exit1(), tsleep(), and the passive
 603  * release code setup in <arch>/<arch>/trap.c
 604  *
 605  * If we do not have or cannot get the MP lock we just wakeup the userland
 606  * helper scheduler thread for this cpu to do the work for us.
 607  *
 608  * WARNING!  The MP lock may be in an unsynchronized state due to the
 609  * way get_mplock() works and the fact that this function may be called
 610  * from a passive release during a lwkt_switch().   try_mplock() will deal
 611  * with this for us but you should be aware that td_mpcount may not be
 612  * useable.
 613  */
 614 static void
 615 bsd4_release_curproc(struct lwp *lp)
 616 {
 617         int cpuid;
 618         globaldata_t gd = mycpu;
 619
 620         KKASSERT(lp->lwp_thread->td_gd == gd);
 621         crit_enter();
 622         cpuid = gd->gd_cpuid;
 623
 624         if (gd->gd_uschedcp == lp) {
 625                 if (try_mplock()) {
 626                         /*
 627                          * YYY when the MP lock is not assumed (see else) we
 628                          * will have to check that gd_uschedcp is still == lp
 629                          * after acquisition of the MP lock
 630                          */
 631                         gd->gd_uschedcp = NULL;
 632                         gd->gd_upri = PRIBASE_NULL;
 633                         bsd4_select_curproc(gd);
 634                         rel_mplock();
 635                 } else {
 636                         KKASSERT(0);    /* MP LOCK ALWAYS HELD AT THE MOMENT */
 637                         gd->gd_uschedcp = NULL;
 638                         gd->gd_upri = PRIBASE_NULL;
 639                         /* YYY uschedcp and curprocmask */
 640                         if (runqcount && (rdyprocmask & (1 << cpuid))) {
 641                                 rdyprocmask &= ~(1 << cpuid);
 642                                 lwkt_schedule(&mycpu->gd_schedthread);
 643                         }
 644                 }
 645         }
 646         crit_exit();
 647 }
 648
 649 /*
 650  * Select a new current process, potentially retaining gd_uschedcp.  However,
 651  * be sure to round-robin.  This routine is generally only called if a
 652  * reschedule is requested and that typically only occurs if a new process
 653  * has a better priority or when we are round-robining.
 654  *
 655  * NOTE: Must be called with giant held and the current cpu's gd.
 656  * NOTE: The caller must handle the situation where it loses a
 657  *      uschedcp designation that it previously held, typically by
 658  *      calling acquire_curproc() again.
 659  * NOTE: May not block
 660  */
 661 static
 662 void
 663 bsd4_select_curproc(globaldata_t gd)
 664 {
 665         struct lwp *nlp;
 666         int cpuid = gd->gd_cpuid;
 667         void *old;
 668
 669         clear_user_resched();
 670
 671         /*
 672          * Choose the next designated current user process.
 673          * Note that we cannot schedule gd_schedthread
 674          * if runqcount is 0 without creating a scheduling
 675          * loop.
 676          *
 677          * We do not clear the user resched request here,
 678          * we need to test it later when we re-acquire.
 679          *
 680          * NOTE: chooseproc returns NULL if the chosen lwp
 681          * is gd_uschedcp. XXX needs cleanup.
 682          */
 683         old = gd->gd_uschedcp;
 684         if ((nlp = chooseproc(gd->gd_uschedcp)) != NULL) {
 685                 curprocmask |= 1 << cpuid;
 686                 gd->gd_upri = nlp->lwp_priority;
 687                 gd->gd_uschedcp = nlp;
 688                 lwkt_acquire(nlp->lwp_thread);
 689                 lwkt_schedule(nlp->lwp_thread);
 690         } else if (gd->gd_uschedcp) {
 691                 gd->gd_upri = gd->gd_uschedcp->lwp_priority;
 692                 KKASSERT(curprocmask & (1 << cpuid));
 693         } else if (runqcount && (rdyprocmask & (1 << cpuid))) {
 694                 /*gd->gd_uschedcp = NULL;*/
 695                 curprocmask &= ~(1 << cpuid);
 696                 rdyprocmask &= ~(1 << cpuid);
 697                 lwkt_schedule(&gd->gd_schedthread);
 698         } else {
 699                 /*gd->gd_uschedcp = NULL;*/
 700                 curprocmask &= ~(1 << cpuid);
 701         }
 702 }
 703
 704 /*
 705  * Acquire the current process designation on the CURRENT process only.
 706  * This function is called at kernel-user priority (not userland priority)
 707  * when curlwp does not match gd_uschedcp.
 708  *
 709  * Basically we recalculate our estcpu to hopefully give us a more
 710  * favorable disposition, setrunqueue, then wait for the curlwp
 711  * designation to be handed to us (if the setrunqueue didn't do it).
 712  */
 713 static void
 714 bsd4_acquire_curproc(struct lwp *lp)
 715 {
 716         globaldata_t gd = mycpu;
 717
 718         crit_enter();
 719         ++lp->lwp_stats->p_ru.ru_nivcsw;
 720
 721         /*
 722          * Loop until we become the current process.
 723          */
 724         do {
 725                 KKASSERT(lp == gd->gd_curthread->td_lwp);
 726                 bsd4_recalculate_estcpu(lp);
 727                 lwkt_deschedule_self(gd->gd_curthread);
 728                 bsd4_setrunqueue(lp);
 729                 lwkt_switch();
 730
 731                 /*
 732                  * WE MAY HAVE BEEN MIGRATED TO ANOTHER CPU, RELOAD GD.
 733                  */
 734                 gd = mycpu;
 735         } while (gd->gd_uschedcp != lp);
 736
 737         crit_exit();
 738
 739         /*
 740          * That's it.  Cleanup, we are done.  The caller can return to
 741          * user mode now.
 742          */
 743         KKASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) == 0);
 744 }
 745
 746 /*
 747  * Compute the priority of a process when running in user mode.
 748  * Arrange to reschedule if the resulting priority is better
 749  * than that of the current process.
 750  */
 751 static void
 752 bsd4_resetpriority(struct lwp *lp)
 753 {
 754         int newpriority;
 755         int opq;
 756         int npq;
 757
 758         /*
 759          * Set p_priority for general process comparisons
 760          */
 761         switch(lp->lwp_rtprio.type) {
 762         case RTP_PRIO_REALTIME:
 763                 lp->lwp_priority = PRIBASE_REALTIME + lp->lwp_rtprio.prio;
 764                 return;
 765         case RTP_PRIO_NORMAL:
 766                 break;
 767         case RTP_PRIO_IDLE:
 768                 lp->lwp_priority = PRIBASE_IDLE + lp->lwp_rtprio.prio;
 769                 return;
 770         case RTP_PRIO_THREAD:
 771                 lp->lwp_priority = PRIBASE_THREAD + lp->lwp_rtprio.prio;
 772                 return;
 773         }
 774
 775         /*
 776          * NORMAL priorities fall through.  These are based on niceness
 777          * and cpu use.  Lower numbers == higher priorities.
 778          *
 779          * Calculate our priority based on our niceness and estimated cpu.
 780          * Note that the nice value adjusts the baseline, which effects
 781          * cpu bursts but does not effect overall cpu use between cpu-bound
 782          * processes.  The use of the nice field in the decay calculation
 783          * controls the overall cpu use.
 784          *
 785          * This isn't an exact calculation.  We fit the full nice and
 786          * estcpu range into the priority range so the actual PPQ value
 787          * is incorrect, but it's still a reasonable way to think about it.
 788          */
 789         newpriority = (lp->lwp_proc->p_nice - PRIO_MIN) * PPQ / NICEPPQ;
 790         newpriority += lp->lwp_estcpu * PPQ / ESTCPUPPQ;
 791         newpriority = newpriority * MAXPRI /
 792                     (PRIO_RANGE * PPQ / NICEPPQ + ESTCPUMAX * PPQ / ESTCPUPPQ);
 793         newpriority = MIN(newpriority, MAXPRI - 1);     /* sanity */
 794         newpriority = MAX(newpriority, 0);              /* sanity */
 795         npq = newpriority / PPQ;
 796         crit_enter();
 797         opq = (lp->lwp_priority & PRIMASK) / PPQ;
 798         if (lp->lwp_proc->p_stat == SRUN && (lp->lwp_proc->p_flag & P_ONRUNQ) && opq != npq) {
 799                 /*
 800                  * We have to move the process to another queue
 801                  */
 802                 bsd4_remrunqueue(lp);
 803                 lp->lwp_priority = PRIBASE_NORMAL + newpriority;
 804                 bsd4_setrunqueue(lp);
 805         } else {
 806                 /*
 807                  * We can just adjust the priority and it will be picked
 808                  * up later.
 809                  */
 810                 KKASSERT(opq == npq || (lp->lwp_proc->p_flag & P_ONRUNQ) == 0);
 811                 lp->lwp_priority = PRIBASE_NORMAL + newpriority;
 812         }
 813         crit_exit();
 814 }
 815
 816 /*
 817  * Called from fork1() when a new child process is being created.
 818  *
 819  * Give the child process an initial estcpu that is more batch then
 820  * its parent and dock the parent for the fork (but do not
 821  * reschedule the parent).   This comprises the main part of our batch
 822  * detection heuristic for both parallel forking and sequential execs.
 823  *
 824  * Interactive processes will decay the boosted estcpu quickly while batch
 825  * processes will tend to compound it.
 826  * XXX lwp should be "spawning" instead of "forking"
 827  */
 828 static void
 829 bsd4_forking(struct lwp *plp, struct lwp *lp)
 830 {
 831         lp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ);
 832         lp->lwp_origcpu = lp->lwp_estcpu;
 833         plp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ);
 834 }
 835
 836 /*
 837  * Called when the parent reaps a child.   Propogate cpu use by the child
 838  * back to the parent.
 839  */
 840 static void
 841 bsd4_exiting(struct lwp *plp, struct lwp *lp)
 842 {
 843         int delta;
 844
 845         if (plp->lwp_proc->p_pid != 1) {
 846                 delta = lp->lwp_estcpu - lp->lwp_origcpu;
 847                 if (delta > 0)
 848                         plp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + delta);
 849         }
 850 }
 851
 852 /*
 853  * Called from acquire and from kern_synch's one-second timer with a
 854  * critical section held.
 855  *
 856  * Decay p_estcpu based on the number of ticks we haven't been running
 857  * and our p_nice.  As the load increases each process observes a larger
 858  * number of idle ticks (because other processes are running in them).
 859  * This observation leads to a larger correction which tends to make the
 860  * system more 'batchy'.
 861  *
 862  * Note that no recalculation occurs for a process which sleeps and wakes
 863  * up in the same tick.  That is, a system doing thousands of context
 864  * switches per second will still only do serious estcpu calculations
 865  * ESTCPUFREQ times per second.
 866  */
 867 static
 868 void
 869 bsd4_recalculate_estcpu(struct lwp *lp)
 870 {
 871         globaldata_t gd = mycpu;
 872         sysclock_t cpbase;
 873         int loadfac;
 874         int ndecay;
 875         int nticks;
 876         int nleft;
 877
 878         /*
 879          * We have to subtract periodic to get the last schedclock
 880          * timeout time, otherwise we would get the upcoming timeout.
 881          * Keep in mind that a process can migrate between cpus and
 882          * while the scheduler clock should be very close, boundary
 883          * conditions could lead to a small negative delta.
 884          */
 885         cpbase = gd->gd_schedclock.time - gd->gd_schedclock.periodic;
 886
 887         if (lp->lwp_slptime > 1) {
 888                 /*
 889                  * Too much time has passed, do a coarse correction.
 890                  */
 891                 lp->lwp_estcpu = lp->lwp_estcpu >> 1;
 892                 bsd4_resetpriority(lp);
 893                 lp->lwp_cpbase = cpbase;
 894                 lp->lwp_cpticks = 0;
 895         } else if (lp->lwp_cpbase != cpbase) {
 896                 /*
 897                  * Adjust estcpu if we are in a different tick.  Don't waste
 898                  * time if we are in the same tick.
 899                  *
 900                  * First calculate the number of ticks in the measurement
 901                  * interval.
 902                  */
 903                 nticks = (cpbase - lp->lwp_cpbase) / gd->gd_schedclock.periodic;
 904                 updatepcpu(lp, lp->lwp_cpticks, nticks);
 905
 906                 if ((nleft = nticks - lp->lwp_cpticks) < 0)
 907                         nleft = 0;
 908                 if (usched_debug == lp->lwp_proc->p_pid) {
 909                         printf("pid %d tid %d estcpu %d cpticks %d nticks %d nleft %d",
 910                                 lp->lwp_proc->p_pid, lp->lwp_tid, lp->lwp_estcpu,
 911                                 lp->lwp_cpticks, nticks, nleft);
 912                 }
 913
 914                 /*
 915                  * Calculate a decay value based on ticks remaining scaled
 916                  * down by the instantanious load and p_nice.
 917                  */
 918                 if ((loadfac = runqcount) < 2)
 919                         loadfac = 2;
 920                 ndecay = nleft * usched_bsd4_decay * 2 *
 921                         (PRIO_MAX * 2 - lp->lwp_proc->p_nice) / (loadfac * PRIO_MAX * 2);
 922
 923                 /*
 924                  * Adjust p_estcpu.  Handle a border case where batch jobs
 925                  * can get stalled long enough to decay to zero when they
 926                  * shouldn't.
 927                  */
 928                 if (lp->lwp_estcpu > ndecay * 2)
 929                         lp->lwp_estcpu -= ndecay;
 930                 else
 931                         lp->lwp_estcpu >>= 1;
 932
 933                 if (usched_debug == lp->lwp_proc->p_pid)
 934                         printf(" ndecay %d estcpu %d\n", ndecay, lp->lwp_estcpu);
 935
 936                 bsd4_resetpriority(lp);
 937                 lp->lwp_cpbase = cpbase;
 938                 lp->lwp_cpticks = 0;
 939         }
 940 }
 941
 942 #ifdef SMP
 943
 944 /*
 945  * For SMP systems a user scheduler helper thread is created for each
 946  * cpu and is used to allow one cpu to wakeup another for the purposes of
 947  * scheduling userland threads from setrunqueue().  UP systems do not
 948  * need the helper since there is only one cpu.  We can't use the idle
 949  * thread for this because we need to hold the MP lock.  Additionally,
 950  * doing things this way allows us to HLT idle cpus on MP systems.
 951  */
 952 static void
 953 sched_thread(void *dummy)
 954 {
 955     globaldata_t gd = mycpu;
 956     int cpuid = gd->gd_cpuid;           /* doesn't change */
 957     u_int32_t cpumask = 1 << cpuid;     /* doesn't change */
 958
 959     get_mplock();                       /* hold the MP lock */
 960     for (;;) {
 961         struct lwp *nlp;
 962
 963         lwkt_deschedule_self(gd->gd_curthread); /* interlock */
 964         rdyprocmask |= cpumask;
 965         crit_enter_quick(gd->gd_curthread);
 966         if ((curprocmask & cpumask) == 0 && (nlp = chooseproc(NULL)) != NULL) {
 967             curprocmask |= cpumask;
 968             gd->gd_upri = nlp->lwp_priority;
 969             gd->gd_uschedcp = nlp;
 970             lwkt_acquire(nlp->lwp_thread);
 971             lwkt_schedule(nlp->lwp_thread);
 972         }
 973         crit_exit_quick(gd->gd_curthread);
 974         lwkt_switch();
 975     }
 976 }
 977
 978 /*
 979  * Setup our scheduler helpers.  Note that curprocmask bit 0 has already
 980  * been cleared by rqinit() and we should not mess with it further.
 981  */
 982 static void
 983 sched_thread_cpu_init(void)
 984 {
 985     int i;
 986
 987     if (bootverbose)
 988         printf("start scheduler helpers on cpus:");
 989
 990     for (i = 0; i < ncpus; ++i) {
 991         globaldata_t dgd = globaldata_find(i);
 992         cpumask_t mask = 1 << i;
 993
 994         if ((mask & smp_active_mask) == 0)
 995             continue;
 996
 997         if (bootverbose)
 998             printf(" %d", i);
 999
1000         lwkt_create(sched_thread, NULL, NULL, &dgd->gd_schedthread,
1001                     TDF_STOPREQ, i, "usched %d", i);
1002
1003         /*
1004          * Allow user scheduling on the target cpu.  cpu #0 has already
1005          * been enabled in rqinit().
1006          */
1007         if (i)
1008             curprocmask &= ~mask;
1009         rdyprocmask |= mask;
1010     }
1011     if (bootverbose)
1012         printf("\n");
1013 }
1014 SYSINIT(uschedtd, SI_SUB_FINISH_SMP, SI_ORDER_ANY, sched_thread_cpu_init, NULL)
1015
1016 #endif
1017