sys/kern/usched_bsd4.c

   1 /*
   2  * Copyright (c) 1999 Peter Wemm <peter@FreeBSD.org>
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  *
  14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  24  * SUCH DAMAGE.
  25  */
  26
  27 #include <sys/param.h>
  28 #include <sys/systm.h>
  29 #include <sys/kernel.h>
  30 #include <sys/lock.h>
  31 #include <sys/queue.h>
  32 #include <sys/proc.h>
  33 #include <sys/rtprio.h>
  34 #include <sys/uio.h>
  35 #include <sys/sysctl.h>
  36 #include <sys/resourcevar.h>
  37 #include <sys/spinlock.h>
  38 #include <machine/cpu.h>
  39 #include <machine/smp.h>
  40
  41 #include <sys/thread2.h>
  42 #include <sys/spinlock2.h>
  43 #include <sys/mplock2.h>
  44
  45 /*
  46  * Priorities.  Note that with 32 run queues per scheduler each queue
  47  * represents four priority levels.
  48  */
  49
  50 #define MAXPRI                  128
  51 #define PRIMASK                 (MAXPRI - 1)
  52 #define PRIBASE_REALTIME        0
  53 #define PRIBASE_NORMAL          MAXPRI
  54 #define PRIBASE_IDLE            (MAXPRI * 2)
  55 #define PRIBASE_THREAD          (MAXPRI * 3)
  56 #define PRIBASE_NULL            (MAXPRI * 4)
  57
  58 #define NQS     32                      /* 32 run queues. */
  59 #define PPQ     (MAXPRI / NQS)          /* priorities per queue */
  60 #define PPQMASK (PPQ - 1)
  61
  62 /*
  63  * NICEPPQ      - number of nice units per priority queue
  64  *
  65  * ESTCPUPPQ    - number of estcpu units per priority queue
  66  * ESTCPUMAX    - number of estcpu units
  67  */
  68 #define NICEPPQ         2
  69 #define ESTCPUPPQ       512
  70 #define ESTCPUMAX       (ESTCPUPPQ * NQS)
  71 #define BATCHMAX        (ESTCPUFREQ * 30)
  72 #define PRIO_RANGE      (PRIO_MAX - PRIO_MIN + 1)
  73
  74 #define ESTCPULIM(v)    min((v), ESTCPUMAX)
  75
  76 TAILQ_HEAD(rq, lwp);
  77
  78 #define lwp_priority    lwp_usdata.bsd4.priority
  79 #define lwp_rqindex     lwp_usdata.bsd4.rqindex
  80 #define lwp_estcpu      lwp_usdata.bsd4.estcpu
  81 #define lwp_batch       lwp_usdata.bsd4.batch
  82 #define lwp_rqtype      lwp_usdata.bsd4.rqtype
  83
  84 static void bsd4_acquire_curproc(struct lwp *lp);
  85 static void bsd4_release_curproc(struct lwp *lp);
  86 static void bsd4_select_curproc(globaldata_t gd);
  87 static void bsd4_setrunqueue(struct lwp *lp);
  88 static void bsd4_schedulerclock(struct lwp *lp, sysclock_t period,
  89                                 sysclock_t cpstamp);
  90 static void bsd4_recalculate_estcpu(struct lwp *lp);
  91 static void bsd4_resetpriority(struct lwp *lp);
  92 static void bsd4_forking(struct lwp *plp, struct lwp *lp);
  93 static void bsd4_exiting(struct lwp *lp, struct proc *);
  94 static void bsd4_yield(struct lwp *lp);
  95
  96 #ifdef SMP
  97 static void need_user_resched_remote(void *dummy);
  98 #endif
  99 static struct lwp *chooseproc_locked(struct lwp *chklp);
 100 static void bsd4_remrunqueue_locked(struct lwp *lp);
 101 static void bsd4_setrunqueue_locked(struct lwp *lp);
 102
 103 struct usched usched_bsd4 = {
 104         { NULL },
 105         "bsd4", "Original DragonFly Scheduler",
 106         NULL,                   /* default registration */
 107         NULL,                   /* default deregistration */
 108         bsd4_acquire_curproc,
 109         bsd4_release_curproc,
 110         bsd4_setrunqueue,
 111         bsd4_schedulerclock,
 112         bsd4_recalculate_estcpu,
 113         bsd4_resetpriority,
 114         bsd4_forking,
 115         bsd4_exiting,
 116         NULL,                   /* setcpumask not supported */
 117         bsd4_yield
 118 };
 119
 120 struct usched_bsd4_pcpu {
 121         struct thread helper_thread;
 122         short   rrcount;
 123         short   upri;
 124         struct lwp *uschedcp;
 125 };
 126
 127 typedef struct usched_bsd4_pcpu *bsd4_pcpu_t;
 128
 129 /*
 130  * We have NQS (32) run queues per scheduling class.  For the normal
 131  * class, there are 128 priorities scaled onto these 32 queues.  New
 132  * processes are added to the last entry in each queue, and processes
 133  * are selected for running by taking them from the head and maintaining
 134  * a simple FIFO arrangement.  Realtime and Idle priority processes have
 135  * and explicit 0-31 priority which maps directly onto their class queue
 136  * index.  When a queue has something in it, the corresponding bit is
 137  * set in the queuebits variable, allowing a single read to determine
 138  * the state of all 32 queues and then a ffs() to find the first busy
 139  * queue.
 140  */
 141 static struct rq bsd4_queues[NQS];
 142 static struct rq bsd4_rtqueues[NQS];
 143 static struct rq bsd4_idqueues[NQS];
 144 static u_int32_t bsd4_queuebits;
 145 static u_int32_t bsd4_rtqueuebits;
 146 static u_int32_t bsd4_idqueuebits;
 147 static cpumask_t bsd4_curprocmask = -1; /* currently running a user process */
 148 static cpumask_t bsd4_rdyprocmask;      /* ready to accept a user process */
 149 static int       bsd4_runqcount;
 150 #ifdef SMP
 151 static volatile int bsd4_scancpu;
 152 #endif
 153 static struct spinlock bsd4_spin;
 154 static struct usched_bsd4_pcpu bsd4_pcpu[MAXCPU];
 155
 156 SYSCTL_INT(_debug, OID_AUTO, bsd4_runqcount, CTLFLAG_RD, &bsd4_runqcount, 0,
 157     "Number of run queues");
 158 #ifdef INVARIANTS
 159 static int usched_nonoptimal;
 160 SYSCTL_INT(_debug, OID_AUTO, usched_nonoptimal, CTLFLAG_RW,
 161         &usched_nonoptimal, 0, "acquire_curproc() was not optimal");
 162 static int usched_optimal;
 163 SYSCTL_INT(_debug, OID_AUTO, usched_optimal, CTLFLAG_RW,
 164         &usched_optimal, 0, "acquire_curproc() was optimal");
 165 #endif
 166 static int usched_debug = -1;
 167 SYSCTL_INT(_debug, OID_AUTO, scdebug, CTLFLAG_RW, &usched_debug, 0,
 168     "Print debug information for this pid");
 169 #ifdef SMP
 170 static int remote_resched_nonaffinity;
 171 static int remote_resched_affinity;
 172 static int choose_affinity;
 173 SYSCTL_INT(_debug, OID_AUTO, remote_resched_nonaffinity, CTLFLAG_RD,
 174         &remote_resched_nonaffinity, 0, "Number of remote rescheds");
 175 SYSCTL_INT(_debug, OID_AUTO, remote_resched_affinity, CTLFLAG_RD,
 176         &remote_resched_affinity, 0, "Number of remote rescheds");
 177 SYSCTL_INT(_debug, OID_AUTO, choose_affinity, CTLFLAG_RD,
 178         &choose_affinity, 0, "chooseproc() was smart");
 179 #endif
 180
 181 static int usched_bsd4_rrinterval = (ESTCPUFREQ + 9) / 10;
 182 SYSCTL_INT(_kern, OID_AUTO, usched_bsd4_rrinterval, CTLFLAG_RW,
 183         &usched_bsd4_rrinterval, 0, "");
 184 static int usched_bsd4_decay = 8;
 185 SYSCTL_INT(_kern, OID_AUTO, usched_bsd4_decay, CTLFLAG_RW,
 186         &usched_bsd4_decay, 0, "Extra decay when not running");
 187 static int usched_bsd4_batch_time = 10;
 188 SYSCTL_INT(_kern, OID_AUTO, usched_bsd4_batch_time, CTLFLAG_RW,
 189         &usched_bsd4_batch_time, 0, "Minimum batch counter value");
 190
 191 /*
 192  * Initialize the run queues at boot time.
 193  */
 194 static void
 195 rqinit(void *dummy)
 196 {
 197         int i;
 198
 199         spin_init(&bsd4_spin);
 200         for (i = 0; i < NQS; i++) {
 201                 TAILQ_INIT(&bsd4_queues[i]);
 202                 TAILQ_INIT(&bsd4_rtqueues[i]);
 203                 TAILQ_INIT(&bsd4_idqueues[i]);
 204         }
 205         atomic_clear_cpumask(&bsd4_curprocmask, 1);
 206 }
 207 SYSINIT(runqueue, SI_BOOT2_USCHED, SI_ORDER_FIRST, rqinit, NULL)
 208
 209 /*
 210  * BSD4_ACQUIRE_CURPROC
 211  *
 212  * This function is called when the kernel intends to return to userland.
 213  * It is responsible for making the thread the current designated userland
 214  * thread for this cpu, blocking if necessary.
 215  *
 216  * The kernel has already depressed our LWKT priority so we must not switch
 217  * until we have either assigned or disposed of the thread.
 218  *
 219  * WARNING! THIS FUNCTION IS ALLOWED TO CAUSE THE CURRENT THREAD TO MIGRATE
 220  * TO ANOTHER CPU!  Because most of the kernel assumes that no migration will
 221  * occur, this function is called only under very controlled circumstances.
 222  *
 223  * MPSAFE
 224  */
 225 static void
 226 bsd4_acquire_curproc(struct lwp *lp)
 227 {
 228         globaldata_t gd;
 229         bsd4_pcpu_t dd;
 230 #if 0
 231         struct lwp *olp;
 232 #endif
 233
 234         crit_enter();
 235         bsd4_recalculate_estcpu(lp);
 236
 237         /*
 238          * If a reschedule was requested give another thread the
 239          * driver's seat.
 240          */
 241         if (user_resched_wanted()) {
 242                 clear_user_resched();
 243                 bsd4_release_curproc(lp);
 244         }
 245
 246         /*
 247          * Loop until we are the current user thread
 248          */
 249         gd = mycpu;
 250         dd = &bsd4_pcpu[gd->gd_cpuid];
 251
 252         do {
 253                 /*
 254                  * Process any pending events and higher priority threads.
 255                  */
 256                 lwkt_yield();
 257
 258                 /*
 259                  * Become the currently scheduled user thread for this cpu
 260                  * if we can do so trivially.
 261                  *
 262                  * We can steal another thread's current thread designation
 263                  * on this cpu since if we are running that other thread
 264                  * must not be, so we can safely deschedule it.
 265                  */
 266                 if (dd->uschedcp == lp) {
 267                         /*
 268                          * We are already the current lwp (hot path).
 269                          */
 270                         dd->upri = lp->lwp_priority;
 271                 } else if (dd->uschedcp == NULL) {
 272                         /*
 273                          * We can trivially become the current lwp.
 274                          */
 275                         atomic_set_cpumask(&bsd4_curprocmask, gd->gd_cpumask);
 276                         dd->uschedcp = lp;
 277                         dd->upri = lp->lwp_priority;
 278                 } else if (dd->upri > lp->lwp_priority) {
 279                         /*
 280                          * We can steal the current cpu's lwp designation
 281                          * away simply by replacing it.  The other thread
 282                          * will stall when it tries to return to userland.
 283                          */
 284                         dd->uschedcp = lp;
 285                         dd->upri = lp->lwp_priority;
 286                         /*
 287                         lwkt_deschedule(olp->lwp_thread);
 288                         bsd4_setrunqueue(olp);
 289                         */
 290                 } else {
 291                         /*
 292                          * We cannot become the current lwp, place the lp
 293                          * on the bsd4 run-queue and deschedule ourselves.
 294                          *
 295                          * When we are reactivated we will have another
 296                          * chance.
 297                          */
 298                         lwkt_deschedule(lp->lwp_thread);
 299                         bsd4_setrunqueue(lp);
 300                         lwkt_switch();
 301                         /*
 302                          * Reload after a switch or setrunqueue/switch possibly
 303                          * moved us to another cpu.
 304                          */
 305                         gd = mycpu;
 306                         dd = &bsd4_pcpu[gd->gd_cpuid];
 307                 }
 308         } while (dd->uschedcp != lp);
 309
 310         crit_exit();
 311         KKASSERT((lp->lwp_flag & LWP_ONRUNQ) == 0);
 312 }
 313
 314 /*
 315  * BSD4_RELEASE_CURPROC
 316  *
 317  * This routine detaches the current thread from the userland scheduler,
 318  * usually because the thread needs to run or block in the kernel (at
 319  * kernel priority) for a while.
 320  *
 321  * This routine is also responsible for selecting a new thread to
 322  * make the current thread.
 323  *
 324  * NOTE: This implementation differs from the dummy example in that
 325  * bsd4_select_curproc() is able to select the current process, whereas
 326  * dummy_select_curproc() is not able to select the current process.
 327  * This means we have to NULL out uschedcp.
 328  *
 329  * Additionally, note that we may already be on a run queue if releasing
 330  * via the lwkt_switch() in bsd4_setrunqueue().
 331  *
 332  * MPSAFE
 333  */
 334 static void
 335 bsd4_release_curproc(struct lwp *lp)
 336 {
 337         globaldata_t gd = mycpu;
 338         bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid];
 339
 340         if (dd->uschedcp == lp) {
 341                 crit_enter();
 342                 KKASSERT((lp->lwp_flag & LWP_ONRUNQ) == 0);
 343                 dd->uschedcp = NULL;    /* don't let lp be selected */
 344                 dd->upri = PRIBASE_NULL;
 345                 atomic_clear_cpumask(&bsd4_curprocmask, gd->gd_cpumask);
 346                 bsd4_select_curproc(gd);
 347                 crit_exit();
 348         }
 349 }
 350
 351 /*
 352  * BSD4_SELECT_CURPROC
 353  *
 354  * Select a new current process for this cpu and clear any pending user
 355  * reschedule request.  The cpu currently has no current process.
 356  *
 357  * This routine is also responsible for equal-priority round-robining,
 358  * typically triggered from bsd4_schedulerclock().  In our dummy example
 359  * all the 'user' threads are LWKT scheduled all at once and we just
 360  * call lwkt_switch().
 361  *
 362  * The calling process is not on the queue and cannot be selected.
 363  *
 364  * MPSAFE
 365  */
 366 static
 367 void
 368 bsd4_select_curproc(globaldata_t gd)
 369 {
 370         bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid];
 371         struct lwp *nlp;
 372         int cpuid = gd->gd_cpuid;
 373
 374         crit_enter_gd(gd);
 375
 376         spin_lock(&bsd4_spin);
 377         if ((nlp = chooseproc_locked(dd->uschedcp)) != NULL) {
 378                 atomic_set_cpumask(&bsd4_curprocmask, CPUMASK(cpuid));
 379                 dd->upri = nlp->lwp_priority;
 380                 dd->uschedcp = nlp;
 381                 spin_unlock(&bsd4_spin);
 382 #ifdef SMP
 383                 lwkt_acquire(nlp->lwp_thread);
 384 #endif
 385                 lwkt_schedule(nlp->lwp_thread);
 386         } else {
 387                 spin_unlock(&bsd4_spin);
 388         }
 389 #if 0
 390         } else if (bsd4_runqcount && (bsd4_rdyprocmask & CPUMASK(cpuid))) {
 391                 atomic_clear_cpumask(&bsd4_rdyprocmask, CPUMASK(cpuid));
 392                 spin_unlock(&bsd4_spin);
 393                 lwkt_schedule(&dd->helper_thread);
 394         } else {
 395                 spin_unlock(&bsd4_spin);
 396         }
 397 #endif
 398         crit_exit_gd(gd);
 399 }
 400
 401 /*
 402  * BSD4_SETRUNQUEUE
 403  *
 404  * Place the specified lwp on the user scheduler's run queue.  This routine
 405  * must be called with the thread descheduled.  The lwp must be runnable.
 406  *
 407  * The thread may be the current thread as a special case.
 408  *
 409  * MPSAFE
 410  */
 411 static void
 412 bsd4_setrunqueue(struct lwp *lp)
 413 {
 414         globaldata_t gd;
 415         bsd4_pcpu_t dd;
 416 #ifdef SMP
 417         int cpuid;
 418         cpumask_t mask;
 419         cpumask_t tmpmask;
 420 #endif
 421
 422         /*
 423          * First validate the process state relative to the current cpu.
 424          * We don't need the spinlock for this, just a critical section.
 425          * We are in control of the process.
 426          */
 427         crit_enter();
 428         KASSERT(lp->lwp_stat == LSRUN, ("setrunqueue: lwp not LSRUN"));
 429         KASSERT((lp->lwp_flag & LWP_ONRUNQ) == 0,
 430             ("lwp %d/%d already on runq! flag %08x/%08x", lp->lwp_proc->p_pid,
 431              lp->lwp_tid, lp->lwp_proc->p_flag, lp->lwp_flag));
 432         KKASSERT((lp->lwp_thread->td_flags & TDF_RUNQ) == 0);
 433
 434         /*
 435          * Note: gd and dd are relative to the target thread's last cpu,
 436          * NOT our current cpu.
 437          */
 438         gd = lp->lwp_thread->td_gd;
 439         dd = &bsd4_pcpu[gd->gd_cpuid];
 440
 441         /*
 442          * This process is not supposed to be scheduled anywhere or assigned
 443          * as the current process anywhere.  Assert the condition.
 444          */
 445         KKASSERT(dd->uschedcp != lp);
 446
 447 #ifndef SMP
 448         /*
 449          * If we are not SMP we do not have a scheduler helper to kick
 450          * and must directly activate the process if none are scheduled.
 451          *
 452          * This is really only an issue when bootstrapping init since
 453          * the caller in all other cases will be a user process, and
 454          * even if released (dd->uschedcp == NULL), that process will
 455          * kickstart the scheduler when it returns to user mode from
 456          * the kernel.
 457          */
 458         if (dd->uschedcp == NULL) {
 459                 atomic_set_cpumask(&bsd4_curprocmask, gd->gd_cpumask);
 460                 dd->uschedcp = lp;
 461                 dd->upri = lp->lwp_priority;
 462                 lwkt_schedule(lp->lwp_thread);
 463                 crit_exit();
 464                 return;
 465         }
 466 #endif
 467
 468 #ifdef SMP
 469         /*
 470          * XXX fixme.  Could be part of a remrunqueue/setrunqueue
 471          * operation when the priority is recalculated, so TDF_MIGRATING
 472          * may already be set.
 473          */
 474         if ((lp->lwp_thread->td_flags & TDF_MIGRATING) == 0)
 475                 lwkt_giveaway(lp->lwp_thread);
 476 #endif
 477
 478         /*
 479          * We lose control of lp the moment we release the spinlock after
 480          * having placed lp on the queue.  i.e. another cpu could pick it
 481          * up and it could exit, or its priority could be further adjusted,
 482          * or something like that.
 483          */
 484         spin_lock(&bsd4_spin);
 485         bsd4_setrunqueue_locked(lp);
 486
 487 #ifdef SMP
 488         /*
 489          * Kick the scheduler helper on one of the other cpu's
 490          * and request a reschedule if appropriate.
 491          *
 492          * NOTE: We check all cpus whos rdyprocmask is set.  First we
 493          *       look for cpus without designated lps, then we look for
 494          *       cpus with designated lps with a worse priority than our
 495          *       process.
 496          */
 497         ++bsd4_scancpu;
 498         cpuid = (bsd4_scancpu & 0xFFFF) % ncpus;
 499         mask = ~bsd4_curprocmask & bsd4_rdyprocmask & lp->lwp_cpumask &
 500                smp_active_mask & usched_global_cpumask;
 501
 502         while (mask) {
 503                 tmpmask = ~(CPUMASK(cpuid) - 1);
 504                 if (mask & tmpmask)
 505                         cpuid = BSFCPUMASK(mask & tmpmask);
 506                 else
 507                         cpuid = BSFCPUMASK(mask);
 508                 gd = globaldata_find(cpuid);
 509                 dd = &bsd4_pcpu[cpuid];
 510
 511                 if ((dd->upri & ~PPQMASK) >= (lp->lwp_priority & ~PPQMASK))
 512                         goto found;
 513                 mask &= ~CPUMASK(cpuid);
 514         }
 515
 516         /*
 517          * Then cpus which might have a currently running lp
 518          */
 519         mask = bsd4_curprocmask & bsd4_rdyprocmask &
 520                lp->lwp_cpumask & smp_active_mask & usched_global_cpumask;
 521
 522         while (mask) {
 523                 tmpmask = ~(CPUMASK(cpuid) - 1);
 524                 if (mask & tmpmask)
 525                         cpuid = BSFCPUMASK(mask & tmpmask);
 526                 else
 527                         cpuid = BSFCPUMASK(mask);
 528                 gd = globaldata_find(cpuid);
 529                 dd = &bsd4_pcpu[cpuid];
 530
 531                 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK))
 532                         goto found;
 533                 mask &= ~CPUMASK(cpuid);
 534         }
 535
 536         /*
 537          * If we cannot find a suitable cpu we reload from bsd4_scancpu
 538          * and round-robin.  Other cpus will pickup as they release their
 539          * current lwps or become ready.
 540          *
 541          * Avoid a degenerate system lockup case if usched_global_cpumask
 542          * is set to 0 or otherwise does not cover lwp_cpumask.
 543          *
 544          * We only kick the target helper thread in this case, we do not
 545          * set the user resched flag because
 546          */
 547         cpuid = (bsd4_scancpu & 0xFFFF) % ncpus;
 548         if ((CPUMASK(cpuid) & usched_global_cpumask) == 0) {
 549                 cpuid = 0;
 550         }
 551         gd = globaldata_find(cpuid);
 552         dd = &bsd4_pcpu[cpuid];
 553 found:
 554         if (gd == mycpu) {
 555                 spin_unlock(&bsd4_spin);
 556                 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) {
 557                         if (dd->uschedcp == NULL) {
 558                                 lwkt_schedule(&dd->helper_thread);
 559                         } else {
 560                                 need_user_resched();
 561                         }
 562                 }
 563         } else {
 564                 atomic_clear_cpumask(&bsd4_rdyprocmask, CPUMASK(cpuid));
 565                 spin_unlock(&bsd4_spin);
 566                 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK))
 567                         lwkt_send_ipiq(gd, need_user_resched_remote, NULL);
 568                 else
 569                         lwkt_schedule(&dd->helper_thread);
 570         }
 571 #else
 572         /*
 573          * Request a reschedule if appropriate.
 574          */
 575         spin_unlock(&bsd4_spin);
 576         if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) {
 577                 need_user_resched();
 578         }
 579 #endif
 580         crit_exit();
 581 }
 582
 583 /*
 584  * This routine is called from a systimer IPI.  It MUST be MP-safe and
 585  * the BGL IS NOT HELD ON ENTRY.  This routine is called at ESTCPUFREQ on
 586  * each cpu.
 587  *
 588  * MPSAFE
 589  */
 590 static
 591 void
 592 bsd4_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp)
 593 {
 594         globaldata_t gd = mycpu;
 595         bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid];
 596
 597         /*
 598          * Do we need to round-robin?  We round-robin 10 times a second.
 599          * This should only occur for cpu-bound batch processes.
 600          */
 601         if (++dd->rrcount >= usched_bsd4_rrinterval) {
 602                 dd->rrcount = 0;
 603                 need_user_resched();
 604         }
 605
 606         /*
 607          * Adjust estcpu upward using a real time equivalent calculation.
 608          */
 609         lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu + ESTCPUMAX / ESTCPUFREQ + 1);
 610
 611         /*
 612          * Spinlocks also hold a critical section so there should not be
 613          * any active.
 614          */
 615         KKASSERT(gd->gd_spinlocks_wr == 0);
 616
 617         bsd4_resetpriority(lp);
 618 #if 0
 619         /*
 620         * if we can't call bsd4_resetpriority for some reason we must call
 621          * need user_resched().
 622          */
 623         need_user_resched();
 624 #endif
 625 }
 626
 627 /*
 628  * Called from acquire and from kern_synch's one-second timer (one of the
 629  * callout helper threads) with a critical section held.
 630  *
 631  * Decay p_estcpu based on the number of ticks we haven't been running
 632  * and our p_nice.  As the load increases each process observes a larger
 633  * number of idle ticks (because other processes are running in them).
 634  * This observation leads to a larger correction which tends to make the
 635  * system more 'batchy'.
 636  *
 637  * Note that no recalculation occurs for a process which sleeps and wakes
 638  * up in the same tick.  That is, a system doing thousands of context
 639  * switches per second will still only do serious estcpu calculations
 640  * ESTCPUFREQ times per second.
 641  *
 642  * MPSAFE
 643  */
 644 static
 645 void
 646 bsd4_recalculate_estcpu(struct lwp *lp)
 647 {
 648         globaldata_t gd = mycpu;
 649         sysclock_t cpbase;
 650         sysclock_t ttlticks;
 651         int estcpu;
 652         int decay_factor;
 653
 654         /*
 655          * We have to subtract periodic to get the last schedclock
 656          * timeout time, otherwise we would get the upcoming timeout.
 657          * Keep in mind that a process can migrate between cpus and
 658          * while the scheduler clock should be very close, boundary
 659          * conditions could lead to a small negative delta.
 660          */
 661         cpbase = gd->gd_schedclock.time - gd->gd_schedclock.periodic;
 662
 663         if (lp->lwp_slptime > 1) {
 664                 /*
 665                  * Too much time has passed, do a coarse correction.
 666                  */
 667                 lp->lwp_estcpu = lp->lwp_estcpu >> 1;
 668                 bsd4_resetpriority(lp);
 669                 lp->lwp_cpbase = cpbase;
 670                 lp->lwp_cpticks = 0;
 671                 lp->lwp_batch -= ESTCPUFREQ;
 672                 if (lp->lwp_batch < 0)
 673                         lp->lwp_batch = 0;
 674         } else if (lp->lwp_cpbase != cpbase) {
 675                 /*
 676                  * Adjust estcpu if we are in a different tick.  Don't waste
 677                  * time if we are in the same tick.
 678                  *
 679                  * First calculate the number of ticks in the measurement
 680                  * interval.  The ttlticks calculation can wind up 0 due to
 681                  * a bug in the handling of lwp_slptime  (as yet not found),
 682                  * so make sure we do not get a divide by 0 panic.
 683                  */
 684                 ttlticks = (cpbase - lp->lwp_cpbase) /
 685                            gd->gd_schedclock.periodic;
 686                 if (ttlticks < 0) {
 687                         ttlticks = 0;
 688                         lp->lwp_cpbase = cpbase;
 689                 }
 690                 if (ttlticks == 0)
 691                         return;
 692                 updatepcpu(lp, lp->lwp_cpticks, ttlticks);
 693
 694                 /*
 695                  * Calculate the percentage of one cpu used factoring in ncpus
 696                  * and the load and adjust estcpu.  Handle degenerate cases
 697                  * by adding 1 to bsd4_runqcount.
 698                  *
 699                  * estcpu is scaled by ESTCPUMAX.
 700                  *
 701                  * bsd4_runqcount is the excess number of user processes
 702                  * that cannot be immediately scheduled to cpus.  We want
 703                  * to count these as running to avoid range compression
 704                  * in the base calculation (which is the actual percentage
 705                  * of one cpu used).
 706                  */
 707                 estcpu = (lp->lwp_cpticks * ESTCPUMAX) *
 708                          (bsd4_runqcount + ncpus) / (ncpus * ttlticks);
 709
 710                 /*
 711                  * If estcpu is > 50% we become more batch-like
 712                  * If estcpu is <= 50% we become less batch-like
 713                  *
 714                  * It takes 30 cpu seconds to traverse the entire range.
 715                  */
 716                 if (estcpu > ESTCPUMAX / 2) {
 717                         lp->lwp_batch += ttlticks;
 718                         if (lp->lwp_batch > BATCHMAX)
 719                                 lp->lwp_batch = BATCHMAX;
 720                 } else {
 721                         lp->lwp_batch -= ttlticks;
 722                         if (lp->lwp_batch < 0)
 723                                 lp->lwp_batch = 0;
 724                 }
 725
 726                 if (usched_debug == lp->lwp_proc->p_pid) {
 727                         kprintf("pid %d lwp %p estcpu %3d %3d bat %d cp %d/%d",
 728                                 lp->lwp_proc->p_pid, lp,
 729                                 estcpu, lp->lwp_estcpu,
 730                                 lp->lwp_batch,
 731                                 lp->lwp_cpticks, ttlticks);
 732                 }
 733
 734                 /*
 735                  * Adjust lp->lwp_esetcpu.  The decay factor determines how
 736                  * quickly lwp_estcpu collapses to its realtime calculation.
 737                  * A slower collapse gives us a more accurate number but
 738                  * can cause a cpu hog to eat too much cpu before the
 739                  * scheduler decides to downgrade it.
 740                  *
 741                  * NOTE: p_nice is accounted for in bsd4_resetpriority(),
 742                  *       and not here, but we must still ensure that a
 743                  *       cpu-bound nice -20 process does not completely
 744                  *       override a cpu-bound nice +20 process.
 745                  *
 746                  * NOTE: We must use ESTCPULIM() here to deal with any
 747                  *       overshoot.
 748                  */
 749                 decay_factor = usched_bsd4_decay;
 750                 if (decay_factor < 1)
 751                         decay_factor = 1;
 752                 if (decay_factor > 1024)
 753                         decay_factor = 1024;
 754
 755                 lp->lwp_estcpu = ESTCPULIM(
 756                         (lp->lwp_estcpu * decay_factor + estcpu) /
 757                         (decay_factor + 1));
 758
 759                 if (usched_debug == lp->lwp_proc->p_pid)
 760                         kprintf(" finalestcpu %d\n", lp->lwp_estcpu);
 761                 bsd4_resetpriority(lp);
 762                 lp->lwp_cpbase += ttlticks * gd->gd_schedclock.periodic;
 763                 lp->lwp_cpticks = 0;
 764         }
 765 }
 766
 767 /*
 768  * Compute the priority of a process when running in user mode.
 769  * Arrange to reschedule if the resulting priority is better
 770  * than that of the current process.
 771  *
 772  * This routine may be called with any process.
 773  *
 774  * This routine is called by fork1() for initial setup with the process
 775  * of the run queue, and also may be called normally with the process on or
 776  * off the run queue.
 777  *
 778  * MPSAFE
 779  */
 780 static void
 781 bsd4_resetpriority(struct lwp *lp)
 782 {
 783         bsd4_pcpu_t dd;
 784         int newpriority;
 785         u_short newrqtype;
 786         int reschedcpu;
 787         int checkpri;
 788         int estcpu;
 789
 790         /*
 791          * Calculate the new priority and queue type
 792          */
 793         crit_enter();
 794         spin_lock(&bsd4_spin);
 795
 796         newrqtype = lp->lwp_rtprio.type;
 797
 798         switch(newrqtype) {
 799         case RTP_PRIO_REALTIME:
 800         case RTP_PRIO_FIFO:
 801                 newpriority = PRIBASE_REALTIME +
 802                              (lp->lwp_rtprio.prio & PRIMASK);
 803                 break;
 804         case RTP_PRIO_NORMAL:
 805                 /*
 806                  * Detune estcpu based on batchiness.  lwp_batch ranges
 807                  * from 0 to  BATCHMAX.  Limit estcpu for the sake of
 808                  * the priority calculation to between 50% and 100%.
 809                  */
 810                 estcpu = lp->lwp_estcpu * (lp->lwp_batch + BATCHMAX) /
 811                          (BATCHMAX * 2);
 812
 813                 /*
 814                  * p_nice piece         Adds (0-40) * 2         0-80
 815                  * estcpu               Adds 16384  * 4 / 512   0-128
 816                  */
 817                 newpriority = (lp->lwp_proc->p_nice - PRIO_MIN) * PPQ / NICEPPQ;
 818                 newpriority += estcpu * PPQ / ESTCPUPPQ;
 819                 newpriority = newpriority * MAXPRI / (PRIO_RANGE * PPQ /
 820                               NICEPPQ + ESTCPUMAX * PPQ / ESTCPUPPQ);
 821                 newpriority = PRIBASE_NORMAL + (newpriority & PRIMASK);
 822                 break;
 823         case RTP_PRIO_IDLE:
 824                 newpriority = PRIBASE_IDLE + (lp->lwp_rtprio.prio & PRIMASK);
 825                 break;
 826         case RTP_PRIO_THREAD:
 827                 newpriority = PRIBASE_THREAD + (lp->lwp_rtprio.prio & PRIMASK);
 828                 break;
 829         default:
 830                 panic("Bad RTP_PRIO %d", newrqtype);
 831                 /* NOT REACHED */
 832         }
 833
 834         /*
 835          * The newpriority incorporates the queue type so do a simple masked
 836          * check to determine if the process has moved to another queue.  If
 837          * it has, and it is currently on a run queue, then move it.
 838          */
 839         if ((lp->lwp_priority ^ newpriority) & ~PPQMASK) {
 840                 lp->lwp_priority = newpriority;
 841                 if (lp->lwp_flag & LWP_ONRUNQ) {
 842                         bsd4_remrunqueue_locked(lp);
 843                         lp->lwp_rqtype = newrqtype;
 844                         lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ;
 845                         bsd4_setrunqueue_locked(lp);
 846                         checkpri = 1;
 847                 } else {
 848                         lp->lwp_rqtype = newrqtype;
 849                         lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ;
 850                         checkpri = 0;
 851                 }
 852                 reschedcpu = lp->lwp_thread->td_gd->gd_cpuid;
 853         } else {
 854                 lp->lwp_priority = newpriority;
 855                 reschedcpu = -1;
 856                 checkpri = 1;
 857         }
 858
 859         /*
 860          * Determine if we need to reschedule the target cpu.  This only
 861          * occurs if the LWP is already on a scheduler queue, which means
 862          * that idle cpu notification has already occured.  At most we
 863          * need only issue a need_user_resched() on the appropriate cpu.
 864          *
 865          * The LWP may be owned by a CPU different from the current one,
 866          * in which case dd->uschedcp may be modified without an MP lock
 867          * or a spinlock held.  The worst that happens is that the code
 868          * below causes a spurious need_user_resched() on the target CPU
 869          * and dd->pri to be wrong for a short period of time, both of
 870          * which are harmless.
 871          *
 872          * If checkpri is 0 we are adjusting the priority of the current
 873          * process, possibly higher (less desireable), so ignore the upri
 874          * check which will fail in that case.
 875          */
 876         if (reschedcpu >= 0) {
 877                 dd = &bsd4_pcpu[reschedcpu];
 878                 if ((bsd4_rdyprocmask & CPUMASK(reschedcpu)) &&
 879                     (checkpri == 0 ||
 880                      (dd->upri & ~PRIMASK) > (lp->lwp_priority & ~PRIMASK))) {
 881 #ifdef SMP
 882                         if (reschedcpu == mycpu->gd_cpuid) {
 883                                 spin_unlock(&bsd4_spin);
 884                                 need_user_resched();
 885                         } else {
 886                                 spin_unlock(&bsd4_spin);
 887                                 atomic_clear_cpumask(&bsd4_rdyprocmask,
 888                                                      CPUMASK(reschedcpu));
 889                                 lwkt_send_ipiq(lp->lwp_thread->td_gd,
 890                                                need_user_resched_remote, NULL);
 891                         }
 892 #else
 893                         spin_unlock(&bsd4_spin);
 894                         need_user_resched();
 895 #endif
 896                 } else {
 897                         spin_unlock(&bsd4_spin);
 898                 }
 899         } else {
 900                 spin_unlock(&bsd4_spin);
 901         }
 902         crit_exit();
 903 }
 904
 905 /*
 906  * MPSAFE
 907  */
 908 static
 909 void
 910 bsd4_yield(struct lwp *lp)
 911 {
 912 #if 0
 913         /* FUTURE (or something similar) */
 914         switch(lp->lwp_rqtype) {
 915         case RTP_PRIO_NORMAL:
 916                 lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu + ESTCPUINCR);
 917                 break;
 918         default:
 919                 break;
 920         }
 921 #endif
 922         need_user_resched();
 923 }
 924
 925 /*
 926  * Called from fork1() when a new child process is being created.
 927  *
 928  * Give the child process an initial estcpu that is more batch then
 929  * its parent and dock the parent for the fork (but do not
 930  * reschedule the parent).   This comprises the main part of our batch
 931  * detection heuristic for both parallel forking and sequential execs.
 932  *
 933  * XXX lwp should be "spawning" instead of "forking"
 934  *
 935  * MPSAFE
 936  */
 937 static void
 938 bsd4_forking(struct lwp *plp, struct lwp *lp)
 939 {
 940         /*
 941          * Put the child 4 queue slots (out of 32) higher than the parent
 942          * (less desireable than the parent).
 943          */
 944         lp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ * 4);
 945
 946         /*
 947          * The batch status of children always starts out centerline
 948          * and will inch-up or inch-down as appropriate.  It takes roughly
 949          * ~15 seconds of >50% cpu to hit the limit.
 950          */
 951         lp->lwp_batch = BATCHMAX / 2;
 952
 953         /*
 954          * Dock the parent a cost for the fork, protecting us from fork
 955          * bombs.  If the parent is forking quickly make the child more
 956          * batchy.
 957          */
 958         plp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ / 16);
 959 }
 960
 961 /*
 962  * Called when a parent waits for a child.
 963  *
 964  * MPSAFE
 965  */
 966 static void
 967 bsd4_exiting(struct lwp *lp, struct proc *child_proc)
 968 {
 969 }
 970
 971 /*
 972  * chooseproc() is called when a cpu needs a user process to LWKT schedule,
 973  * it selects a user process and returns it.  If chklp is non-NULL and chklp
 974  * has a better or equal priority then the process that would otherwise be
 975  * chosen, NULL is returned.
 976  *
 977  * Until we fix the RUNQ code the chklp test has to be strict or we may
 978  * bounce between processes trying to acquire the current process designation.
 979  *
 980  * MPSAFE - must be called with bsd4_spin exclusive held.  The spinlock is
 981  *          left intact through the entire routine.
 982  */
 983 static
 984 struct lwp *
 985 chooseproc_locked(struct lwp *chklp)
 986 {
 987         struct lwp *lp;
 988         struct rq *q;
 989         u_int32_t *which, *which2;
 990         u_int32_t pri;
 991         u_int32_t rtqbits;
 992         u_int32_t tsqbits;
 993         u_int32_t idqbits;
 994         cpumask_t cpumask;
 995
 996         rtqbits = bsd4_rtqueuebits;
 997         tsqbits = bsd4_queuebits;
 998         idqbits = bsd4_idqueuebits;
 999         cpumask = mycpu->gd_cpumask;
1000
1001 #ifdef SMP
1002 again:
1003 #endif
1004         if (rtqbits) {
1005                 pri = bsfl(rtqbits);
1006                 q = &bsd4_rtqueues[pri];
1007                 which = &bsd4_rtqueuebits;
1008                 which2 = &rtqbits;
1009         } else if (tsqbits) {
1010                 pri = bsfl(tsqbits);
1011                 q = &bsd4_queues[pri];
1012                 which = &bsd4_queuebits;
1013                 which2 = &tsqbits;
1014         } else if (idqbits) {
1015                 pri = bsfl(idqbits);
1016                 q = &bsd4_idqueues[pri];
1017                 which = &bsd4_idqueuebits;
1018                 which2 = &idqbits;
1019         } else {
1020                 return NULL;
1021         }
1022         lp = TAILQ_FIRST(q);
1023         KASSERT(lp, ("chooseproc: no lwp on busy queue"));
1024
1025 #ifdef SMP
1026         while ((lp->lwp_cpumask & cpumask) == 0) {
1027                 lp = TAILQ_NEXT(lp, lwp_procq);
1028                 if (lp == NULL) {
1029                         *which2 &= ~(1 << pri);
1030                         goto again;
1031                 }
1032         }
1033 #endif
1034
1035         /*
1036          * If the passed lwp <chklp> is reasonably close to the selected
1037          * lwp <lp>, return NULL (indicating that <chklp> should be kept).
1038          *
1039          * Note that we must error on the side of <chklp> to avoid bouncing
1040          * between threads in the acquire code.
1041          */
1042         if (chklp) {
1043                 if (chklp->lwp_priority < lp->lwp_priority + PPQ)
1044                         return(NULL);
1045         }
1046
1047 #ifdef SMP
1048         /*
1049          * If the chosen lwp does not reside on this cpu spend a few
1050          * cycles looking for a better candidate at the same priority level.
1051          * This is a fallback check, setrunqueue() tries to wakeup the
1052          * correct cpu and is our front-line affinity.
1053          */
1054         if (lp->lwp_thread->td_gd != mycpu &&
1055             (chklp = TAILQ_NEXT(lp, lwp_procq)) != NULL
1056         ) {
1057                 if (chklp->lwp_thread->td_gd == mycpu) {
1058                         ++choose_affinity;
1059                         lp = chklp;
1060                 }
1061         }
1062 #endif
1063
1064         TAILQ_REMOVE(q, lp, lwp_procq);
1065         --bsd4_runqcount;
1066         if (TAILQ_EMPTY(q))
1067                 *which &= ~(1 << pri);
1068         KASSERT((lp->lwp_flag & LWP_ONRUNQ) != 0, ("not on runq6!"));
1069         lp->lwp_flag &= ~LWP_ONRUNQ;
1070         return lp;
1071 }
1072
1073 #ifdef SMP
1074
1075 static
1076 void
1077 need_user_resched_remote(void *dummy)
1078 {
1079         globaldata_t gd = mycpu;
1080         bsd4_pcpu_t  dd = &bsd4_pcpu[gd->gd_cpuid];
1081
1082         need_user_resched();
1083         lwkt_schedule(&dd->helper_thread);
1084 }
1085
1086 #endif
1087
1088 /*
1089  * bsd4_remrunqueue_locked() removes a given process from the run queue
1090  * that it is on, clearing the queue busy bit if it becomes empty.
1091  *
1092  * Note that user process scheduler is different from the LWKT schedule.
1093  * The user process scheduler only manages user processes but it uses LWKT
1094  * underneath, and a user process operating in the kernel will often be
1095  * 'released' from our management.
1096  *
1097  * MPSAFE - bsd4_spin must be held exclusively on call
1098  */
1099 static void
1100 bsd4_remrunqueue_locked(struct lwp *lp)
1101 {
1102         struct rq *q;
1103         u_int32_t *which;
1104         u_int8_t pri;
1105
1106         KKASSERT(lp->lwp_flag & LWP_ONRUNQ);
1107         lp->lwp_flag &= ~LWP_ONRUNQ;
1108         --bsd4_runqcount;
1109         KKASSERT(bsd4_runqcount >= 0);
1110
1111         pri = lp->lwp_rqindex;
1112         switch(lp->lwp_rqtype) {
1113         case RTP_PRIO_NORMAL:
1114                 q = &bsd4_queues[pri];
1115                 which = &bsd4_queuebits;
1116                 break;
1117         case RTP_PRIO_REALTIME:
1118         case RTP_PRIO_FIFO:
1119                 q = &bsd4_rtqueues[pri];
1120                 which = &bsd4_rtqueuebits;
1121                 break;
1122         case RTP_PRIO_IDLE:
1123                 q = &bsd4_idqueues[pri];
1124                 which = &bsd4_idqueuebits;
1125                 break;
1126         default:
1127                 panic("remrunqueue: invalid rtprio type");
1128                 /* NOT REACHED */
1129         }
1130         TAILQ_REMOVE(q, lp, lwp_procq);
1131         if (TAILQ_EMPTY(q)) {
1132                 KASSERT((*which & (1 << pri)) != 0,
1133                         ("remrunqueue: remove from empty queue"));
1134                 *which &= ~(1 << pri);
1135         }
1136 }
1137
1138 /*
1139  * bsd4_setrunqueue_locked()
1140  *
1141  * Add a process whos rqtype and rqindex had previously been calculated
1142  * onto the appropriate run queue.   Determine if the addition requires
1143  * a reschedule on a cpu and return the cpuid or -1.
1144  *
1145  * NOTE: Lower priorities are better priorities.
1146  *
1147  * MPSAFE - bsd4_spin must be held exclusively on call
1148  */
1149 static void
1150 bsd4_setrunqueue_locked(struct lwp *lp)
1151 {
1152         struct rq *q;
1153         u_int32_t *which;
1154         int pri;
1155
1156         KKASSERT((lp->lwp_flag & LWP_ONRUNQ) == 0);
1157         lp->lwp_flag |= LWP_ONRUNQ;
1158         ++bsd4_runqcount;
1159
1160         pri = lp->lwp_rqindex;
1161
1162         switch(lp->lwp_rqtype) {
1163         case RTP_PRIO_NORMAL:
1164                 q = &bsd4_queues[pri];
1165                 which = &bsd4_queuebits;
1166                 break;
1167         case RTP_PRIO_REALTIME:
1168         case RTP_PRIO_FIFO:
1169                 q = &bsd4_rtqueues[pri];
1170                 which = &bsd4_rtqueuebits;
1171                 break;
1172         case RTP_PRIO_IDLE:
1173                 q = &bsd4_idqueues[pri];
1174                 which = &bsd4_idqueuebits;
1175                 break;
1176         default:
1177                 panic("remrunqueue: invalid rtprio type");
1178                 /* NOT REACHED */
1179         }
1180
1181         /*
1182          * Add to the correct queue and set the appropriate bit.  If no
1183          * lower priority (i.e. better) processes are in the queue then
1184          * we want a reschedule, calculate the best cpu for the job.
1185          *
1186          * Always run reschedules on the LWPs original cpu.
1187          */
1188         TAILQ_INSERT_TAIL(q, lp, lwp_procq);
1189         *which |= 1 << pri;
1190 }
1191
1192 #ifdef SMP
1193
1194 /*
1195  * For SMP systems a user scheduler helper thread is created for each
1196  * cpu and is used to allow one cpu to wakeup another for the purposes of
1197  * scheduling userland threads from setrunqueue().
1198  *
1199  * UP systems do not need the helper since there is only one cpu.
1200  *
1201  * We can't use the idle thread for this because we might block.
1202  * Additionally, doing things this way allows us to HLT idle cpus
1203  * on MP systems.
1204  *
1205  * MPSAFE
1206  */
1207 static void
1208 sched_thread(void *dummy)
1209 {
1210     globaldata_t gd;
1211     bsd4_pcpu_t  dd;
1212     bsd4_pcpu_t  tmpdd;
1213     struct lwp *nlp;
1214     cpumask_t mask;
1215     int cpuid;
1216 #ifdef SMP
1217     cpumask_t tmpmask;
1218     int tmpid;
1219 #endif
1220
1221     gd = mycpu;
1222     cpuid = gd->gd_cpuid;       /* doesn't change */
1223     mask = gd->gd_cpumask;      /* doesn't change */
1224     dd = &bsd4_pcpu[cpuid];
1225
1226     /*
1227      * Since we are woken up only when no user processes are scheduled
1228      * on a cpu, we can run at an ultra low priority.
1229      */
1230     lwkt_setpri_self(TDPRI_USER_SCHEDULER);
1231
1232     for (;;) {
1233         /*
1234          * We use the LWKT deschedule-interlock trick to avoid racing
1235          * bsd4_rdyprocmask.  This means we cannot block through to the
1236          * manual lwkt_switch() call we make below.
1237          */
1238         crit_enter_gd(gd);
1239         lwkt_deschedule_self(gd->gd_curthread);
1240         spin_lock(&bsd4_spin);
1241         atomic_set_cpumask(&bsd4_rdyprocmask, mask);
1242
1243         clear_user_resched();   /* This satisfied the reschedule request */
1244         dd->rrcount = 0;        /* Reset the round-robin counter */
1245
1246         if ((bsd4_curprocmask & mask) == 0) {
1247                 /*
1248                  * No thread is currently scheduled.
1249                  */
1250                 KKASSERT(dd->uschedcp == NULL);
1251                 if ((nlp = chooseproc_locked(NULL)) != NULL) {
1252                         atomic_set_cpumask(&bsd4_curprocmask, mask);
1253                         dd->upri = nlp->lwp_priority;
1254                         dd->uschedcp = nlp;
1255                         spin_unlock(&bsd4_spin);
1256 #ifdef SMP
1257                         lwkt_acquire(nlp->lwp_thread);
1258 #endif
1259                         lwkt_schedule(nlp->lwp_thread);
1260                 } else {
1261                         spin_unlock(&bsd4_spin);
1262                 }
1263         } else if (bsd4_runqcount) {
1264                 if ((nlp = chooseproc_locked(dd->uschedcp)) != NULL) {
1265                         dd->upri = nlp->lwp_priority;
1266                         dd->uschedcp = nlp;
1267                         spin_unlock(&bsd4_spin);
1268 #ifdef SMP
1269                         lwkt_acquire(nlp->lwp_thread);
1270 #endif
1271                         lwkt_schedule(nlp->lwp_thread);
1272                 } else {
1273                         /*
1274                          * CHAINING CONDITION TRAIN
1275                          *
1276                          * We could not deal with the scheduler wakeup
1277                          * request on this cpu, locate a ready scheduler
1278                          * with no current lp assignment and chain to it.
1279                          *
1280                          * This ensures that a wakeup race which fails due
1281                          * to priority test does not leave other unscheduled
1282                          * cpus idle when the runqueue is not empty.
1283                          */
1284                         tmpmask = ~bsd4_curprocmask & bsd4_rdyprocmask &
1285                                   smp_active_mask;
1286                         if (tmpmask) {
1287                                 tmpid = BSFCPUMASK(tmpmask);
1288                                 tmpdd = &bsd4_pcpu[tmpid];
1289                                 atomic_clear_cpumask(&bsd4_rdyprocmask,
1290                                                      CPUMASK(tmpid));
1291                                 spin_unlock(&bsd4_spin);
1292                                 lwkt_schedule(&tmpdd->helper_thread);
1293                         } else {
1294                                 spin_unlock(&bsd4_spin);
1295                         }
1296                 }
1297         } else {
1298                 /*
1299                  * The runq is empty.
1300                  */
1301                 spin_unlock(&bsd4_spin);
1302         }
1303
1304         /*
1305          * We're descheduled unless someone scheduled us.  Switch away.
1306          * Exiting the critical section will cause splz() to be called
1307          * for us if interrupts and such are pending.
1308          */
1309         crit_exit_gd(gd);
1310         lwkt_switch();
1311     }
1312 }
1313
1314 /*
1315  * Setup our scheduler helpers.  Note that curprocmask bit 0 has already
1316  * been cleared by rqinit() and we should not mess with it further.
1317  */
1318 static void
1319 sched_thread_cpu_init(void)
1320 {
1321     int i;
1322
1323     if (bootverbose)
1324         kprintf("start scheduler helpers on cpus:");
1325
1326     for (i = 0; i < ncpus; ++i) {
1327         bsd4_pcpu_t dd = &bsd4_pcpu[i];
1328         cpumask_t mask = CPUMASK(i);
1329
1330         if ((mask & smp_active_mask) == 0)
1331             continue;
1332
1333         if (bootverbose)
1334             kprintf(" %d", i);
1335
1336         lwkt_create(sched_thread, NULL, NULL, &dd->helper_thread,
1337                     TDF_STOPREQ, i, "usched %d", i);
1338
1339         /*
1340          * Allow user scheduling on the target cpu.  cpu #0 has already
1341          * been enabled in rqinit().
1342          */
1343         if (i)
1344             atomic_clear_cpumask(&bsd4_curprocmask, mask);
1345         atomic_set_cpumask(&bsd4_rdyprocmask, mask);
1346         dd->upri = PRIBASE_NULL;
1347     }
1348     if (bootverbose)
1349         kprintf("\n");
1350 }
1351 SYSINIT(uschedtd, SI_BOOT2_USCHED, SI_ORDER_SECOND,
1352         sched_thread_cpu_init, NULL)
1353
1354 #endif
1355