sys/kern/usched_bsd4.c

   1 /*
   2  * Copyright (c) 1999 Peter Wemm <peter@FreeBSD.org>
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  *
  14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  24  * SUCH DAMAGE.
  25  *
  26  * $DragonFly: src/sys/kern/usched_bsd4.c,v 1.22 2007/04/30 07:18:54 dillon Exp $
  27  */
  28
  29 #include <sys/param.h>
  30 #include <sys/systm.h>
  31 #include <sys/kernel.h>
  32 #include <sys/lock.h>
  33 #include <sys/queue.h>
  34 #include <sys/proc.h>
  35 #include <sys/rtprio.h>
  36 #include <sys/uio.h>
  37 #include <sys/sysctl.h>
  38 #include <sys/resourcevar.h>
  39 #include <sys/spinlock.h>
  40 #include <machine/cpu.h>
  41 #include <machine/smp.h>
  42
  43 #include <sys/thread2.h>
  44 #include <sys/spinlock2.h>
  45
  46 /*
  47  * Priorities.  Note that with 32 run queues per scheduler each queue
  48  * represents four priority levels.
  49  */
  50
  51 #define MAXPRI                  128
  52 #define PRIMASK                 (MAXPRI - 1)
  53 #define PRIBASE_REALTIME        0
  54 #define PRIBASE_NORMAL          MAXPRI
  55 #define PRIBASE_IDLE            (MAXPRI * 2)
  56 #define PRIBASE_THREAD          (MAXPRI * 3)
  57 #define PRIBASE_NULL            (MAXPRI * 4)
  58
  59 #define NQS     32                      /* 32 run queues. */
  60 #define PPQ     (MAXPRI / NQS)          /* priorities per queue */
  61 #define PPQMASK (PPQ - 1)
  62
  63 /*
  64  * NICEPPQ      - number of nice units per priority queue
  65  * ESTCPURAMP   - number of scheduler ticks for estcpu to switch queues
  66  *
  67  * ESTCPUPPQ    - number of estcpu units per priority queue
  68  * ESTCPUMAX    - number of estcpu units
  69  * ESTCPUINCR   - amount we have to increment p_estcpu per scheduling tick at
  70  *                100% cpu.
  71  */
  72 #define NICEPPQ         2
  73 #define ESTCPURAMP      4
  74 #define ESTCPUPPQ       512
  75 #define ESTCPUMAX       (ESTCPUPPQ * NQS)
  76 #define ESTCPUINCR      (ESTCPUPPQ / ESTCPURAMP)
  77 #define PRIO_RANGE      (PRIO_MAX - PRIO_MIN + 1)
  78
  79 #define ESTCPULIM(v)    min((v), ESTCPUMAX)
  80
  81 TAILQ_HEAD(rq, lwp);
  82
  83 #define lwp_priority    lwp_usdata.bsd4.priority
  84 #define lwp_rqindex     lwp_usdata.bsd4.rqindex
  85 #define lwp_origcpu     lwp_usdata.bsd4.origcpu
  86 #define lwp_estcpu      lwp_usdata.bsd4.estcpu
  87 #define lwp_rqtype      lwp_usdata.bsd4.rqtype
  88
  89 static void bsd4_acquire_curproc(struct lwp *lp);
  90 static void bsd4_release_curproc(struct lwp *lp);
  91 static void bsd4_select_curproc(globaldata_t gd);
  92 static void bsd4_setrunqueue(struct lwp *lp);
  93 static void bsd4_schedulerclock(struct lwp *lp, sysclock_t period,
  94                                 sysclock_t cpstamp);
  95 static void bsd4_recalculate_estcpu(struct lwp *lp);
  96 static void bsd4_resetpriority(struct lwp *lp);
  97 static void bsd4_forking(struct lwp *plp, struct lwp *lp);
  98 static void bsd4_exiting(struct lwp *plp, struct lwp *lp);
  99
 100 #ifdef SMP
 101 static void need_user_resched_remote(void *dummy);
 102 #endif
 103 static struct lwp *chooseproc_locked(struct lwp *chklp);
 104 static void bsd4_remrunqueue_locked(struct lwp *lp);
 105 static void bsd4_setrunqueue_locked(struct lwp *lp);
 106
 107 struct usched usched_bsd4 = {
 108         { NULL },
 109         "bsd4", "Original DragonFly Scheduler",
 110         NULL,                   /* default registration */
 111         NULL,                   /* default deregistration */
 112         bsd4_acquire_curproc,
 113         bsd4_release_curproc,
 114         bsd4_setrunqueue,
 115         bsd4_schedulerclock,
 116         bsd4_recalculate_estcpu,
 117         bsd4_resetpriority,
 118         bsd4_forking,
 119         bsd4_exiting,
 120         NULL                    /* setcpumask not supported */
 121 };
 122
 123 struct usched_bsd4_pcpu {
 124         struct thread helper_thread;
 125         short   rrcount;
 126         short   upri;
 127         struct lwp *uschedcp;
 128 };
 129
 130 typedef struct usched_bsd4_pcpu *bsd4_pcpu_t;
 131
 132 /*
 133  * We have NQS (32) run queues per scheduling class.  For the normal
 134  * class, there are 128 priorities scaled onto these 32 queues.  New
 135  * processes are added to the last entry in each queue, and processes
 136  * are selected for running by taking them from the head and maintaining
 137  * a simple FIFO arrangement.  Realtime and Idle priority processes have
 138  * and explicit 0-31 priority which maps directly onto their class queue
 139  * index.  When a queue has something in it, the corresponding bit is
 140  * set in the queuebits variable, allowing a single read to determine
 141  * the state of all 32 queues and then a ffs() to find the first busy
 142  * queue.
 143  */
 144 static struct rq bsd4_queues[NQS];
 145 static struct rq bsd4_rtqueues[NQS];
 146 static struct rq bsd4_idqueues[NQS];
 147 static u_int32_t bsd4_queuebits;
 148 static u_int32_t bsd4_rtqueuebits;
 149 static u_int32_t bsd4_idqueuebits;
 150 static cpumask_t bsd4_curprocmask = -1; /* currently running a user process */
 151 static cpumask_t bsd4_rdyprocmask;      /* ready to accept a user process */
 152 static int       bsd4_runqcount;
 153 #ifdef SMP
 154 static volatile int bsd4_scancpu;
 155 #endif
 156 static struct spinlock bsd4_spin;
 157 static struct usched_bsd4_pcpu bsd4_pcpu[MAXCPU];
 158
 159 SYSCTL_INT(_debug, OID_AUTO, bsd4_runqcount, CTLFLAG_RD, &bsd4_runqcount, 0, "");
 160 #ifdef INVARIANTS
 161 static int usched_nonoptimal;
 162 SYSCTL_INT(_debug, OID_AUTO, usched_nonoptimal, CTLFLAG_RW,
 163         &usched_nonoptimal, 0, "acquire_curproc() was not optimal");
 164 static int usched_optimal;
 165 SYSCTL_INT(_debug, OID_AUTO, usched_optimal, CTLFLAG_RW,
 166         &usched_optimal, 0, "acquire_curproc() was optimal");
 167 #endif
 168 static int usched_debug = -1;
 169 SYSCTL_INT(_debug, OID_AUTO, scdebug, CTLFLAG_RW, &usched_debug, 0, "");
 170 #ifdef SMP
 171 static int remote_resched_nonaffinity;
 172 static int remote_resched_affinity;
 173 static int choose_affinity;
 174 SYSCTL_INT(_debug, OID_AUTO, remote_resched_nonaffinity, CTLFLAG_RD,
 175         &remote_resched_nonaffinity, 0, "Number of remote rescheds");
 176 SYSCTL_INT(_debug, OID_AUTO, remote_resched_affinity, CTLFLAG_RD,
 177         &remote_resched_affinity, 0, "Number of remote rescheds");
 178 SYSCTL_INT(_debug, OID_AUTO, choose_affinity, CTLFLAG_RD,
 179         &choose_affinity, 0, "chooseproc() was smart");
 180 #endif
 181
 182 static int usched_bsd4_rrinterval = (ESTCPUFREQ + 9) / 10;
 183 SYSCTL_INT(_kern, OID_AUTO, usched_bsd4_rrinterval, CTLFLAG_RW,
 184         &usched_bsd4_rrinterval, 0, "");
 185 static int usched_bsd4_decay = ESTCPUINCR / 2;
 186 SYSCTL_INT(_kern, OID_AUTO, usched_bsd4_decay, CTLFLAG_RW,
 187         &usched_bsd4_decay, 0, "");
 188
 189 /*
 190  * Initialize the run queues at boot time.
 191  */
 192 static void
 193 rqinit(void *dummy)
 194 {
 195         int i;
 196
 197         spin_init(&bsd4_spin);
 198         for (i = 0; i < NQS; i++) {
 199                 TAILQ_INIT(&bsd4_queues[i]);
 200                 TAILQ_INIT(&bsd4_rtqueues[i]);
 201                 TAILQ_INIT(&bsd4_idqueues[i]);
 202         }
 203         atomic_clear_int(&bsd4_curprocmask, 1);
 204 }
 205 SYSINIT(runqueue, SI_BOOT2_USCHED, SI_ORDER_FIRST, rqinit, NULL)
 206
 207 /*
 208  * BSD4_ACQUIRE_CURPROC
 209  *
 210  * This function is called when the kernel intends to return to userland.
 211  * It is responsible for making the thread the current designated userland
 212  * thread for this cpu, blocking if necessary.
 213  *
 214  * We are expected to handle userland reschedule requests here too.
 215  *
 216  * WARNING! THIS FUNCTION IS ALLOWED TO CAUSE THE CURRENT THREAD TO MIGRATE
 217  * TO ANOTHER CPU!  Because most of the kernel assumes that no migration will
 218  * occur, this function is called only under very controlled circumstances.
 219  *
 220  * Basically we recalculate our estcpu to hopefully give us a more
 221  * favorable disposition, setrunqueue, then wait for the curlwp
 222  * designation to be handed to us (if the setrunqueue didn't do it).
 223  *
 224  * MPSAFE
 225  */
 226 static void
 227 bsd4_acquire_curproc(struct lwp *lp)
 228 {
 229         globaldata_t gd = mycpu;
 230         bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid];
 231
 232         /*
 233          * Possibly select another thread, or keep the current thread.
 234          */
 235         if (user_resched_wanted())
 236                 bsd4_select_curproc(gd);
 237
 238         /*
 239          * If uschedcp is still pointing to us, we're done
 240          */
 241         if (dd->uschedcp == lp)
 242                 return;
 243
 244         /*
 245          * If this cpu has no current thread, and the run queue is
 246          * empty, we can safely select ourself.
 247          */
 248         if (dd->uschedcp == NULL && bsd4_runqcount == 0) {
 249                 atomic_set_int(&bsd4_curprocmask, gd->gd_cpumask);
 250                 dd->uschedcp = lp;
 251                 dd->upri = lp->lwp_priority;
 252                 return;
 253         }
 254
 255         /*
 256          * Adjust estcpu and recalculate our priority, then put us back on
 257          * the user process scheduler's runq.  Only increment the involuntary
 258          * context switch count if the setrunqueue call did not immediately
 259          * schedule us.
 260          *
 261          * Loop until we become the currently scheduled process.  Note that
 262          * calling setrunqueue can cause us to be migrated to another cpu
 263          * after we switch away.
 264          */
 265         do {
 266                 crit_enter();
 267                 bsd4_recalculate_estcpu(lp);
 268                 lwkt_deschedule_self(gd->gd_curthread);
 269                 bsd4_setrunqueue(lp);
 270                 if ((gd->gd_curthread->td_flags & TDF_RUNQ) == 0)
 271                         ++lp->lwp_ru.ru_nivcsw;
 272                 lwkt_switch();
 273                 crit_exit();
 274                 gd = mycpu;
 275                 dd = &bsd4_pcpu[gd->gd_cpuid];
 276         } while (dd->uschedcp != lp);
 277         KKASSERT((lp->lwp_flag & LWP_ONRUNQ) == 0);
 278 }
 279
 280 /*
 281  * BSD4_RELEASE_CURPROC
 282  *
 283  * This routine detaches the current thread from the userland scheduler,
 284  * usually because the thread needs to run in the kernel (at kernel priority)
 285  * for a while.
 286  *
 287  * This routine is also responsible for selecting a new thread to
 288  * make the current thread.
 289  *
 290  * NOTE: This implementation differs from the dummy example in that
 291  * bsd4_select_curproc() is able to select the current process, whereas
 292  * dummy_select_curproc() is not able to select the current process.
 293  * This means we have to NULL out uschedcp.
 294  *
 295  * Additionally, note that we may already be on a run queue if releasing
 296  * via the lwkt_switch() in bsd4_setrunqueue().
 297  *
 298  * WARNING!  The MP lock may be in an unsynchronized state due to the
 299  * way get_mplock() works and the fact that this function may be called
 300  * from a passive release during a lwkt_switch().   try_mplock() will deal
 301  * with this for us but you should be aware that td_mpcount may not be
 302  * useable.
 303  *
 304  * MPSAFE
 305  */
 306 static void
 307 bsd4_release_curproc(struct lwp *lp)
 308 {
 309         globaldata_t gd = mycpu;
 310         bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid];
 311
 312         if (dd->uschedcp == lp) {
 313                 /*
 314                  * Note: we leave ou curprocmask bit set to prevent
 315                  * unnecessary scheduler helper wakeups.
 316                  * bsd4_select_curproc() will clean it up.
 317                  */
 318                 KKASSERT((lp->lwp_flag & LWP_ONRUNQ) == 0);
 319                 dd->uschedcp = NULL;    /* don't let lp be selected */
 320                 bsd4_select_curproc(gd);
 321         }
 322 }
 323
 324 /*
 325  * BSD4_SELECT_CURPROC
 326  *
 327  * Select a new current process for this cpu.  This satisfies a user
 328  * scheduler reschedule request so clear that too.
 329  *
 330  * This routine is also responsible for equal-priority round-robining,
 331  * typically triggered from bsd4_schedulerclock().  In our dummy example
 332  * all the 'user' threads are LWKT scheduled all at once and we just
 333  * call lwkt_switch().
 334  *
 335  * MPSAFE
 336  */
 337 static
 338 void
 339 bsd4_select_curproc(globaldata_t gd)
 340 {
 341         bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid];
 342         struct lwp *nlp;
 343         int cpuid = gd->gd_cpuid;
 344
 345         crit_enter_gd(gd);
 346         clear_user_resched();   /* This satisfied the reschedule request */
 347         dd->rrcount = 0;        /* Reset the round-robin counter */
 348
 349         spin_lock_wr(&bsd4_spin);
 350         if ((nlp = chooseproc_locked(dd->uschedcp)) != NULL) {
 351                 atomic_set_int(&bsd4_curprocmask, 1 << cpuid);
 352                 dd->upri = nlp->lwp_priority;
 353                 dd->uschedcp = nlp;
 354                 spin_unlock_wr(&bsd4_spin);
 355 #ifdef SMP
 356                 lwkt_acquire(nlp->lwp_thread);
 357 #endif
 358                 lwkt_schedule(nlp->lwp_thread);
 359         } else if (dd->uschedcp) {
 360                 dd->upri = dd->uschedcp->lwp_priority;
 361                 spin_unlock_wr(&bsd4_spin);
 362                 KKASSERT(bsd4_curprocmask & (1 << cpuid));
 363         } else if (bsd4_runqcount && (bsd4_rdyprocmask & (1 << cpuid))) {
 364                 atomic_clear_int(&bsd4_curprocmask, 1 << cpuid);
 365                 atomic_clear_int(&bsd4_rdyprocmask, 1 << cpuid);
 366                 dd->uschedcp = NULL;
 367                 dd->upri = PRIBASE_NULL;
 368                 spin_unlock_wr(&bsd4_spin);
 369                 lwkt_schedule(&dd->helper_thread);
 370         } else {
 371                 dd->uschedcp = NULL;
 372                 dd->upri = PRIBASE_NULL;
 373                 atomic_clear_int(&bsd4_curprocmask, 1 << cpuid);
 374                 spin_unlock_wr(&bsd4_spin);
 375         }
 376         crit_exit_gd(gd);
 377 }
 378
 379 /*
 380  * BSD4_SETRUNQUEUE
 381  *
 382  * This routine is called to schedule a new user process after a fork.
 383  *
 384  * The caller may set P_PASSIVE_ACQ in p_flag to indicate that we should
 385  * attempt to leave the thread on the current cpu.
 386  *
 387  * If P_PASSIVE_ACQ is set setrunqueue() will not wakeup potential target
 388  * cpus in an attempt to keep the process on the current cpu at least for
 389  * a little while to take advantage of locality of reference (e.g. fork/exec
 390  * or short fork/exit, and uio_yield()).
 391  *
 392  * CPU AFFINITY: cpu affinity is handled by attempting to either schedule
 393  * or (user level) preempt on the same cpu that a process was previously
 394  * scheduled to.  If we cannot do this but we are at enough of a higher
 395  * priority then the processes running on other cpus, we will allow the
 396  * process to be stolen by another cpu.
 397  *
 398  * WARNING!  This routine cannot block.  bsd4_acquire_curproc() does
 399  * a deschedule/switch interlock and we can be moved to another cpu
 400  * the moment we are switched out.  Our LWKT run state is the only
 401  * thing preventing the transfer.
 402  *
 403  * The associated thread must NOT currently be scheduled (but can be the
 404  * current process after it has been LWKT descheduled).  It must NOT be on
 405  * a bsd4 scheduler queue either.  The purpose of this routine is to put
 406  * it on a scheduler queue or make it the current user process and LWKT
 407  * schedule it.  It is possible that the thread is in the middle of a LWKT
 408  * switchout on another cpu, lwkt_acquire() deals with that case.
 409  *
 410  * The process must be runnable.
 411  *
 412  * MPSAFE
 413  */
 414 static void
 415 bsd4_setrunqueue(struct lwp *lp)
 416 {
 417         globaldata_t gd;
 418         bsd4_pcpu_t dd;
 419         int cpuid;
 420 #ifdef SMP
 421         cpumask_t mask;
 422         cpumask_t tmpmask;
 423 #endif
 424
 425         /*
 426          * First validate the process state relative to the current cpu.
 427          * We don't need the spinlock for this, just a critical section.
 428          * We are in control of the process.
 429          */
 430         crit_enter();
 431         KASSERT(lp->lwp_stat == LSRUN, ("setrunqueue: lwp not LSRUN"));
 432         KASSERT((lp->lwp_flag & LWP_ONRUNQ) == 0,
 433             ("lwp %d/%d already on runq! flag %08x/%08x", lp->lwp_proc->p_pid,
 434              lp->lwp_tid, lp->lwp_proc->p_flag, lp->lwp_flag));
 435         KKASSERT((lp->lwp_thread->td_flags & TDF_RUNQ) == 0);
 436
 437         /*
 438          * Note: gd and dd are relative to the target thread's last cpu,
 439          * NOT our current cpu.
 440          */
 441         gd = lp->lwp_thread->td_gd;
 442         dd = &bsd4_pcpu[gd->gd_cpuid];
 443
 444         /*
 445          * This process is not supposed to be scheduled anywhere or assigned
 446          * as the current process anywhere.  Assert the condition.
 447          */
 448         KKASSERT(dd->uschedcp != lp);
 449
 450         /*
 451          * Check local cpu affinity.  The associated thread is stable at
 452          * the moment.  Note that we may be checking another cpu here so we
 453          * have to be careful.  We can only assign uschedcp on OUR cpu.
 454          *
 455          * This allows us to avoid actually queueing the process.
 456          * acquire_curproc() will handle any threads we mistakenly schedule.
 457          */
 458         cpuid = gd->gd_cpuid;
 459         if (gd == mycpu && (bsd4_curprocmask & (1 << cpuid)) == 0) {
 460                 atomic_set_int(&bsd4_curprocmask, 1 << cpuid);
 461                 dd->uschedcp = lp;
 462                 dd->upri = lp->lwp_priority;
 463                 lwkt_schedule(lp->lwp_thread);
 464                 crit_exit();
 465                 return;
 466         }
 467
 468         /*
 469          * gd and cpuid may still 'hint' at another cpu.  Even so we have
 470          * to place this process on the userland scheduler's run queue for
 471          * action by the target cpu.
 472          */
 473 #ifdef SMP
 474         /*
 475          * XXX fixme.  Could be part of a remrunqueue/setrunqueue
 476          * operation when the priority is recalculated, so TDF_MIGRATING
 477          * may already be set.
 478          */
 479         if ((lp->lwp_thread->td_flags & TDF_MIGRATING) == 0)
 480                 lwkt_giveaway(lp->lwp_thread);
 481 #endif
 482
 483         /*
 484          * We lose control of lp the moment we release the spinlock after
 485          * having placed lp on the queue.  i.e. another cpu could pick it
 486          * up and it could exit, or its priority could be further adjusted,
 487          * or something like that.
 488          */
 489         spin_lock_wr(&bsd4_spin);
 490         bsd4_setrunqueue_locked(lp);
 491
 492         /*
 493          * gd, dd, and cpuid are still our target cpu 'hint', not our current
 494          * cpu info.
 495          *
 496          * We always try to schedule a LWP to its original cpu first.  It
 497          * is possible for the scheduler helper or setrunqueue to assign
 498          * the LWP to a different cpu before the one we asked for wakes
 499          * up.
 500          *
 501          * If the LWP has higher priority (lower lwp_priority value) on
 502          * its target cpu, reschedule on that cpu.
 503          */
 504         if ((lp->lwp_thread->td_flags & TDF_NORESCHED) == 0) {
 505                 if ((dd->upri & ~PRIMASK) > (lp->lwp_priority & ~PRIMASK)) {
 506                         dd->upri = lp->lwp_priority;
 507                         spin_unlock_wr(&bsd4_spin);
 508 #ifdef SMP
 509                         if (gd == mycpu) {
 510                                 need_user_resched();
 511                         } else {
 512                                 lwkt_send_ipiq(gd, need_user_resched_remote,
 513                                                NULL);
 514                         }
 515 #else
 516                         need_user_resched();
 517 #endif
 518                         crit_exit();
 519                         return;
 520                 }
 521         }
 522         spin_unlock_wr(&bsd4_spin);
 523
 524 #ifdef SMP
 525         /*
 526          * Otherwise the LWP has a lower priority or we were asked not
 527          * to reschedule.  Look for an idle cpu whos scheduler helper
 528          * is ready to accept more work.
 529          *
 530          * Look for an idle cpu starting at our rotator (bsd4_scancpu).
 531          *
 532          * If no cpus are ready to accept work, just return.
 533          *
 534          * XXX P_PASSIVE_ACQ
 535          */
 536         mask = ~bsd4_curprocmask & bsd4_rdyprocmask & mycpu->gd_other_cpus &
 537             lp->lwp_cpumask;
 538         if (mask) {
 539                 cpuid = bsd4_scancpu;
 540                 if (++cpuid == ncpus)
 541                         cpuid = 0;
 542                 tmpmask = ~((1 << cpuid) - 1);
 543                 if (mask & tmpmask)
 544                         cpuid = bsfl(mask & tmpmask);
 545                 else
 546                         cpuid = bsfl(mask);
 547                 atomic_clear_int(&bsd4_rdyprocmask, 1 << cpuid);
 548                 bsd4_scancpu = cpuid;
 549                 lwkt_schedule(&bsd4_pcpu[cpuid].helper_thread);
 550         }
 551 #endif
 552         crit_exit();
 553 }
 554
 555 /*
 556  * This routine is called from a systimer IPI.  It MUST be MP-safe and
 557  * the BGL IS NOT HELD ON ENTRY.  This routine is called at ESTCPUFREQ on
 558  * each cpu.
 559  *
 560  * Because this is effectively a 'fast' interrupt, we cannot safely
 561  * use spinlocks unless gd_spinlock_rd is NULL and gd_spinlocks_wr is 0,
 562  * even if the spinlocks are 'non conflicting'.  This is due to the way
 563  * spinlock conflicts against cached read locks are handled.
 564  *
 565  * MPSAFE
 566  */
 567 static
 568 void
 569 bsd4_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp)
 570 {
 571         globaldata_t gd = mycpu;
 572         bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid];
 573
 574         /*
 575          * Do we need to round-robin?  We round-robin 10 times a second.
 576          * This should only occur for cpu-bound batch processes.
 577          */
 578         if (++dd->rrcount >= usched_bsd4_rrinterval) {
 579                 dd->rrcount = 0;
 580                 need_user_resched();
 581         }
 582
 583         /*
 584          * As the process accumulates cpu time p_estcpu is bumped and may
 585          * push the process into another scheduling queue.  It typically
 586          * takes 4 ticks to bump the queue.
 587          */
 588         lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu + ESTCPUINCR);
 589
 590         /*
 591          * Reducing p_origcpu over time causes more of our estcpu to be
 592          * returned to the parent when we exit.  This is a small tweak
 593          * for the batch detection heuristic.
 594          */
 595         if (lp->lwp_origcpu)
 596                 --lp->lwp_origcpu;
 597
 598         /*
 599          * We can only safely call bsd4_resetpriority(), which uses spinlocks,
 600          * if we aren't interrupting a thread that is using spinlocks.
 601          * Otherwise we can deadlock with another cpu waiting for our read
 602          * spinlocks to clear.
 603          */
 604         if (gd->gd_spinlock_rd == NULL && gd->gd_spinlocks_wr == 0)
 605                 bsd4_resetpriority(lp);
 606         else
 607                 need_user_resched();
 608 }
 609
 610 /*
 611  * Called from acquire and from kern_synch's one-second timer (one of the
 612  * callout helper threads) with a critical section held.
 613  *
 614  * Decay p_estcpu based on the number of ticks we haven't been running
 615  * and our p_nice.  As the load increases each process observes a larger
 616  * number of idle ticks (because other processes are running in them).
 617  * This observation leads to a larger correction which tends to make the
 618  * system more 'batchy'.
 619  *
 620  * Note that no recalculation occurs for a process which sleeps and wakes
 621  * up in the same tick.  That is, a system doing thousands of context
 622  * switches per second will still only do serious estcpu calculations
 623  * ESTCPUFREQ times per second.
 624  *
 625  * MPSAFE
 626  */
 627 static
 628 void
 629 bsd4_recalculate_estcpu(struct lwp *lp)
 630 {
 631         globaldata_t gd = mycpu;
 632         sysclock_t cpbase;
 633         int loadfac;
 634         int ndecay;
 635         int nticks;
 636         int nleft;
 637
 638         /*
 639          * We have to subtract periodic to get the last schedclock
 640          * timeout time, otherwise we would get the upcoming timeout.
 641          * Keep in mind that a process can migrate between cpus and
 642          * while the scheduler clock should be very close, boundary
 643          * conditions could lead to a small negative delta.
 644          */
 645         cpbase = gd->gd_schedclock.time - gd->gd_schedclock.periodic;
 646
 647         if (lp->lwp_slptime > 1) {
 648                 /*
 649                  * Too much time has passed, do a coarse correction.
 650                  */
 651                 lp->lwp_estcpu = lp->lwp_estcpu >> 1;
 652                 bsd4_resetpriority(lp);
 653                 lp->lwp_cpbase = cpbase;
 654                 lp->lwp_cpticks = 0;
 655         } else if (lp->lwp_cpbase != cpbase) {
 656                 /*
 657                  * Adjust estcpu if we are in a different tick.  Don't waste
 658                  * time if we are in the same tick.
 659                  *
 660                  * First calculate the number of ticks in the measurement
 661                  * interval.  The nticks calculation can wind up 0 due to
 662                  * a bug in the handling of lwp_slptime  (as yet not found),
 663                  * so make sure we do not get a divide by 0 panic.
 664                  */
 665                 nticks = (cpbase - lp->lwp_cpbase) / gd->gd_schedclock.periodic;
 666                 if (nticks <= 0)
 667                         nticks = 1;
 668                 updatepcpu(lp, lp->lwp_cpticks, nticks);
 669
 670                 if ((nleft = nticks - lp->lwp_cpticks) < 0)
 671                         nleft = 0;
 672                 if (usched_debug == lp->lwp_proc->p_pid) {
 673                         kprintf("pid %d tid %d estcpu %d cpticks %d nticks %d nleft %d",
 674                                 lp->lwp_proc->p_pid, lp->lwp_tid, lp->lwp_estcpu,
 675                                 lp->lwp_cpticks, nticks, nleft);
 676                 }
 677
 678                 /*
 679                  * Calculate a decay value based on ticks remaining scaled
 680                  * down by the instantanious load and p_nice.
 681                  */
 682                 if ((loadfac = bsd4_runqcount) < 2)
 683                         loadfac = 2;
 684                 ndecay = nleft * usched_bsd4_decay * 2 *
 685                         (PRIO_MAX * 2 - lp->lwp_proc->p_nice) / (loadfac * PRIO_MAX * 2);
 686
 687                 /*
 688                  * Adjust p_estcpu.  Handle a border case where batch jobs
 689                  * can get stalled long enough to decay to zero when they
 690                  * shouldn't.
 691                  */
 692                 if (lp->lwp_estcpu > ndecay * 2)
 693                         lp->lwp_estcpu -= ndecay;
 694                 else
 695                         lp->lwp_estcpu >>= 1;
 696
 697                 if (usched_debug == lp->lwp_proc->p_pid)
 698                         kprintf(" ndecay %d estcpu %d\n", ndecay, lp->lwp_estcpu);
 699                 bsd4_resetpriority(lp);
 700                 lp->lwp_cpbase = cpbase;
 701                 lp->lwp_cpticks = 0;
 702         }
 703 }
 704
 705 /*
 706  * Compute the priority of a process when running in user mode.
 707  * Arrange to reschedule if the resulting priority is better
 708  * than that of the current process.
 709  *
 710  * This routine may be called with any process.
 711  *
 712  * This routine is called by fork1() for initial setup with the process
 713  * of the run queue, and also may be called normally with the process on or
 714  * off the run queue.
 715  *
 716  * MPSAFE
 717  */
 718 static void
 719 bsd4_resetpriority(struct lwp *lp)
 720 {
 721         bsd4_pcpu_t dd;
 722         int newpriority;
 723         u_short newrqtype;
 724         int reschedcpu;
 725
 726         /*
 727          * Calculate the new priority and queue type
 728          */
 729         crit_enter();
 730         spin_lock_wr(&bsd4_spin);
 731
 732         newrqtype = lp->lwp_rtprio.type;
 733
 734         switch(newrqtype) {
 735         case RTP_PRIO_REALTIME:
 736         case RTP_PRIO_FIFO:
 737                 newpriority = PRIBASE_REALTIME +
 738                              (lp->lwp_rtprio.prio & PRIMASK);
 739                 break;
 740         case RTP_PRIO_NORMAL:
 741                 newpriority = (lp->lwp_proc->p_nice - PRIO_MIN) * PPQ / NICEPPQ;
 742                 newpriority += lp->lwp_estcpu * PPQ / ESTCPUPPQ;
 743                 newpriority = newpriority * MAXPRI / (PRIO_RANGE * PPQ /
 744                               NICEPPQ + ESTCPUMAX * PPQ / ESTCPUPPQ);
 745                 newpriority = PRIBASE_NORMAL + (newpriority & PRIMASK);
 746                 break;
 747         case RTP_PRIO_IDLE:
 748                 newpriority = PRIBASE_IDLE + (lp->lwp_rtprio.prio & PRIMASK);
 749                 break;
 750         case RTP_PRIO_THREAD:
 751                 newpriority = PRIBASE_THREAD + (lp->lwp_rtprio.prio & PRIMASK);
 752                 break;
 753         default:
 754                 panic("Bad RTP_PRIO %d", newrqtype);
 755                 /* NOT REACHED */
 756         }
 757
 758         /*
 759          * The newpriority incorporates the queue type so do a simple masked
 760          * check to determine if the process has moved to another queue.  If
 761          * it has, and it is currently on a run queue, then move it.
 762          */
 763         if ((lp->lwp_priority ^ newpriority) & ~PPQMASK) {
 764                 lp->lwp_priority = newpriority;
 765                 if (lp->lwp_flag & LWP_ONRUNQ) {
 766                         bsd4_remrunqueue_locked(lp);
 767                         lp->lwp_rqtype = newrqtype;
 768                         lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ;
 769                         bsd4_setrunqueue_locked(lp);
 770                         reschedcpu = lp->lwp_thread->td_gd->gd_cpuid;
 771                 } else {
 772                         lp->lwp_rqtype = newrqtype;
 773                         lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ;
 774                         reschedcpu = -1;
 775                 }
 776         } else {
 777                 lp->lwp_priority = newpriority;
 778                 reschedcpu = -1;
 779         }
 780         spin_unlock_wr(&bsd4_spin);
 781
 782         /*
 783          * Determine if we need to reschedule the target cpu.  This only
 784          * occurs if the LWP is already on a scheduler queue, which means
 785          * that idle cpu notification has already occured.  At most we
 786          * need only issue a need_user_resched() on the appropriate cpu.
 787          */
 788         if (reschedcpu >= 0) {
 789                 dd = &bsd4_pcpu[reschedcpu];
 790                 KKASSERT(dd->uschedcp != lp);
 791                 if ((dd->upri & ~PRIMASK) > (lp->lwp_priority & ~PRIMASK)) {
 792                         dd->upri = lp->lwp_priority;
 793 #ifdef SMP
 794                         if (reschedcpu == mycpu->gd_cpuid) {
 795                                 need_user_resched();
 796                         } else {
 797                                 lwkt_send_ipiq(lp->lwp_thread->td_gd,
 798                                                need_user_resched_remote, NULL);
 799                         }
 800 #else
 801                         need_user_resched();
 802 #endif
 803                 }
 804         }
 805         crit_exit();
 806 }
 807
 808 /*
 809  * Called from fork1() when a new child process is being created.
 810  *
 811  * Give the child process an initial estcpu that is more batch then
 812  * its parent and dock the parent for the fork (but do not
 813  * reschedule the parent).   This comprises the main part of our batch
 814  * detection heuristic for both parallel forking and sequential execs.
 815  *
 816  * Interactive processes will decay the boosted estcpu quickly while batch
 817  * processes will tend to compound it.
 818  * XXX lwp should be "spawning" instead of "forking"
 819  *
 820  * MPSAFE
 821  */
 822 static void
 823 bsd4_forking(struct lwp *plp, struct lwp *lp)
 824 {
 825         lp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ);
 826         lp->lwp_origcpu = lp->lwp_estcpu;
 827         plp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ);
 828 }
 829
 830 /*
 831  * Called when the parent reaps a child.   Propogate cpu use by the child
 832  * back to the parent.
 833  *
 834  * MPSAFE
 835  */
 836 static void
 837 bsd4_exiting(struct lwp *plp, struct lwp *lp)
 838 {
 839         int delta;
 840
 841         if (plp->lwp_proc->p_pid != 1) {
 842                 delta = lp->lwp_estcpu - lp->lwp_origcpu;
 843                 if (delta > 0)
 844                         plp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + delta);
 845         }
 846 }
 847
 848
 849 /*
 850  * chooseproc() is called when a cpu needs a user process to LWKT schedule,
 851  * it selects a user process and returns it.  If chklp is non-NULL and chklp
 852  * has a better or equal priority then the process that would otherwise be
 853  * chosen, NULL is returned.
 854  *
 855  * Until we fix the RUNQ code the chklp test has to be strict or we may
 856  * bounce between processes trying to acquire the current process designation.
 857  *
 858  * MPSAFE - must be called with bsd4_spin exclusive held.  The spinlock is
 859  *          left intact through the entire routine.
 860  */
 861 static
 862 struct lwp *
 863 chooseproc_locked(struct lwp *chklp)
 864 {
 865         struct lwp *lp;
 866         struct rq *q;
 867         u_int32_t *which, *which2;
 868         u_int32_t pri;
 869         u_int32_t rtqbits;
 870         u_int32_t tsqbits;
 871         u_int32_t idqbits;
 872         cpumask_t cpumask;
 873
 874         rtqbits = bsd4_rtqueuebits;
 875         tsqbits = bsd4_queuebits;
 876         idqbits = bsd4_idqueuebits;
 877         cpumask = mycpu->gd_cpumask;
 878
 879 #ifdef SMP
 880 again:
 881 #endif
 882         if (rtqbits) {
 883                 pri = bsfl(rtqbits);
 884                 q = &bsd4_rtqueues[pri];
 885                 which = &bsd4_rtqueuebits;
 886                 which2 = &rtqbits;
 887         } else if (tsqbits) {
 888                 pri = bsfl(tsqbits);
 889                 q = &bsd4_queues[pri];
 890                 which = &bsd4_queuebits;
 891                 which2 = &tsqbits;
 892         } else if (idqbits) {
 893                 pri = bsfl(idqbits);
 894                 q = &bsd4_idqueues[pri];
 895                 which = &bsd4_idqueuebits;
 896                 which2 = &idqbits;
 897         } else {
 898                 return NULL;
 899         }
 900         lp = TAILQ_FIRST(q);
 901         KASSERT(lp, ("chooseproc: no lwp on busy queue"));
 902
 903 #ifdef SMP
 904         while ((lp->lwp_cpumask & cpumask) == 0) {
 905                 lp = TAILQ_NEXT(lp, lwp_procq);
 906                 if (lp == NULL) {
 907                         *which2 &= ~(1 << pri);
 908                         goto again;
 909                 }
 910         }
 911 #endif
 912
 913         /*
 914          * If the passed lwp <chklp> is reasonably close to the selected
 915          * lwp <lp>, return NULL (indicating that <chklp> should be kept).
 916          *
 917          * Note that we must error on the side of <chklp> to avoid bouncing
 918          * between threads in the acquire code.
 919          */
 920         if (chklp) {
 921                 if (chklp->lwp_priority < lp->lwp_priority + PPQ)
 922                         return(NULL);
 923         }
 924
 925 #ifdef SMP
 926         /*
 927          * If the chosen lwp does not reside on this cpu spend a few
 928          * cycles looking for a better candidate at the same priority level.
 929          * This is a fallback check, setrunqueue() tries to wakeup the
 930          * correct cpu and is our front-line affinity.
 931          */
 932         if (lp->lwp_thread->td_gd != mycpu &&
 933             (chklp = TAILQ_NEXT(lp, lwp_procq)) != NULL
 934         ) {
 935                 if (chklp->lwp_thread->td_gd == mycpu) {
 936                         ++choose_affinity;
 937                         lp = chklp;
 938                 }
 939         }
 940 #endif
 941
 942         TAILQ_REMOVE(q, lp, lwp_procq);
 943         --bsd4_runqcount;
 944         if (TAILQ_EMPTY(q))
 945                 *which &= ~(1 << pri);
 946         KASSERT((lp->lwp_flag & LWP_ONRUNQ) != 0, ("not on runq6!"));
 947         lp->lwp_flag &= ~LWP_ONRUNQ;
 948         return lp;
 949 }
 950
 951 #ifdef SMP
 952 /*
 953  * Called via an ipi message to reschedule on another cpu.
 954  *
 955  * MPSAFE
 956  */
 957 static
 958 void
 959 need_user_resched_remote(void *dummy)
 960 {
 961         need_user_resched();
 962 }
 963
 964 #endif
 965
 966
 967 /*
 968  * bsd4_remrunqueue_locked() removes a given process from the run queue
 969  * that it is on, clearing the queue busy bit if it becomes empty.
 970  *
 971  * Note that user process scheduler is different from the LWKT schedule.
 972  * The user process scheduler only manages user processes but it uses LWKT
 973  * underneath, and a user process operating in the kernel will often be
 974  * 'released' from our management.
 975  *
 976  * MPSAFE - bsd4_spin must be held exclusively on call
 977  */
 978 static void
 979 bsd4_remrunqueue_locked(struct lwp *lp)
 980 {
 981         struct rq *q;
 982         u_int32_t *which;
 983         u_int8_t pri;
 984
 985         KKASSERT(lp->lwp_flag & LWP_ONRUNQ);
 986         lp->lwp_flag &= ~LWP_ONRUNQ;
 987         --bsd4_runqcount;
 988         KKASSERT(bsd4_runqcount >= 0);
 989
 990         pri = lp->lwp_rqindex;
 991         switch(lp->lwp_rqtype) {
 992         case RTP_PRIO_NORMAL:
 993                 q = &bsd4_queues[pri];
 994                 which = &bsd4_queuebits;
 995                 break;
 996         case RTP_PRIO_REALTIME:
 997         case RTP_PRIO_FIFO:
 998                 q = &bsd4_rtqueues[pri];
 999                 which = &bsd4_rtqueuebits;
1000                 break;
1001         case RTP_PRIO_IDLE:
1002                 q = &bsd4_idqueues[pri];
1003                 which = &bsd4_idqueuebits;
1004                 break;
1005         default:
1006                 panic("remrunqueue: invalid rtprio type");
1007                 /* NOT REACHED */
1008         }
1009         TAILQ_REMOVE(q, lp, lwp_procq);
1010         if (TAILQ_EMPTY(q)) {
1011                 KASSERT((*which & (1 << pri)) != 0,
1012                         ("remrunqueue: remove from empty queue"));
1013                 *which &= ~(1 << pri);
1014         }
1015 }
1016
1017 /*
1018  * bsd4_setrunqueue_locked()
1019  *
1020  * Add a process whos rqtype and rqindex had previously been calculated
1021  * onto the appropriate run queue.   Determine if the addition requires
1022  * a reschedule on a cpu and return the cpuid or -1.
1023  *
1024  * NOTE: Lower priorities are better priorities.
1025  *
1026  * MPSAFE - bsd4_spin must be held exclusively on call
1027  */
1028 static void
1029 bsd4_setrunqueue_locked(struct lwp *lp)
1030 {
1031         struct rq *q;
1032         u_int32_t *which;
1033         int pri;
1034
1035         KKASSERT((lp->lwp_flag & LWP_ONRUNQ) == 0);
1036         lp->lwp_flag |= LWP_ONRUNQ;
1037         ++bsd4_runqcount;
1038
1039         pri = lp->lwp_rqindex;
1040
1041         switch(lp->lwp_rqtype) {
1042         case RTP_PRIO_NORMAL:
1043                 q = &bsd4_queues[pri];
1044                 which = &bsd4_queuebits;
1045                 break;
1046         case RTP_PRIO_REALTIME:
1047         case RTP_PRIO_FIFO:
1048                 q = &bsd4_rtqueues[pri];
1049                 which = &bsd4_rtqueuebits;
1050                 break;
1051         case RTP_PRIO_IDLE:
1052                 q = &bsd4_idqueues[pri];
1053                 which = &bsd4_idqueuebits;
1054                 break;
1055         default:
1056                 panic("remrunqueue: invalid rtprio type");
1057                 /* NOT REACHED */
1058         }
1059
1060         /*
1061          * Add to the correct queue and set the appropriate bit.  If no
1062          * lower priority (i.e. better) processes are in the queue then
1063          * we want a reschedule, calculate the best cpu for the job.
1064          *
1065          * Always run reschedules on the LWPs original cpu.
1066          */
1067         TAILQ_INSERT_TAIL(q, lp, lwp_procq);
1068         *which |= 1 << pri;
1069 }
1070
1071 #ifdef SMP
1072
1073 /*
1074  * For SMP systems a user scheduler helper thread is created for each
1075  * cpu and is used to allow one cpu to wakeup another for the purposes of
1076  * scheduling userland threads from setrunqueue().  UP systems do not
1077  * need the helper since there is only one cpu.  We can't use the idle
1078  * thread for this because we need to hold the MP lock.  Additionally,
1079  * doing things this way allows us to HLT idle cpus on MP systems.
1080  *
1081  * MPSAFE
1082  */
1083 static void
1084 sched_thread(void *dummy)
1085 {
1086     globaldata_t gd;
1087     bsd4_pcpu_t  dd;
1088     struct lwp *nlp;
1089     cpumask_t cpumask;
1090     cpumask_t tmpmask;
1091     int cpuid;
1092     int tmpid;
1093
1094     gd = mycpu;
1095     cpuid = gd->gd_cpuid;       /* doesn't change */
1096     cpumask = 1 << cpuid;       /* doesn't change */
1097     dd = &bsd4_pcpu[cpuid];
1098
1099     /*
1100      * The scheduler thread does not need to hold the MP lock.  Since we
1101      * are woken up only when no user processes are scheduled on a cpu, we
1102      * can run at an ultra low priority.
1103      */
1104     rel_mplock();
1105     lwkt_setpri_self(TDPRI_USER_SCHEDULER);
1106
1107     for (;;) {
1108         /*
1109          * We use the LWKT deschedule-interlock trick to avoid racing
1110          * bsd4_rdyprocmask.  This means we cannot block through to the
1111          * manual lwkt_switch() call we make below.
1112          */
1113         crit_enter_gd(gd);
1114         lwkt_deschedule_self(gd->gd_curthread);
1115         spin_lock_wr(&bsd4_spin);
1116         atomic_set_int(&bsd4_rdyprocmask, cpumask);
1117         if ((bsd4_curprocmask & cpumask) == 0) {
1118                 if ((nlp = chooseproc_locked(NULL)) != NULL) {
1119                         atomic_set_int(&bsd4_curprocmask, cpumask);
1120                         dd->upri = nlp->lwp_priority;
1121                         dd->uschedcp = nlp;
1122                         spin_unlock_wr(&bsd4_spin);
1123                         lwkt_acquire(nlp->lwp_thread);
1124                         lwkt_schedule(nlp->lwp_thread);
1125                 } else {
1126                         spin_unlock_wr(&bsd4_spin);
1127                 }
1128         } else {
1129                 /*
1130                  * Someone scheduled us but raced.  In order to not lose
1131                  * track of the fact that there may be a LWP ready to go,
1132                  * forward the request to another cpu if available.
1133                  *
1134                  * Rotate through cpus starting with cpuid + 1.  Since cpuid
1135                  * is already masked out by gd_other_cpus, just use ~cpumask.
1136                  */
1137                 tmpmask = ~bsd4_curprocmask & bsd4_rdyprocmask &
1138                           mycpu->gd_other_cpus;
1139                 if (tmpmask) {
1140                         if (tmpmask & ~(cpumask - 1))
1141                                 tmpid = bsfl(tmpmask & ~(cpumask - 1));
1142                         else
1143                                 tmpid = bsfl(tmpmask);
1144                         bsd4_scancpu = tmpid;
1145                         atomic_clear_int(&bsd4_rdyprocmask, 1 << tmpid);
1146                         spin_unlock_wr(&bsd4_spin);
1147                         lwkt_schedule(&bsd4_pcpu[tmpid].helper_thread);
1148                 } else {
1149                         spin_unlock_wr(&bsd4_spin);
1150                 }
1151         }
1152         crit_exit_gd(gd);
1153         lwkt_switch();
1154     }
1155 }
1156
1157 /*
1158  * Setup our scheduler helpers.  Note that curprocmask bit 0 has already
1159  * been cleared by rqinit() and we should not mess with it further.
1160  */
1161 static void
1162 sched_thread_cpu_init(void)
1163 {
1164     int i;
1165
1166     if (bootverbose)
1167         kprintf("start scheduler helpers on cpus:");
1168
1169     for (i = 0; i < ncpus; ++i) {
1170         bsd4_pcpu_t dd = &bsd4_pcpu[i];
1171         cpumask_t mask = 1 << i;
1172
1173         if ((mask & smp_active_mask) == 0)
1174             continue;
1175
1176         if (bootverbose)
1177             kprintf(" %d", i);
1178
1179         lwkt_create(sched_thread, NULL, NULL, &dd->helper_thread,
1180                     TDF_STOPREQ, i, "usched %d", i);
1181
1182         /*
1183          * Allow user scheduling on the target cpu.  cpu #0 has already
1184          * been enabled in rqinit().
1185          */
1186         if (i)
1187             atomic_clear_int(&bsd4_curprocmask, mask);
1188         atomic_set_int(&bsd4_rdyprocmask, mask);
1189     }
1190     if (bootverbose)
1191         kprintf("\n");
1192 }
1193 SYSINIT(uschedtd, SI_BOOT2_USCHED, SI_ORDER_SECOND,
1194         sched_thread_cpu_init, NULL)
1195
1196 #endif
1197