sys/kern/usched_bsd4.c

   1 /*
   2  * Copyright (c) 1999 Peter Wemm <peter@FreeBSD.org>
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  *
  14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  24  * SUCH DAMAGE.
  25  */
  26
  27 #include <sys/param.h>
  28 #include <sys/systm.h>
  29 #include <sys/kernel.h>
  30 #include <sys/lock.h>
  31 #include <sys/queue.h>
  32 #include <sys/proc.h>
  33 #include <sys/rtprio.h>
  34 #include <sys/uio.h>
  35 #include <sys/sysctl.h>
  36 #include <sys/resourcevar.h>
  37 #include <sys/spinlock.h>
  38 #include <sys/cpu_topology.h>
  39 #include <sys/thread2.h>
  40 #include <sys/spinlock2.h>
  41 #include <sys/mplock2.h>
  42
  43 #include <sys/ktr.h>
  44
  45 #include <machine/cpu.h>
  46 #include <machine/smp.h>
  47
  48 /*
  49  * Priorities.  Note that with 32 run queues per scheduler each queue
  50  * represents four priority levels.
  51  */
  52
  53 #define MAXPRI                  128
  54 #define PRIMASK                 (MAXPRI - 1)
  55 #define PRIBASE_REALTIME        0
  56 #define PRIBASE_NORMAL          MAXPRI
  57 #define PRIBASE_IDLE            (MAXPRI * 2)
  58 #define PRIBASE_THREAD          (MAXPRI * 3)
  59 #define PRIBASE_NULL            (MAXPRI * 4)
  60
  61 #define NQS     32                      /* 32 run queues. */
  62 #define PPQ     (MAXPRI / NQS)          /* priorities per queue */
  63 #define PPQMASK (PPQ - 1)
  64
  65 /*
  66  * NICEPPQ      - number of nice units per priority queue
  67  *
  68  * ESTCPUPPQ    - number of estcpu units per priority queue
  69  * ESTCPUMAX    - number of estcpu units
  70  */
  71 #define NICEPPQ         2
  72 #define ESTCPUPPQ       512
  73 #define ESTCPUMAX       (ESTCPUPPQ * NQS)
  74 #define BATCHMAX        (ESTCPUFREQ * 30)
  75 #define PRIO_RANGE      (PRIO_MAX - PRIO_MIN + 1)
  76
  77 #define ESTCPULIM(v)    min((v), ESTCPUMAX)
  78
  79 TAILQ_HEAD(rq, lwp);
  80
  81 #define lwp_priority    lwp_usdata.bsd4.priority
  82 #define lwp_rqindex     lwp_usdata.bsd4.rqindex
  83 #define lwp_estcpu      lwp_usdata.bsd4.estcpu
  84 #define lwp_batch       lwp_usdata.bsd4.batch
  85 #define lwp_rqtype      lwp_usdata.bsd4.rqtype
  86
  87 static void bsd4_acquire_curproc(struct lwp *lp);
  88 static void bsd4_release_curproc(struct lwp *lp);
  89 static void bsd4_select_curproc(globaldata_t gd);
  90 static void bsd4_setrunqueue(struct lwp *lp);
  91 static void bsd4_schedulerclock(struct lwp *lp, sysclock_t period,
  92                                 sysclock_t cpstamp);
  93 static void bsd4_recalculate_estcpu(struct lwp *lp);
  94 static void bsd4_resetpriority(struct lwp *lp);
  95 static void bsd4_forking(struct lwp *plp, struct lwp *lp);
  96 static void bsd4_exiting(struct lwp *lp, struct proc *);
  97 static void bsd4_yield(struct lwp *lp);
  98
  99 #ifdef SMP
 100 static void need_user_resched_remote(void *dummy);
 101 static int batchy_looser_pri_test(struct lwp* lp);
 102 static struct lwp *chooseproc_locked_cache_coherent(struct lwp *chklp);
 103 #endif
 104 static struct lwp *chooseproc_locked(struct lwp *chklp);
 105 static void bsd4_remrunqueue_locked(struct lwp *lp);
 106 static void bsd4_setrunqueue_locked(struct lwp *lp);
 107
 108 struct usched usched_bsd4 = {
 109         { NULL },
 110         "bsd4", "Original DragonFly Scheduler",
 111         NULL,                   /* default registration */
 112         NULL,                   /* default deregistration */
 113         bsd4_acquire_curproc,
 114         bsd4_release_curproc,
 115         bsd4_setrunqueue,
 116         bsd4_schedulerclock,
 117         bsd4_recalculate_estcpu,
 118         bsd4_resetpriority,
 119         bsd4_forking,
 120         bsd4_exiting,
 121         NULL,                   /* setcpumask not supported */
 122         bsd4_yield
 123 };
 124
 125 struct usched_bsd4_pcpu {
 126         struct thread   helper_thread;
 127         short           rrcount;
 128         short           upri;
 129         struct lwp      *uschedcp;
 130         struct lwp      *old_uschedcp;
 131 #ifdef SMP
 132         cpu_node_t      *cpunode;
 133 #endif
 134 };
 135
 136 typedef struct usched_bsd4_pcpu *bsd4_pcpu_t;
 137
 138 /*
 139  * We have NQS (32) run queues per scheduling class.  For the normal
 140  * class, there are 128 priorities scaled onto these 32 queues.  New
 141  * processes are added to the last entry in each queue, and processes
 142  * are selected for running by taking them from the head and maintaining
 143  * a simple FIFO arrangement.  Realtime and Idle priority processes have
 144  * and explicit 0-31 priority which maps directly onto their class queue
 145  * index.  When a queue has something in it, the corresponding bit is
 146  * set in the queuebits variable, allowing a single read to determine
 147  * the state of all 32 queues and then a ffs() to find the first busy
 148  * queue.
 149  */
 150 static struct rq bsd4_queues[NQS];
 151 static struct rq bsd4_rtqueues[NQS];
 152 static struct rq bsd4_idqueues[NQS];
 153 static u_int32_t bsd4_queuebits;
 154 static u_int32_t bsd4_rtqueuebits;
 155 static u_int32_t bsd4_idqueuebits;
 156 static cpumask_t bsd4_curprocmask = -1; /* currently running a user process */
 157 static cpumask_t bsd4_rdyprocmask;      /* ready to accept a user process */
 158 static int       bsd4_runqcount;
 159 #ifdef SMP
 160 static volatile int bsd4_scancpu;
 161 #endif
 162 static struct spinlock bsd4_spin;
 163 static struct usched_bsd4_pcpu bsd4_pcpu[MAXCPU];
 164 static struct sysctl_ctx_list usched_bsd4_sysctl_ctx;
 165 static struct sysctl_oid *usched_bsd4_sysctl_tree;
 166
 167 /* Debug info exposed through debug.* sysctl */
 168
 169 SYSCTL_INT(_debug, OID_AUTO, bsd4_runqcount, CTLFLAG_RD, &bsd4_runqcount, 0,
 170     "Number of run queues");
 171 #ifdef INVARIANTS
 172 static int usched_nonoptimal;
 173 SYSCTL_INT(_debug, OID_AUTO, usched_nonoptimal, CTLFLAG_RW,
 174         &usched_nonoptimal, 0, "acquire_curproc() was not optimal");
 175 static int usched_optimal;
 176 SYSCTL_INT(_debug, OID_AUTO, usched_optimal, CTLFLAG_RW,
 177         &usched_optimal, 0, "acquire_curproc() was optimal");
 178 #endif
 179
 180 static int usched_bsd4_debug = -1;
 181 SYSCTL_INT(_debug, OID_AUTO, scdebug, CTLFLAG_RW, &usched_bsd4_debug, 0,
 182     "Print debug information for this pid");
 183 static int usched_bsd4_pid_debug = -1;
 184 SYSCTL_INT(_debug, OID_AUTO, pid_debug, CTLFLAG_RW, &usched_bsd4_pid_debug, 0,
 185     "Print KTR debug information for this pid");
 186
 187 #ifdef SMP
 188 static int remote_resched_nonaffinity;
 189 static int remote_resched_affinity;
 190 static int choose_affinity;
 191 SYSCTL_INT(_debug, OID_AUTO, remote_resched_nonaffinity, CTLFLAG_RD,
 192         &remote_resched_nonaffinity, 0, "Number of remote rescheds");
 193 SYSCTL_INT(_debug, OID_AUTO, remote_resched_affinity, CTLFLAG_RD,
 194         &remote_resched_affinity, 0, "Number of remote rescheds");
 195 SYSCTL_INT(_debug, OID_AUTO, choose_affinity, CTLFLAG_RD,
 196         &choose_affinity, 0, "chooseproc() was smart");
 197 #endif
 198
 199
 200 /* Tunning usched_bsd4 - configurable through kern.usched_bsd4.* */
 201 #ifdef SMP
 202 static int usched_bsd4_smt = 0;
 203 static int usched_bsd4_cache_coherent = 0;
 204 static int usched_bsd4_upri_affinity = 16; /* 32 queues - half-way */
 205 static int usched_bsd4_queue_checks = 5;
 206 static int usched_bsd4_stick_to_level = 0;
 207 #endif
 208 static int usched_bsd4_rrinterval = (ESTCPUFREQ + 9) / 10;
 209 static int usched_bsd4_decay = 8;
 210 static int usched_bsd4_batch_time = 10;
 211
 212 /* KTR debug printings */
 213
 214 KTR_INFO_MASTER(usched);
 215
 216 #if !defined(KTR_USCHED_BSD4)
 217 #define KTR_USCHED_BSD4 KTR_ALL
 218 #endif
 219
 220 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_acquire_curproc_urw, 0,
 221     "USCHED_BSD4(bsd4_acquire_curproc in user_reseched_wanted "
 222     "after release: pid %d, cpuid %d, curr_cpuid %d)",
 223     pid_t pid, int cpuid, int curr);
 224 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_acquire_curproc_before_loop, 0,
 225     "USCHED_BSD4(bsd4_acquire_curproc before loop: pid %d, cpuid %d, "
 226     "curr_cpuid %d)",
 227     pid_t pid, int cpuid, int curr);
 228 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_acquire_curproc_not, 0,
 229     "USCHED_BSD4(bsd4_acquire_curproc couldn't acquire after "
 230     "bsd4_setrunqueue: pid %d, cpuid %d, curr_lp pid %d, curr_cpuid %d)",
 231     pid_t pid, int cpuid, pid_t curr_pid, int curr_cpuid);
 232 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_acquire_curproc_switch, 0,
 233     "USCHED_BSD4(bsd4_acquire_curproc after lwkt_switch: pid %d, "
 234     "cpuid %d, curr_cpuid %d)",
 235     pid_t pid, int cpuid, int curr);
 236
 237 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_release_curproc, 0,
 238     "USCHED_BSD4(bsd4_release_curproc before select: pid %d, "
 239     "cpuid %d, curr_cpuid %d)",
 240     pid_t pid, int cpuid, int curr);
 241
 242 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_select_curproc, 0,
 243     "USCHED_BSD4(bsd4_release_curproc before select: pid %d, "
 244     "cpuid %d, old_pid %d, old_cpuid %d, curr_cpuid %d)",
 245     pid_t pid, int cpuid, pid_t old_pid, int old_cpuid, int curr);
 246
 247 #ifdef SMP
 248 KTR_INFO(KTR_USCHED_BSD4, usched, batchy_test_false, 0,
 249     "USCHED_BSD4(batchy_looser_pri_test false: pid %d, "
 250     "cpuid %d, verify_mask %lu)",
 251     pid_t pid, int cpuid, cpumask_t mask);
 252 KTR_INFO(KTR_USCHED_BSD4, usched, batchy_test_true, 0,
 253     "USCHED_BSD4(batchy_looser_pri_test true: pid %d, "
 254     "cpuid %d, verify_mask %lu)",
 255     pid_t pid, int cpuid, cpumask_t mask);
 256
 257 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_fc_smt, 0,
 258     "USCHED_BSD4(bsd4_setrunqueue free cpus smt: pid %d, cpuid %d, "
 259     "mask %lu, curr_cpuid %d)",
 260     pid_t pid, int cpuid, cpumask_t mask, int curr);
 261 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_fc_non_smt, 0,
 262     "USCHED_BSD4(bsd4_setrunqueue free cpus check non_smt: pid %d, "
 263     "cpuid %d, mask %lu, curr_cpuid %d)",
 264     pid_t pid, int cpuid, cpumask_t mask, int curr);
 265 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_rc, 0,
 266     "USCHED_BSD4(bsd4_setrunqueue running cpus check: pid %d, "
 267     "cpuid %d, mask %lu, curr_cpuid %d)",
 268     pid_t pid, int cpuid, cpumask_t mask, int curr);
 269 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_found, 0,
 270     "USCHED_BSD4(bsd4_setrunqueue found cpu: pid %d, cpuid %d, "
 271     "mask %lu, found_cpuid %d, curr_cpuid %d)",
 272     pid_t pid, int cpuid, cpumask_t mask, int found_cpuid, int curr);
 273 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_not_found, 0,
 274     "USCHED_BSD4(bsd4_setrunqueue not found cpu: pid %d, cpuid %d, "
 275     "try_cpuid %d, curr_cpuid %d)",
 276     pid_t pid, int cpuid, int try_cpuid, int curr);
 277 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_found_best_cpuid, 0,
 278     "USCHED_BSD4(bsd4_setrunqueue found cpu: pid %d, cpuid %d, "
 279     "mask %lu, found_cpuid %d, curr_cpuid %d)",
 280     pid_t pid, int cpuid, cpumask_t mask, int found_cpuid, int curr);
 281 #endif
 282
 283 KTR_INFO(KTR_USCHED_BSD4, usched, chooseproc, 0,
 284     "USCHED_BSD4(chooseproc: pid %d, old_cpuid %d, curr_cpuid %d)",
 285     pid_t pid, int old_cpuid, int curr);
 286 #ifdef SMP
 287 KTR_INFO(KTR_USCHED_BSD4, usched, chooseproc_cc, 0,
 288     "USCHED_BSD4(chooseproc_cc: pid %d, old_cpuid %d, curr_cpuid %d)",
 289     pid_t pid, int old_cpuid, int curr);
 290 KTR_INFO(KTR_USCHED_BSD4, usched, chooseproc_cc_not_good, 0,
 291     "USCHED_BSD4(chooseproc_cc not good: pid %d, old_cpumask %lu, "
 292     "sibling_mask %lu, curr_cpumask %lu)",
 293     pid_t pid, cpumask_t old_cpumask, cpumask_t sibling_mask, cpumask_t curr);
 294 KTR_INFO(KTR_USCHED_BSD4, usched, chooseproc_cc_elected, 0,
 295     "USCHED_BSD4(chooseproc_cc elected: pid %d, old_cpumask %lu, "
 296     "sibling_mask %lu, curr_cpumask: %lu)",
 297     pid_t pid, cpumask_t old_cpumask, cpumask_t sibling_mask, cpumask_t curr);
 298
 299 KTR_INFO(KTR_USCHED_BSD4, usched, sched_thread_no_process, 0,
 300     "USCHED_BSD4(sched_thread %d no process scheduled: pid %d, old_cpuid %d)",
 301     int id, pid_t pid, int cpuid);
 302 KTR_INFO(KTR_USCHED_BSD4, usched, sched_thread_process, 0,
 303     "USCHED_BSD4(sched_thread %d process scheduled: pid %d, old_cpuid %d)",
 304     int id, pid_t pid, int cpuid);
 305 KTR_INFO(KTR_USCHED_BSD4, usched, sched_thread_no_process_found, 0,
 306     "USCHED_BSD4(sched_thread %d no process found; tmpmask %lu)",
 307     int id, cpumask_t tmpmask);
 308 #endif
 309
 310 /*
 311  * Initialize the run queues at boot time.
 312  */
 313 static void
 314 rqinit(void *dummy)
 315 {
 316         int i;
 317
 318         spin_init(&bsd4_spin);
 319         for (i = 0; i < NQS; i++) {
 320                 TAILQ_INIT(&bsd4_queues[i]);
 321                 TAILQ_INIT(&bsd4_rtqueues[i]);
 322                 TAILQ_INIT(&bsd4_idqueues[i]);
 323         }
 324         atomic_clear_cpumask(&bsd4_curprocmask, 1);
 325 }
 326 SYSINIT(runqueue, SI_BOOT2_USCHED, SI_ORDER_FIRST, rqinit, NULL)
 327
 328 /*
 329  * BSD4_ACQUIRE_CURPROC
 330  *
 331  * This function is called when the kernel intends to return to userland.
 332  * It is responsible for making the thread the current designated userland
 333  * thread for this cpu, blocking if necessary.
 334  *
 335  * The kernel has already depressed our LWKT priority so we must not switch
 336  * until we have either assigned or disposed of the thread.
 337  *
 338  * WARNING! THIS FUNCTION IS ALLOWED TO CAUSE THE CURRENT THREAD TO MIGRATE
 339  * TO ANOTHER CPU!  Because most of the kernel assumes that no migration will
 340  * occur, this function is called only under very controlled circumstances.
 341  *
 342  * MPSAFE
 343  */
 344 static void
 345 bsd4_acquire_curproc(struct lwp *lp)
 346 {
 347         globaldata_t gd;
 348         bsd4_pcpu_t dd;
 349         thread_t td;
 350 #if 0
 351         struct lwp *olp;
 352 #endif
 353
 354         /*
 355          * Make sure we aren't sitting on a tsleep queue.
 356          */
 357         td = lp->lwp_thread;
 358         crit_enter_quick(td);
 359         if (td->td_flags & TDF_TSLEEPQ)
 360                 tsleep_remove(td);
 361         bsd4_recalculate_estcpu(lp);
 362
 363         /*
 364          * If a reschedule was requested give another thread the
 365          * driver's seat.
 366          */
 367         if (user_resched_wanted()) {
 368                 clear_user_resched();
 369                 bsd4_release_curproc(lp);
 370
 371                 KTR_COND_LOG(usched_bsd4_acquire_curproc_urw,
 372                     lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
 373                     lp->lwp_proc->p_pid,
 374                     lp->lwp_thread->td_gd->gd_cpuid,
 375                     mycpu->gd_cpuid);
 376         }
 377
 378         /*
 379          * Loop until we are the current user thread
 380          */
 381         gd = mycpu;
 382         dd = &bsd4_pcpu[gd->gd_cpuid];
 383
 384         KTR_COND_LOG(usched_bsd4_acquire_curproc_before_loop,
 385             lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
 386             lp->lwp_proc->p_pid,
 387             lp->lwp_thread->td_gd->gd_cpuid,
 388             gd->gd_cpuid);
 389
 390         do {
 391                 /*
 392                  * Process any pending events and higher priority threads.
 393                  */
 394                 lwkt_yield();
 395
 396                 /*
 397                  * Become the currently scheduled user thread for this cpu
 398                  * if we can do so trivially.
 399                  *
 400                  * We can steal another thread's current thread designation
 401                  * on this cpu since if we are running that other thread
 402                  * must not be, so we can safely deschedule it.
 403                  */
 404                 if (dd->uschedcp == lp) {
 405                         /*
 406                          * We are already the current lwp (hot path).
 407                          */
 408                         dd->upri = lp->lwp_priority;
 409                 } else if (dd->uschedcp == NULL) {
 410                         /*
 411                          * We can trivially become the current lwp.
 412                          */
 413                         atomic_set_cpumask(&bsd4_curprocmask, gd->gd_cpumask);
 414                         dd->uschedcp = lp;
 415                         dd->upri = lp->lwp_priority;
 416                 } else if (dd->upri > lp->lwp_priority) {
 417                         /*
 418                          * We can steal the current cpu's lwp designation
 419                          * away simply by replacing it.  The other thread
 420                          * will stall when it tries to return to userland.
 421                          */
 422                         dd->uschedcp = lp;
 423                         dd->upri = lp->lwp_priority;
 424                         /*
 425                         lwkt_deschedule(olp->lwp_thread);
 426                         bsd4_setrunqueue(olp);
 427                         */
 428                 } else {
 429                         /*
 430                          * We cannot become the current lwp, place the lp
 431                          * on the bsd4 run-queue and deschedule ourselves.
 432                          *
 433                          * When we are reactivated we will have another
 434                          * chance.
 435                          */
 436                         lwkt_deschedule(lp->lwp_thread);
 437
 438                         bsd4_setrunqueue(lp);
 439
 440                         KTR_COND_LOG(usched_bsd4_acquire_curproc_not,
 441                             lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
 442                             lp->lwp_proc->p_pid,
 443                             lp->lwp_thread->td_gd->gd_cpuid,
 444                             dd->uschedcp->lwp_proc->p_pid,
 445                             gd->gd_cpuid);
 446
 447
 448                         lwkt_switch();
 449
 450                         /*
 451                          * Reload after a switch or setrunqueue/switch possibly
 452                          * moved us to another cpu.
 453                          */
 454                         gd = mycpu;
 455                         dd = &bsd4_pcpu[gd->gd_cpuid];
 456
 457                         KTR_COND_LOG(usched_bsd4_acquire_curproc_switch,
 458                             lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
 459                             lp->lwp_proc->p_pid,
 460                             lp->lwp_thread->td_gd->gd_cpuid,
 461                             gd->gd_cpuid);
 462                 }
 463         } while (dd->uschedcp != lp);
 464
 465         crit_exit_quick(td);
 466         KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0);
 467 }
 468
 469 /*
 470  * BSD4_RELEASE_CURPROC
 471  *
 472  * This routine detaches the current thread from the userland scheduler,
 473  * usually because the thread needs to run or block in the kernel (at
 474  * kernel priority) for a while.
 475  *
 476  * This routine is also responsible for selecting a new thread to
 477  * make the current thread.
 478  *
 479  * NOTE: This implementation differs from the dummy example in that
 480  * bsd4_select_curproc() is able to select the current process, whereas
 481  * dummy_select_curproc() is not able to select the current process.
 482  * This means we have to NULL out uschedcp.
 483  *
 484  * Additionally, note that we may already be on a run queue if releasing
 485  * via the lwkt_switch() in bsd4_setrunqueue().
 486  *
 487  * MPSAFE
 488  */
 489
 490 static void
 491 bsd4_release_curproc(struct lwp *lp)
 492 {
 493         globaldata_t gd = mycpu;
 494         bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid];
 495
 496         if (dd->uschedcp == lp) {
 497                 crit_enter();
 498                 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0);
 499
 500                 KTR_COND_LOG(usched_bsd4_release_curproc,
 501                     lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
 502                     lp->lwp_proc->p_pid,
 503                     lp->lwp_thread->td_gd->gd_cpuid,
 504                     gd->gd_cpuid);
 505
 506                 dd->uschedcp = NULL;    /* don't let lp be selected */
 507                 dd->upri = PRIBASE_NULL;
 508                 atomic_clear_cpumask(&bsd4_curprocmask, gd->gd_cpumask);
 509                 dd->old_uschedcp = lp;  /* used only for KTR debug prints */
 510                 bsd4_select_curproc(gd);
 511                 crit_exit();
 512         }
 513 }
 514
 515 /*
 516  * BSD4_SELECT_CURPROC
 517  *
 518  * Select a new current process for this cpu and clear any pending user
 519  * reschedule request.  The cpu currently has no current process.
 520  *
 521  * This routine is also responsible for equal-priority round-robining,
 522  * typically triggered from bsd4_schedulerclock().  In our dummy example
 523  * all the 'user' threads are LWKT scheduled all at once and we just
 524  * call lwkt_switch().
 525  *
 526  * The calling process is not on the queue and cannot be selected.
 527  *
 528  * MPSAFE
 529  */
 530 static
 531 void
 532 bsd4_select_curproc(globaldata_t gd)
 533 {
 534         bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid];
 535         struct lwp *nlp;
 536         int cpuid = gd->gd_cpuid;
 537
 538         crit_enter_gd(gd);
 539
 540         spin_lock(&bsd4_spin);
 541 #ifdef SMP
 542         if(usched_bsd4_cache_coherent)
 543                 nlp = chooseproc_locked_cache_coherent(dd->uschedcp);
 544         else
 545 #endif
 546                 nlp = chooseproc_locked(dd->uschedcp);
 547
 548         if (nlp) {
 549
 550                 KTR_COND_LOG(usched_bsd4_select_curproc,
 551                     nlp->lwp_proc->p_pid == usched_bsd4_pid_debug,
 552                     nlp->lwp_proc->p_pid,
 553                     nlp->lwp_thread->td_gd->gd_cpuid,
 554                     dd->old_uschedcp->lwp_proc->p_pid,
 555                     dd->old_uschedcp->lwp_thread->td_gd->gd_cpuid,
 556                     gd->gd_cpuid);
 557
 558                 atomic_set_cpumask(&bsd4_curprocmask, CPUMASK(cpuid));
 559                 dd->upri = nlp->lwp_priority;
 560                 dd->uschedcp = nlp;
 561                 spin_unlock(&bsd4_spin);
 562 #ifdef SMP
 563                 lwkt_acquire(nlp->lwp_thread);
 564 #endif
 565                 lwkt_schedule(nlp->lwp_thread);
 566         } else {
 567                 spin_unlock(&bsd4_spin);
 568         }
 569
 570 #if 0
 571         } else if (bsd4_runqcount && (bsd4_rdyprocmask & CPUMASK(cpuid))) {
 572                 atomic_clear_cpumask(&bsd4_rdyprocmask, CPUMASK(cpuid));
 573                 spin_unlock(&bsd4_spin);
 574                 lwkt_schedule(&dd->helper_thread);
 575         } else {
 576                 spin_unlock(&bsd4_spin);
 577         }
 578 #endif
 579         crit_exit_gd(gd);
 580 }
 581 #ifdef SMP
 582
 583 /*
 584  * batchy_looser_pri_test() - determine if a process is batchy or not
 585  * relative to the other processes running in the system
 586  */
 587 static int
 588 batchy_looser_pri_test(struct lwp* lp)
 589 {
 590         cpumask_t mask;
 591         bsd4_pcpu_t other_dd;
 592         int cpu;
 593
 594         /* Current running processes */
 595         mask = bsd4_curprocmask & smp_active_mask
 596             & usched_global_cpumask;
 597
 598         while(mask) {
 599                 cpu = BSFCPUMASK(mask);
 600                 other_dd = &bsd4_pcpu[cpu];
 601                 if (other_dd->upri - lp->lwp_priority > usched_bsd4_upri_affinity * PPQ) {
 602
 603                         KTR_COND_LOG(usched_batchy_test_false,
 604                             lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
 605                             lp->lwp_proc->p_pid,
 606                             lp->lwp_thread->td_gd->gd_cpuid,
 607                             mask);
 608
 609                         return 0;
 610                 }
 611                 mask &= ~CPUMASK(cpu);
 612         }
 613
 614         KTR_COND_LOG(usched_batchy_test_true,
 615             lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
 616             lp->lwp_proc->p_pid,
 617             lp->lwp_thread->td_gd->gd_cpuid,
 618             mask);
 619
 620         return 1;
 621 }
 622
 623 #endif
 624 /*
 625  *
 626  * BSD4_SETRUNQUEUE
 627  *
 628  * Place the specified lwp on the user scheduler's run queue.  This routine
 629  * must be called with the thread descheduled.  The lwp must be runnable.
 630  *
 631  * The thread may be the current thread as a special case.
 632  *
 633  * MPSAFE
 634  */
 635 static void
 636 bsd4_setrunqueue(struct lwp *lp)
 637 {
 638         globaldata_t gd;
 639         bsd4_pcpu_t dd;
 640 #ifdef SMP
 641         int cpuid;
 642         cpumask_t mask;
 643         cpumask_t tmpmask;
 644 #endif
 645
 646         /*
 647          * First validate the process state relative to the current cpu.
 648          * We don't need the spinlock for this, just a critical section.
 649          * We are in control of the process.
 650          */
 651         crit_enter();
 652         KASSERT(lp->lwp_stat == LSRUN, ("setrunqueue: lwp not LSRUN"));
 653         KASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0,
 654             ("lwp %d/%d already on runq! flag %08x/%08x", lp->lwp_proc->p_pid,
 655              lp->lwp_tid, lp->lwp_proc->p_flags, lp->lwp_flags));
 656         KKASSERT((lp->lwp_thread->td_flags & TDF_RUNQ) == 0);
 657
 658         /*
 659          * Note: gd and dd are relative to the target thread's last cpu,
 660          * NOT our current cpu.
 661          */
 662         gd = lp->lwp_thread->td_gd;
 663         dd = &bsd4_pcpu[gd->gd_cpuid];
 664
 665         /*
 666          * This process is not supposed to be scheduled anywhere or assigned
 667          * as the current process anywhere.  Assert the condition.
 668          */
 669         KKASSERT(dd->uschedcp != lp);
 670
 671 #ifndef SMP
 672         /*
 673          * If we are not SMP we do not have a scheduler helper to kick
 674          * and must directly activate the process if none are scheduled.
 675          *
 676          * This is really only an issue when bootstrapping init since
 677          * the caller in all other cases will be a user process, and
 678          * even if released (dd->uschedcp == NULL), that process will
 679          * kickstart the scheduler when it returns to user mode from
 680          * the kernel.
 681          */
 682         if (dd->uschedcp == NULL) {
 683                 atomic_set_cpumask(&bsd4_curprocmask, gd->gd_cpumask);
 684                 dd->uschedcp = lp;
 685                 dd->upri = lp->lwp_priority;
 686                 lwkt_schedule(lp->lwp_thread);
 687                 crit_exit();
 688                 return;
 689         }
 690 #endif
 691
 692 #ifdef SMP
 693         /*
 694          * XXX fixme.  Could be part of a remrunqueue/setrunqueue
 695          * operation when the priority is recalculated, so TDF_MIGRATING
 696          * may already be set.
 697          */
 698         if ((lp->lwp_thread->td_flags & TDF_MIGRATING) == 0)
 699                 lwkt_giveaway(lp->lwp_thread);
 700 #endif
 701
 702         /*
 703          * We lose control of lp the moment we release the spinlock after
 704          * having placed lp on the queue.  i.e. another cpu could pick it
 705          * up and it could exit, or its priority could be further adjusted,
 706          * or something like that.
 707          */
 708         spin_lock(&bsd4_spin);
 709         bsd4_setrunqueue_locked(lp);
 710         lp->lwp_setrunqueue_ticks = sched_ticks;
 711
 712 #ifdef SMP
 713         /*
 714          * Kick the scheduler helper on one of the other cpu's
 715          * and request a reschedule if appropriate.
 716          *
 717          * NOTE: We check all cpus whos rdyprocmask is set.  First we
 718          *       look for cpus without designated lps, then we look for
 719          *       cpus with designated lps with a worse priority than our
 720          *       process.
 721          */
 722         ++bsd4_scancpu;
 723
 724         if(usched_bsd4_smt) {
 725
 726                 /*
 727                  * SMT heuristic - Try to schedule on a free physical core. If no physical core
 728                  * found than choose the one that has an interactive thread
 729                  */
 730
 731                 int best_cpuid = -1;
 732                 int min_prio = MAXPRI * MAXPRI;
 733                 int sibling;
 734
 735                 cpuid = (bsd4_scancpu & 0xFFFF) % ncpus;
 736                 mask = ~bsd4_curprocmask & bsd4_rdyprocmask & lp->lwp_cpumask &
 737                     smp_active_mask & usched_global_cpumask;
 738
 739                 KTR_COND_LOG(usched_bsd4_setrunqueue_fc_smt,
 740                     lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
 741                     lp->lwp_proc->p_pid,
 742                     lp->lwp_thread->td_gd->gd_cpuid,
 743                     mask,
 744                     mycpu->gd_cpuid);
 745
 746                 while (mask) {
 747                         tmpmask = ~(CPUMASK(cpuid) - 1);
 748                         if (mask & tmpmask)
 749                                 cpuid = BSFCPUMASK(mask & tmpmask);
 750                         else
 751                                 cpuid = BSFCPUMASK(mask);
 752                         gd = globaldata_find(cpuid);
 753                         dd = &bsd4_pcpu[cpuid];
 754
 755                         if ((dd->upri & ~PPQMASK) >= (lp->lwp_priority & ~PPQMASK)) {
 756                                 if (dd->cpunode->parent_node->members & ~dd->cpunode->members & mask) {
 757
 758                                         KTR_COND_LOG(usched_bsd4_setrunqueue_found,
 759                                             lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
 760                                             lp->lwp_proc->p_pid,
 761                                             lp->lwp_thread->td_gd->gd_cpuid,
 762                                             mask,
 763                                             cpuid,
 764                                             mycpu->gd_cpuid);
 765
 766                                         goto found;
 767                                 } else {
 768                                         sibling = BSFCPUMASK(dd->cpunode->parent_node->members &
 769                                             ~dd->cpunode->members);
 770                                         if (min_prio > bsd4_pcpu[sibling].upri) {
 771                                                 min_prio = bsd4_pcpu[sibling].upri;
 772                                                 best_cpuid = cpuid;
 773                                         }
 774                                 }
 775                         }
 776                         mask &= ~CPUMASK(cpuid);
 777                 }
 778
 779                 if (best_cpuid != -1) {
 780                         cpuid = best_cpuid;
 781                         gd = globaldata_find(cpuid);
 782                         dd = &bsd4_pcpu[cpuid];
 783
 784                         KTR_COND_LOG(usched_bsd4_setrunqueue_found_best_cpuid,
 785                             lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
 786                             lp->lwp_proc->p_pid,
 787                             lp->lwp_thread->td_gd->gd_cpuid,
 788                             mask,
 789                             cpuid,
 790                             mycpu->gd_cpuid);
 791
 792                         goto found;
 793                 }
 794         } else {
 795                 /* Fallback to the original heuristic */
 796                 cpuid = (bsd4_scancpu & 0xFFFF) % ncpus;
 797                 mask = ~bsd4_curprocmask & bsd4_rdyprocmask & lp->lwp_cpumask &
 798                        smp_active_mask & usched_global_cpumask;
 799
 800                 KTR_COND_LOG(usched_bsd4_setrunqueue_fc_non_smt,
 801                     lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
 802                     lp->lwp_proc->p_pid,
 803                     lp->lwp_thread->td_gd->gd_cpuid,
 804                     mask,
 805                     mycpu->gd_cpuid);
 806
 807                 while (mask) {
 808                         tmpmask = ~(CPUMASK(cpuid) - 1);
 809                         if (mask & tmpmask)
 810                                 cpuid = BSFCPUMASK(mask & tmpmask);
 811                         else
 812                                 cpuid = BSFCPUMASK(mask);
 813                         gd = globaldata_find(cpuid);
 814                         dd = &bsd4_pcpu[cpuid];
 815
 816                         if ((dd->upri & ~PPQMASK) >= (lp->lwp_priority & ~PPQMASK)) {
 817
 818                                 KTR_COND_LOG(usched_bsd4_setrunqueue_found,
 819                                     lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
 820                                     lp->lwp_proc->p_pid,
 821                                     lp->lwp_thread->td_gd->gd_cpuid,
 822                                     mask,
 823                                     cpuid,
 824                                     mycpu->gd_cpuid);
 825
 826                                 goto found;
 827                         }
 828                         mask &= ~CPUMASK(cpuid);
 829                 }
 830         }
 831
 832         /*
 833          * Then cpus which might have a currently running lp
 834          */
 835         mask = bsd4_curprocmask & bsd4_rdyprocmask &
 836                lp->lwp_cpumask & smp_active_mask & usched_global_cpumask;
 837
 838         KTR_COND_LOG(usched_bsd4_setrunqueue_rc,
 839             lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
 840             lp->lwp_proc->p_pid,
 841             lp->lwp_thread->td_gd->gd_cpuid,
 842             mask,
 843             mycpu->gd_cpuid);
 844
 845         while (mask) {
 846                 tmpmask = ~(CPUMASK(cpuid) - 1);
 847                 if (mask & tmpmask)
 848                         cpuid = BSFCPUMASK(mask & tmpmask);
 849                 else
 850                         cpuid = BSFCPUMASK(mask);
 851                 gd = globaldata_find(cpuid);
 852                 dd = &bsd4_pcpu[cpuid];
 853
 854                 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) {
 855
 856                         KTR_COND_LOG(usched_bsd4_setrunqueue_found,
 857                             lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
 858                             lp->lwp_proc->p_pid,
 859                             lp->lwp_thread->td_gd->gd_cpuid,
 860                             mask,
 861                             cpuid,
 862                             mycpu->gd_cpuid);
 863
 864                         goto found;
 865                 }
 866                 mask &= ~CPUMASK(cpuid);
 867         }
 868
 869         /*
 870          * If we cannot find a suitable cpu we reload from bsd4_scancpu
 871          * and round-robin.  Other cpus will pickup as they release their
 872          * current lwps or become ready.
 873          *
 874          * Avoid a degenerate system lockup case if usched_global_cpumask
 875          * is set to 0 or otherwise does not cover lwp_cpumask.
 876          *
 877          * We only kick the target helper thread in this case, we do not
 878          * set the user resched flag because
 879          */
 880         cpuid = (bsd4_scancpu & 0xFFFF) % ncpus;
 881         if ((CPUMASK(cpuid) & usched_global_cpumask) == 0) {
 882                 cpuid = 0;
 883         }
 884         gd = globaldata_find(cpuid);
 885         dd = &bsd4_pcpu[cpuid];
 886
 887         KTR_COND_LOG(usched_bsd4_setrunqueue_not_found,
 888             lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
 889             lp->lwp_proc->p_pid,
 890             lp->lwp_thread->td_gd->gd_cpuid,
 891             cpuid,
 892             mycpu->gd_cpuid);
 893
 894 found:
 895         if (gd == mycpu) {
 896                 spin_unlock(&bsd4_spin);
 897                 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) {
 898                         if (dd->uschedcp == NULL) {
 899                                 wakeup(&dd->helper_thread);
 900                         } else {
 901                                 need_user_resched();
 902                         }
 903                 }
 904         } else {
 905                 atomic_clear_cpumask(&bsd4_rdyprocmask, CPUMASK(cpuid));
 906                 spin_unlock(&bsd4_spin);
 907                 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK))
 908                         lwkt_send_ipiq(gd, need_user_resched_remote, NULL);
 909                 else
 910                         wakeup(&dd->helper_thread);
 911         }
 912 #else
 913         /*
 914          * Request a reschedule if appropriate.
 915          */
 916         spin_unlock(&bsd4_spin);
 917         if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) {
 918                 need_user_resched();
 919         }
 920 #endif
 921         crit_exit();
 922 }
 923
 924 /*
 925  * This routine is called from a systimer IPI.  It MUST be MP-safe and
 926  * the BGL IS NOT HELD ON ENTRY.  This routine is called at ESTCPUFREQ on
 927  * each cpu.
 928  *
 929  * MPSAFE
 930  */
 931 static
 932 void
 933 bsd4_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp)
 934 {
 935         globaldata_t gd = mycpu;
 936         bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid];
 937
 938         /*
 939          * Do we need to round-robin?  We round-robin 10 times a second.
 940          * This should only occur for cpu-bound batch processes.
 941          */
 942         if (++dd->rrcount >= usched_bsd4_rrinterval) {
 943                 dd->rrcount = 0;
 944                 need_user_resched();
 945         }
 946
 947         /*
 948          * Adjust estcpu upward using a real time equivalent calculation.
 949          */
 950         lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu + ESTCPUMAX / ESTCPUFREQ + 1);
 951
 952         /*
 953          * Spinlocks also hold a critical section so there should not be
 954          * any active.
 955          */
 956         KKASSERT(gd->gd_spinlocks_wr == 0);
 957
 958         bsd4_resetpriority(lp);
 959 #if 0
 960         /*
 961         * if we can't call bsd4_resetpriority for some reason we must call
 962          * need user_resched().
 963          */
 964         need_user_resched();
 965 #endif
 966 }
 967
 968 /*
 969  * Called from acquire and from kern_synch's one-second timer (one of the
 970  * callout helper threads) with a critical section held.
 971  *
 972  * Decay p_estcpu based on the number of ticks we haven't been running
 973  * and our p_nice.  As the load increases each process observes a larger
 974  * number of idle ticks (because other processes are running in them).
 975  * This observation leads to a larger correction which tends to make the
 976  * system more 'batchy'.
 977  *
 978  * Note that no recalculation occurs for a process which sleeps and wakes
 979  * up in the same tick.  That is, a system doing thousands of context
 980  * switches per second will still only do serious estcpu calculations
 981  * ESTCPUFREQ times per second.
 982  *
 983  * MPSAFE
 984  */
 985 static
 986 void
 987 bsd4_recalculate_estcpu(struct lwp *lp)
 988 {
 989         globaldata_t gd = mycpu;
 990         sysclock_t cpbase;
 991         sysclock_t ttlticks;
 992         int estcpu;
 993         int decay_factor;
 994
 995         /*
 996          * We have to subtract periodic to get the last schedclock
 997          * timeout time, otherwise we would get the upcoming timeout.
 998          * Keep in mind that a process can migrate between cpus and
 999          * while the scheduler clock should be very close, boundary
1000          * conditions could lead to a small negative delta.
1001          */
1002         cpbase = gd->gd_schedclock.time - gd->gd_schedclock.periodic;
1003
1004         if (lp->lwp_slptime > 1) {
1005                 /*
1006                  * Too much time has passed, do a coarse correction.
1007                  */
1008                 lp->lwp_estcpu = lp->lwp_estcpu >> 1;
1009                 bsd4_resetpriority(lp);
1010                 lp->lwp_cpbase = cpbase;
1011                 lp->lwp_cpticks = 0;
1012                 lp->lwp_batch -= ESTCPUFREQ;
1013                 if (lp->lwp_batch < 0)
1014                         lp->lwp_batch = 0;
1015         } else if (lp->lwp_cpbase != cpbase) {
1016                 /*
1017                  * Adjust estcpu if we are in a different tick.  Don't waste
1018                  * time if we are in the same tick.
1019                  *
1020                  * First calculate the number of ticks in the measurement
1021                  * interval.  The ttlticks calculation can wind up 0 due to
1022                  * a bug in the handling of lwp_slptime  (as yet not found),
1023                  * so make sure we do not get a divide by 0 panic.
1024                  */
1025                 ttlticks = (cpbase - lp->lwp_cpbase) /
1026                            gd->gd_schedclock.periodic;
1027                 if (ttlticks < 0) {
1028                         ttlticks = 0;
1029                         lp->lwp_cpbase = cpbase;
1030                 }
1031                 if (ttlticks == 0)
1032                         return;
1033                 updatepcpu(lp, lp->lwp_cpticks, ttlticks);
1034
1035                 /*
1036                  * Calculate the percentage of one cpu used factoring in ncpus
1037                  * and the load and adjust estcpu.  Handle degenerate cases
1038                  * by adding 1 to bsd4_runqcount.
1039                  *
1040                  * estcpu is scaled by ESTCPUMAX.
1041                  *
1042                  * bsd4_runqcount is the excess number of user processes
1043                  * that cannot be immediately scheduled to cpus.  We want
1044                  * to count these as running to avoid range compression
1045                  * in the base calculation (which is the actual percentage
1046                  * of one cpu used).
1047                  */
1048                 estcpu = (lp->lwp_cpticks * ESTCPUMAX) *
1049                          (bsd4_runqcount + ncpus) / (ncpus * ttlticks);
1050
1051                 /*
1052                  * If estcpu is > 50% we become more batch-like
1053                  * If estcpu is <= 50% we become less batch-like
1054                  *
1055                  * It takes 30 cpu seconds to traverse the entire range.
1056                  */
1057                 if (estcpu > ESTCPUMAX / 2) {
1058                         lp->lwp_batch += ttlticks;
1059                         if (lp->lwp_batch > BATCHMAX)
1060                                 lp->lwp_batch = BATCHMAX;
1061                 } else {
1062                         lp->lwp_batch -= ttlticks;
1063                         if (lp->lwp_batch < 0)
1064                                 lp->lwp_batch = 0;
1065                 }
1066
1067                 if (usched_bsd4_debug == lp->lwp_proc->p_pid) {
1068                         kprintf("pid %d lwp %p estcpu %3d %3d bat %d cp %d/%d",
1069                                 lp->lwp_proc->p_pid, lp,
1070                                 estcpu, lp->lwp_estcpu,
1071                                 lp->lwp_batch,
1072                                 lp->lwp_cpticks, ttlticks);
1073                 }
1074
1075                 /*
1076                  * Adjust lp->lwp_esetcpu.  The decay factor determines how
1077                  * quickly lwp_estcpu collapses to its realtime calculation.
1078                  * A slower collapse gives us a more accurate number but
1079                  * can cause a cpu hog to eat too much cpu before the
1080                  * scheduler decides to downgrade it.
1081                  *
1082                  * NOTE: p_nice is accounted for in bsd4_resetpriority(),
1083                  *       and not here, but we must still ensure that a
1084                  *       cpu-bound nice -20 process does not completely
1085                  *       override a cpu-bound nice +20 process.
1086                  *
1087                  * NOTE: We must use ESTCPULIM() here to deal with any
1088                  *       overshoot.
1089                  */
1090                 decay_factor = usched_bsd4_decay;
1091                 if (decay_factor < 1)
1092                         decay_factor = 1;
1093                 if (decay_factor > 1024)
1094                         decay_factor = 1024;
1095
1096                 lp->lwp_estcpu = ESTCPULIM(
1097                         (lp->lwp_estcpu * decay_factor + estcpu) /
1098                         (decay_factor + 1));
1099
1100                 if (usched_bsd4_debug == lp->lwp_proc->p_pid)
1101                         kprintf(" finalestcpu %d\n", lp->lwp_estcpu);
1102                 bsd4_resetpriority(lp);
1103                 lp->lwp_cpbase += ttlticks * gd->gd_schedclock.periodic;
1104                 lp->lwp_cpticks = 0;
1105         }
1106 }
1107
1108 /*
1109  * Compute the priority of a process when running in user mode.
1110  * Arrange to reschedule if the resulting priority is better
1111  * than that of the current process.
1112  *
1113  * This routine may be called with any process.
1114  *
1115  * This routine is called by fork1() for initial setup with the process
1116  * of the run queue, and also may be called normally with the process on or
1117  * off the run queue.
1118  *
1119  * MPSAFE
1120  */
1121 static void
1122 bsd4_resetpriority(struct lwp *lp)
1123 {
1124         bsd4_pcpu_t dd;
1125         int newpriority;
1126         u_short newrqtype;
1127         int reschedcpu;
1128         int checkpri;
1129         int estcpu;
1130
1131         /*
1132          * Calculate the new priority and queue type
1133          */
1134         crit_enter();
1135         spin_lock(&bsd4_spin);
1136
1137         newrqtype = lp->lwp_rtprio.type;
1138
1139         switch(newrqtype) {
1140         case RTP_PRIO_REALTIME:
1141         case RTP_PRIO_FIFO:
1142                 newpriority = PRIBASE_REALTIME +
1143                              (lp->lwp_rtprio.prio & PRIMASK);
1144                 break;
1145         case RTP_PRIO_NORMAL:
1146                 /*
1147                  * Detune estcpu based on batchiness.  lwp_batch ranges
1148                  * from 0 to  BATCHMAX.  Limit estcpu for the sake of
1149                  * the priority calculation to between 50% and 100%.
1150                  */
1151                 estcpu = lp->lwp_estcpu * (lp->lwp_batch + BATCHMAX) /
1152                          (BATCHMAX * 2);
1153
1154                 /*
1155                  * p_nice piece         Adds (0-40) * 2         0-80
1156                  * estcpu               Adds 16384  * 4 / 512   0-128
1157                  */
1158                 newpriority = (lp->lwp_proc->p_nice - PRIO_MIN) * PPQ / NICEPPQ;
1159                 newpriority += estcpu * PPQ / ESTCPUPPQ;
1160                 newpriority = newpriority * MAXPRI / (PRIO_RANGE * PPQ /
1161                               NICEPPQ + ESTCPUMAX * PPQ / ESTCPUPPQ);
1162                 newpriority = PRIBASE_NORMAL + (newpriority & PRIMASK);
1163                 break;
1164         case RTP_PRIO_IDLE:
1165                 newpriority = PRIBASE_IDLE + (lp->lwp_rtprio.prio & PRIMASK);
1166                 break;
1167         case RTP_PRIO_THREAD:
1168                 newpriority = PRIBASE_THREAD + (lp->lwp_rtprio.prio & PRIMASK);
1169                 break;
1170         default:
1171                 panic("Bad RTP_PRIO %d", newrqtype);
1172                 /* NOT REACHED */
1173         }
1174
1175         /*
1176          * The newpriority incorporates the queue type so do a simple masked
1177          * check to determine if the process has moved to another queue.  If
1178          * it has, and it is currently on a run queue, then move it.
1179          */
1180         if ((lp->lwp_priority ^ newpriority) & ~PPQMASK) {
1181                 lp->lwp_priority = newpriority;
1182                 if (lp->lwp_mpflags & LWP_MP_ONRUNQ) {
1183                         bsd4_remrunqueue_locked(lp);
1184                         lp->lwp_rqtype = newrqtype;
1185                         lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ;
1186                         bsd4_setrunqueue_locked(lp);
1187                         checkpri = 1;
1188                 } else {
1189                         lp->lwp_rqtype = newrqtype;
1190                         lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ;
1191                         checkpri = 0;
1192                 }
1193                 reschedcpu = lp->lwp_thread->td_gd->gd_cpuid;
1194         } else {
1195                 lp->lwp_priority = newpriority;
1196                 reschedcpu = -1;
1197                 checkpri = 1;
1198         }
1199
1200         /*
1201          * Determine if we need to reschedule the target cpu.  This only
1202          * occurs if the LWP is already on a scheduler queue, which means
1203          * that idle cpu notification has already occured.  At most we
1204          * need only issue a need_user_resched() on the appropriate cpu.
1205          *
1206          * The LWP may be owned by a CPU different from the current one,
1207          * in which case dd->uschedcp may be modified without an MP lock
1208          * or a spinlock held.  The worst that happens is that the code
1209          * below causes a spurious need_user_resched() on the target CPU
1210          * and dd->pri to be wrong for a short period of time, both of
1211          * which are harmless.
1212          *
1213          * If checkpri is 0 we are adjusting the priority of the current
1214          * process, possibly higher (less desireable), so ignore the upri
1215          * check which will fail in that case.
1216          */
1217         if (reschedcpu >= 0) {
1218                 dd = &bsd4_pcpu[reschedcpu];
1219                 if ((bsd4_rdyprocmask & CPUMASK(reschedcpu)) &&
1220                     (checkpri == 0 ||
1221                      (dd->upri & ~PRIMASK) > (lp->lwp_priority & ~PRIMASK))) {
1222 #ifdef SMP
1223                         if (reschedcpu == mycpu->gd_cpuid) {
1224                                 spin_unlock(&bsd4_spin);
1225                                 need_user_resched();
1226                         } else {
1227                                 spin_unlock(&bsd4_spin);
1228                                 atomic_clear_cpumask(&bsd4_rdyprocmask,
1229                                                      CPUMASK(reschedcpu));
1230                                 lwkt_send_ipiq(lp->lwp_thread->td_gd,
1231                                                need_user_resched_remote, NULL);
1232                         }
1233 #else
1234                         spin_unlock(&bsd4_spin);
1235                         need_user_resched();
1236 #endif
1237                 } else {
1238                         spin_unlock(&bsd4_spin);
1239                 }
1240         } else {
1241                 spin_unlock(&bsd4_spin);
1242         }
1243         crit_exit();
1244 }
1245
1246 /*
1247  * MPSAFE
1248  */
1249 static
1250 void
1251 bsd4_yield(struct lwp *lp)
1252 {
1253 #if 0
1254         /* FUTURE (or something similar) */
1255         switch(lp->lwp_rqtype) {
1256         case RTP_PRIO_NORMAL:
1257                 lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu + ESTCPUINCR);
1258                 break;
1259         default:
1260                 break;
1261         }
1262 #endif
1263         need_user_resched();
1264 }
1265
1266 /*
1267  * Called from fork1() when a new child process is being created.
1268  *
1269  * Give the child process an initial estcpu that is more batch then
1270  * its parent and dock the parent for the fork (but do not
1271  * reschedule the parent).   This comprises the main part of our batch
1272  * detection heuristic for both parallel forking and sequential execs.
1273  *
1274  * XXX lwp should be "spawning" instead of "forking"
1275  *
1276  * MPSAFE
1277  */
1278 static void
1279 bsd4_forking(struct lwp *plp, struct lwp *lp)
1280 {
1281         /*
1282          * Put the child 4 queue slots (out of 32) higher than the parent
1283          * (less desireable than the parent).
1284          */
1285         lp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ * 4);
1286
1287         /*
1288          * The batch status of children always starts out centerline
1289          * and will inch-up or inch-down as appropriate.  It takes roughly
1290          * ~15 seconds of >50% cpu to hit the limit.
1291          */
1292         lp->lwp_batch = BATCHMAX / 2;
1293
1294         /*
1295          * Dock the parent a cost for the fork, protecting us from fork
1296          * bombs.  If the parent is forking quickly make the child more
1297          * batchy.
1298          */
1299         plp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ / 16);
1300 }
1301
1302 /*
1303  * Called when a parent waits for a child.
1304  *
1305  * MPSAFE
1306  */
1307 static void
1308 bsd4_exiting(struct lwp *lp, struct proc *child_proc)
1309 {
1310 }
1311
1312 /*
1313  * chooseproc() is called when a cpu needs a user process to LWKT schedule,
1314  * it selects a user process and returns it.  If chklp is non-NULL and chklp
1315  * has a better or equal priority then the process that would otherwise be
1316  * chosen, NULL is returned.
1317  *
1318  * Until we fix the RUNQ code the chklp test has to be strict or we may
1319  * bounce between processes trying to acquire the current process designation.
1320  *
1321  * MPSAFE - must be called with bsd4_spin exclusive held.  The spinlock is
1322  *          left intact through the entire routine.
1323  */
1324 static
1325 struct lwp *
1326 chooseproc_locked(struct lwp *chklp)
1327 {
1328         struct lwp *lp;
1329         struct rq *q;
1330         u_int32_t *which, *which2;
1331         u_int32_t pri;
1332         u_int32_t rtqbits;
1333         u_int32_t tsqbits;
1334         u_int32_t idqbits;
1335         cpumask_t cpumask;
1336
1337         rtqbits = bsd4_rtqueuebits;
1338         tsqbits = bsd4_queuebits;
1339         idqbits = bsd4_idqueuebits;
1340         cpumask = mycpu->gd_cpumask;
1341
1342
1343 #ifdef SMP
1344 again:
1345 #endif
1346         if (rtqbits) {
1347                 pri = bsfl(rtqbits);
1348                 q = &bsd4_rtqueues[pri];
1349                 which = &bsd4_rtqueuebits;
1350                 which2 = &rtqbits;
1351         } else if (tsqbits) {
1352                 pri = bsfl(tsqbits);
1353                 q = &bsd4_queues[pri];
1354                 which = &bsd4_queuebits;
1355                 which2 = &tsqbits;
1356         } else if (idqbits) {
1357                 pri = bsfl(idqbits);
1358                 q = &bsd4_idqueues[pri];
1359                 which = &bsd4_idqueuebits;
1360                 which2 = &idqbits;
1361         } else {
1362                 return NULL;
1363         }
1364         lp = TAILQ_FIRST(q);
1365         KASSERT(lp, ("chooseproc: no lwp on busy queue"));
1366
1367 #ifdef SMP
1368         while ((lp->lwp_cpumask & cpumask) == 0) {
1369                 lp = TAILQ_NEXT(lp, lwp_procq);
1370                 if (lp == NULL) {
1371                         *which2 &= ~(1 << pri);
1372                         goto again;
1373                 }
1374         }
1375 #endif
1376
1377         /*
1378          * If the passed lwp <chklp> is reasonably close to the selected
1379          * lwp <lp>, return NULL (indicating that <chklp> should be kept).
1380          *
1381          * Note that we must error on the side of <chklp> to avoid bouncing
1382          * between threads in the acquire code.
1383          */
1384         if (chklp) {
1385                 if (chklp->lwp_priority < lp->lwp_priority + PPQ)
1386                         return(NULL);
1387         }
1388
1389 #ifdef SMP
1390         /*
1391          * If the chosen lwp does not reside on this cpu spend a few
1392          * cycles looking for a better candidate at the same priority level.
1393          * This is a fallback check, setrunqueue() tries to wakeup the
1394          * correct cpu and is our front-line affinity.
1395          */
1396         if (lp->lwp_thread->td_gd != mycpu &&
1397             (chklp = TAILQ_NEXT(lp, lwp_procq)) != NULL
1398         ) {
1399                 if (chklp->lwp_thread->td_gd == mycpu) {
1400                         ++choose_affinity;
1401                         lp = chklp;
1402                 }
1403         }
1404 #endif
1405
1406         KTR_COND_LOG(usched_chooseproc,
1407             lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
1408             lp->lwp_proc->p_pid,
1409             lp->lwp_thread->td_gd->gd_cpuid,
1410             mycpu->gd_cpuid);
1411
1412         TAILQ_REMOVE(q, lp, lwp_procq);
1413         --bsd4_runqcount;
1414         if (TAILQ_EMPTY(q))
1415                 *which &= ~(1 << pri);
1416         KASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) != 0, ("not on runq6!"));
1417         atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ);
1418         return lp;
1419 }
1420
1421 #ifdef SMP
1422 /*
1423  * chooseproc() - with a cache coherence heuristic. Try to pull a process that
1424  * has its home on the current CPU> If the process doesn't have its home here
1425  * and is a batchy one (see batcy_looser_pri_test), we can wait for a
1426  * sched_tick, may be its home will become free and pull it in. Anyway,
1427  * we can't wait more than one tick. If that tick expired, we pull in that
1428  * process, no matter what.
1429  */
1430 static
1431 struct lwp *
1432 chooseproc_locked_cache_coherent(struct lwp *chklp)
1433 {
1434         struct lwp *lp;
1435         struct rq *q;
1436         u_int32_t *which, *which2;
1437         u_int32_t pri;
1438         u_int32_t checks;
1439         u_int32_t rtqbits;
1440         u_int32_t tsqbits;
1441         u_int32_t idqbits;
1442         cpumask_t cpumask;
1443
1444         struct lwp * min_level_lwp = NULL;
1445         struct rq *min_q = NULL;
1446         cpumask_t siblings;
1447         cpu_node_t* cpunode = NULL;
1448         u_int32_t min_level = MAXCPU;   /* number of levels < MAXCPU */
1449         u_int32_t *min_which = NULL;
1450         u_int32_t min_pri = 0;
1451         u_int32_t level = 0;
1452
1453         rtqbits = bsd4_rtqueuebits;
1454         tsqbits = bsd4_queuebits;
1455         idqbits = bsd4_idqueuebits;
1456         cpumask = mycpu->gd_cpumask;
1457
1458         /* Get the mask coresponding to the sysctl configured level */
1459         cpunode = bsd4_pcpu[mycpu->gd_cpuid].cpunode;
1460         level = usched_bsd4_stick_to_level;
1461         while (level) {
1462                 cpunode = cpunode->parent_node;
1463                 level--;
1464         }
1465         /* The cpus which can ellect a process */
1466         siblings = cpunode->members;
1467
1468 again:
1469         if (rtqbits) {
1470                 pri = bsfl(rtqbits);
1471                 q = &bsd4_rtqueues[pri];
1472                 which = &bsd4_rtqueuebits;
1473                 which2 = &rtqbits;
1474         } else if (tsqbits) {
1475                 pri = bsfl(tsqbits);
1476                 q = &bsd4_queues[pri];
1477                 which = &bsd4_queuebits;
1478                 which2 = &tsqbits;
1479         } else if (idqbits) {
1480                 pri = bsfl(idqbits);
1481                 q = &bsd4_idqueues[pri];
1482                 which = &bsd4_idqueuebits;
1483                 which2 = &idqbits;
1484         } else {
1485                 return NULL;
1486         }
1487         lp = TAILQ_FIRST(q);
1488         KASSERT(lp, ("chooseproc: no lwp on busy queue"));
1489
1490         /* Limit the number of checks/queue to a configurable value to
1491          * minimize the contention (we are in a locked region
1492          */
1493         for (checks = 0; checks < usched_bsd4_queue_checks; checks++) {
1494
1495                 if ((lp->lwp_cpumask & cpumask) == 0 ||
1496                     ((siblings & lp->lwp_thread->td_gd->gd_cpumask) == 0 &&
1497                       batchy_looser_pri_test(lp) &&
1498                       (lp->lwp_setrunqueue_ticks == sched_ticks ||
1499                        lp->lwp_setrunqueue_ticks == (int)(sched_ticks - 1)))) {
1500
1501                         KTR_COND_LOG(usched_chooseproc_cc_not_good,
1502                             lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
1503                             lp->lwp_proc->p_pid,
1504                             lp->lwp_thread->td_gd->gd_cpumask,
1505                             siblings,
1506                             cpumask);
1507
1508                         cpunode = bsd4_pcpu[lp->lwp_thread->td_gd->gd_cpuid].cpunode;
1509                         level = 0;
1510                         while (cpunode) {
1511                                 if (cpunode->members & cpumask) {
1512                                         break;
1513                                 }
1514                                 cpunode = cpunode->parent_node;
1515                                 level++;
1516                         }
1517                         if (level < min_level) {
1518                                 min_level_lwp = lp;
1519                                 min_level = level;
1520                                 min_q = q;
1521                                 min_which = which;
1522                                 min_pri = pri;
1523                         }
1524
1525                         lp = TAILQ_NEXT(lp, lwp_procq);
1526                         if (lp == NULL) {
1527                                 *which2 &= ~(1 << pri);
1528                                 goto again;
1529                         }
1530                 } else {
1531                         KTR_COND_LOG(usched_chooseproc_cc_elected,
1532                             lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
1533                             lp->lwp_proc->p_pid,
1534                             lp->lwp_thread->td_gd->gd_cpumask,
1535                             siblings,
1536                             cpumask);
1537
1538                         goto found;
1539                 }
1540         }
1541         lp = min_level_lwp;
1542         q = min_q;
1543         which = min_which;
1544         pri = min_pri;
1545         KASSERT(lp, ("chooseproc: at least the first lp was good"));
1546
1547 found:
1548
1549         /*
1550          * If the passed lwp <chklp> is reasonably close to the selected
1551          * lwp <lp>, return NULL (indicating that <chklp> should be kept).
1552          *
1553          * Note that we must error on the side of <chklp> to avoid bouncing
1554          * between threads in the acquire code.
1555          */
1556         if (chklp) {
1557                 if (chklp->lwp_priority < lp->lwp_priority + PPQ)
1558                         return(NULL);
1559         }
1560
1561         KTR_COND_LOG(usched_chooseproc_cc,
1562             lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
1563             lp->lwp_proc->p_pid,
1564             lp->lwp_thread->td_gd->gd_cpuid,
1565             mycpu->gd_cpuid);
1566
1567         TAILQ_REMOVE(q, lp, lwp_procq);
1568         --bsd4_runqcount;
1569         if (TAILQ_EMPTY(q))
1570                 *which &= ~(1 << pri);
1571         KASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) != 0, ("not on runq6!"));
1572         atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ);
1573         return lp;
1574 }
1575
1576
1577 static
1578 void
1579 need_user_resched_remote(void *dummy)
1580 {
1581         globaldata_t gd = mycpu;
1582         bsd4_pcpu_t  dd = &bsd4_pcpu[gd->gd_cpuid];
1583
1584         need_user_resched();
1585         wakeup(&dd->helper_thread);
1586 }
1587
1588 #endif
1589
1590 /*
1591  * bsd4_remrunqueue_locked() removes a given process from the run queue
1592  * that it is on, clearing the queue busy bit if it becomes empty.
1593  *
1594  * Note that user process scheduler is different from the LWKT schedule.
1595  * The user process scheduler only manages user processes but it uses LWKT
1596  * underneath, and a user process operating in the kernel will often be
1597  * 'released' from our management.
1598  *
1599  * MPSAFE - bsd4_spin must be held exclusively on call
1600  */
1601 static void
1602 bsd4_remrunqueue_locked(struct lwp *lp)
1603 {
1604         struct rq *q;
1605         u_int32_t *which;
1606         u_int8_t pri;
1607
1608         KKASSERT(lp->lwp_mpflags & LWP_MP_ONRUNQ);
1609         atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ);
1610         --bsd4_runqcount;
1611         KKASSERT(bsd4_runqcount >= 0);
1612
1613         pri = lp->lwp_rqindex;
1614         switch(lp->lwp_rqtype) {
1615         case RTP_PRIO_NORMAL:
1616                 q = &bsd4_queues[pri];
1617                 which = &bsd4_queuebits;
1618                 break;
1619         case RTP_PRIO_REALTIME:
1620         case RTP_PRIO_FIFO:
1621                 q = &bsd4_rtqueues[pri];
1622                 which = &bsd4_rtqueuebits;
1623                 break;
1624         case RTP_PRIO_IDLE:
1625                 q = &bsd4_idqueues[pri];
1626                 which = &bsd4_idqueuebits;
1627                 break;
1628         default:
1629                 panic("remrunqueue: invalid rtprio type");
1630                 /* NOT REACHED */
1631         }
1632         TAILQ_REMOVE(q, lp, lwp_procq);
1633         if (TAILQ_EMPTY(q)) {
1634                 KASSERT((*which & (1 << pri)) != 0,
1635                         ("remrunqueue: remove from empty queue"));
1636                 *which &= ~(1 << pri);
1637         }
1638 }
1639
1640 /*
1641  * bsd4_setrunqueue_locked()
1642  *
1643  * Add a process whos rqtype and rqindex had previously been calculated
1644  * onto the appropriate run queue.   Determine if the addition requires
1645  * a reschedule on a cpu and return the cpuid or -1.
1646  *
1647  * NOTE: Lower priorities are better priorities.
1648  *
1649  * MPSAFE - bsd4_spin must be held exclusively on call
1650  */
1651 static void
1652 bsd4_setrunqueue_locked(struct lwp *lp)
1653 {
1654         struct rq *q;
1655         u_int32_t *which;
1656         int pri;
1657
1658         KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0);
1659         atomic_set_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ);
1660         ++bsd4_runqcount;
1661
1662         pri = lp->lwp_rqindex;
1663
1664         switch(lp->lwp_rqtype) {
1665         case RTP_PRIO_NORMAL:
1666                 q = &bsd4_queues[pri];
1667                 which = &bsd4_queuebits;
1668                 break;
1669         case RTP_PRIO_REALTIME:
1670         case RTP_PRIO_FIFO:
1671                 q = &bsd4_rtqueues[pri];
1672                 which = &bsd4_rtqueuebits;
1673                 break;
1674         case RTP_PRIO_IDLE:
1675                 q = &bsd4_idqueues[pri];
1676                 which = &bsd4_idqueuebits;
1677                 break;
1678         default:
1679                 panic("remrunqueue: invalid rtprio type");
1680                 /* NOT REACHED */
1681         }
1682
1683         /*
1684          * Add to the correct queue and set the appropriate bit.  If no
1685          * lower priority (i.e. better) processes are in the queue then
1686          * we want a reschedule, calculate the best cpu for the job.
1687          *
1688          * Always run reschedules on the LWPs original cpu.
1689          */
1690         TAILQ_INSERT_TAIL(q, lp, lwp_procq);
1691         *which |= 1 << pri;
1692 }
1693
1694 #ifdef SMP
1695
1696 /*
1697  * For SMP systems a user scheduler helper thread is created for each
1698  * cpu and is used to allow one cpu to wakeup another for the purposes of
1699  * scheduling userland threads from setrunqueue().
1700  *
1701  * UP systems do not need the helper since there is only one cpu.
1702  *
1703  * We can't use the idle thread for this because we might block.
1704  * Additionally, doing things this way allows us to HLT idle cpus
1705  * on MP systems.
1706  *
1707  * MPSAFE
1708  */
1709 static void
1710 sched_thread(void *dummy)
1711 {
1712     globaldata_t gd;
1713     bsd4_pcpu_t  dd;
1714     bsd4_pcpu_t  tmpdd;
1715     struct lwp *nlp;
1716     cpumask_t mask;
1717     int cpuid;
1718 #ifdef SMP
1719     cpumask_t tmpmask;
1720     int tmpid;
1721 #endif
1722
1723     gd = mycpu;
1724     cpuid = gd->gd_cpuid;       /* doesn't change */
1725     mask = gd->gd_cpumask;      /* doesn't change */
1726     dd = &bsd4_pcpu[cpuid];
1727
1728     /*
1729      * Since we are woken up only when no user processes are scheduled
1730      * on a cpu, we can run at an ultra low priority.
1731      */
1732     lwkt_setpri_self(TDPRI_USER_SCHEDULER);
1733
1734     tsleep(&dd->helper_thread, PINTERLOCKED, "sched_thread_sleep", 0);
1735
1736     for (;;) {
1737 //again:
1738         /*
1739          * We use the LWKT deschedule-interlock trick to avoid racing
1740          * bsd4_rdyprocmask.  This means we cannot block through to the
1741          * manual lwkt_switch() call we make below.
1742          */
1743         crit_enter_gd(gd);
1744         //lwkt_deschedule_self(gd->gd_curthread);
1745         tsleep_interlock(&dd->helper_thread, 0);
1746         spin_lock(&bsd4_spin);
1747         atomic_set_cpumask(&bsd4_rdyprocmask, mask);
1748
1749         clear_user_resched();   /* This satisfied the reschedule request */
1750         dd->rrcount = 0;        /* Reset the round-robin counter */
1751
1752         if ((bsd4_curprocmask & mask) == 0) {
1753                 /*
1754                  * No thread is currently scheduled.
1755                  */
1756                 KKASSERT(dd->uschedcp == NULL);
1757                 if ((nlp = chooseproc_locked(NULL)) != NULL) {
1758
1759                         KTR_COND_LOG(usched_sched_thread_no_process,
1760                             nlp->lwp_proc->p_pid == usched_bsd4_pid_debug,
1761                             gd->gd_cpuid,
1762                             nlp->lwp_proc->p_pid,
1763                             nlp->lwp_thread->td_gd->gd_cpuid);
1764
1765                         atomic_set_cpumask(&bsd4_curprocmask, mask);
1766                         dd->upri = nlp->lwp_priority;
1767                         dd->uschedcp = nlp;
1768                         spin_unlock(&bsd4_spin);
1769 #ifdef SMP
1770                         lwkt_acquire(nlp->lwp_thread);
1771 #endif
1772                         lwkt_schedule(nlp->lwp_thread);
1773                 } else {
1774                         spin_unlock(&bsd4_spin);
1775                 }
1776         } else if (bsd4_runqcount) {
1777                 if ((nlp = chooseproc_locked(dd->uschedcp)) != NULL) {
1778
1779                         KTR_COND_LOG(usched_sched_thread_process,
1780                             nlp->lwp_proc->p_pid == usched_bsd4_pid_debug,
1781                             gd->gd_cpuid,
1782                             nlp->lwp_proc->p_pid,
1783                             nlp->lwp_thread->td_gd->gd_cpuid);
1784
1785                         dd->upri = nlp->lwp_priority;
1786                         dd->uschedcp = nlp;
1787                         spin_unlock(&bsd4_spin);
1788 #ifdef SMP
1789                         lwkt_acquire(nlp->lwp_thread);
1790 #endif
1791                         lwkt_schedule(nlp->lwp_thread);
1792                 } else {
1793                         /*
1794                          * CHAINING CONDITION TRAIN
1795                          *
1796                          * We could not deal with the scheduler wakeup
1797                          * request on this cpu, locate a ready scheduler
1798                          * with no current lp assignment and chain to it.
1799                          *
1800                          * This ensures that a wakeup race which fails due
1801                          * to priority test does not leave other unscheduled
1802                          * cpus idle when the runqueue is not empty.
1803                          */
1804                         tmpmask = ~bsd4_curprocmask &
1805                             bsd4_rdyprocmask & smp_active_mask;
1806                         if (tmpmask) {
1807                                 tmpid = BSFCPUMASK(tmpmask);
1808                                 tmpdd = &bsd4_pcpu[tmpid];
1809                                 atomic_clear_cpumask(&bsd4_rdyprocmask,
1810                                     CPUMASK(tmpid));
1811                                 spin_unlock(&bsd4_spin);
1812                                 wakeup(&tmpdd->helper_thread);
1813                         } else {
1814                                 spin_unlock(&bsd4_spin);
1815                         }
1816
1817                         KTR_LOG(usched_sched_thread_no_process_found,
1818                             gd->gd_cpuid,
1819                             tmpmask);
1820                 }
1821         } else {
1822                 /*
1823                  * The runq is empty.
1824                  */
1825                 spin_unlock(&bsd4_spin);
1826         }
1827
1828         /*
1829          * We're descheduled unless someone scheduled us.  Switch away.
1830          * Exiting the critical section will cause splz() to be called
1831          * for us if interrupts and such are pending.
1832          */
1833         crit_exit_gd(gd);
1834         tsleep(&dd->helper_thread, PINTERLOCKED, "sched_thread_sleep", 0);
1835 //      lwkt_switch();
1836     }
1837 }
1838
1839 /* sysctl stick_to_level parameter */
1840 static int
1841 sysctl_usched_bsd4_stick_to_level(SYSCTL_HANDLER_ARGS)
1842 {
1843         int error, new_val;
1844
1845         new_val = usched_bsd4_stick_to_level;
1846
1847         error = sysctl_handle_int(oidp, &new_val, 0, req);
1848         if (error != 0 || req->newptr == NULL)
1849                 return (error);
1850         if (new_val > cpu_topology_levels_number - 1 ||
1851             new_val < 0)
1852                 return (EINVAL);
1853         usched_bsd4_stick_to_level = new_val;
1854         return (0);
1855 }
1856
1857 /*
1858  * Setup our scheduler helpers.  Note that curprocmask bit 0 has already
1859  * been cleared by rqinit() and we should not mess with it further.
1860  */
1861 static void
1862 sched_thread_cpu_init(void)
1863 {
1864         int i;
1865         int cpuid;
1866         int smt_not_supported = 0;
1867         int cache_coherent_not_supported = 0;
1868         if (bootverbose)
1869                 kprintf("Start scheduler helpers on cpus:\n");
1870
1871         sysctl_ctx_init(&usched_bsd4_sysctl_ctx);
1872         usched_bsd4_sysctl_tree = SYSCTL_ADD_NODE(&usched_bsd4_sysctl_ctx,
1873             SYSCTL_STATIC_CHILDREN(_kern), OID_AUTO,
1874             "usched_bsd4", CTLFLAG_RD, 0, "");
1875
1876         for (i = 0; i < ncpus; ++i) {
1877                 bsd4_pcpu_t dd = &bsd4_pcpu[i];
1878                 cpumask_t mask = CPUMASK(i);
1879
1880                 if ((mask & smp_active_mask) == 0)
1881                     continue;
1882
1883                 dd->cpunode = get_cpu_node_by_cpuid(i);
1884
1885                 if (dd->cpunode == NULL) {
1886                         smt_not_supported = 1;
1887                         cache_coherent_not_supported = 1;
1888                         if (bootverbose)
1889                                 kprintf ("\tcpu%d - WARNING: No CPU NODE found for cpu\n", i);
1890
1891                 } else {
1892
1893                         switch (dd->cpunode->type) {
1894                                 case THREAD_LEVEL:
1895                                         if (bootverbose)
1896                                                 kprintf ("\tcpu%d - HyperThreading available. "
1897                                                     "Core siblings: ", i);
1898                                         break;
1899                                 case CORE_LEVEL:
1900                                         smt_not_supported = 1;
1901
1902                                         if (bootverbose)
1903                                                 kprintf ("\tcpu%d - No HT available, multi-core/physical "
1904                                                     "cpu. Physical siblings: ", i);
1905                                         break;
1906                                 case CHIP_LEVEL:
1907                                         smt_not_supported = 1;
1908
1909                                         if (bootverbose)
1910                                                 kprintf ("\tcpu%d - No HT available, single-core/physical cpu. "
1911                                                     "Package Siblings: ", i);
1912                                         break;
1913                                 default:
1914                                         if (bootverbose)
1915                                                 kprintf ("\tcpu%d - Unknown cpunode->type. Siblings: ", i);
1916                                         break;
1917                         }
1918
1919                         if (bootverbose) {
1920                                 if (dd->cpunode->parent_node != NULL) {
1921                                         CPUSET_FOREACH(cpuid, dd->cpunode->parent_node->members)
1922                                                 kprintf("cpu%d ", cpuid);
1923                                         kprintf("\n");
1924                                 } else {
1925                                         kprintf(" no siblings\n");
1926                                 }
1927                         }
1928                 }
1929
1930                 lwkt_create(sched_thread, NULL, NULL, &dd->helper_thread,
1931                     0, i, "usched %d", i);
1932
1933                 /*
1934                  * Allow user scheduling on the target cpu.  cpu #0 has already
1935                  * been enabled in rqinit().
1936                  */
1937                 if (i)
1938                     atomic_clear_cpumask(&bsd4_curprocmask, mask);
1939                 atomic_set_cpumask(&bsd4_rdyprocmask, mask);
1940                 dd->upri = PRIBASE_NULL;
1941
1942         }
1943
1944         /* usched_bsd4 sysctl configurable parameters */
1945
1946         SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx,
1947             SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
1948             OID_AUTO, "rrinterval", CTLFLAG_RW,
1949             &usched_bsd4_rrinterval, 0, "");
1950         SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx,
1951             SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
1952             OID_AUTO, "decay", CTLFLAG_RW,
1953             &usched_bsd4_decay, 0, "Extra decay when not running");
1954         SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx,
1955             SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
1956             OID_AUTO, "batch_time", CTLFLAG_RW,
1957             &usched_bsd4_batch_time, 0, "Minimum batch counter value");
1958
1959         /* Add enable/disable option for SMT scheduling if supported */
1960         if (smt_not_supported) {
1961                 usched_bsd4_smt = 0;
1962                 SYSCTL_ADD_STRING(&usched_bsd4_sysctl_ctx,
1963                     SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
1964                     OID_AUTO, "smt", CTLFLAG_RD,
1965                     "NOT SUPPORTED", 0, "SMT NOT SUPPORTED");
1966         } else {
1967                 usched_bsd4_smt = 1;
1968                 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx,
1969                     SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
1970                     OID_AUTO, "smt", CTLFLAG_RW,
1971                     &usched_bsd4_smt, 0, "Enable/Disable SMT scheduling");
1972
1973         }
1974
1975         /* Add enable/disable option for cache coherent scheduling if supported */
1976         if (cache_coherent_not_supported) {
1977 #ifdef SMP
1978                 usched_bsd4_cache_coherent = 0;
1979                 SYSCTL_ADD_STRING(&usched_bsd4_sysctl_ctx,
1980                     SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
1981                     OID_AUTO, "cache_coherent", CTLFLAG_RD,
1982                     "NOT SUPPORTED", 0, "Cache coherence NOT SUPPORTED");
1983 #endif
1984         } else {
1985 #ifdef SMP
1986                 usched_bsd4_cache_coherent = 1;
1987                 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx,
1988                     SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
1989                     OID_AUTO, "cache_coherent", CTLFLAG_RW,
1990                     &usched_bsd4_cache_coherent, 0,
1991                     "Enable/Disable cache coherent scheduling");
1992 #endif
1993
1994                 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx,
1995                     SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
1996                     OID_AUTO, "upri_affinity", CTLFLAG_RW,
1997                     &usched_bsd4_upri_affinity, 1,
1998                     "Number of PPQs in user priority check");
1999
2000                 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx,
2001                     SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
2002                     OID_AUTO, "queue_checks", CTLFLAG_RW,
2003                     &usched_bsd4_queue_checks, 5,
2004                     "Number of LWP to check from a queue before giving up");
2005
2006                 SYSCTL_ADD_PROC(&usched_bsd4_sysctl_ctx,
2007                     SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
2008                     OID_AUTO, "stick_to_level", CTLTYPE_INT | CTLFLAG_RW,
2009                     NULL, sizeof usched_bsd4_stick_to_level,
2010                     sysctl_usched_bsd4_stick_to_level, "I",
2011                     "Stick a process to this level. See sysctl"
2012                     "paremter hw.cpu_topology.level_description");
2013         }
2014 }
2015 SYSINIT(uschedtd, SI_BOOT2_USCHED, SI_ORDER_SECOND,
2016         sched_thread_cpu_init, NULL)
2017 #else /* No SMP options - just add the configurable parameters to sysctl */
2018
2019 static void
2020 sched_sysctl_tree_init(void)
2021 {
2022         sysctl_ctx_init(&usched_bsd4_sysctl_ctx);
2023         usched_bsd4_sysctl_tree = SYSCTL_ADD_NODE(&usched_bsd4_sysctl_ctx,
2024             SYSCTL_STATIC_CHILDREN(_kern), OID_AUTO,
2025             "usched_bsd4", CTLFLAG_RD, 0, "");
2026
2027         /* usched_bsd4 sysctl configurable parameters */
2028         SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx,
2029             SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
2030             OID_AUTO, "rrinterval", CTLFLAG_RW,
2031             &usched_bsd4_rrinterval, 0, "");
2032         SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx,
2033             SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
2034             OID_AUTO, "decay", CTLFLAG_RW,
2035             &usched_bsd4_decay, 0, "Extra decay when not running");
2036         SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx,
2037             SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
2038             OID_AUTO, "batch_time", CTLFLAG_RW,
2039             &usched_bsd4_batch_time, 0, "Minimum batch counter value");
2040 }
2041 SYSINIT(uschedtd, SI_BOOT2_USCHED, SI_ORDER_SECOND,
2042         sched_sysctl_tree_init, NULL)
2043 #endif
2044