Merge branches 'hammer2' and 'master' of ssh://crater.dragonflybsd.org/repository...
[dragonfly.git] / sys / kern / usched_bsd4.c
1 /*
2  * Copyright (c) 1999 Peter Wemm <peter@FreeBSD.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26
27 #include <sys/param.h>
28 #include <sys/systm.h>
29 #include <sys/kernel.h>
30 #include <sys/lock.h>
31 #include <sys/queue.h>
32 #include <sys/proc.h>
33 #include <sys/rtprio.h>
34 #include <sys/uio.h>
35 #include <sys/sysctl.h>
36 #include <sys/resourcevar.h>
37 #include <sys/spinlock.h>
38 #include <sys/cpu_topology.h>
39 #include <sys/thread2.h>
40 #include <sys/spinlock2.h>
41 #include <sys/mplock2.h>
42
43 #include <sys/ktr.h>
44
45 #include <machine/cpu.h>
46 #include <machine/smp.h>
47
48 /*
49  * Priorities.  Note that with 32 run queues per scheduler each queue
50  * represents four priority levels.
51  */
52
53 #define MAXPRI                  128
54 #define PRIMASK                 (MAXPRI - 1)
55 #define PRIBASE_REALTIME        0
56 #define PRIBASE_NORMAL          MAXPRI
57 #define PRIBASE_IDLE            (MAXPRI * 2)
58 #define PRIBASE_THREAD          (MAXPRI * 3)
59 #define PRIBASE_NULL            (MAXPRI * 4)
60
61 #define NQS     32                      /* 32 run queues. */
62 #define PPQ     (MAXPRI / NQS)          /* priorities per queue */
63 #define PPQMASK (PPQ - 1)
64
65 /*
66  * NICEPPQ      - number of nice units per priority queue
67  *
68  * ESTCPUPPQ    - number of estcpu units per priority queue
69  * ESTCPUMAX    - number of estcpu units
70  */
71 #define NICEPPQ         2
72 #define ESTCPUPPQ       512
73 #define ESTCPUMAX       (ESTCPUPPQ * NQS)
74 #define BATCHMAX        (ESTCPUFREQ * 30)
75 #define PRIO_RANGE      (PRIO_MAX - PRIO_MIN + 1)
76
77 #define ESTCPULIM(v)    min((v), ESTCPUMAX)
78
79 TAILQ_HEAD(rq, lwp);
80
81 #define lwp_priority    lwp_usdata.bsd4.priority
82 #define lwp_rqindex     lwp_usdata.bsd4.rqindex
83 #define lwp_estcpu      lwp_usdata.bsd4.estcpu
84 #define lwp_batch       lwp_usdata.bsd4.batch
85 #define lwp_rqtype      lwp_usdata.bsd4.rqtype
86
87 static void bsd4_acquire_curproc(struct lwp *lp);
88 static void bsd4_release_curproc(struct lwp *lp);
89 static void bsd4_select_curproc(globaldata_t gd);
90 static void bsd4_setrunqueue(struct lwp *lp);
91 static void bsd4_schedulerclock(struct lwp *lp, sysclock_t period,
92                                 sysclock_t cpstamp);
93 static void bsd4_recalculate_estcpu(struct lwp *lp);
94 static void bsd4_resetpriority(struct lwp *lp);
95 static void bsd4_forking(struct lwp *plp, struct lwp *lp);
96 static void bsd4_exiting(struct lwp *lp, struct proc *);
97 static void bsd4_yield(struct lwp *lp);
98
99 #ifdef SMP
100 static void need_user_resched_remote(void *dummy);
101 static int batchy_looser_pri_test(struct lwp* lp);
102 static struct lwp *chooseproc_locked_cache_coherent(struct lwp *chklp);
103 #endif
104 static struct lwp *chooseproc_locked(struct lwp *chklp);
105 static void bsd4_remrunqueue_locked(struct lwp *lp);
106 static void bsd4_setrunqueue_locked(struct lwp *lp);
107
108 struct usched usched_bsd4 = {
109         { NULL },
110         "bsd4", "Original DragonFly Scheduler",
111         NULL,                   /* default registration */
112         NULL,                   /* default deregistration */
113         bsd4_acquire_curproc,
114         bsd4_release_curproc,
115         bsd4_setrunqueue,
116         bsd4_schedulerclock,
117         bsd4_recalculate_estcpu,
118         bsd4_resetpriority,
119         bsd4_forking,
120         bsd4_exiting,
121         NULL,                   /* setcpumask not supported */
122         bsd4_yield
123 };
124
125 struct usched_bsd4_pcpu {
126         struct thread   helper_thread;
127         short           rrcount;
128         short           upri;
129         struct lwp      *uschedcp;
130         struct lwp      *old_uschedcp;
131 #ifdef SMP
132         cpu_node_t      *cpunode;
133 #endif
134 };
135
136 typedef struct usched_bsd4_pcpu *bsd4_pcpu_t;
137
138 /*
139  * We have NQS (32) run queues per scheduling class.  For the normal
140  * class, there are 128 priorities scaled onto these 32 queues.  New
141  * processes are added to the last entry in each queue, and processes
142  * are selected for running by taking them from the head and maintaining
143  * a simple FIFO arrangement.  Realtime and Idle priority processes have
144  * and explicit 0-31 priority which maps directly onto their class queue
145  * index.  When a queue has something in it, the corresponding bit is
146  * set in the queuebits variable, allowing a single read to determine
147  * the state of all 32 queues and then a ffs() to find the first busy
148  * queue.
149  */
150 static struct rq bsd4_queues[NQS];
151 static struct rq bsd4_rtqueues[NQS];
152 static struct rq bsd4_idqueues[NQS];
153 static u_int32_t bsd4_queuebits;
154 static u_int32_t bsd4_rtqueuebits;
155 static u_int32_t bsd4_idqueuebits;
156 static cpumask_t bsd4_curprocmask = -1; /* currently running a user process */
157 static cpumask_t bsd4_rdyprocmask;      /* ready to accept a user process */
158 static int       bsd4_runqcount;
159 #ifdef SMP
160 static volatile int bsd4_scancpu;
161 #endif
162 static struct spinlock bsd4_spin;
163 static struct usched_bsd4_pcpu bsd4_pcpu[MAXCPU];
164 static struct sysctl_ctx_list usched_bsd4_sysctl_ctx;
165 static struct sysctl_oid *usched_bsd4_sysctl_tree;
166
167 /* Debug info exposed through debug.* sysctl */
168
169 SYSCTL_INT(_debug, OID_AUTO, bsd4_runqcount, CTLFLAG_RD, &bsd4_runqcount, 0,
170     "Number of run queues");
171 #ifdef INVARIANTS
172 static int usched_nonoptimal;
173 SYSCTL_INT(_debug, OID_AUTO, usched_nonoptimal, CTLFLAG_RW,
174         &usched_nonoptimal, 0, "acquire_curproc() was not optimal");
175 static int usched_optimal;
176 SYSCTL_INT(_debug, OID_AUTO, usched_optimal, CTLFLAG_RW,
177         &usched_optimal, 0, "acquire_curproc() was optimal");
178 #endif
179
180 static int usched_bsd4_debug = -1;
181 SYSCTL_INT(_debug, OID_AUTO, scdebug, CTLFLAG_RW, &usched_bsd4_debug, 0,
182     "Print debug information for this pid");
183 static int usched_bsd4_pid_debug = -1;
184 SYSCTL_INT(_debug, OID_AUTO, pid_debug, CTLFLAG_RW, &usched_bsd4_pid_debug, 0,
185     "Print KTR debug information for this pid");
186
187 #ifdef SMP
188 static int remote_resched_nonaffinity;
189 static int remote_resched_affinity;
190 static int choose_affinity;
191 SYSCTL_INT(_debug, OID_AUTO, remote_resched_nonaffinity, CTLFLAG_RD,
192         &remote_resched_nonaffinity, 0, "Number of remote rescheds");
193 SYSCTL_INT(_debug, OID_AUTO, remote_resched_affinity, CTLFLAG_RD,
194         &remote_resched_affinity, 0, "Number of remote rescheds");
195 SYSCTL_INT(_debug, OID_AUTO, choose_affinity, CTLFLAG_RD,
196         &choose_affinity, 0, "chooseproc() was smart");
197 #endif
198
199
200 /* Tunning usched_bsd4 - configurable through kern.usched_bsd4.* */
201 #ifdef SMP
202 static int usched_bsd4_smt = 0;
203 static int usched_bsd4_cache_coherent = 0;
204 static int usched_bsd4_upri_affinity = 16; /* 32 queues - half-way */
205 static int usched_bsd4_queue_checks = 5;
206 static int usched_bsd4_stick_to_level = 0;
207 #endif
208 static int usched_bsd4_rrinterval = (ESTCPUFREQ + 9) / 10;
209 static int usched_bsd4_decay = 8;
210 static int usched_bsd4_batch_time = 10;
211
212 /* KTR debug printings */
213
214 KTR_INFO_MASTER(usched);
215
216 #if !defined(KTR_USCHED_BSD4)
217 #define KTR_USCHED_BSD4 KTR_ALL
218 #endif
219
220 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_acquire_curproc_urw, 0,
221     "USCHED_BSD4(bsd4_acquire_curproc in user_reseched_wanted "
222     "after release: pid %d, cpuid %d, curr_cpuid %d)",
223     pid_t pid, int cpuid, int curr);
224 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_acquire_curproc_before_loop, 0,
225     "USCHED_BSD4(bsd4_acquire_curproc before loop: pid %d, cpuid %d, "
226     "curr_cpuid %d)",
227     pid_t pid, int cpuid, int curr);
228 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_acquire_curproc_not, 0,
229     "USCHED_BSD4(bsd4_acquire_curproc couldn't acquire after "
230     "bsd4_setrunqueue: pid %d, cpuid %d, curr_lp pid %d, curr_cpuid %d)",
231     pid_t pid, int cpuid, pid_t curr_pid, int curr_cpuid);
232 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_acquire_curproc_switch, 0,
233     "USCHED_BSD4(bsd4_acquire_curproc after lwkt_switch: pid %d, "
234     "cpuid %d, curr_cpuid %d)",
235     pid_t pid, int cpuid, int curr);
236
237 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_release_curproc, 0,
238     "USCHED_BSD4(bsd4_release_curproc before select: pid %d, "
239     "cpuid %d, curr_cpuid %d)",
240     pid_t pid, int cpuid, int curr);
241
242 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_select_curproc, 0,
243     "USCHED_BSD4(bsd4_release_curproc before select: pid %d, "
244     "cpuid %d, old_pid %d, old_cpuid %d, curr_cpuid %d)",
245     pid_t pid, int cpuid, pid_t old_pid, int old_cpuid, int curr);
246
247 #ifdef SMP
248 KTR_INFO(KTR_USCHED_BSD4, usched, batchy_test_false, 0,
249     "USCHED_BSD4(batchy_looser_pri_test false: pid %d, "
250     "cpuid %d, verify_mask %lu)",
251     pid_t pid, int cpuid, cpumask_t mask);
252 KTR_INFO(KTR_USCHED_BSD4, usched, batchy_test_true, 0,
253     "USCHED_BSD4(batchy_looser_pri_test true: pid %d, "
254     "cpuid %d, verify_mask %lu)",
255     pid_t pid, int cpuid, cpumask_t mask);
256
257 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_fc_smt, 0,
258     "USCHED_BSD4(bsd4_setrunqueue free cpus smt: pid %d, cpuid %d, "
259     "mask %lu, curr_cpuid %d)",
260     pid_t pid, int cpuid, cpumask_t mask, int curr);
261 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_fc_non_smt, 0,
262     "USCHED_BSD4(bsd4_setrunqueue free cpus check non_smt: pid %d, "
263     "cpuid %d, mask %lu, curr_cpuid %d)",
264     pid_t pid, int cpuid, cpumask_t mask, int curr);
265 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_rc, 0,
266     "USCHED_BSD4(bsd4_setrunqueue running cpus check: pid %d, "
267     "cpuid %d, mask %lu, curr_cpuid %d)",
268     pid_t pid, int cpuid, cpumask_t mask, int curr);
269 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_found, 0,
270     "USCHED_BSD4(bsd4_setrunqueue found cpu: pid %d, cpuid %d, "
271     "mask %lu, found_cpuid %d, curr_cpuid %d)",
272     pid_t pid, int cpuid, cpumask_t mask, int found_cpuid, int curr);
273 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_not_found, 0,
274     "USCHED_BSD4(bsd4_setrunqueue not found cpu: pid %d, cpuid %d, "
275     "try_cpuid %d, curr_cpuid %d)",
276     pid_t pid, int cpuid, int try_cpuid, int curr);
277 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_found_best_cpuid, 0,
278     "USCHED_BSD4(bsd4_setrunqueue found cpu: pid %d, cpuid %d, "
279     "mask %lu, found_cpuid %d, curr_cpuid %d)",
280     pid_t pid, int cpuid, cpumask_t mask, int found_cpuid, int curr);
281 #endif
282
283 KTR_INFO(KTR_USCHED_BSD4, usched, chooseproc, 0,
284     "USCHED_BSD4(chooseproc: pid %d, old_cpuid %d, curr_cpuid %d)",
285     pid_t pid, int old_cpuid, int curr);
286 #ifdef SMP
287 KTR_INFO(KTR_USCHED_BSD4, usched, chooseproc_cc, 0,
288     "USCHED_BSD4(chooseproc_cc: pid %d, old_cpuid %d, curr_cpuid %d)",
289     pid_t pid, int old_cpuid, int curr);
290 KTR_INFO(KTR_USCHED_BSD4, usched, chooseproc_cc_not_good, 0,
291     "USCHED_BSD4(chooseproc_cc not good: pid %d, old_cpumask %lu, "
292     "sibling_mask %lu, curr_cpumask %lu)",
293     pid_t pid, cpumask_t old_cpumask, cpumask_t sibling_mask, cpumask_t curr);
294 KTR_INFO(KTR_USCHED_BSD4, usched, chooseproc_cc_elected, 0,
295     "USCHED_BSD4(chooseproc_cc elected: pid %d, old_cpumask %lu, "
296     "sibling_mask %lu, curr_cpumask: %lu)",
297     pid_t pid, cpumask_t old_cpumask, cpumask_t sibling_mask, cpumask_t curr);
298
299 KTR_INFO(KTR_USCHED_BSD4, usched, sched_thread_no_process, 0,
300     "USCHED_BSD4(sched_thread %d no process scheduled: pid %d, old_cpuid %d)",
301     int id, pid_t pid, int cpuid);
302 KTR_INFO(KTR_USCHED_BSD4, usched, sched_thread_process, 0,
303     "USCHED_BSD4(sched_thread %d process scheduled: pid %d, old_cpuid %d)",
304     int id, pid_t pid, int cpuid);
305 KTR_INFO(KTR_USCHED_BSD4, usched, sched_thread_no_process_found, 0,
306     "USCHED_BSD4(sched_thread %d no process found; tmpmask %lu)",
307     int id, cpumask_t tmpmask);
308 #endif
309
310 /*
311  * Initialize the run queues at boot time.
312  */
313 static void
314 rqinit(void *dummy)
315 {
316         int i;
317
318         spin_init(&bsd4_spin);
319         for (i = 0; i < NQS; i++) {
320                 TAILQ_INIT(&bsd4_queues[i]);
321                 TAILQ_INIT(&bsd4_rtqueues[i]);
322                 TAILQ_INIT(&bsd4_idqueues[i]);
323         }
324         atomic_clear_cpumask(&bsd4_curprocmask, 1);
325 }
326 SYSINIT(runqueue, SI_BOOT2_USCHED, SI_ORDER_FIRST, rqinit, NULL)
327
328 /*
329  * BSD4_ACQUIRE_CURPROC
330  *
331  * This function is called when the kernel intends to return to userland.
332  * It is responsible for making the thread the current designated userland
333  * thread for this cpu, blocking if necessary.
334  *
335  * The kernel has already depressed our LWKT priority so we must not switch
336  * until we have either assigned or disposed of the thread.
337  *
338  * WARNING! THIS FUNCTION IS ALLOWED TO CAUSE THE CURRENT THREAD TO MIGRATE
339  * TO ANOTHER CPU!  Because most of the kernel assumes that no migration will
340  * occur, this function is called only under very controlled circumstances.
341  *
342  * MPSAFE
343  */
344 static void
345 bsd4_acquire_curproc(struct lwp *lp)
346 {
347         globaldata_t gd;
348         bsd4_pcpu_t dd;
349         thread_t td;
350 #if 0
351         struct lwp *olp;
352 #endif
353
354         /*
355          * Make sure we aren't sitting on a tsleep queue.
356          */
357         td = lp->lwp_thread;
358         crit_enter_quick(td);
359         if (td->td_flags & TDF_TSLEEPQ)
360                 tsleep_remove(td);
361         bsd4_recalculate_estcpu(lp);
362
363         /*
364          * If a reschedule was requested give another thread the
365          * driver's seat.
366          */
367         if (user_resched_wanted()) {
368                 clear_user_resched();
369                 bsd4_release_curproc(lp);
370
371                 KTR_COND_LOG(usched_bsd4_acquire_curproc_urw,
372                     lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
373                     lp->lwp_proc->p_pid,
374                     lp->lwp_thread->td_gd->gd_cpuid,
375                     mycpu->gd_cpuid);
376         }
377
378         /*
379          * Loop until we are the current user thread
380          */
381         gd = mycpu;
382         dd = &bsd4_pcpu[gd->gd_cpuid];
383
384         KTR_COND_LOG(usched_bsd4_acquire_curproc_before_loop,
385             lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
386             lp->lwp_proc->p_pid,
387             lp->lwp_thread->td_gd->gd_cpuid,
388             gd->gd_cpuid);
389
390         do {
391                 /*
392                  * Process any pending events and higher priority threads.
393                  */
394                 lwkt_yield();
395
396                 /*
397                  * Become the currently scheduled user thread for this cpu
398                  * if we can do so trivially.
399                  *
400                  * We can steal another thread's current thread designation
401                  * on this cpu since if we are running that other thread
402                  * must not be, so we can safely deschedule it.
403                  */
404                 if (dd->uschedcp == lp) {
405                         /*
406                          * We are already the current lwp (hot path).
407                          */
408                         dd->upri = lp->lwp_priority;
409                 } else if (dd->uschedcp == NULL) {
410                         /*
411                          * We can trivially become the current lwp.
412                          */
413                         atomic_set_cpumask(&bsd4_curprocmask, gd->gd_cpumask);
414                         dd->uschedcp = lp;
415                         dd->upri = lp->lwp_priority;
416                 } else if (dd->upri > lp->lwp_priority) {
417                         /*
418                          * We can steal the current cpu's lwp designation
419                          * away simply by replacing it.  The other thread
420                          * will stall when it tries to return to userland.
421                          */
422                         dd->uschedcp = lp;
423                         dd->upri = lp->lwp_priority;
424                         /*
425                         lwkt_deschedule(olp->lwp_thread);
426                         bsd4_setrunqueue(olp);
427                         */
428                 } else {
429                         /*
430                          * We cannot become the current lwp, place the lp
431                          * on the bsd4 run-queue and deschedule ourselves.
432                          *
433                          * When we are reactivated we will have another
434                          * chance.
435                          */
436                         lwkt_deschedule(lp->lwp_thread);
437
438                         bsd4_setrunqueue(lp);
439
440                         KTR_COND_LOG(usched_bsd4_acquire_curproc_not,
441                             lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
442                             lp->lwp_proc->p_pid,
443                             lp->lwp_thread->td_gd->gd_cpuid,
444                             dd->uschedcp->lwp_proc->p_pid,
445                             gd->gd_cpuid);
446
447
448                         lwkt_switch();
449
450                         /*
451                          * Reload after a switch or setrunqueue/switch possibly
452                          * moved us to another cpu.
453                          */
454                         gd = mycpu;
455                         dd = &bsd4_pcpu[gd->gd_cpuid];
456
457                         KTR_COND_LOG(usched_bsd4_acquire_curproc_switch,
458                             lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
459                             lp->lwp_proc->p_pid,
460                             lp->lwp_thread->td_gd->gd_cpuid,
461                             gd->gd_cpuid);
462                 }
463         } while (dd->uschedcp != lp);
464
465         crit_exit_quick(td);
466         KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0);
467 }
468
469 /*
470  * BSD4_RELEASE_CURPROC
471  *
472  * This routine detaches the current thread from the userland scheduler,
473  * usually because the thread needs to run or block in the kernel (at
474  * kernel priority) for a while.
475  *
476  * This routine is also responsible for selecting a new thread to
477  * make the current thread.
478  *
479  * NOTE: This implementation differs from the dummy example in that
480  * bsd4_select_curproc() is able to select the current process, whereas
481  * dummy_select_curproc() is not able to select the current process.
482  * This means we have to NULL out uschedcp.
483  *
484  * Additionally, note that we may already be on a run queue if releasing
485  * via the lwkt_switch() in bsd4_setrunqueue().
486  *
487  * MPSAFE
488  */
489
490 static void
491 bsd4_release_curproc(struct lwp *lp)
492 {
493         globaldata_t gd = mycpu;
494         bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid];
495
496         if (dd->uschedcp == lp) {
497                 crit_enter();
498                 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0);
499
500                 KTR_COND_LOG(usched_bsd4_release_curproc,
501                     lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
502                     lp->lwp_proc->p_pid,
503                     lp->lwp_thread->td_gd->gd_cpuid,
504                     gd->gd_cpuid);
505
506                 dd->uschedcp = NULL;    /* don't let lp be selected */
507                 dd->upri = PRIBASE_NULL;
508                 atomic_clear_cpumask(&bsd4_curprocmask, gd->gd_cpumask);
509                 dd->old_uschedcp = lp;  /* used only for KTR debug prints */
510                 bsd4_select_curproc(gd);
511                 crit_exit();
512         }
513 }
514
515 /*
516  * BSD4_SELECT_CURPROC
517  *
518  * Select a new current process for this cpu and clear any pending user
519  * reschedule request.  The cpu currently has no current process.
520  *
521  * This routine is also responsible for equal-priority round-robining,
522  * typically triggered from bsd4_schedulerclock().  In our dummy example
523  * all the 'user' threads are LWKT scheduled all at once and we just
524  * call lwkt_switch().
525  *
526  * The calling process is not on the queue and cannot be selected.
527  *
528  * MPSAFE
529  */
530 static
531 void
532 bsd4_select_curproc(globaldata_t gd)
533 {
534         bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid];
535         struct lwp *nlp;
536         int cpuid = gd->gd_cpuid;
537
538         crit_enter_gd(gd);
539
540         spin_lock(&bsd4_spin);
541 #ifdef SMP
542         if(usched_bsd4_cache_coherent)
543                 nlp = chooseproc_locked_cache_coherent(dd->uschedcp);
544         else
545 #endif
546                 nlp = chooseproc_locked(dd->uschedcp);
547
548         if (nlp) {
549
550                 KTR_COND_LOG(usched_bsd4_select_curproc,
551                     nlp->lwp_proc->p_pid == usched_bsd4_pid_debug,
552                     nlp->lwp_proc->p_pid,
553                     nlp->lwp_thread->td_gd->gd_cpuid,
554                     dd->old_uschedcp->lwp_proc->p_pid,
555                     dd->old_uschedcp->lwp_thread->td_gd->gd_cpuid,
556                     gd->gd_cpuid);
557
558                 atomic_set_cpumask(&bsd4_curprocmask, CPUMASK(cpuid));
559                 dd->upri = nlp->lwp_priority;
560                 dd->uschedcp = nlp;
561                 spin_unlock(&bsd4_spin);
562 #ifdef SMP
563                 lwkt_acquire(nlp->lwp_thread);
564 #endif
565                 lwkt_schedule(nlp->lwp_thread);
566         } else {
567                 spin_unlock(&bsd4_spin);
568         }
569
570 #if 0
571         } else if (bsd4_runqcount && (bsd4_rdyprocmask & CPUMASK(cpuid))) {
572                 atomic_clear_cpumask(&bsd4_rdyprocmask, CPUMASK(cpuid));
573                 spin_unlock(&bsd4_spin);
574                 lwkt_schedule(&dd->helper_thread);
575         } else {
576                 spin_unlock(&bsd4_spin);
577         }
578 #endif
579         crit_exit_gd(gd);
580 }
581 #ifdef SMP
582
583 /*
584  * batchy_looser_pri_test() - determine if a process is batchy or not
585  * relative to the other processes running in the system
586  */
587 static int
588 batchy_looser_pri_test(struct lwp* lp)
589 {
590         cpumask_t mask;
591         bsd4_pcpu_t other_dd;
592         int cpu;
593
594         /* Current running processes */
595         mask = bsd4_curprocmask & smp_active_mask
596             & usched_global_cpumask;
597
598         while(mask) {
599                 cpu = BSFCPUMASK(mask);
600                 other_dd = &bsd4_pcpu[cpu];
601                 if (other_dd->upri - lp->lwp_priority > usched_bsd4_upri_affinity * PPQ) {
602
603                         KTR_COND_LOG(usched_batchy_test_false,
604                             lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
605                             lp->lwp_proc->p_pid,
606                             lp->lwp_thread->td_gd->gd_cpuid,
607                             mask);
608
609                         return 0;
610                 }
611                 mask &= ~CPUMASK(cpu);
612         }
613
614         KTR_COND_LOG(usched_batchy_test_true,
615             lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
616             lp->lwp_proc->p_pid,
617             lp->lwp_thread->td_gd->gd_cpuid,
618             mask);
619
620         return 1;
621 }
622
623 #endif
624 /*
625  *
626  * BSD4_SETRUNQUEUE
627  *
628  * Place the specified lwp on the user scheduler's run queue.  This routine
629  * must be called with the thread descheduled.  The lwp must be runnable.
630  *
631  * The thread may be the current thread as a special case.
632  *
633  * MPSAFE
634  */
635 static void
636 bsd4_setrunqueue(struct lwp *lp)
637 {
638         globaldata_t gd;
639         bsd4_pcpu_t dd;
640 #ifdef SMP
641         int cpuid;
642         cpumask_t mask;
643         cpumask_t tmpmask;
644 #endif
645
646         /*
647          * First validate the process state relative to the current cpu.
648          * We don't need the spinlock for this, just a critical section.
649          * We are in control of the process.
650          */
651         crit_enter();
652         KASSERT(lp->lwp_stat == LSRUN, ("setrunqueue: lwp not LSRUN"));
653         KASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0,
654             ("lwp %d/%d already on runq! flag %08x/%08x", lp->lwp_proc->p_pid,
655              lp->lwp_tid, lp->lwp_proc->p_flags, lp->lwp_flags));
656         KKASSERT((lp->lwp_thread->td_flags & TDF_RUNQ) == 0);
657
658         /*
659          * Note: gd and dd are relative to the target thread's last cpu,
660          * NOT our current cpu.
661          */
662         gd = lp->lwp_thread->td_gd;
663         dd = &bsd4_pcpu[gd->gd_cpuid];
664
665         /*
666          * This process is not supposed to be scheduled anywhere or assigned
667          * as the current process anywhere.  Assert the condition.
668          */
669         KKASSERT(dd->uschedcp != lp);
670
671 #ifndef SMP
672         /*
673          * If we are not SMP we do not have a scheduler helper to kick
674          * and must directly activate the process if none are scheduled.
675          *
676          * This is really only an issue when bootstrapping init since
677          * the caller in all other cases will be a user process, and
678          * even if released (dd->uschedcp == NULL), that process will
679          * kickstart the scheduler when it returns to user mode from
680          * the kernel.
681          */
682         if (dd->uschedcp == NULL) {
683                 atomic_set_cpumask(&bsd4_curprocmask, gd->gd_cpumask);
684                 dd->uschedcp = lp;
685                 dd->upri = lp->lwp_priority;
686                 lwkt_schedule(lp->lwp_thread);
687                 crit_exit();
688                 return;
689         }
690 #endif
691
692 #ifdef SMP
693         /*
694          * XXX fixme.  Could be part of a remrunqueue/setrunqueue
695          * operation when the priority is recalculated, so TDF_MIGRATING
696          * may already be set.
697          */
698         if ((lp->lwp_thread->td_flags & TDF_MIGRATING) == 0)
699                 lwkt_giveaway(lp->lwp_thread);
700 #endif
701
702         /*
703          * We lose control of lp the moment we release the spinlock after
704          * having placed lp on the queue.  i.e. another cpu could pick it
705          * up and it could exit, or its priority could be further adjusted,
706          * or something like that.
707          */
708         spin_lock(&bsd4_spin);
709         bsd4_setrunqueue_locked(lp);
710         lp->lwp_setrunqueue_ticks = sched_ticks;
711
712 #ifdef SMP
713         /*
714          * Kick the scheduler helper on one of the other cpu's
715          * and request a reschedule if appropriate.
716          *
717          * NOTE: We check all cpus whos rdyprocmask is set.  First we
718          *       look for cpus without designated lps, then we look for
719          *       cpus with designated lps with a worse priority than our
720          *       process.
721          */
722         ++bsd4_scancpu;
723
724         if(usched_bsd4_smt) {
725
726                 /*
727                  * SMT heuristic - Try to schedule on a free physical core. If no physical core
728                  * found than choose the one that has an interactive thread
729                  */
730
731                 int best_cpuid = -1;
732                 int min_prio = MAXPRI * MAXPRI;
733                 int sibling;
734
735                 cpuid = (bsd4_scancpu & 0xFFFF) % ncpus;
736                 mask = ~bsd4_curprocmask & bsd4_rdyprocmask & lp->lwp_cpumask &
737                     smp_active_mask & usched_global_cpumask;
738
739                 KTR_COND_LOG(usched_bsd4_setrunqueue_fc_smt,
740                     lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
741                     lp->lwp_proc->p_pid,
742                     lp->lwp_thread->td_gd->gd_cpuid,
743                     mask,
744                     mycpu->gd_cpuid);
745
746                 while (mask) {
747                         tmpmask = ~(CPUMASK(cpuid) - 1);
748                         if (mask & tmpmask)
749                                 cpuid = BSFCPUMASK(mask & tmpmask);
750                         else
751                                 cpuid = BSFCPUMASK(mask);
752                         gd = globaldata_find(cpuid);
753                         dd = &bsd4_pcpu[cpuid];
754
755                         if ((dd->upri & ~PPQMASK) >= (lp->lwp_priority & ~PPQMASK)) {
756                                 if (dd->cpunode->parent_node->members & ~dd->cpunode->members & mask) {
757
758                                         KTR_COND_LOG(usched_bsd4_setrunqueue_found,
759                                             lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
760                                             lp->lwp_proc->p_pid,
761                                             lp->lwp_thread->td_gd->gd_cpuid,
762                                             mask,
763                                             cpuid,
764                                             mycpu->gd_cpuid);
765
766                                         goto found;
767                                 } else {
768                                         sibling = BSFCPUMASK(dd->cpunode->parent_node->members &
769                                             ~dd->cpunode->members);
770                                         if (min_prio > bsd4_pcpu[sibling].upri) {
771                                                 min_prio = bsd4_pcpu[sibling].upri;
772                                                 best_cpuid = cpuid;
773                                         }
774                                 }
775                         }
776                         mask &= ~CPUMASK(cpuid);
777                 }
778
779                 if (best_cpuid != -1) {
780                         cpuid = best_cpuid;
781                         gd = globaldata_find(cpuid);
782                         dd = &bsd4_pcpu[cpuid];
783
784                         KTR_COND_LOG(usched_bsd4_setrunqueue_found_best_cpuid,
785                             lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
786                             lp->lwp_proc->p_pid,
787                             lp->lwp_thread->td_gd->gd_cpuid,
788                             mask,
789                             cpuid,
790                             mycpu->gd_cpuid);
791
792                         goto found;
793                 }
794         } else {
795                 /* Fallback to the original heuristic */
796                 cpuid = (bsd4_scancpu & 0xFFFF) % ncpus;
797                 mask = ~bsd4_curprocmask & bsd4_rdyprocmask & lp->lwp_cpumask &
798                        smp_active_mask & usched_global_cpumask;
799
800                 KTR_COND_LOG(usched_bsd4_setrunqueue_fc_non_smt,
801                     lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
802                     lp->lwp_proc->p_pid,
803                     lp->lwp_thread->td_gd->gd_cpuid,
804                     mask,
805                     mycpu->gd_cpuid);
806
807                 while (mask) {
808                         tmpmask = ~(CPUMASK(cpuid) - 1);
809                         if (mask & tmpmask)
810                                 cpuid = BSFCPUMASK(mask & tmpmask);
811                         else
812                                 cpuid = BSFCPUMASK(mask);
813                         gd = globaldata_find(cpuid);
814                         dd = &bsd4_pcpu[cpuid];
815
816                         if ((dd->upri & ~PPQMASK) >= (lp->lwp_priority & ~PPQMASK)) {
817
818                                 KTR_COND_LOG(usched_bsd4_setrunqueue_found,
819                                     lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
820                                     lp->lwp_proc->p_pid,
821                                     lp->lwp_thread->td_gd->gd_cpuid,
822                                     mask,
823                                     cpuid,
824                                     mycpu->gd_cpuid);
825
826                                 goto found;
827                         }
828                         mask &= ~CPUMASK(cpuid);
829                 }
830         }
831
832         /*
833          * Then cpus which might have a currently running lp
834          */
835         mask = bsd4_curprocmask & bsd4_rdyprocmask &
836                lp->lwp_cpumask & smp_active_mask & usched_global_cpumask;
837
838         KTR_COND_LOG(usched_bsd4_setrunqueue_rc,
839             lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
840             lp->lwp_proc->p_pid,
841             lp->lwp_thread->td_gd->gd_cpuid,
842             mask,
843             mycpu->gd_cpuid);
844
845         while (mask) {
846                 tmpmask = ~(CPUMASK(cpuid) - 1);
847                 if (mask & tmpmask)
848                         cpuid = BSFCPUMASK(mask & tmpmask);
849                 else
850                         cpuid = BSFCPUMASK(mask);
851                 gd = globaldata_find(cpuid);
852                 dd = &bsd4_pcpu[cpuid];
853
854                 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) {
855
856                         KTR_COND_LOG(usched_bsd4_setrunqueue_found,
857                             lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
858                             lp->lwp_proc->p_pid,
859                             lp->lwp_thread->td_gd->gd_cpuid,
860                             mask,
861                             cpuid,
862                             mycpu->gd_cpuid);
863
864                         goto found;
865                 }
866                 mask &= ~CPUMASK(cpuid);
867         }
868
869         /*
870          * If we cannot find a suitable cpu we reload from bsd4_scancpu
871          * and round-robin.  Other cpus will pickup as they release their
872          * current lwps or become ready.
873          *
874          * Avoid a degenerate system lockup case if usched_global_cpumask
875          * is set to 0 or otherwise does not cover lwp_cpumask.
876          *
877          * We only kick the target helper thread in this case, we do not
878          * set the user resched flag because
879          */
880         cpuid = (bsd4_scancpu & 0xFFFF) % ncpus;
881         if ((CPUMASK(cpuid) & usched_global_cpumask) == 0) {
882                 cpuid = 0;
883         }
884         gd = globaldata_find(cpuid);
885         dd = &bsd4_pcpu[cpuid];
886
887         KTR_COND_LOG(usched_bsd4_setrunqueue_not_found,
888             lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
889             lp->lwp_proc->p_pid,
890             lp->lwp_thread->td_gd->gd_cpuid,
891             cpuid,
892             mycpu->gd_cpuid);
893
894 found:
895         if (gd == mycpu) {
896                 spin_unlock(&bsd4_spin);
897                 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) {
898                         if (dd->uschedcp == NULL) {
899                                 wakeup(&dd->helper_thread);
900                         } else {
901                                 need_user_resched();
902                         }
903                 }
904         } else {
905                 atomic_clear_cpumask(&bsd4_rdyprocmask, CPUMASK(cpuid));
906                 spin_unlock(&bsd4_spin);
907                 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK))
908                         lwkt_send_ipiq(gd, need_user_resched_remote, NULL);
909                 else
910                         wakeup(&dd->helper_thread);
911         }
912 #else
913         /*
914          * Request a reschedule if appropriate.
915          */
916         spin_unlock(&bsd4_spin);
917         if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) {
918                 need_user_resched();
919         }
920 #endif
921         crit_exit();
922 }
923
924 /*
925  * This routine is called from a systimer IPI.  It MUST be MP-safe and
926  * the BGL IS NOT HELD ON ENTRY.  This routine is called at ESTCPUFREQ on
927  * each cpu.
928  *
929  * MPSAFE
930  */
931 static
932 void
933 bsd4_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp)
934 {
935         globaldata_t gd = mycpu;
936         bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid];
937
938         /*
939          * Do we need to round-robin?  We round-robin 10 times a second.
940          * This should only occur for cpu-bound batch processes.
941          */
942         if (++dd->rrcount >= usched_bsd4_rrinterval) {
943                 dd->rrcount = 0;
944                 need_user_resched();
945         }
946
947         /*
948          * Adjust estcpu upward using a real time equivalent calculation.
949          */
950         lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu + ESTCPUMAX / ESTCPUFREQ + 1);
951
952         /*
953          * Spinlocks also hold a critical section so there should not be
954          * any active.
955          */
956         KKASSERT(gd->gd_spinlocks_wr == 0);
957
958         bsd4_resetpriority(lp);
959 #if 0
960         /*
961         * if we can't call bsd4_resetpriority for some reason we must call
962          * need user_resched().
963          */
964         need_user_resched();
965 #endif
966 }
967
968 /*
969  * Called from acquire and from kern_synch's one-second timer (one of the
970  * callout helper threads) with a critical section held.
971  *
972  * Decay p_estcpu based on the number of ticks we haven't been running
973  * and our p_nice.  As the load increases each process observes a larger
974  * number of idle ticks (because other processes are running in them).
975  * This observation leads to a larger correction which tends to make the
976  * system more 'batchy'.
977  *
978  * Note that no recalculation occurs for a process which sleeps and wakes
979  * up in the same tick.  That is, a system doing thousands of context
980  * switches per second will still only do serious estcpu calculations
981  * ESTCPUFREQ times per second.
982  *
983  * MPSAFE
984  */
985 static
986 void
987 bsd4_recalculate_estcpu(struct lwp *lp)
988 {
989         globaldata_t gd = mycpu;
990         sysclock_t cpbase;
991         sysclock_t ttlticks;
992         int estcpu;
993         int decay_factor;
994
995         /*
996          * We have to subtract periodic to get the last schedclock
997          * timeout time, otherwise we would get the upcoming timeout.
998          * Keep in mind that a process can migrate between cpus and
999          * while the scheduler clock should be very close, boundary
1000          * conditions could lead to a small negative delta.
1001          */
1002         cpbase = gd->gd_schedclock.time - gd->gd_schedclock.periodic;
1003
1004         if (lp->lwp_slptime > 1) {
1005                 /*
1006                  * Too much time has passed, do a coarse correction.
1007                  */
1008                 lp->lwp_estcpu = lp->lwp_estcpu >> 1;
1009                 bsd4_resetpriority(lp);
1010                 lp->lwp_cpbase = cpbase;
1011                 lp->lwp_cpticks = 0;
1012                 lp->lwp_batch -= ESTCPUFREQ;
1013                 if (lp->lwp_batch < 0)
1014                         lp->lwp_batch = 0;
1015         } else if (lp->lwp_cpbase != cpbase) {
1016                 /*
1017                  * Adjust estcpu if we are in a different tick.  Don't waste
1018                  * time if we are in the same tick.
1019                  *
1020                  * First calculate the number of ticks in the measurement
1021                  * interval.  The ttlticks calculation can wind up 0 due to
1022                  * a bug in the handling of lwp_slptime  (as yet not found),
1023                  * so make sure we do not get a divide by 0 panic.
1024                  */
1025                 ttlticks = (cpbase - lp->lwp_cpbase) /
1026                            gd->gd_schedclock.periodic;
1027                 if (ttlticks < 0) {
1028                         ttlticks = 0;
1029                         lp->lwp_cpbase = cpbase;
1030                 }
1031                 if (ttlticks == 0)
1032                         return;
1033                 updatepcpu(lp, lp->lwp_cpticks, ttlticks);
1034
1035                 /*
1036                  * Calculate the percentage of one cpu used factoring in ncpus
1037                  * and the load and adjust estcpu.  Handle degenerate cases
1038                  * by adding 1 to bsd4_runqcount.
1039                  *
1040                  * estcpu is scaled by ESTCPUMAX.
1041                  *
1042                  * bsd4_runqcount is the excess number of user processes
1043                  * that cannot be immediately scheduled to cpus.  We want
1044                  * to count these as running to avoid range compression
1045                  * in the base calculation (which is the actual percentage
1046                  * of one cpu used).
1047                  */
1048                 estcpu = (lp->lwp_cpticks * ESTCPUMAX) *
1049                          (bsd4_runqcount + ncpus) / (ncpus * ttlticks);
1050
1051                 /*
1052                  * If estcpu is > 50% we become more batch-like
1053                  * If estcpu is <= 50% we become less batch-like
1054                  *
1055                  * It takes 30 cpu seconds to traverse the entire range.
1056                  */
1057                 if (estcpu > ESTCPUMAX / 2) {
1058                         lp->lwp_batch += ttlticks;
1059                         if (lp->lwp_batch > BATCHMAX)
1060                                 lp->lwp_batch = BATCHMAX;
1061                 } else {
1062                         lp->lwp_batch -= ttlticks;
1063                         if (lp->lwp_batch < 0)
1064                                 lp->lwp_batch = 0;
1065                 }
1066
1067                 if (usched_bsd4_debug == lp->lwp_proc->p_pid) {
1068                         kprintf("pid %d lwp %p estcpu %3d %3d bat %d cp %d/%d",
1069                                 lp->lwp_proc->p_pid, lp,
1070                                 estcpu, lp->lwp_estcpu,
1071                                 lp->lwp_batch,
1072                                 lp->lwp_cpticks, ttlticks);
1073                 }
1074
1075                 /*
1076                  * Adjust lp->lwp_esetcpu.  The decay factor determines how
1077                  * quickly lwp_estcpu collapses to its realtime calculation.
1078                  * A slower collapse gives us a more accurate number but
1079                  * can cause a cpu hog to eat too much cpu before the
1080                  * scheduler decides to downgrade it.
1081                  *
1082                  * NOTE: p_nice is accounted for in bsd4_resetpriority(),
1083                  *       and not here, but we must still ensure that a
1084                  *       cpu-bound nice -20 process does not completely
1085                  *       override a cpu-bound nice +20 process.
1086                  *
1087                  * NOTE: We must use ESTCPULIM() here to deal with any
1088                  *       overshoot.
1089                  */
1090                 decay_factor = usched_bsd4_decay;
1091                 if (decay_factor < 1)
1092                         decay_factor = 1;
1093                 if (decay_factor > 1024)
1094                         decay_factor = 1024;
1095
1096                 lp->lwp_estcpu = ESTCPULIM(
1097                         (lp->lwp_estcpu * decay_factor + estcpu) /
1098                         (decay_factor + 1));
1099
1100                 if (usched_bsd4_debug == lp->lwp_proc->p_pid)
1101                         kprintf(" finalestcpu %d\n", lp->lwp_estcpu);
1102                 bsd4_resetpriority(lp);
1103                 lp->lwp_cpbase += ttlticks * gd->gd_schedclock.periodic;
1104                 lp->lwp_cpticks = 0;
1105         }
1106 }
1107
1108 /*
1109  * Compute the priority of a process when running in user mode.
1110  * Arrange to reschedule if the resulting priority is better
1111  * than that of the current process.
1112  *
1113  * This routine may be called with any process.
1114  *
1115  * This routine is called by fork1() for initial setup with the process
1116  * of the run queue, and also may be called normally with the process on or
1117  * off the run queue.
1118  *
1119  * MPSAFE
1120  */
1121 static void
1122 bsd4_resetpriority(struct lwp *lp)
1123 {
1124         bsd4_pcpu_t dd;
1125         int newpriority;
1126         u_short newrqtype;
1127         int reschedcpu;
1128         int checkpri;
1129         int estcpu;
1130
1131         /*
1132          * Calculate the new priority and queue type
1133          */
1134         crit_enter();
1135         spin_lock(&bsd4_spin);
1136
1137         newrqtype = lp->lwp_rtprio.type;
1138
1139         switch(newrqtype) {
1140         case RTP_PRIO_REALTIME:
1141         case RTP_PRIO_FIFO:
1142                 newpriority = PRIBASE_REALTIME +
1143                              (lp->lwp_rtprio.prio & PRIMASK);
1144                 break;
1145         case RTP_PRIO_NORMAL:
1146                 /*
1147                  * Detune estcpu based on batchiness.  lwp_batch ranges
1148                  * from 0 to  BATCHMAX.  Limit estcpu for the sake of
1149                  * the priority calculation to between 50% and 100%.
1150                  */
1151                 estcpu = lp->lwp_estcpu * (lp->lwp_batch + BATCHMAX) /
1152                          (BATCHMAX * 2);
1153
1154                 /*
1155                  * p_nice piece         Adds (0-40) * 2         0-80
1156                  * estcpu               Adds 16384  * 4 / 512   0-128
1157                  */
1158                 newpriority = (lp->lwp_proc->p_nice - PRIO_MIN) * PPQ / NICEPPQ;
1159                 newpriority += estcpu * PPQ / ESTCPUPPQ;
1160                 newpriority = newpriority * MAXPRI / (PRIO_RANGE * PPQ /
1161                               NICEPPQ + ESTCPUMAX * PPQ / ESTCPUPPQ);
1162                 newpriority = PRIBASE_NORMAL + (newpriority & PRIMASK);
1163                 break;
1164         case RTP_PRIO_IDLE:
1165                 newpriority = PRIBASE_IDLE + (lp->lwp_rtprio.prio & PRIMASK);
1166                 break;
1167         case RTP_PRIO_THREAD:
1168                 newpriority = PRIBASE_THREAD + (lp->lwp_rtprio.prio & PRIMASK);
1169                 break;
1170         default:
1171                 panic("Bad RTP_PRIO %d", newrqtype);
1172                 /* NOT REACHED */
1173         }
1174
1175         /*
1176          * The newpriority incorporates the queue type so do a simple masked
1177          * check to determine if the process has moved to another queue.  If
1178          * it has, and it is currently on a run queue, then move it.
1179          */
1180         if ((lp->lwp_priority ^ newpriority) & ~PPQMASK) {
1181                 lp->lwp_priority = newpriority;
1182                 if (lp->lwp_mpflags & LWP_MP_ONRUNQ) {
1183                         bsd4_remrunqueue_locked(lp);
1184                         lp->lwp_rqtype = newrqtype;
1185                         lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ;
1186                         bsd4_setrunqueue_locked(lp);
1187                         checkpri = 1;
1188                 } else {
1189                         lp->lwp_rqtype = newrqtype;
1190                         lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ;
1191                         checkpri = 0;
1192                 }
1193                 reschedcpu = lp->lwp_thread->td_gd->gd_cpuid;
1194         } else {
1195                 lp->lwp_priority = newpriority;
1196                 reschedcpu = -1;
1197                 checkpri = 1;
1198         }
1199
1200         /*
1201          * Determine if we need to reschedule the target cpu.  This only
1202          * occurs if the LWP is already on a scheduler queue, which means
1203          * that idle cpu notification has already occured.  At most we
1204          * need only issue a need_user_resched() on the appropriate cpu.
1205          *
1206          * The LWP may be owned by a CPU different from the current one,
1207          * in which case dd->uschedcp may be modified without an MP lock
1208          * or a spinlock held.  The worst that happens is that the code
1209          * below causes a spurious need_user_resched() on the target CPU
1210          * and dd->pri to be wrong for a short period of time, both of
1211          * which are harmless.
1212          *
1213          * If checkpri is 0 we are adjusting the priority of the current
1214          * process, possibly higher (less desireable), so ignore the upri
1215          * check which will fail in that case.
1216          */
1217         if (reschedcpu >= 0) {
1218                 dd = &bsd4_pcpu[reschedcpu];
1219                 if ((bsd4_rdyprocmask & CPUMASK(reschedcpu)) &&
1220                     (checkpri == 0 ||
1221                      (dd->upri & ~PRIMASK) > (lp->lwp_priority & ~PRIMASK))) {
1222 #ifdef SMP
1223                         if (reschedcpu == mycpu->gd_cpuid) {
1224                                 spin_unlock(&bsd4_spin);
1225                                 need_user_resched();
1226                         } else {
1227                                 spin_unlock(&bsd4_spin);
1228                                 atomic_clear_cpumask(&bsd4_rdyprocmask,
1229                                                      CPUMASK(reschedcpu));
1230                                 lwkt_send_ipiq(lp->lwp_thread->td_gd,
1231                                                need_user_resched_remote, NULL);
1232                         }
1233 #else
1234                         spin_unlock(&bsd4_spin);
1235                         need_user_resched();
1236 #endif
1237                 } else {
1238                         spin_unlock(&bsd4_spin);
1239                 }
1240         } else {
1241                 spin_unlock(&bsd4_spin);
1242         }
1243         crit_exit();
1244 }
1245
1246 /*
1247  * MPSAFE
1248  */
1249 static
1250 void
1251 bsd4_yield(struct lwp *lp)
1252 {
1253 #if 0
1254         /* FUTURE (or something similar) */
1255         switch(lp->lwp_rqtype) {
1256         case RTP_PRIO_NORMAL:
1257                 lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu + ESTCPUINCR);
1258                 break;
1259         default:
1260                 break;
1261         }
1262 #endif
1263         need_user_resched();
1264 }
1265
1266 /*
1267  * Called from fork1() when a new child process is being created.
1268  *
1269  * Give the child process an initial estcpu that is more batch then
1270  * its parent and dock the parent for the fork (but do not
1271  * reschedule the parent).   This comprises the main part of our batch
1272  * detection heuristic for both parallel forking and sequential execs.
1273  *
1274  * XXX lwp should be "spawning" instead of "forking"
1275  *
1276  * MPSAFE
1277  */
1278 static void
1279 bsd4_forking(struct lwp *plp, struct lwp *lp)
1280 {
1281         /*
1282          * Put the child 4 queue slots (out of 32) higher than the parent
1283          * (less desireable than the parent).
1284          */
1285         lp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ * 4);
1286
1287         /*
1288          * The batch status of children always starts out centerline
1289          * and will inch-up or inch-down as appropriate.  It takes roughly
1290          * ~15 seconds of >50% cpu to hit the limit.
1291          */
1292         lp->lwp_batch = BATCHMAX / 2;
1293
1294         /*
1295          * Dock the parent a cost for the fork, protecting us from fork
1296          * bombs.  If the parent is forking quickly make the child more
1297          * batchy.
1298          */
1299         plp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ / 16);
1300 }
1301
1302 /*
1303  * Called when a parent waits for a child.
1304  *
1305  * MPSAFE
1306  */
1307 static void
1308 bsd4_exiting(struct lwp *lp, struct proc *child_proc)
1309 {
1310 }
1311
1312 /*
1313  * chooseproc() is called when a cpu needs a user process to LWKT schedule,
1314  * it selects a user process and returns it.  If chklp is non-NULL and chklp
1315  * has a better or equal priority then the process that would otherwise be
1316  * chosen, NULL is returned.
1317  *
1318  * Until we fix the RUNQ code the chklp test has to be strict or we may
1319  * bounce between processes trying to acquire the current process designation.
1320  *
1321  * MPSAFE - must be called with bsd4_spin exclusive held.  The spinlock is
1322  *          left intact through the entire routine.
1323  */
1324 static
1325 struct lwp *
1326 chooseproc_locked(struct lwp *chklp)
1327 {
1328         struct lwp *lp;
1329         struct rq *q;
1330         u_int32_t *which, *which2;
1331         u_int32_t pri;
1332         u_int32_t rtqbits;
1333         u_int32_t tsqbits;
1334         u_int32_t idqbits;
1335         cpumask_t cpumask;
1336
1337         rtqbits = bsd4_rtqueuebits;
1338         tsqbits = bsd4_queuebits;
1339         idqbits = bsd4_idqueuebits;
1340         cpumask = mycpu->gd_cpumask;
1341
1342
1343 #ifdef SMP
1344 again:
1345 #endif
1346         if (rtqbits) {
1347                 pri = bsfl(rtqbits);
1348                 q = &bsd4_rtqueues[pri];
1349                 which = &bsd4_rtqueuebits;
1350                 which2 = &rtqbits;
1351         } else if (tsqbits) {
1352                 pri = bsfl(tsqbits);
1353                 q = &bsd4_queues[pri];
1354                 which = &bsd4_queuebits;
1355                 which2 = &tsqbits;
1356         } else if (idqbits) {
1357                 pri = bsfl(idqbits);
1358                 q = &bsd4_idqueues[pri];
1359                 which = &bsd4_idqueuebits;
1360                 which2 = &idqbits;
1361         } else {
1362                 return NULL;
1363         }
1364         lp = TAILQ_FIRST(q);
1365         KASSERT(lp, ("chooseproc: no lwp on busy queue"));
1366
1367 #ifdef SMP
1368         while ((lp->lwp_cpumask & cpumask) == 0) {
1369                 lp = TAILQ_NEXT(lp, lwp_procq);
1370                 if (lp == NULL) {
1371                         *which2 &= ~(1 << pri);
1372                         goto again;
1373                 }
1374         }
1375 #endif
1376
1377         /*
1378          * If the passed lwp <chklp> is reasonably close to the selected
1379          * lwp <lp>, return NULL (indicating that <chklp> should be kept).
1380          *
1381          * Note that we must error on the side of <chklp> to avoid bouncing
1382          * between threads in the acquire code.
1383          */
1384         if (chklp) {
1385                 if (chklp->lwp_priority < lp->lwp_priority + PPQ)
1386                         return(NULL);
1387         }
1388
1389 #ifdef SMP
1390         /*
1391          * If the chosen lwp does not reside on this cpu spend a few
1392          * cycles looking for a better candidate at the same priority level.
1393          * This is a fallback check, setrunqueue() tries to wakeup the
1394          * correct cpu and is our front-line affinity.
1395          */
1396         if (lp->lwp_thread->td_gd != mycpu &&
1397             (chklp = TAILQ_NEXT(lp, lwp_procq)) != NULL
1398         ) {
1399                 if (chklp->lwp_thread->td_gd == mycpu) {
1400                         ++choose_affinity;
1401                         lp = chklp;
1402                 }
1403         }
1404 #endif
1405
1406         KTR_COND_LOG(usched_chooseproc,
1407             lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
1408             lp->lwp_proc->p_pid,
1409             lp->lwp_thread->td_gd->gd_cpuid,
1410             mycpu->gd_cpuid);
1411
1412         TAILQ_REMOVE(q, lp, lwp_procq);
1413         --bsd4_runqcount;
1414         if (TAILQ_EMPTY(q))
1415                 *which &= ~(1 << pri);
1416         KASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) != 0, ("not on runq6!"));
1417         atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ);
1418         return lp;
1419 }
1420
1421 #ifdef SMP
1422 /*
1423  * chooseproc() - with a cache coherence heuristic. Try to pull a process that
1424  * has its home on the current CPU> If the process doesn't have its home here
1425  * and is a batchy one (see batcy_looser_pri_test), we can wait for a
1426  * sched_tick, may be its home will become free and pull it in. Anyway,
1427  * we can't wait more than one tick. If that tick expired, we pull in that
1428  * process, no matter what.
1429  */
1430 static
1431 struct lwp *
1432 chooseproc_locked_cache_coherent(struct lwp *chklp)
1433 {
1434         struct lwp *lp;
1435         struct rq *q;
1436         u_int32_t *which, *which2;
1437         u_int32_t pri;
1438         u_int32_t checks;
1439         u_int32_t rtqbits;
1440         u_int32_t tsqbits;
1441         u_int32_t idqbits;
1442         cpumask_t cpumask;
1443
1444         struct lwp * min_level_lwp = NULL;
1445         struct rq *min_q = NULL;
1446         cpumask_t siblings;
1447         cpu_node_t* cpunode = NULL;
1448         u_int32_t min_level = MAXCPU;   /* number of levels < MAXCPU */
1449         u_int32_t *min_which = NULL;
1450         u_int32_t min_pri = 0;
1451         u_int32_t level = 0;
1452
1453         rtqbits = bsd4_rtqueuebits;
1454         tsqbits = bsd4_queuebits;
1455         idqbits = bsd4_idqueuebits;
1456         cpumask = mycpu->gd_cpumask;
1457
1458         /* Get the mask coresponding to the sysctl configured level */
1459         cpunode = bsd4_pcpu[mycpu->gd_cpuid].cpunode;
1460         level = usched_bsd4_stick_to_level;
1461         while (level) {
1462                 cpunode = cpunode->parent_node;
1463                 level--;
1464         }
1465         /* The cpus which can ellect a process */
1466         siblings = cpunode->members;
1467
1468 again:
1469         if (rtqbits) {
1470                 pri = bsfl(rtqbits);
1471                 q = &bsd4_rtqueues[pri];
1472                 which = &bsd4_rtqueuebits;
1473                 which2 = &rtqbits;
1474         } else if (tsqbits) {
1475                 pri = bsfl(tsqbits);
1476                 q = &bsd4_queues[pri];
1477                 which = &bsd4_queuebits;
1478                 which2 = &tsqbits;
1479         } else if (idqbits) {
1480                 pri = bsfl(idqbits);
1481                 q = &bsd4_idqueues[pri];
1482                 which = &bsd4_idqueuebits;
1483                 which2 = &idqbits;
1484         } else {
1485                 return NULL;
1486         }
1487         lp = TAILQ_FIRST(q);
1488         KASSERT(lp, ("chooseproc: no lwp on busy queue"));
1489
1490         /* Limit the number of checks/queue to a configurable value to
1491          * minimize the contention (we are in a locked region
1492          */
1493         for (checks = 0; checks < usched_bsd4_queue_checks; checks++) {
1494
1495                 if ((lp->lwp_cpumask & cpumask) == 0 ||
1496                     ((siblings & lp->lwp_thread->td_gd->gd_cpumask) == 0 &&
1497                       batchy_looser_pri_test(lp) &&
1498                       (lp->lwp_setrunqueue_ticks == sched_ticks ||
1499                        lp->lwp_setrunqueue_ticks == (int)(sched_ticks - 1)))) {
1500
1501                         KTR_COND_LOG(usched_chooseproc_cc_not_good,
1502                             lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
1503                             lp->lwp_proc->p_pid,
1504                             lp->lwp_thread->td_gd->gd_cpumask,
1505                             siblings,
1506                             cpumask);
1507
1508                         cpunode = bsd4_pcpu[lp->lwp_thread->td_gd->gd_cpuid].cpunode;
1509                         level = 0;
1510                         while (cpunode) {
1511                                 if (cpunode->members & cpumask) {
1512                                         break;
1513                                 }
1514                                 cpunode = cpunode->parent_node;
1515                                 level++;
1516                         }
1517                         if (level < min_level) {
1518                                 min_level_lwp = lp;
1519                                 min_level = level;
1520                                 min_q = q;
1521                                 min_which = which;
1522                                 min_pri = pri;
1523                         }
1524
1525                         lp = TAILQ_NEXT(lp, lwp_procq);
1526                         if (lp == NULL) {
1527                                 *which2 &= ~(1 << pri);
1528                                 goto again;
1529                         }
1530                 } else {
1531                         KTR_COND_LOG(usched_chooseproc_cc_elected,
1532                             lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
1533                             lp->lwp_proc->p_pid,
1534                             lp->lwp_thread->td_gd->gd_cpumask,
1535                             siblings,
1536                             cpumask);
1537
1538                         goto found;
1539                 }
1540         }
1541         lp = min_level_lwp;
1542         q = min_q;
1543         which = min_which;
1544         pri = min_pri;
1545         KASSERT(lp, ("chooseproc: at least the first lp was good"));
1546
1547 found:
1548
1549         /*
1550          * If the passed lwp <chklp> is reasonably close to the selected
1551          * lwp <lp>, return NULL (indicating that <chklp> should be kept).
1552          *
1553          * Note that we must error on the side of <chklp> to avoid bouncing
1554          * between threads in the acquire code.
1555          */
1556         if (chklp) {
1557                 if (chklp->lwp_priority < lp->lwp_priority + PPQ)
1558                         return(NULL);
1559         }
1560
1561         KTR_COND_LOG(usched_chooseproc_cc,
1562             lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
1563             lp->lwp_proc->p_pid,
1564             lp->lwp_thread->td_gd->gd_cpuid,
1565             mycpu->gd_cpuid);
1566
1567         TAILQ_REMOVE(q, lp, lwp_procq);
1568         --bsd4_runqcount;
1569         if (TAILQ_EMPTY(q))
1570                 *which &= ~(1 << pri);
1571         KASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) != 0, ("not on runq6!"));
1572         atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ);
1573         return lp;
1574 }
1575
1576
1577 static
1578 void
1579 need_user_resched_remote(void *dummy)
1580 {
1581         globaldata_t gd = mycpu;
1582         bsd4_pcpu_t  dd = &bsd4_pcpu[gd->gd_cpuid];
1583
1584         need_user_resched();
1585         wakeup(&dd->helper_thread);
1586 }
1587
1588 #endif
1589
1590 /*
1591  * bsd4_remrunqueue_locked() removes a given process from the run queue
1592  * that it is on, clearing the queue busy bit if it becomes empty.
1593  *
1594  * Note that user process scheduler is different from the LWKT schedule.
1595  * The user process scheduler only manages user processes but it uses LWKT
1596  * underneath, and a user process operating in the kernel will often be
1597  * 'released' from our management.
1598  *
1599  * MPSAFE - bsd4_spin must be held exclusively on call
1600  */
1601 static void
1602 bsd4_remrunqueue_locked(struct lwp *lp)
1603 {
1604         struct rq *q;
1605         u_int32_t *which;
1606         u_int8_t pri;
1607
1608         KKASSERT(lp->lwp_mpflags & LWP_MP_ONRUNQ);
1609         atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ);
1610         --bsd4_runqcount;
1611         KKASSERT(bsd4_runqcount >= 0);
1612
1613         pri = lp->lwp_rqindex;
1614         switch(lp->lwp_rqtype) {
1615         case RTP_PRIO_NORMAL:
1616                 q = &bsd4_queues[pri];
1617                 which = &bsd4_queuebits;
1618                 break;
1619         case RTP_PRIO_REALTIME:
1620         case RTP_PRIO_FIFO:
1621                 q = &bsd4_rtqueues[pri];
1622                 which = &bsd4_rtqueuebits;
1623                 break;
1624         case RTP_PRIO_IDLE:
1625                 q = &bsd4_idqueues[pri];
1626                 which = &bsd4_idqueuebits;
1627                 break;
1628         default:
1629                 panic("remrunqueue: invalid rtprio type");
1630                 /* NOT REACHED */
1631         }
1632         TAILQ_REMOVE(q, lp, lwp_procq);
1633         if (TAILQ_EMPTY(q)) {
1634                 KASSERT((*which & (1 << pri)) != 0,
1635                         ("remrunqueue: remove from empty queue"));
1636                 *which &= ~(1 << pri);
1637         }
1638 }
1639
1640 /*
1641  * bsd4_setrunqueue_locked()
1642  *
1643  * Add a process whos rqtype and rqindex had previously been calculated
1644  * onto the appropriate run queue.   Determine if the addition requires
1645  * a reschedule on a cpu and return the cpuid or -1.
1646  *
1647  * NOTE: Lower priorities are better priorities.
1648  *
1649  * MPSAFE - bsd4_spin must be held exclusively on call
1650  */
1651 static void
1652 bsd4_setrunqueue_locked(struct lwp *lp)
1653 {
1654         struct rq *q;
1655         u_int32_t *which;
1656         int pri;
1657
1658         KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0);
1659         atomic_set_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ);
1660         ++bsd4_runqcount;
1661
1662         pri = lp->lwp_rqindex;
1663
1664         switch(lp->lwp_rqtype) {
1665         case RTP_PRIO_NORMAL:
1666                 q = &bsd4_queues[pri];
1667                 which = &bsd4_queuebits;
1668                 break;
1669         case RTP_PRIO_REALTIME:
1670         case RTP_PRIO_FIFO:
1671                 q = &bsd4_rtqueues[pri];
1672                 which = &bsd4_rtqueuebits;
1673                 break;
1674         case RTP_PRIO_IDLE:
1675                 q = &bsd4_idqueues[pri];
1676                 which = &bsd4_idqueuebits;
1677                 break;
1678         default:
1679                 panic("remrunqueue: invalid rtprio type");
1680                 /* NOT REACHED */
1681         }
1682
1683         /*
1684          * Add to the correct queue and set the appropriate bit.  If no
1685          * lower priority (i.e. better) processes are in the queue then
1686          * we want a reschedule, calculate the best cpu for the job.
1687          *
1688          * Always run reschedules on the LWPs original cpu.
1689          */
1690         TAILQ_INSERT_TAIL(q, lp, lwp_procq);
1691         *which |= 1 << pri;
1692 }
1693
1694 #ifdef SMP
1695
1696 /*
1697  * For SMP systems a user scheduler helper thread is created for each
1698  * cpu and is used to allow one cpu to wakeup another for the purposes of
1699  * scheduling userland threads from setrunqueue().
1700  *
1701  * UP systems do not need the helper since there is only one cpu.
1702  *
1703  * We can't use the idle thread for this because we might block.
1704  * Additionally, doing things this way allows us to HLT idle cpus
1705  * on MP systems.
1706  *
1707  * MPSAFE
1708  */
1709 static void
1710 sched_thread(void *dummy)
1711 {
1712     globaldata_t gd;
1713     bsd4_pcpu_t  dd;
1714     bsd4_pcpu_t  tmpdd;
1715     struct lwp *nlp;
1716     cpumask_t mask;
1717     int cpuid;
1718 #ifdef SMP
1719     cpumask_t tmpmask;
1720     int tmpid;
1721 #endif
1722
1723     gd = mycpu;
1724     cpuid = gd->gd_cpuid;       /* doesn't change */
1725     mask = gd->gd_cpumask;      /* doesn't change */
1726     dd = &bsd4_pcpu[cpuid];
1727
1728     /*
1729      * Since we are woken up only when no user processes are scheduled
1730      * on a cpu, we can run at an ultra low priority.
1731      */
1732     lwkt_setpri_self(TDPRI_USER_SCHEDULER);
1733
1734     tsleep(&dd->helper_thread, PINTERLOCKED, "sched_thread_sleep", 0);
1735
1736     for (;;) {
1737 //again:
1738         /*
1739          * We use the LWKT deschedule-interlock trick to avoid racing
1740          * bsd4_rdyprocmask.  This means we cannot block through to the
1741          * manual lwkt_switch() call we make below.
1742          */
1743         crit_enter_gd(gd);
1744         //lwkt_deschedule_self(gd->gd_curthread);
1745         tsleep_interlock(&dd->helper_thread, 0);
1746         spin_lock(&bsd4_spin);
1747         atomic_set_cpumask(&bsd4_rdyprocmask, mask);
1748
1749         clear_user_resched();   /* This satisfied the reschedule request */
1750         dd->rrcount = 0;        /* Reset the round-robin counter */
1751
1752         if ((bsd4_curprocmask & mask) == 0) {
1753                 /*
1754                  * No thread is currently scheduled.
1755                  */
1756                 KKASSERT(dd->uschedcp == NULL);
1757                 if ((nlp = chooseproc_locked(NULL)) != NULL) {
1758
1759                         KTR_COND_LOG(usched_sched_thread_no_process,
1760                             nlp->lwp_proc->p_pid == usched_bsd4_pid_debug,
1761                             gd->gd_cpuid,
1762                             nlp->lwp_proc->p_pid,
1763                             nlp->lwp_thread->td_gd->gd_cpuid);
1764
1765                         atomic_set_cpumask(&bsd4_curprocmask, mask);
1766                         dd->upri = nlp->lwp_priority;
1767                         dd->uschedcp = nlp;
1768                         spin_unlock(&bsd4_spin);
1769 #ifdef SMP
1770                         lwkt_acquire(nlp->lwp_thread);
1771 #endif
1772                         lwkt_schedule(nlp->lwp_thread);
1773                 } else {
1774                         spin_unlock(&bsd4_spin);
1775                 }
1776         } else if (bsd4_runqcount) {
1777                 if ((nlp = chooseproc_locked(dd->uschedcp)) != NULL) {
1778
1779                         KTR_COND_LOG(usched_sched_thread_process,
1780                             nlp->lwp_proc->p_pid == usched_bsd4_pid_debug,
1781                             gd->gd_cpuid,
1782                             nlp->lwp_proc->p_pid,
1783                             nlp->lwp_thread->td_gd->gd_cpuid);
1784
1785                         dd->upri = nlp->lwp_priority;
1786                         dd->uschedcp = nlp;
1787                         spin_unlock(&bsd4_spin);
1788 #ifdef SMP
1789                         lwkt_acquire(nlp->lwp_thread);
1790 #endif
1791                         lwkt_schedule(nlp->lwp_thread);
1792                 } else {
1793                         /*
1794                          * CHAINING CONDITION TRAIN
1795                          *
1796                          * We could not deal with the scheduler wakeup
1797                          * request on this cpu, locate a ready scheduler
1798                          * with no current lp assignment and chain to it.
1799                          *
1800                          * This ensures that a wakeup race which fails due
1801                          * to priority test does not leave other unscheduled
1802                          * cpus idle when the runqueue is not empty.
1803                          */
1804                         tmpmask = ~bsd4_curprocmask &
1805                             bsd4_rdyprocmask & smp_active_mask;
1806                         if (tmpmask) {
1807                                 tmpid = BSFCPUMASK(tmpmask);
1808                                 tmpdd = &bsd4_pcpu[tmpid];
1809                                 atomic_clear_cpumask(&bsd4_rdyprocmask,
1810                                     CPUMASK(tmpid));
1811                                 spin_unlock(&bsd4_spin);
1812                                 wakeup(&tmpdd->helper_thread);
1813                         } else {
1814                                 spin_unlock(&bsd4_spin);
1815                         }
1816
1817                         KTR_LOG(usched_sched_thread_no_process_found,
1818                             gd->gd_cpuid,
1819                             tmpmask);
1820                 }
1821         } else {
1822                 /*
1823                  * The runq is empty.
1824                  */
1825                 spin_unlock(&bsd4_spin);
1826         }
1827
1828         /*
1829          * We're descheduled unless someone scheduled us.  Switch away.
1830          * Exiting the critical section will cause splz() to be called
1831          * for us if interrupts and such are pending.
1832          */
1833         crit_exit_gd(gd);
1834         tsleep(&dd->helper_thread, PINTERLOCKED, "sched_thread_sleep", 0);
1835 //      lwkt_switch();
1836     }
1837 }
1838
1839 /* sysctl stick_to_level parameter */
1840 static int
1841 sysctl_usched_bsd4_stick_to_level(SYSCTL_HANDLER_ARGS)
1842 {
1843         int error, new_val;
1844
1845         new_val = usched_bsd4_stick_to_level;
1846
1847         error = sysctl_handle_int(oidp, &new_val, 0, req);
1848         if (error != 0 || req->newptr == NULL)
1849                 return (error);
1850         if (new_val > cpu_topology_levels_number - 1 ||
1851             new_val < 0)
1852                 return (EINVAL);
1853         usched_bsd4_stick_to_level = new_val;
1854         return (0);
1855 }
1856
1857 /*
1858  * Setup our scheduler helpers.  Note that curprocmask bit 0 has already
1859  * been cleared by rqinit() and we should not mess with it further.
1860  */
1861 static void
1862 sched_thread_cpu_init(void)
1863 {
1864         int i;
1865         int cpuid;
1866         int smt_not_supported = 0;
1867         int cache_coherent_not_supported = 0;
1868         if (bootverbose)
1869                 kprintf("Start scheduler helpers on cpus:\n");
1870
1871         sysctl_ctx_init(&usched_bsd4_sysctl_ctx);
1872         usched_bsd4_sysctl_tree = SYSCTL_ADD_NODE(&usched_bsd4_sysctl_ctx,
1873             SYSCTL_STATIC_CHILDREN(_kern), OID_AUTO,
1874             "usched_bsd4", CTLFLAG_RD, 0, "");
1875
1876         for (i = 0; i < ncpus; ++i) {
1877                 bsd4_pcpu_t dd = &bsd4_pcpu[i];
1878                 cpumask_t mask = CPUMASK(i);
1879
1880                 if ((mask & smp_active_mask) == 0)
1881                     continue;
1882
1883                 dd->cpunode = get_cpu_node_by_cpuid(i);
1884
1885                 if (dd->cpunode == NULL) {
1886                         smt_not_supported = 1;
1887                         cache_coherent_not_supported = 1;
1888                         if (bootverbose)
1889                                 kprintf ("\tcpu%d - WARNING: No CPU NODE found for cpu\n", i);
1890
1891                 } else {
1892
1893                         switch (dd->cpunode->type) {
1894                                 case THREAD_LEVEL:
1895                                         if (bootverbose)
1896                                                 kprintf ("\tcpu%d - HyperThreading available. "
1897                                                     "Core siblings: ", i);
1898                                         break;
1899                                 case CORE_LEVEL:
1900                                         smt_not_supported = 1;
1901
1902                                         if (bootverbose)
1903                                                 kprintf ("\tcpu%d - No HT available, multi-core/physical "
1904                                                     "cpu. Physical siblings: ", i);
1905                                         break;
1906                                 case CHIP_LEVEL:
1907                                         smt_not_supported = 1;
1908
1909                                         if (bootverbose)
1910                                                 kprintf ("\tcpu%d - No HT available, single-core/physical cpu. "
1911                                                     "Package Siblings: ", i);
1912                                         break;
1913                                 default:
1914                                         if (bootverbose)
1915                                                 kprintf ("\tcpu%d - Unknown cpunode->type. Siblings: ", i);
1916                                         break;
1917                         }
1918
1919                         if (bootverbose) {
1920                                 if (dd->cpunode->parent_node != NULL) {
1921                                         CPUSET_FOREACH(cpuid, dd->cpunode->parent_node->members)
1922                                                 kprintf("cpu%d ", cpuid);
1923                                         kprintf("\n");
1924                                 } else {
1925                                         kprintf(" no siblings\n");
1926                                 }
1927                         }
1928                 }
1929
1930                 lwkt_create(sched_thread, NULL, NULL, &dd->helper_thread,
1931                     0, i, "usched %d", i);
1932
1933                 /*
1934                  * Allow user scheduling on the target cpu.  cpu #0 has already
1935                  * been enabled in rqinit().
1936                  */
1937                 if (i)
1938                     atomic_clear_cpumask(&bsd4_curprocmask, mask);
1939                 atomic_set_cpumask(&bsd4_rdyprocmask, mask);
1940                 dd->upri = PRIBASE_NULL;
1941
1942         }
1943
1944         /* usched_bsd4 sysctl configurable parameters */
1945
1946         SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx,
1947             SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
1948             OID_AUTO, "rrinterval", CTLFLAG_RW,
1949             &usched_bsd4_rrinterval, 0, "");
1950         SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx,
1951             SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
1952             OID_AUTO, "decay", CTLFLAG_RW,
1953             &usched_bsd4_decay, 0, "Extra decay when not running");
1954         SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx,
1955             SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
1956             OID_AUTO, "batch_time", CTLFLAG_RW,
1957             &usched_bsd4_batch_time, 0, "Minimum batch counter value");
1958
1959         /* Add enable/disable option for SMT scheduling if supported */
1960         if (smt_not_supported) {
1961                 usched_bsd4_smt = 0;
1962                 SYSCTL_ADD_STRING(&usched_bsd4_sysctl_ctx,
1963                     SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
1964                     OID_AUTO, "smt", CTLFLAG_RD,
1965                     "NOT SUPPORTED", 0, "SMT NOT SUPPORTED");
1966         } else {
1967                 usched_bsd4_smt = 1;
1968                 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx,
1969                     SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
1970                     OID_AUTO, "smt", CTLFLAG_RW,
1971                     &usched_bsd4_smt, 0, "Enable/Disable SMT scheduling");
1972
1973         }
1974
1975         /* Add enable/disable option for cache coherent scheduling if supported */
1976         if (cache_coherent_not_supported) {
1977 #ifdef SMP
1978                 usched_bsd4_cache_coherent = 0;
1979                 SYSCTL_ADD_STRING(&usched_bsd4_sysctl_ctx,
1980                     SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
1981                     OID_AUTO, "cache_coherent", CTLFLAG_RD,
1982                     "NOT SUPPORTED", 0, "Cache coherence NOT SUPPORTED");
1983 #endif
1984         } else {
1985 #ifdef SMP
1986                 usched_bsd4_cache_coherent = 1;
1987                 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx,
1988                     SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
1989                     OID_AUTO, "cache_coherent", CTLFLAG_RW,
1990                     &usched_bsd4_cache_coherent, 0,
1991                     "Enable/Disable cache coherent scheduling");
1992 #endif
1993
1994                 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx,
1995                     SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
1996                     OID_AUTO, "upri_affinity", CTLFLAG_RW,
1997                     &usched_bsd4_upri_affinity, 1,
1998                     "Number of PPQs in user priority check");
1999
2000                 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx,
2001                     SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
2002                     OID_AUTO, "queue_checks", CTLFLAG_RW,
2003                     &usched_bsd4_queue_checks, 5,
2004                     "Number of LWP to check from a queue before giving up");
2005
2006                 SYSCTL_ADD_PROC(&usched_bsd4_sysctl_ctx,
2007                     SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
2008                     OID_AUTO, "stick_to_level", CTLTYPE_INT | CTLFLAG_RW,
2009                     NULL, sizeof usched_bsd4_stick_to_level,
2010                     sysctl_usched_bsd4_stick_to_level, "I",
2011                     "Stick a process to this level. See sysctl"
2012                     "paremter hw.cpu_topology.level_description");
2013         }
2014 }
2015 SYSINIT(uschedtd, SI_BOOT2_USCHED, SI_ORDER_SECOND,
2016         sched_thread_cpu_init, NULL)
2017 #else /* No SMP options - just add the configurable parameters to sysctl */
2018
2019 static void
2020 sched_sysctl_tree_init(void)
2021 {
2022         sysctl_ctx_init(&usched_bsd4_sysctl_ctx);
2023         usched_bsd4_sysctl_tree = SYSCTL_ADD_NODE(&usched_bsd4_sysctl_ctx,
2024             SYSCTL_STATIC_CHILDREN(_kern), OID_AUTO,
2025             "usched_bsd4", CTLFLAG_RD, 0, "");
2026
2027         /* usched_bsd4 sysctl configurable parameters */
2028         SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx,
2029             SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
2030             OID_AUTO, "rrinterval", CTLFLAG_RW,
2031             &usched_bsd4_rrinterval, 0, "");
2032         SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx,
2033             SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
2034             OID_AUTO, "decay", CTLFLAG_RW,
2035             &usched_bsd4_decay, 0, "Extra decay when not running");
2036         SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx,
2037             SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
2038             OID_AUTO, "batch_time", CTLFLAG_RW,
2039             &usched_bsd4_batch_time, 0, "Minimum batch counter value");
2040 }
2041 SYSINIT(uschedtd, SI_BOOT2_USCHED, SI_ORDER_SECOND,
2042         sched_sysctl_tree_init, NULL)
2043 #endif
2044