sys/kern/kern_switch.c

   1 /*
   2  * Copyright (c) 1999 Peter Wemm <peter@FreeBSD.org>
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  *
  14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  24  * SUCH DAMAGE.
  25  *
  26  * $FreeBSD: src/sys/kern/kern_switch.c,v 1.3.2.1 2000/05/16 06:58:12 dillon Exp $
  27  * $DragonFly: src/sys/kern/Attic/kern_switch.c,v 1.17 2004/02/12 06:57:48 dillon Exp $
  28  */
  29
  30 #include <sys/param.h>
  31 #include <sys/systm.h>
  32 #include <sys/kernel.h>
  33 #include <sys/lock.h>
  34 #include <sys/queue.h>
  35 #include <sys/proc.h>
  36 #include <sys/rtprio.h>
  37 #include <sys/thread2.h>
  38 #include <sys/uio.h>
  39 #include <sys/sysctl.h>
  40 #include <machine/ipl.h>
  41 #include <machine/cpu.h>
  42
  43 /*
  44  * debugging only YYY Remove me!   define to schedule user processes only
  45  * on the BSP.  Interrupts can still be taken on the APs.
  46  */
  47 #undef ONLY_ONE_USER_CPU
  48
  49 /*
  50  * We have NQS (32) run queues per scheduling class.  For the normal
  51  * class, there are 128 priorities scaled onto these 32 queues.  New
  52  * processes are added to the last entry in each queue, and processes
  53  * are selected for running by taking them from the head and maintaining
  54  * a simple FIFO arrangement.  Realtime and Idle priority processes have
  55  * and explicit 0-31 priority which maps directly onto their class queue
  56  * index.  When a queue has something in it, the corresponding bit is
  57  * set in the queuebits variable, allowing a single read to determine
  58  * the state of all 32 queues and then a ffs() to find the first busy
  59  * queue.
  60  */
  61 static struct rq queues[NQS];
  62 static struct rq rtqueues[NQS];
  63 static struct rq idqueues[NQS];
  64 static u_int32_t queuebits;
  65 static u_int32_t rtqueuebits;
  66 static u_int32_t idqueuebits;
  67 static u_int32_t curprocmask = -1;      /* currently running a user process */
  68 static u_int32_t rdyprocmask;           /* ready to accept a user process */
  69 static int       runqcount;
  70 #ifdef SMP
  71 static int       scancpu;
  72 #endif
  73
  74 SYSCTL_INT(_debug, OID_AUTO, runqcount, CTLFLAG_RD, &runqcount, 0, "");
  75 static int usched_steal;
  76 SYSCTL_INT(_debug, OID_AUTO, usched_steal, CTLFLAG_RW,
  77         &usched_steal, 0, "Passive Release was nonoptimal");
  78 static int usched_optimal;
  79 SYSCTL_INT(_debug, OID_AUTO, usched_optimal, CTLFLAG_RW,
  80         &usched_optimal, 0, "Passive Release was nonoptimal");
  81 #ifdef SMP
  82 static int remote_resched = 1;
  83 static int remote_resched_nonaffinity;
  84 static int remote_resched_affinity;
  85 static int choose_affinity;
  86 SYSCTL_INT(_debug, OID_AUTO, remote_resched, CTLFLAG_RW,
  87         &remote_resched, 0, "Resched to another cpu");
  88 SYSCTL_INT(_debug, OID_AUTO, remote_resched_nonaffinity, CTLFLAG_RD,
  89         &remote_resched_nonaffinity, 0, "Number of remote rescheds");
  90 SYSCTL_INT(_debug, OID_AUTO, remote_resched_affinity, CTLFLAG_RD,
  91         &remote_resched_affinity, 0, "Number of remote rescheds");
  92 SYSCTL_INT(_debug, OID_AUTO, choose_affinity, CTLFLAG_RD,
  93         &choose_affinity, 0, "chooseproc() was smart");
  94 #endif
  95
  96 #define USCHED_COUNTER(td)      ((td->td_gd == mycpu) ? ++usched_optimal : ++usched_steal)
  97
  98 /*
  99  * Initialize the run queues at boot time.
 100  */
 101 static void
 102 rqinit(void *dummy)
 103 {
 104         int i;
 105
 106         for (i = 0; i < NQS; i++) {
 107                 TAILQ_INIT(&queues[i]);
 108                 TAILQ_INIT(&rtqueues[i]);
 109                 TAILQ_INIT(&idqueues[i]);
 110         }
 111 #ifdef SMP
 112         sched_thread_init();
 113 #else
 114         curprocmask &= ~1;
 115 #endif
 116 }
 117 SYSINIT(runqueue, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, rqinit, NULL)
 118
 119 static __inline
 120 int
 121 test_resched(struct proc *curp, struct proc *newp)
 122 {
 123         if (newp->p_priority / PPQ <= curp->p_priority / PPQ)
 124                 return(1);
 125         return(0);
 126 }
 127
 128 /*
 129  * chooseproc() is called when a cpu needs a user process to LWKT schedule.
 130  * chooseproc() will select a user process and return it.
 131  */
 132 static
 133 struct proc *
 134 chooseproc(struct proc *chkp)
 135 {
 136         struct proc *p;
 137         struct rq *q;
 138         u_int32_t *which;
 139         u_int32_t pri;
 140
 141         clear_resched();
 142         if (rtqueuebits) {
 143                 pri = bsfl(rtqueuebits);
 144                 q = &rtqueues[pri];
 145                 which = &rtqueuebits;
 146         } else if (queuebits) {
 147                 pri = bsfl(queuebits);
 148                 q = &queues[pri];
 149                 which = &queuebits;
 150         } else if (idqueuebits) {
 151                 pri = bsfl(idqueuebits);
 152                 q = &idqueues[pri];
 153                 which = &idqueuebits;
 154         } else {
 155                 return NULL;
 156         }
 157         p = TAILQ_FIRST(q);
 158         KASSERT(p, ("chooseproc: no proc on busy queue"));
 159
 160         /*
 161          * If the chosen process is not at a higher priority then chkp
 162          * then return NULL without dequeueing a new process.
 163          */
 164         if (chkp && !test_resched(chkp, p))
 165                 return(NULL);
 166
 167 #ifdef SMP
 168         /*
 169          * If the chosen process does not reside on this cpu spend a few
 170          * cycles looking for a better candidate at the same priority level.
 171          * This is a fallback check, setrunqueue() tries to wakeup the
 172          * correct cpu and is our front-line affinity.
 173          */
 174         if (p->p_thread->td_gd != mycpu &&
 175             (chkp = TAILQ_NEXT(p, p_procq)) != NULL
 176         ) {
 177                 if (chkp->p_thread->td_gd == mycpu) {
 178                         ++choose_affinity;
 179                         p = chkp;
 180                 }
 181         }
 182 #endif
 183
 184         TAILQ_REMOVE(q, p, p_procq);
 185         --runqcount;
 186         if (TAILQ_EMPTY(q))
 187                 *which &= ~(1 << pri);
 188         KASSERT((p->p_flag & P_ONRUNQ) != 0, ("not on runq6!"));
 189         p->p_flag &= ~P_ONRUNQ;
 190         return p;
 191 }
 192
 193 #ifdef SMP
 194 /*
 195  * called via an ipi message to reschedule on another cpu.
 196  */
 197 static
 198 void
 199 need_resched_remote(void *dummy)
 200 {
 201         need_resched();
 202 }
 203
 204 #endif
 205
 206 /*
 207  * setrunqueue() 'wakes up' a 'user' process, which can mean several things.
 208  *
 209  * If P_CP_RELEASED is set the user process is under the control of the
 210  * LWKT subsystem and we simply wake the thread up.  This is ALWAYS the
 211  * case when setrunqueue() is called from wakeup() and, in fact wakeup()
 212  * asserts that P_CP_RELEASED is set.
 213  *
 214  * Note that acquire_curproc() already optimizes making the current process
 215  * P_CURPROC, so setrunqueue() does not need to.
 216  *
 217  * If P_CP_RELEASED is not set we place the process on the run queue and we
 218  * signal other cpus in the system that may need to be woken up to service
 219  * the new 'user' process.
 220  *
 221  * If P_PASSIVE_ACQ is set setrunqueue() will not wakeup potential target
 222  * cpus in an attempt to keep the process on the current cpu at least for
 223  * a little while to take advantage of locality of reference (e.g. fork/exec
 224  * or short fork/exit).
 225  *
 226  * CPU AFFINITY: cpu affinity is handled by attempting to either schedule
 227  * or (user level) preempt on the same cpu that a process was previously
 228  * scheduled to.  If we cannot do this but we are at enough of a higher
 229  * priority then the processes running on other cpus, we will allow the
 230  * process to be stolen by another cpu.
 231  *
 232  * WARNING! a thread can be acquired by another cpu the moment it is put
 233  * on the user scheduler's run queue AND we release the MP lock.  Since we
 234  * release the MP lock before switching out another cpu may begin stealing
 235  * our current thread before we are completely switched out!  The
 236  * lwkt_acquire() function will stall until TDF_RUNNING is cleared on the
 237  * thread before stealing it.
 238  *
 239  * The associated thread must NOT be scheduled.
 240  * The process must be runnable.
 241  * This must be called at splhigh().
 242  */
 243 void
 244 setrunqueue(struct proc *p)
 245 {
 246         struct rq *q;
 247         struct globaldata *gd;
 248         int pri;
 249         int cpuid;
 250 #ifdef SMP
 251         int count;
 252         u_int32_t mask;
 253 #endif
 254
 255         crit_enter();
 256         KASSERT(p->p_stat == SRUN, ("setrunqueue: proc not SRUN"));
 257         KASSERT((p->p_flag & (P_ONRUNQ|P_CURPROC)) == 0,
 258             ("process %d already on runq! flag %08x", p->p_pid, p->p_flag));
 259         KKASSERT((p->p_thread->td_flags & TDF_RUNQ) == 0);
 260
 261         /*
 262          * If we have been released from the userland scheduler we
 263          * directly schedule its thread.
 264          */
 265         if (p->p_flag & P_CP_RELEASED) {
 266                 lwkt_schedule(p->p_thread);
 267                 crit_exit();
 268                 return;
 269         }
 270
 271         /*
 272          * Check cpu affinity.  The associated thread is stable at the
 273          * moment.  Note that we may be checking another cpu here so we
 274          * have to be careful.  Note that gd_upri only counts when the
 275          * curprocmask bit is set for the cpu in question, and since it is
 276          * only a hint we can modify it on another cpu's globaldata structure.
 277          * We use it to prevent unnecessary IPIs (hence the - PPQ).
 278          */
 279         gd = p->p_thread->td_gd;
 280         cpuid = gd->gd_cpuid;
 281
 282         if ((curprocmask & (1 << cpuid)) == 0) {
 283                 curprocmask |= 1 << cpuid;
 284                 p->p_flag |= P_CURPROC;
 285                 gd->gd_upri = p->p_priority;
 286                 USCHED_COUNTER(p->p_thread);
 287                 lwkt_schedule(p->p_thread);
 288                 /* CANNOT TOUCH PROC OR TD AFTER SCHEDULE CALL TO REMOTE CPU */
 289                 crit_exit();
 290 #ifdef SMP
 291                 if (gd != mycpu)
 292                         ++remote_resched_affinity;
 293 #endif
 294                 return;
 295         }
 296
 297         /*
 298          * gd and cpuid may still 'hint' at another cpu.  Even so we have
 299          * to place this process on the userland scheduler's run queue for
 300          * action by the target cpu.
 301          */
 302         ++runqcount;
 303         p->p_flag |= P_ONRUNQ;
 304         if (p->p_rtprio.type == RTP_PRIO_NORMAL) {
 305                 pri = (p->p_priority & PRIMASK) >> 2;
 306                 q = &queues[pri];
 307                 queuebits |= 1 << pri;
 308         } else if (p->p_rtprio.type == RTP_PRIO_REALTIME ||
 309                    p->p_rtprio.type == RTP_PRIO_FIFO) {
 310                 pri = (u_int8_t)p->p_rtprio.prio;
 311                 q = &rtqueues[pri];
 312                 rtqueuebits |= 1 << pri;
 313         } else if (p->p_rtprio.type == RTP_PRIO_IDLE) {
 314                 pri = (u_int8_t)p->p_rtprio.prio;
 315                 q = &idqueues[pri];
 316                 idqueuebits |= 1 << pri;
 317         } else {
 318                 panic("setrunqueue: invalid rtprio type");
 319         }
 320         KKASSERT(pri < 32);
 321         p->p_rqindex = pri;             /* remember the queue index */
 322         TAILQ_INSERT_TAIL(q, p, p_procq);
 323
 324 #ifdef SMP
 325         /*
 326          * Either wakeup other cpus user thread scheduler or request
 327          * preemption on other cpus (which will also wakeup a HLT).
 328          *
 329          * NOTE!  gd and cpuid may still be our 'hint', not our current
 330          * cpu info.
 331          */
 332
 333         count = runqcount;
 334
 335         /*
 336          * Check cpu affinity for user preemption (when the curprocmask bit
 337          * is set)
 338          */
 339         if (gd == mycpu) {
 340                 if (p->p_priority / PPQ < gd->gd_upri / PPQ) {
 341                         need_resched();
 342                         --count;
 343                 }
 344         } else if (remote_resched) {
 345                 if (p->p_priority / PPQ < gd->gd_upri / PPQ) {
 346                         gd->gd_upri = p->p_priority;
 347                         lwkt_send_ipiq(gd, need_resched_remote, NULL);
 348                         --count;
 349                         ++remote_resched_affinity;
 350                 }
 351         }
 352
 353         /*
 354          * No affinity, first schedule to any cpus that do not have a current
 355          * process.  If there is a free cpu we always schedule to it.
 356          */
 357         if (count &&
 358             (mask = ~curprocmask & rdyprocmask & mycpu->gd_other_cpus) != 0 &&
 359             (p->p_flag & P_PASSIVE_ACQ) == 0) {
 360                 if (!mask)
 361                         printf("PROC %d nocpu to schedule it on\n", p->p_pid);
 362                 while (mask && count) {
 363                         cpuid = bsfl(mask);
 364                         KKASSERT((curprocmask & (1 << cpuid)) == 0);
 365                         rdyprocmask &= ~(1 << cpuid);
 366                         lwkt_schedule(&globaldata_find(cpuid)->gd_schedthread);
 367                         --count;
 368                         mask &= ~(1 << cpuid);
 369                 }
 370         }
 371
 372         /*
 373          * If there are still runnable processes try to wakeup a random
 374          * cpu that is running a much lower priority process in order to
 375          * preempt on it.  Note that gd_upri is only a hint, so we can
 376          * overwrite it from the wrong cpu.   If we can't find one, we
 377          * are SOL.
 378          *
 379          * We depress the priority check so multiple cpu bound programs
 380          * do not bounce between cpus.  Remember that the clock interrupt
 381          * will also cause all cpus to reschedule.
 382          */
 383         if (count && remote_resched && ncpus > 1) {
 384                 cpuid = scancpu;
 385                 do {
 386                         if (++cpuid == ncpus)
 387                                 cpuid = 0;
 388                 } while (cpuid == mycpu->gd_cpuid);
 389                 scancpu = cpuid;
 390
 391                 gd = globaldata_find(cpuid);
 392
 393                 if (p->p_priority / PPQ < gd->gd_upri / PPQ - 2) {
 394                         gd->gd_upri = p->p_priority;
 395                         lwkt_send_ipiq(gd, need_resched_remote, NULL);
 396                         ++remote_resched_nonaffinity;
 397                 }
 398         }
 399 #else
 400         if (p->p_priority / PPQ < gd->gd_upri / PPQ) {
 401                 need_resched();
 402         }
 403 #endif
 404         crit_exit();
 405 }
 406
 407 /*
 408  * remrunqueue() removes a given process from the run queue that it is on,
 409  * clearing the queue busy bit if it becomes empty.  This function is called
 410  * when a userland process is selected for LWKT scheduling.  Note that
 411  * LWKT scheduling is an abstraction of 'curproc'.. there could very well be
 412  * several userland processes whos threads are scheduled or otherwise in
 413  * a special state, and such processes are NOT on the userland scheduler's
 414  * run queue.
 415  *
 416  * This must be called at splhigh().
 417  */
 418 void
 419 remrunqueue(struct proc *p)
 420 {
 421         struct rq *q;
 422         u_int32_t *which;
 423         u_int8_t pri;
 424
 425         crit_enter();
 426         KASSERT((p->p_flag & P_ONRUNQ) != 0, ("not on runq4!"));
 427         p->p_flag &= ~P_ONRUNQ;
 428         --runqcount;
 429         KKASSERT(runqcount >= 0);
 430         pri = p->p_rqindex;
 431         if (p->p_rtprio.type == RTP_PRIO_NORMAL) {
 432                 q = &queues[pri];
 433                 which = &queuebits;
 434         } else if (p->p_rtprio.type == RTP_PRIO_REALTIME ||
 435                    p->p_rtprio.type == RTP_PRIO_FIFO) {
 436                 q = &rtqueues[pri];
 437                 which = &rtqueuebits;
 438         } else if (p->p_rtprio.type == RTP_PRIO_IDLE) {
 439                 q = &idqueues[pri];
 440                 which = &idqueuebits;
 441         } else {
 442                 panic("remrunqueue: invalid rtprio type");
 443         }
 444         TAILQ_REMOVE(q, p, p_procq);
 445         if (TAILQ_EMPTY(q)) {
 446                 KASSERT((*which & (1 << pri)) != 0,
 447                         ("remrunqueue: remove from empty queue"));
 448                 *which &= ~(1 << pri);
 449         }
 450         crit_exit();
 451 }
 452
 453 /*
 454  * Release the P_CURPROC designation on the current process for this cpu
 455  * and attempt to assign a new current process from the run queue.
 456  *
 457  * If we do not have or cannot get the MP lock we just wakeup the userland
 458  * helper scheduler thread for this cpu.
 459  *
 460  * WARNING!  The MP lock may be in an unsynchronized state due to the
 461  * way get_mplock() works and the fact that this function may be called
 462  * from a passive release during a lwkt_switch().   try_mplock() will deal
 463  * with this for us but you should be aware that td_mpcount may not be
 464  * useable.
 465  */
 466 void
 467 release_curproc(struct proc *p)
 468 {
 469         int cpuid;
 470         struct proc *np;
 471
 472 #ifdef ONLY_ONE_USER_CPU
 473         KKASSERT(mycpu->gd_cpuid == 0 && p->p_thread->td_gd == mycpu);
 474 #endif
 475         crit_enter();
 476         clear_resched();
 477         cpuid = p->p_thread->td_gd->gd_cpuid;
 478         if ((p->p_flag & P_CP_RELEASED) == 0) {
 479                 p->p_flag |= P_CP_RELEASED;
 480                 lwkt_setpri_self(TDPRI_KERN_USER);
 481         }
 482         if (p->p_flag & P_CURPROC) {
 483                 p->p_flag &= ~P_CURPROC;
 484                 curprocmask &= ~(1 << cpuid);
 485                 if (try_mplock()) {
 486                         /*
 487                          * Choose the next process to assign P_CURPROC to.
 488                          * Note that we cannot schedule gd_schedthread
 489                          * if runqcount is 0 without creating a scheduling
 490                          * loop.
 491                          */
 492                         if ((np = chooseproc(NULL)) != NULL) {
 493                                 curprocmask |= 1 << cpuid;
 494                                 np->p_flag |= P_CURPROC;
 495                                 mycpu->gd_upri = np->p_priority;
 496                                 USCHED_COUNTER(np->p_thread);
 497                                 lwkt_acquire(np->p_thread);
 498                                 lwkt_schedule(np->p_thread);
 499                         } else if (runqcount && (rdyprocmask & (1 << cpuid))) {
 500                                 rdyprocmask &= ~(1 << cpuid);
 501                                 lwkt_schedule(&mycpu->gd_schedthread);
 502                         }
 503                         rel_mplock();
 504                 } else {
 505                         KKASSERT(0);    /* MP LOCK ALWAYS HELD AT THE MOMENT */
 506                         if (runqcount && (rdyprocmask & (1 << cpuid))) {
 507                                 rdyprocmask &= ~(1 << cpuid);
 508                                 lwkt_schedule(&mycpu->gd_schedthread);
 509                         }
 510                 }
 511         }
 512         crit_exit();
 513 }
 514
 515 /*
 516  * Acquire the P_CURPROC designation on the CURRENT process only.  This
 517  * function is called prior to returning to userland.  If the system
 518  * call or trap did not block and if no reschedule was requested it is
 519  * highly likely that the P_CURPROC flag is still set in the proc, and
 520  * we do almost nothing here.
 521  */
 522 void
 523 acquire_curproc(struct proc *p)
 524 {
 525         int cpuid;
 526         struct proc *np;
 527
 528         /*
 529          * Short cut, we've already acquired the designation or we never
 530          * lost it in the first place.  P_CP_RELEASED is cleared, meaning
 531          * that the process is again under the control of the userland
 532          * scheduler.  We do not have to fiddle with the LWKT priority,
 533          * the trap code (userret/userexit) will do that for us.
 534          */
 535         if ((p->p_flag & P_CURPROC) != 0) {
 536                 p->p_flag &= ~P_CP_RELEASED;
 537                 return;
 538         }
 539
 540         /*
 541          * Long cut.  This pulls in a bit of the userland scheduler as
 542          * an optimization.  If our cpu has not scheduled a userland
 543          * process we gladly fill the slot, otherwise we choose the best
 544          * candidate from the run queue and compare it against ourselves,
 545          * scheduling either us or him depending.
 546          *
 547          * If our cpu's slot isn't free we put ourselves on the userland
 548          * run queue and switch away.  We should have P_CURPROC when we
 549          * come back.  Note that a cpu change can occur when we come back.
 550          *
 551          * YYY don't need critical section, we hold giant and no interrupt
 552          * will mess w/ this proc?  Or will it?  What about curprocmask?
 553          */
 554 #ifdef ONLY_ONE_USER_CPU
 555         KKASSERT(mycpu->gd_cpuid == 0 && p->p_thread->td_gd == mycpu);
 556 #endif
 557         crit_enter();
 558
 559         while ((p->p_flag & P_CURPROC) == 0) {
 560                 /*
 561                  * reload the cpuid
 562                  */
 563                 cpuid = p->p_thread->td_gd->gd_cpuid;
 564
 565                 /*
 566                  * (broken out from setrunqueue() as an optimization that
 567                  * allows us to avoid descheduling and rescheduling ourself)
 568                  *
 569                  * Interlock against the helper scheduler thread by setting
 570                  * curprocmask while we choose a new process.  Check our
 571                  * process against the new process to shortcut setrunqueue()
 572                  * and remrunqueue() operations.
 573                  */
 574                 if ((curprocmask & (1 << cpuid)) == 0) {
 575                         curprocmask |= 1 << cpuid;
 576
 577                         if ((np = chooseproc(p)) != NULL) {
 578                                 KKASSERT((np->p_flag & P_CP_RELEASED) == 0);
 579                                 np->p_flag |= P_CURPROC;
 580                                 mycpu->gd_upri = np->p_priority;
 581                                 USCHED_COUNTER(np->p_thread);
 582                                 lwkt_acquire(np->p_thread);
 583                                 lwkt_schedule(np->p_thread);
 584                         } else {
 585                                 p->p_flag |= P_CURPROC;
 586                         }
 587                         break;
 588                 }
 589                 lwkt_deschedule_self();
 590                 p->p_flag &= ~P_CP_RELEASED;
 591                 setrunqueue(p);
 592                 lwkt_switch();  /* CPU CAN CHANGE DUE TO SETRUNQUEUE() */
 593                 KASSERT((p->p_flag & (P_ONRUNQ|P_CURPROC|P_CP_RELEASED)) == P_CURPROC, ("unexpected p_flag %08x acquiring P_CURPROC\n", p->p_flag));
 594         }
 595         crit_exit();
 596 }
 597
 598 /*
 599  * Yield / synchronous reschedule.  This is a bit tricky because the trap
 600  * code might have set a lazy release on the switch function.   Setting
 601  * P_PASSIVE_ACQ will ensure that the lazy release executes when we call
 602  * switch, and that we will not be rescheduled to another cpu when we attempt
 603  * to re-acquire P_CURPROC.
 604  *
 605  * We have to release P_CURPROC (by calling lwkt_switch(), and acquire it
 606  * again to yield to another user process.  Note that the release will
 607  * ensure that we are running at a kernel LWKT priority, and this priority
 608  * is not lowered through the reacquisition and rerelease sequence to ensure
 609  * that we do not deadlock against a higher priority *user* process.
 610  */
 611 void
 612 uio_yield(void)
 613 {
 614         struct thread *td = curthread;
 615         struct proc *p = td->td_proc;
 616
 617         if (p) {
 618                 p->p_flag |= P_PASSIVE_ACQ;
 619                 lwkt_switch();
 620                 acquire_curproc(p);
 621                 release_curproc(p);
 622                 p->p_flag &= ~P_PASSIVE_ACQ;
 623         } else {
 624                 lwkt_switch();
 625         }
 626 }
 627
 628
 629 /*
 630  * For SMP systems a user scheduler helper thread is created for each
 631  * cpu and is used to allow one cpu to wakeup another for the purposes of
 632  * scheduling userland threads from setrunqueue().  UP systems do not
 633  * need the helper since there is only one cpu.  We can't use the idle
 634  * thread for this because we need to hold the MP lock.  Additionally,
 635  * doing things this way allows us to HLT idle cpus on MP systems.
 636  */
 637
 638 #ifdef SMP
 639
 640 static void
 641 sched_thread(void *dummy)
 642 {
 643     int cpuid = mycpu->gd_cpuid;        /* doesn't change */
 644     u_int32_t cpumask = 1 << cpuid;     /* doesn't change */
 645
 646 #ifdef ONLY_ONE_USER_CPU
 647     KKASSERT(cpuid == 0);
 648 #endif
 649
 650     get_mplock();                       /* hold the MP lock */
 651     for (;;) {
 652         struct proc *np;
 653
 654         lwkt_deschedule_self();         /* interlock */
 655         rdyprocmask |= cpumask;
 656         crit_enter();
 657         if ((curprocmask & cpumask) == 0 && (np = chooseproc(NULL)) != NULL) {
 658             curprocmask |= cpumask;
 659             np->p_flag |= P_CURPROC;
 660             mycpu->gd_upri = np->p_priority;
 661             USCHED_COUNTER(np->p_thread);
 662             lwkt_acquire(np->p_thread);
 663             lwkt_schedule(np->p_thread);
 664         }
 665         crit_exit();
 666         lwkt_switch();
 667     }
 668 }
 669
 670 void
 671 sched_thread_init(void)
 672 {
 673     int cpuid = mycpu->gd_cpuid;
 674
 675     lwkt_create(sched_thread, NULL, NULL, &mycpu->gd_schedthread,
 676                 TDF_STOPREQ, -1,
 677                 "usched %d", cpuid);
 678     curprocmask &= ~(1 << cpuid);       /* schedule user proc on cpu */
 679 #ifdef ONLY_ONE_USER_CPU
 680     if (cpuid)
 681         curprocmask |= 1 << cpuid;      /* DISABLE USER PROCS */
 682 #endif
 683     rdyprocmask |= 1 << cpuid;
 684 }
 685
 686 #endif
 687