sys/kern/kern_switch.c

   1 /*
   2  * Copyright (c) 1999 Peter Wemm <peter@FreeBSD.org>
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  *
  14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  24  * SUCH DAMAGE.
  25  *
  26  * $FreeBSD: src/sys/kern/kern_switch.c,v 1.3.2.1 2000/05/16 06:58:12 dillon Exp $
  27  * $DragonFly: src/sys/kern/Attic/kern_switch.c,v 1.13 2003/10/21 04:14:55 dillon Exp $
  28  */
  29
  30 #include <sys/param.h>
  31 #include <sys/systm.h>
  32 #include <sys/kernel.h>
  33 #include <sys/lock.h>
  34 #include <sys/queue.h>
  35 #include <sys/proc.h>
  36 #include <sys/rtprio.h>
  37 #include <sys/thread2.h>
  38 #include <sys/uio.h>
  39 #include <sys/sysctl.h>
  40 #include <machine/ipl.h>
  41 #include <machine/cpu.h>
  42
  43 /*
  44  * debugging only YYY Remove me!   define to schedule user processes only
  45  * on the BSP.  Interrupts can still be taken on the APs.
  46  */
  47 #undef ONLY_ONE_USER_CPU
  48
  49 /*
  50  * We have NQS (32) run queues per scheduling class.  For the normal
  51  * class, there are 128 priorities scaled onto these 32 queues.  New
  52  * processes are added to the last entry in each queue, and processes
  53  * are selected for running by taking them from the head and maintaining
  54  * a simple FIFO arrangement.  Realtime and Idle priority processes have
  55  * and explicit 0-31 priority which maps directly onto their class queue
  56  * index.  When a queue has something in it, the corresponding bit is
  57  * set in the queuebits variable, allowing a single read to determine
  58  * the state of all 32 queues and then a ffs() to find the first busy
  59  * queue.
  60  */
  61 static struct rq queues[NQS];
  62 static struct rq rtqueues[NQS];
  63 static struct rq idqueues[NQS];
  64 static u_int32_t queuebits;
  65 static u_int32_t rtqueuebits;
  66 static u_int32_t idqueuebits;
  67 static u_int32_t curprocmask = -1;      /* currently running a user process */
  68 static u_int32_t rdyprocmask;           /* ready to accept a user process */
  69 static int       runqcount;
  70 #ifdef SMP
  71 static int       scancpu;
  72 #endif
  73
  74 SYSCTL_INT(_debug, OID_AUTO, runqcount, CTLFLAG_RD, &runqcount, 0, "");
  75 static int usched_steal;
  76 SYSCTL_INT(_debug, OID_AUTO, usched_steal, CTLFLAG_RW,
  77         &usched_steal, 0, "Passive Release was nonoptimal");
  78 static int usched_optimal;
  79 SYSCTL_INT(_debug, OID_AUTO, usched_optimal, CTLFLAG_RW,
  80         &usched_optimal, 0, "Passive Release was nonoptimal");
  81 #ifdef SMP
  82 static int remote_resched = 1;
  83 static int remote_resched_nonaffinity;
  84 static int remote_resched_affinity;
  85 static int choose_affinity;
  86 SYSCTL_INT(_debug, OID_AUTO, remote_resched, CTLFLAG_RW,
  87         &remote_resched, 0, "Resched to another cpu");
  88 SYSCTL_INT(_debug, OID_AUTO, remote_resched_nonaffinity, CTLFLAG_RD,
  89         &remote_resched_nonaffinity, 0, "Number of remote rescheds");
  90 SYSCTL_INT(_debug, OID_AUTO, remote_resched_affinity, CTLFLAG_RD,
  91         &remote_resched_affinity, 0, "Number of remote rescheds");
  92 SYSCTL_INT(_debug, OID_AUTO, choose_affinity, CTLFLAG_RD,
  93         &choose_affinity, 0, "chooseproc() was smart");
  94 #endif
  95
  96 #define USCHED_COUNTER(td)      ((td->td_gd == mycpu) ? ++usched_optimal : ++usched_steal)
  97
  98 /*
  99  * Initialize the run queues at boot time.
 100  */
 101 static void
 102 rqinit(void *dummy)
 103 {
 104         int i;
 105
 106         for (i = 0; i < NQS; i++) {
 107                 TAILQ_INIT(&queues[i]);
 108                 TAILQ_INIT(&rtqueues[i]);
 109                 TAILQ_INIT(&idqueues[i]);
 110         }
 111 #ifdef SMP
 112         sched_thread_init();
 113 #else
 114         curprocmask &= ~1;
 115 #endif
 116 }
 117 SYSINIT(runqueue, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, rqinit, NULL)
 118
 119 static __inline
 120 int
 121 test_resched(struct proc *curp, struct proc *newp)
 122 {
 123         if (newp->p_priority / PPQ <= curp->p_priority / PPQ)
 124                 return(1);
 125         return(0);
 126 }
 127
 128 /*
 129  * chooseproc() is called when a cpu needs a user process to LWKT schedule.
 130  * chooseproc() will select a user process and return it.
 131  */
 132 static
 133 struct proc *
 134 chooseproc(struct proc *chkp)
 135 {
 136         struct proc *p;
 137         struct rq *q;
 138         u_int32_t *which;
 139         u_int32_t pri;
 140
 141         if (rtqueuebits) {
 142                 pri = bsfl(rtqueuebits);
 143                 q = &rtqueues[pri];
 144                 which = &rtqueuebits;
 145         } else if (queuebits) {
 146                 pri = bsfl(queuebits);
 147                 q = &queues[pri];
 148                 which = &queuebits;
 149         } else if (idqueuebits) {
 150                 pri = bsfl(idqueuebits);
 151                 q = &idqueues[pri];
 152                 which = &idqueuebits;
 153         } else {
 154                 return NULL;
 155         }
 156         p = TAILQ_FIRST(q);
 157         KASSERT(p, ("chooseproc: no proc on busy queue"));
 158
 159         /*
 160          * If the chosen process is not at a higher priority then chkp
 161          * then return NULL without dequeueing a new process.
 162          */
 163         if (chkp && !test_resched(chkp, p))
 164                 return(NULL);
 165
 166 #ifdef SMP
 167         /*
 168          * If the chosen process does not reside on this cpu spend a few
 169          * cycles looking for a better candidate at the same priority level.
 170          * This is a fallback check, setrunqueue() tries to wakeup the
 171          * correct cpu and is our front-line affinity.
 172          */
 173         if (p->p_thread->td_gd != mycpu &&
 174             (chkp = TAILQ_NEXT(p, p_procq)) != NULL
 175         ) {
 176                 if (chkp->p_thread->td_gd == mycpu) {
 177                         ++choose_affinity;
 178                         p = chkp;
 179                 }
 180         }
 181 #endif
 182
 183         TAILQ_REMOVE(q, p, p_procq);
 184         --runqcount;
 185         if (TAILQ_EMPTY(q))
 186                 *which &= ~(1 << pri);
 187         KASSERT((p->p_flag & P_ONRUNQ) != 0, ("not on runq6!"));
 188         p->p_flag &= ~P_ONRUNQ;
 189         return p;
 190 }
 191
 192 #ifdef SMP
 193 /*
 194  * called via an ipi message to reschedule on another cpu.
 195  */
 196 static
 197 void
 198 need_resched_remote(void *dummy)
 199 {
 200         need_resched();
 201 }
 202
 203 #endif
 204
 205 /*
 206  * setrunqueue() 'wakes up' a 'user' process, which can mean several things.
 207  *
 208  * If P_CP_RELEASED is set the user process is under the control of the
 209  * LWKT subsystem and we simply wake the thread up.  This is ALWAYS the
 210  * case when setrunqueue() is called from wakeup() and, in fact wakeup()
 211  * asserts that P_CP_RELEASED is set.
 212  *
 213  * Note that acquire_curproc() already optimizes making the current process
 214  * P_CURPROC, so setrunqueue() does not need to.
 215  *
 216  * If P_CP_RELEASED is not set we place the process on the run queue and we
 217  * signal other cpus in the system that may need to be woken up to service
 218  * the new 'user' process.
 219  *
 220  * If P_PASSIVE_ACQ is set setrunqueue() will not wakeup potential target
 221  * cpus in an attempt to keep the process on the current cpu at least for
 222  * a little while to take advantage of locality of reference (e.g. fork/exec
 223  * or short fork/exit).
 224  *
 225  * CPU AFFINITY: cpu affinity is handled by attempting to either schedule
 226  * or (user level) preempt on the same cpu that a process was previously
 227  * scheduled to.  If we cannot do this but we are at enough of a higher
 228  * priority then the processes running on other cpus, we will allow the
 229  * process to be stolen by another cpu.
 230  *
 231  * WARNING! a thread can be acquired by another cpu the moment it is put
 232  * on the user scheduler's run queue AND we release the MP lock.  Since we
 233  * release the MP lock before switching out another cpu may begin stealing
 234  * our current thread before we are completely switched out!  The
 235  * lwkt_acquire() function will stall until TDF_RUNNING is cleared on the
 236  * thread before stealing it.
 237  *
 238  * The associated thread must NOT be scheduled.
 239  * The process must be runnable.
 240  * This must be called at splhigh().
 241  */
 242 void
 243 setrunqueue(struct proc *p)
 244 {
 245         struct rq *q;
 246         struct globaldata *gd;
 247         int pri;
 248         int cpuid;
 249 #ifdef SMP
 250         int count;
 251         u_int32_t mask;
 252 #endif
 253
 254         crit_enter();
 255         KASSERT(p->p_stat == SRUN, ("setrunqueue: proc not SRUN"));
 256         KASSERT((p->p_flag & (P_ONRUNQ|P_CURPROC)) == 0,
 257             ("process %d already on runq! flag %08x", p->p_pid, p->p_flag));
 258         KKASSERT((p->p_thread->td_flags & TDF_RUNQ) == 0);
 259
 260         /*
 261          * If we have been released from the userland scheduler we
 262          * directly schedule its thread.
 263          */
 264         if (p->p_flag & P_CP_RELEASED) {
 265                 lwkt_schedule(p->p_thread);
 266                 crit_exit();
 267                 return;
 268         }
 269
 270         /*
 271          * Check cpu affinity.  The associated thread is stable at the
 272          * moment.  Note that we may be checking another cpu here so we
 273          * have to be careful.  Note that gd_upri only counts when the
 274          * curprocmask bit is set for the cpu in question, and since it is
 275          * only a hint we can modify it on another cpu's globaldata structure.
 276          * We use it to prevent unnecessary IPIs (hence the - PPQ).
 277          */
 278         gd = p->p_thread->td_gd;
 279         cpuid = gd->gd_cpuid;
 280
 281         if ((curprocmask & (1 << cpuid)) == 0) {
 282                 curprocmask |= 1 << cpuid;
 283                 p->p_flag |= P_CURPROC;
 284                 gd->gd_upri = p->p_priority;
 285                 USCHED_COUNTER(p->p_thread);
 286                 lwkt_schedule(p->p_thread);
 287                 /* CANNOT TOUCH PROC OR TD AFTER SCHEDULE CALL TO REMOTE CPU */
 288                 crit_exit();
 289 #ifdef SMP
 290                 if (gd != mycpu)
 291                         ++remote_resched_affinity;
 292 #endif
 293                 return;
 294         }
 295
 296         /*
 297          * gd and cpuid may still 'hint' at another cpu.  Even so we have
 298          * to place this process on the userland scheduler's run queue for
 299          * action by the target cpu.
 300          */
 301         ++runqcount;
 302         p->p_flag |= P_ONRUNQ;
 303         if (p->p_rtprio.type == RTP_PRIO_NORMAL) {
 304                 pri = (p->p_priority & PRIMASK) >> 2;
 305                 q = &queues[pri];
 306                 queuebits |= 1 << pri;
 307         } else if (p->p_rtprio.type == RTP_PRIO_REALTIME ||
 308                    p->p_rtprio.type == RTP_PRIO_FIFO) {
 309                 pri = (u_int8_t)p->p_rtprio.prio;
 310                 q = &rtqueues[pri];
 311                 rtqueuebits |= 1 << pri;
 312         } else if (p->p_rtprio.type == RTP_PRIO_IDLE) {
 313                 pri = (u_int8_t)p->p_rtprio.prio;
 314                 q = &idqueues[pri];
 315                 idqueuebits |= 1 << pri;
 316         } else {
 317                 panic("setrunqueue: invalid rtprio type");
 318         }
 319         KKASSERT(pri < 32);
 320         p->p_rqindex = pri;             /* remember the queue index */
 321         TAILQ_INSERT_TAIL(q, p, p_procq);
 322
 323 #ifdef SMP
 324         /*
 325          * Either wakeup other cpus user thread scheduler or request
 326          * preemption on other cpus (which will also wakeup a HLT).
 327          *
 328          * NOTE!  gd and cpuid may still be our 'hint', not our current
 329          * cpu info.
 330          */
 331
 332         count = runqcount;
 333
 334         /*
 335          * Check cpu affinity for user preemption (when the curprocmask bit
 336          * is set)
 337          */
 338         if (gd == mycpu) {
 339                 if (p->p_priority / PPQ < gd->gd_upri / PPQ) {
 340                         need_resched();
 341                         --count;
 342                 }
 343         } else if (remote_resched) {
 344                 if (p->p_priority / PPQ < gd->gd_upri / PPQ) {
 345                         gd->gd_upri = p->p_priority;
 346                         lwkt_send_ipiq(cpuid, need_resched_remote, NULL);
 347                         --count;
 348                         ++remote_resched_affinity;
 349                 }
 350         }
 351
 352         /*
 353          * No affinity, first schedule to any cpus that do not have a current
 354          * process.  If there is a free cpu we always schedule to it.
 355          */
 356         if (count &&
 357             (mask = ~curprocmask & rdyprocmask & mycpu->gd_other_cpus) != 0 &&
 358             (p->p_flag & P_PASSIVE_ACQ) == 0) {
 359                 if (!mask)
 360                         printf("PROC %d nocpu to schedule it on\n", p->p_pid);
 361                 while (mask && count) {
 362                         cpuid = bsfl(mask);
 363                         KKASSERT((curprocmask & (1 << cpuid)) == 0);
 364                         rdyprocmask &= ~(1 << cpuid);
 365                         lwkt_schedule(&globaldata_find(cpuid)->gd_schedthread);
 366                         --count;
 367                         mask &= ~(1 << cpuid);
 368                 }
 369         }
 370
 371         /*
 372          * If there are still runnable processes try to wakeup a random
 373          * cpu that is running a much lower priority process in order to
 374          * preempt on it.  Note that gd_upri is only a hint, so we can
 375          * overwrite it from the wrong cpu.   If we can't find one, we
 376          * are SOL.
 377          *
 378          * We depress the priority check so multiple cpu bound programs
 379          * do not bounce between cpus.  Remember that the clock interrupt
 380          * will also cause all cpus to reschedule.
 381          */
 382         if (count && remote_resched && ncpus > 1) {
 383                 cpuid = scancpu;
 384                 do {
 385                         if (++cpuid == ncpus)
 386                                 cpuid = 0;
 387                 } while (cpuid == mycpu->gd_cpuid);
 388                 scancpu = cpuid;
 389
 390                 gd = globaldata_find(cpuid);
 391
 392                 if (p->p_priority / PPQ < gd->gd_upri / PPQ - 2) {
 393                         gd->gd_upri = p->p_priority;
 394                         lwkt_send_ipiq(cpuid, need_resched_remote, NULL);
 395                         ++remote_resched_nonaffinity;
 396                 }
 397         }
 398 #else
 399         if (p->p_priority / PPQ < gd->gd_upri / PPQ) {
 400                 need_resched();
 401         }
 402 #endif
 403         crit_exit();
 404 }
 405
 406 /*
 407  * remrunqueue() removes a given process from the run queue that it is on,
 408  * clearing the queue busy bit if it becomes empty.  This function is called
 409  * when a userland process is selected for LWKT scheduling.  Note that
 410  * LWKT scheduling is an abstraction of 'curproc'.. there could very well be
 411  * several userland processes whos threads are scheduled or otherwise in
 412  * a special state, and such processes are NOT on the userland scheduler's
 413  * run queue.
 414  *
 415  * This must be called at splhigh().
 416  */
 417 void
 418 remrunqueue(struct proc *p)
 419 {
 420         struct rq *q;
 421         u_int32_t *which;
 422         u_int8_t pri;
 423
 424         crit_enter();
 425         KASSERT((p->p_flag & P_ONRUNQ) != 0, ("not on runq4!"));
 426         p->p_flag &= ~P_ONRUNQ;
 427         --runqcount;
 428         KKASSERT(runqcount >= 0);
 429         pri = p->p_rqindex;
 430         if (p->p_rtprio.type == RTP_PRIO_NORMAL) {
 431                 q = &queues[pri];
 432                 which = &queuebits;
 433         } else if (p->p_rtprio.type == RTP_PRIO_REALTIME ||
 434                    p->p_rtprio.type == RTP_PRIO_FIFO) {
 435                 q = &rtqueues[pri];
 436                 which = &rtqueuebits;
 437         } else if (p->p_rtprio.type == RTP_PRIO_IDLE) {
 438                 q = &idqueues[pri];
 439                 which = &idqueuebits;
 440         } else {
 441                 panic("remrunqueue: invalid rtprio type");
 442         }
 443         TAILQ_REMOVE(q, p, p_procq);
 444         if (TAILQ_EMPTY(q)) {
 445                 KASSERT((*which & (1 << pri)) != 0,
 446                         ("remrunqueue: remove from empty queue"));
 447                 *which &= ~(1 << pri);
 448         }
 449         crit_exit();
 450 }
 451
 452 /*
 453  * Release the P_CURPROC designation on the current process for this cpu
 454  * and attempt to assign a new current process from the run queue.
 455  *
 456  * If we do not have or cannot get the MP lock we just wakeup the userland
 457  * helper scheduler thread for this cpu.
 458  *
 459  * WARNING!  The MP lock may be in an unsynchronized state due to the
 460  * way get_mplock() works and the fact that this function may be called
 461  * from a passive release during a lwkt_switch().   try_mplock() will deal
 462  * with this for us but you should be aware that td_mpcount may not be
 463  * useable.
 464  */
 465 void
 466 release_curproc(struct proc *p)
 467 {
 468         int cpuid;
 469         struct proc *np;
 470
 471 #ifdef ONLY_ONE_USER_CPU
 472         KKASSERT(mycpu->gd_cpuid == 0 && p->p_thread->td_gd == mycpu);
 473 #endif
 474         crit_enter();
 475         clear_resched();
 476         cpuid = p->p_thread->td_gd->gd_cpuid;
 477         if ((p->p_flag & P_CP_RELEASED) == 0) {
 478                 p->p_flag |= P_CP_RELEASED;
 479                 lwkt_setpri_self(TDPRI_KERN_USER);
 480         }
 481         if (p->p_flag & P_CURPROC) {
 482                 p->p_flag &= ~P_CURPROC;
 483                 curprocmask &= ~(1 << cpuid);
 484                 if (try_mplock()) {
 485                         /*
 486                          * Choose the next process to assign P_CURPROC to.
 487                          * Note that we cannot schedule gd_schedthread
 488                          * if runqcount is 0 without creating a scheduling
 489                          * loop.
 490                          */
 491                         if ((np = chooseproc(NULL)) != NULL) {
 492                                 curprocmask |= 1 << cpuid;
 493                                 np->p_flag |= P_CURPROC;
 494                                 mycpu->gd_upri = np->p_priority;
 495                                 USCHED_COUNTER(np->p_thread);
 496                                 lwkt_acquire(np->p_thread);
 497                                 lwkt_schedule(np->p_thread);
 498                         } else if (runqcount && (rdyprocmask & (1 << cpuid))) {
 499                                 rdyprocmask &= ~(1 << cpuid);
 500                                 lwkt_schedule(&mycpu->gd_schedthread);
 501                         }
 502                         rel_mplock();
 503                 } else {
 504                         KKASSERT(0);    /* MP LOCK ALWAYS HELD AT THE MOMENT */
 505                         if (runqcount && (rdyprocmask & (1 << cpuid))) {
 506                                 rdyprocmask &= ~(1 << cpuid);
 507                                 lwkt_schedule(&mycpu->gd_schedthread);
 508                         }
 509                 }
 510         }
 511         crit_exit();
 512 }
 513
 514 /*
 515  * Acquire the P_CURPROC designation on the CURRENT process only.  This
 516  * function is called prior to returning to userland.  If the system
 517  * call or trap did not block and if no reschedule was requested it is
 518  * highly likely that the P_CURPROC flag is still set in the proc, and
 519  * we do almost nothing here.
 520  */
 521 void
 522 acquire_curproc(struct proc *p)
 523 {
 524         int cpuid;
 525         struct proc *np;
 526
 527         /*
 528          * Short cut, we've already acquired the designation or we never
 529          * lost it in the first place.  P_CP_RELEASED is cleared, meaning
 530          * that the process is again under the control of the userland
 531          * scheduler.  We do not have to fiddle with the LWKT priority,
 532          * the trap code (userret/userexit) will do that for us.
 533          */
 534         if ((p->p_flag & P_CURPROC) != 0) {
 535                 p->p_flag &= ~P_CP_RELEASED;
 536                 return;
 537         }
 538
 539         /*
 540          * Long cut.  This pulls in a bit of the userland scheduler as
 541          * an optimization.  If our cpu has not scheduled a userland
 542          * process we gladly fill the slot, otherwise we choose the best
 543          * candidate from the run queue and compare it against ourselves,
 544          * scheduling either us or him depending.
 545          *
 546          * If our cpu's slot isn't free we put ourselves on the userland
 547          * run queue and switch away.  We should have P_CURPROC when we
 548          * come back.  Note that a cpu change can occur when we come back.
 549          *
 550          * YYY don't need critical section, we hold giant and no interrupt
 551          * will mess w/ this proc?  Or will it?  What about curprocmask?
 552          */
 553 #ifdef ONLY_ONE_USER_CPU
 554         KKASSERT(mycpu->gd_cpuid == 0 && p->p_thread->td_gd == mycpu);
 555 #endif
 556         crit_enter();
 557
 558         while ((p->p_flag & P_CURPROC) == 0) {
 559                 /*
 560                  * reload the cpuid
 561                  */
 562                 cpuid = p->p_thread->td_gd->gd_cpuid;
 563
 564                 /*
 565                  * (broken out from setrunqueue() as an optimization that
 566                  * allows us to avoid descheduling and rescheduling ourself)
 567                  *
 568                  * Interlock against the helper scheduler thread by setting
 569                  * curprocmask while we choose a new process.  Check our
 570                  * process against the new process to shortcut setrunqueue()
 571                  * and remrunqueue() operations.
 572                  */
 573                 if ((curprocmask & (1 << cpuid)) == 0) {
 574                         curprocmask |= 1 << cpuid;
 575
 576                         if ((np = chooseproc(p)) != NULL) {
 577                                 KKASSERT((np->p_flag & P_CP_RELEASED) == 0);
 578                                 np->p_flag |= P_CURPROC;
 579                                 mycpu->gd_upri = np->p_priority;
 580                                 USCHED_COUNTER(np->p_thread);
 581                                 lwkt_acquire(np->p_thread);
 582                                 lwkt_schedule(np->p_thread);
 583                         } else {
 584                                 p->p_flag |= P_CURPROC;
 585                         }
 586                         break;
 587                 }
 588                 lwkt_deschedule_self();
 589                 p->p_flag &= ~P_CP_RELEASED;
 590                 setrunqueue(p);
 591                 lwkt_switch();  /* CPU CAN CHANGE DUE TO SETRUNQUEUE() */
 592                 KASSERT((p->p_flag & (P_ONRUNQ|P_CURPROC|P_CP_RELEASED)) == P_CURPROC, ("unexpected p_flag %08x acquiring P_CURPROC\n", p->p_flag));
 593         }
 594         crit_exit();
 595 }
 596
 597 /*
 598  * Yield / synchronous reschedule.  This is a bit tricky because the trap
 599  * code might have set a lazy release on the switch function.  The lazy
 600  * release normally doesn't release the P_CURPROC designation unless we
 601  * are blocking at the time of the switch (no longer on the run queue), which
 602  * we aren't.  We need to release our P_CURPROC designation in order to
 603  * properly allow another user process to run.  This is done by creating
 604  * a special case by setting P_PASSIVE_ACQ prior to calling lwkt_switch().
 605  *
 606  * This code is confusing and really needs to be cleaned up.  Plus I don't
 607  * think it actually works as expected.
 608  */
 609 void
 610 uio_yield(void)
 611 {
 612         struct thread *td = curthread;
 613         struct proc *p = td->td_proc;
 614
 615         if (p) {
 616                 p->p_flag |= P_PASSIVE_ACQ;
 617                 lwkt_switch();
 618                 acquire_curproc(p);
 619                 release_curproc(p);
 620                 p->p_flag &= ~P_PASSIVE_ACQ;
 621         } else {
 622                 lwkt_switch();
 623         }
 624 }
 625
 626
 627 /*
 628  * For SMP systems a user scheduler helper thread is created for each
 629  * cpu and is used to allow one cpu to wakeup another for the purposes of
 630  * scheduling userland threads from setrunqueue().  UP systems do not
 631  * need the helper since there is only one cpu.  We can't use the idle
 632  * thread for this because we need to hold the MP lock.  Additionally,
 633  * doing things this way allows us to HLT idle cpus on MP systems.
 634  */
 635
 636 #ifdef SMP
 637
 638 static void
 639 sched_thread(void *dummy)
 640 {
 641     int cpuid = mycpu->gd_cpuid;        /* doesn't change */
 642     u_int32_t cpumask = 1 << cpuid;     /* doesn't change */
 643
 644 #ifdef ONLY_ONE_USER_CPU
 645     KKASSERT(cpuid == 0);
 646 #endif
 647
 648     get_mplock();                       /* hold the MP lock */
 649     for (;;) {
 650         struct proc *np;
 651
 652         lwkt_deschedule_self();         /* interlock */
 653         rdyprocmask |= cpumask;
 654         crit_enter();
 655         if ((curprocmask & cpumask) == 0 && (np = chooseproc(NULL)) != NULL) {
 656             curprocmask |= cpumask;
 657             np->p_flag |= P_CURPROC;
 658             mycpu->gd_upri = np->p_priority;
 659             USCHED_COUNTER(np->p_thread);
 660             lwkt_acquire(np->p_thread);
 661             lwkt_schedule(np->p_thread);
 662         }
 663         crit_exit();
 664         lwkt_switch();
 665     }
 666 }
 667
 668 void
 669 sched_thread_init(void)
 670 {
 671     int cpuid = mycpu->gd_cpuid;
 672
 673     lwkt_create(sched_thread, NULL, NULL, &mycpu->gd_schedthread,
 674         TDF_STOPREQ, "usched %d", cpuid);
 675     curprocmask &= ~(1 << cpuid);       /* schedule user proc on cpu */
 676 #ifdef ONLY_ONE_USER_CPU
 677     if (cpuid)
 678         curprocmask |= 1 << cpuid;      /* DISABLE USER PROCS */
 679 #endif
 680     rdyprocmask |= 1 << cpuid;
 681 }
 682
 683 #endif
 684