sys/kern/kern_synch.c

   1 /*-
   2  * Copyright (c) 1982, 1986, 1990, 1991, 1993
   3  *      The Regents of the University of California.  All rights reserved.
   4  * (c) UNIX System Laboratories, Inc.
   5  * All or some portions of this file are derived from material licensed
   6  * to the University of California by American Telephone and Telegraph
   7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   8  * the permission of UNIX System Laboratories, Inc.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 3. All advertising materials mentioning features or use of this software
  19  *    must display the following acknowledgement:
  20  *      This product includes software developed by the University of
  21  *      California, Berkeley and its contributors.
  22  * 4. Neither the name of the University nor the names of its contributors
  23  *    may be used to endorse or promote products derived from this software
  24  *    without specific prior written permission.
  25  *
  26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  36  * SUCH DAMAGE.
  37  *
  38  *      @(#)kern_synch.c        8.9 (Berkeley) 5/19/95
  39  * $FreeBSD: src/sys/kern/kern_synch.c,v 1.87.2.6 2002/10/13 07:29:53 kbyanc Exp $
  40  * $DragonFly: src/sys/kern/kern_synch.c,v 1.5 2003/06/21 07:54:57 dillon Exp $
  41  */
  42
  43 #include "opt_ktrace.h"
  44
  45 #include <sys/param.h>
  46 #include <sys/systm.h>
  47 #include <sys/proc.h>
  48 #include <sys/kernel.h>
  49 #include <sys/signalvar.h>
  50 #include <sys/resourcevar.h>
  51 #include <sys/vmmeter.h>
  52 #include <sys/sysctl.h>
  53 #ifdef KTRACE
  54 #include <sys/uio.h>
  55 #include <sys/ktrace.h>
  56 #endif
  57 #include <sys/xwait.h>
  58
  59 #include <machine/cpu.h>
  60 #include <machine/ipl.h>
  61 #include <machine/smp.h>
  62
  63 static void sched_setup __P((void *dummy));
  64 SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL)
  65
  66 u_char  curpriority;
  67 int     hogticks;
  68 int     lbolt;
  69 int     sched_quantum;          /* Roundrobin scheduling quantum in ticks. */
  70
  71 static struct callout loadav_callout;
  72
  73 struct loadavg averunnable =
  74         { {0, 0, 0}, FSCALE };  /* load average, of runnable procs */
  75 /*
  76  * Constants for averages over 1, 5, and 15 minutes
  77  * when sampling at 5 second intervals.
  78  */
  79 static fixpt_t cexp[3] = {
  80         0.9200444146293232 * FSCALE,    /* exp(-1/12) */
  81         0.9834714538216174 * FSCALE,    /* exp(-1/60) */
  82         0.9944598480048967 * FSCALE,    /* exp(-1/180) */
  83 };
  84
  85 static int      curpriority_cmp __P((struct proc *p));
  86 static void     endtsleep __P((void *));
  87 static void     loadav __P((void *arg));
  88 static void     maybe_resched __P((struct proc *chk));
  89 static void     roundrobin __P((void *arg));
  90 static void     schedcpu __P((void *arg));
  91 static void     updatepri __P((struct proc *p));
  92
  93 static int
  94 sysctl_kern_quantum(SYSCTL_HANDLER_ARGS)
  95 {
  96         int error, new_val;
  97
  98         new_val = sched_quantum * tick;
  99         error = sysctl_handle_int(oidp, &new_val, 0, req);
 100         if (error != 0 || req->newptr == NULL)
 101                 return (error);
 102         if (new_val < tick)
 103                 return (EINVAL);
 104         sched_quantum = new_val / tick;
 105         hogticks = 2 * sched_quantum;
 106         return (0);
 107 }
 108
 109 SYSCTL_PROC(_kern, OID_AUTO, quantum, CTLTYPE_INT|CTLFLAG_RW,
 110         0, sizeof sched_quantum, sysctl_kern_quantum, "I", "");
 111
 112 /*-
 113  * Compare priorities.  Return:
 114  *     <0: priority of p < current priority
 115  *      0: priority of p == current priority
 116  *     >0: priority of p > current priority
 117  * The priorities are the normal priorities or the normal realtime priorities
 118  * if p is on the same scheduler as curproc.  Otherwise the process on the
 119  * more realtimeish scheduler has lowest priority.  As usual, a higher
 120  * priority really means a lower priority.
 121  */
 122 static int
 123 curpriority_cmp(p)
 124         struct proc *p;
 125 {
 126         int c_class, p_class;
 127
 128         c_class = RTP_PRIO_BASE(curproc->p_rtprio.type);
 129         p_class = RTP_PRIO_BASE(p->p_rtprio.type);
 130         if (p_class != c_class)
 131                 return (p_class - c_class);
 132         if (p_class == RTP_PRIO_NORMAL)
 133                 return (((int)p->p_priority - (int)curpriority) / PPQ);
 134         return ((int)p->p_rtprio.prio - (int)curproc->p_rtprio.prio);
 135 }
 136
 137 /*
 138  * Arrange to reschedule if necessary, taking the priorities and
 139  * schedulers into account.
 140  */
 141 static void
 142 maybe_resched(chk)
 143         struct proc *chk;
 144 {
 145         struct proc *p = curproc; /* XXX */
 146
 147         /*
 148          * XXX idle scheduler still broken because proccess stays on idle
 149          * scheduler during waits (such as when getting FS locks).  If a
 150          * standard process becomes runaway cpu-bound, the system can lockup
 151          * due to idle-scheduler processes in wakeup never getting any cpu.
 152          */
 153         if (p == NULL) {
 154 #if 0
 155                 need_resched();
 156 #endif
 157         } else if (chk == p) {
 158                 /* We may need to yield if our priority has been raised. */
 159                 if (curpriority_cmp(chk) > 0)
 160                         need_resched();
 161         } else if (curpriority_cmp(chk) < 0)
 162                 need_resched();
 163 }
 164
 165 int
 166 roundrobin_interval(void)
 167 {
 168         return (sched_quantum);
 169 }
 170
 171 /*
 172  * Force switch among equal priority processes every 100ms.
 173  */
 174 /* ARGSUSED */
 175 static void
 176 roundrobin(arg)
 177         void *arg;
 178 {
 179 #ifndef SMP
 180         struct proc *p = curproc; /* XXX */
 181 #endif
 182
 183 #ifdef SMP
 184         need_resched();
 185         forward_roundrobin();
 186 #else
 187         if (p == 0 || RTP_PRIO_NEED_RR(p->p_rtprio.type))
 188                 need_resched();
 189 #endif
 190
 191         timeout(roundrobin, NULL, sched_quantum);
 192 }
 193
 194 /*
 195  * Constants for digital decay and forget:
 196  *      90% of (p_estcpu) usage in 5 * loadav time
 197  *      95% of (p_pctcpu) usage in 60 seconds (load insensitive)
 198  *          Note that, as ps(1) mentions, this can let percentages
 199  *          total over 100% (I've seen 137.9% for 3 processes).
 200  *
 201  * Note that schedclock() updates p_estcpu and p_cpticks asynchronously.
 202  *
 203  * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds.
 204  * That is, the system wants to compute a value of decay such
 205  * that the following for loop:
 206  *      for (i = 0; i < (5 * loadavg); i++)
 207  *              p_estcpu *= decay;
 208  * will compute
 209  *      p_estcpu *= 0.1;
 210  * for all values of loadavg:
 211  *
 212  * Mathematically this loop can be expressed by saying:
 213  *      decay ** (5 * loadavg) ~= .1
 214  *
 215  * The system computes decay as:
 216  *      decay = (2 * loadavg) / (2 * loadavg + 1)
 217  *
 218  * We wish to prove that the system's computation of decay
 219  * will always fulfill the equation:
 220  *      decay ** (5 * loadavg) ~= .1
 221  *
 222  * If we compute b as:
 223  *      b = 2 * loadavg
 224  * then
 225  *      decay = b / (b + 1)
 226  *
 227  * We now need to prove two things:
 228  *      1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
 229  *      2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
 230  *
 231  * Facts:
 232  *         For x close to zero, exp(x) =~ 1 + x, since
 233  *              exp(x) = 0! + x**1/1! + x**2/2! + ... .
 234  *              therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
 235  *         For x close to zero, ln(1+x) =~ x, since
 236  *              ln(1+x) = x - x**2/2 + x**3/3 - ...     -1 < x < 1
 237  *              therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
 238  *         ln(.1) =~ -2.30
 239  *
 240  * Proof of (1):
 241  *    Solve (factor)**(power) =~ .1 given power (5*loadav):
 242  *      solving for factor,
 243  *      ln(factor) =~ (-2.30/5*loadav), or
 244  *      factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
 245  *          exp(-1/b) =~ (b-1)/b =~ b/(b+1).                    QED
 246  *
 247  * Proof of (2):
 248  *    Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
 249  *      solving for power,
 250  *      power*ln(b/(b+1)) =~ -2.30, or
 251  *      power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav.  QED
 252  *
 253  * Actual power values for the implemented algorithm are as follows:
 254  *      loadav: 1       2       3       4
 255  *      power:  5.68    10.32   14.94   19.55
 256  */
 257
 258 /* calculations for digital decay to forget 90% of usage in 5*loadav sec */
 259 #define loadfactor(loadav)      (2 * (loadav))
 260 #define decay_cpu(loadfac, cpu) (((loadfac) * (cpu)) / ((loadfac) + FSCALE))
 261
 262 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
 263 static fixpt_t  ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
 264 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
 265
 266 /* kernel uses `FSCALE', userland (SHOULD) use kern.fscale */
 267 static int      fscale __unused = FSCALE;
 268 SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, 0, FSCALE, "");
 269
 270 /*
 271  * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
 272  * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
 273  * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
 274  *
 275  * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
 276  *      1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
 277  *
 278  * If you don't want to bother with the faster/more-accurate formula, you
 279  * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
 280  * (more general) method of calculating the %age of CPU used by a process.
 281  */
 282 #define CCPU_SHIFT      11
 283
 284 /*
 285  * Recompute process priorities, every hz ticks.
 286  */
 287 /* ARGSUSED */
 288 static void
 289 schedcpu(arg)
 290         void *arg;
 291 {
 292         register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
 293         register struct proc *p;
 294         register int realstathz, s;
 295
 296         realstathz = stathz ? stathz : hz;
 297         LIST_FOREACH(p, &allproc, p_list) {
 298                 /*
 299                  * Increment time in/out of memory and sleep time
 300                  * (if sleeping).  We ignore overflow; with 16-bit int's
 301                  * (remember them?) overflow takes 45 days.
 302                  */
 303                 p->p_swtime++;
 304                 if (p->p_stat == SSLEEP || p->p_stat == SSTOP)
 305                         p->p_slptime++;
 306                 p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
 307                 /*
 308                  * If the process has slept the entire second,
 309                  * stop recalculating its priority until it wakes up.
 310                  */
 311                 if (p->p_slptime > 1)
 312                         continue;
 313                 s = splhigh();  /* prevent state changes and protect run queue */
 314                 /*
 315                  * p_pctcpu is only for ps.
 316                  */
 317 #if     (FSHIFT >= CCPU_SHIFT)
 318                 p->p_pctcpu += (realstathz == 100)?
 319                         ((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT):
 320                         100 * (((fixpt_t) p->p_cpticks)
 321                                 << (FSHIFT - CCPU_SHIFT)) / realstathz;
 322 #else
 323                 p->p_pctcpu += ((FSCALE - ccpu) *
 324                         (p->p_cpticks * FSCALE / realstathz)) >> FSHIFT;
 325 #endif
 326                 p->p_cpticks = 0;
 327                 p->p_estcpu = decay_cpu(loadfac, p->p_estcpu);
 328                 resetpriority(p);
 329                 if (p->p_priority >= PUSER) {
 330                         if ((p != curproc) &&
 331 #ifdef SMP
 332                             p->p_oncpu == 0xff &&       /* idle */
 333 #endif
 334                             p->p_stat == SRUN &&
 335                             (p->p_flag & P_INMEM) &&
 336                             (p->p_priority / PPQ) != (p->p_usrpri / PPQ)) {
 337                                 remrunqueue(p);
 338                                 p->p_priority = p->p_usrpri;
 339                                 setrunqueue(p);
 340                         } else {
 341                                 p->p_priority = p->p_usrpri;
 342                         }
 343                 }
 344                 splx(s);
 345         }
 346         wakeup((caddr_t)&lbolt);
 347         timeout(schedcpu, (void *)0, hz);
 348 }
 349
 350 /*
 351  * Recalculate the priority of a process after it has slept for a while.
 352  * For all load averages >= 1 and max p_estcpu of 255, sleeping for at
 353  * least six times the loadfactor will decay p_estcpu to zero.
 354  */
 355 static void
 356 updatepri(p)
 357         register struct proc *p;
 358 {
 359         register unsigned int newcpu = p->p_estcpu;
 360         register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
 361
 362         if (p->p_slptime > 5 * loadfac)
 363                 p->p_estcpu = 0;
 364         else {
 365                 p->p_slptime--; /* the first time was done in schedcpu */
 366                 while (newcpu && --p->p_slptime)
 367                         newcpu = decay_cpu(loadfac, newcpu);
 368                 p->p_estcpu = newcpu;
 369         }
 370         resetpriority(p);
 371 }
 372
 373 /*
 374  * We're only looking at 7 bits of the address; everything is
 375  * aligned to 4, lots of things are aligned to greater powers
 376  * of 2.  Shift right by 8, i.e. drop the bottom 256 worth.
 377  */
 378 #define TABLESIZE       128
 379 static TAILQ_HEAD(slpquehead, proc) slpque[TABLESIZE];
 380 #define LOOKUP(x)       (((intptr_t)(x) >> 8) & (TABLESIZE - 1))
 381
 382 /*
 383  * During autoconfiguration or after a panic, a sleep will simply
 384  * lower the priority briefly to allow interrupts, then return.
 385  * The priority to be used (safepri) is machine-dependent, thus this
 386  * value is initialized and maintained in the machine-dependent layers.
 387  * This priority will typically be 0, or the lowest priority
 388  * that is safe for use on the interrupt stack; it can be made
 389  * higher to block network software interrupts after panics.
 390  */
 391 int safepri;
 392
 393 void
 394 sleepinit(void)
 395 {
 396         int i;
 397
 398         sched_quantum = hz/10;
 399         hogticks = 2 * sched_quantum;
 400         for (i = 0; i < TABLESIZE; i++)
 401                 TAILQ_INIT(&slpque[i]);
 402 }
 403
 404 void
 405 xwait_init(struct xwait *w)
 406 {
 407         bzero(w, sizeof(*w));
 408         TAILQ_INIT(&w->waitq);
 409 }
 410
 411 /*
 412  * General sleep call.  Suspends the current process until a wakeup is
 413  * performed on the specified identifier.  The process will then be made
 414  * runnable with the specified priority.  Sleeps at most timo/hz seconds
 415  * (0 means no timeout).  If pri includes PCATCH flag, signals are checked
 416  * before and after sleeping, else signals are not checked.  Returns 0 if
 417  * awakened, EWOULDBLOCK if the timeout expires.  If PCATCH is set and a
 418  * signal needs to be delivered, ERESTART is returned if the current system
 419  * call should be restarted if possible, and EINTR is returned if the system
 420  * call should be interrupted by the signal (return EINTR).
 421  */
 422 int
 423 tsleep(ident, priority, wmesg, timo)
 424         void *ident;
 425         int priority, timo;
 426         const char *wmesg;
 427 {
 428         struct proc *p = curproc;
 429         int s, sig, catch = priority & PCATCH;
 430         int id = LOOKUP(ident);
 431         struct callout_handle thandle;
 432
 433 #ifdef KTRACE
 434         if (p && KTRPOINT(p, KTR_CSW))
 435                 ktrcsw(p->p_tracep, 1, 0);
 436 #endif
 437         s = splhigh();
 438
 439         if (cold || panicstr) {
 440                 /*
 441                  * After a panic, or during autoconfiguration,
 442                  * just give interrupts a chance, then just return;
 443                  * don't run any other procs or panic below,
 444                  * in case this is the idle process and already asleep.
 445                  */
 446                 splx(safepri);
 447                 splx(s);
 448                 return (0);
 449         }
 450         KASSERT(p != NULL, ("tsleep1"));
 451         KASSERT(ident != NULL && p->p_stat == SRUN, ("tsleep"));
 452
 453         p->p_wchan = ident;
 454         p->p_wmesg = wmesg;
 455         p->p_slptime = 0;
 456         p->p_priority = priority & PRIMASK;
 457         TAILQ_INSERT_TAIL(&slpque[id], p, p_procq);
 458         if (timo)
 459                 thandle = timeout(endtsleep, (void *)p, timo);
 460         /*
 461          * We put ourselves on the sleep queue and start our timeout
 462          * before calling CURSIG, as we could stop there, and a wakeup
 463          * or a SIGCONT (or both) could occur while we were stopped.
 464          * A SIGCONT would cause us to be marked as SSLEEP
 465          * without resuming us, thus we must be ready for sleep
 466          * when CURSIG is called.  If the wakeup happens while we're
 467          * stopped, p->p_wchan will be 0 upon return from CURSIG.
 468          */
 469         if (catch) {
 470                 p->p_flag |= P_SINTR;
 471                 if ((sig = CURSIG(p))) {
 472                         if (p->p_wchan)
 473                                 unsleep(p);
 474                         p->p_stat = SRUN;
 475                         goto resume;
 476                 }
 477                 if (p->p_wchan == 0) {
 478                         catch = 0;
 479                         goto resume;
 480                 }
 481         } else
 482                 sig = 0;
 483         p->p_stat = SSLEEP;
 484         p->p_stats->p_ru.ru_nvcsw++;
 485         mi_switch();
 486 resume:
 487         curpriority = p->p_usrpri;
 488         splx(s);
 489         p->p_flag &= ~P_SINTR;
 490         if (p->p_flag & P_TIMEOUT) {
 491                 p->p_flag &= ~P_TIMEOUT;
 492                 if (sig == 0) {
 493 #ifdef KTRACE
 494                         if (KTRPOINT(p, KTR_CSW))
 495                                 ktrcsw(p->p_tracep, 0, 0);
 496 #endif
 497                         return (EWOULDBLOCK);
 498                 }
 499         } else if (timo)
 500                 untimeout(endtsleep, (void *)p, thandle);
 501         if (catch && (sig != 0 || (sig = CURSIG(p)))) {
 502 #ifdef KTRACE
 503                 if (KTRPOINT(p, KTR_CSW))
 504                         ktrcsw(p->p_tracep, 0, 0);
 505 #endif
 506                 if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
 507                         return (EINTR);
 508                 return (ERESTART);
 509         }
 510 #ifdef KTRACE
 511         if (KTRPOINT(p, KTR_CSW))
 512                 ktrcsw(p->p_tracep, 0, 0);
 513 #endif
 514         return (0);
 515 }
 516
 517 /*
 518  * General sleep call.  Suspends the current process until a wakeup is
 519  * performed on the specified xwait structure.  The process will then be made
 520  * runnable with the specified priority.  Sleeps at most timo/hz seconds
 521  * (0 means no timeout).  If pri includes PCATCH flag, signals are checked
 522  * before and after sleeping, else signals are not checked.  Returns 0 if
 523  * awakened, EWOULDBLOCK if the timeout expires.  If PCATCH is set and a
 524  * signal needs to be delivered, ERESTART is returned if the current system
 525  * call should be restarted if possible, and EINTR is returned if the system
 526  * call should be interrupted by the signal (return EINTR).
 527  *
 528  * If the passed generation number is different from the generation number
 529  * in the xwait, return immediately.
 530  */
 531 int
 532 xsleep(struct xwait *w, int priority, const char *wmesg, int timo, int *gen)
 533 {
 534         struct proc *p = curproc;
 535         int s, sig, catch = priority & PCATCH;
 536         struct callout_handle thandle;
 537
 538 #ifdef KTRACE
 539         if (p && KTRPOINT(p, KTR_CSW))
 540                 ktrcsw(p->p_tracep, 1, 0);
 541 #endif
 542         s = splhigh();
 543
 544         if (cold || panicstr) {
 545                 /*
 546                  * After a panic, or during autoconfiguration,
 547                  * just give interrupts a chance, then just return;
 548                  * don't run any other procs or panic below,
 549                  * in case this is the idle process and already asleep.
 550                  */
 551                 splx(safepri);
 552                 splx(s);
 553                 return (0);
 554         }
 555         KASSERT(p != NULL, ("tsleep1"));
 556         KASSERT(w != NULL && p->p_stat == SRUN, ("tsleep"));
 557
 558         /*
 559          * If the generation number does not match we return immediately.
 560          */
 561         if (*gen != w->gen) {
 562                 *gen = w->gen;
 563                 splx(s);
 564 #ifdef KTRACE
 565                 if (p && KTRPOINT(p, KTR_CSW))
 566                         ktrcsw(p->p_tracep, 0, 0);
 567 #endif
 568                 return(0);
 569         }
 570
 571         p->p_wchan = w;
 572         p->p_wmesg = wmesg;
 573         p->p_slptime = 0;
 574         p->p_priority = priority & PRIMASK;
 575         p->p_flag |= P_XSLEEP;
 576         TAILQ_INSERT_TAIL(&w->waitq, p, p_procq);
 577         if (timo)
 578                 thandle = timeout(endtsleep, (void *)p, timo);
 579         /*
 580          * We put ourselves on the sleep queue and start our timeout
 581          * before calling CURSIG, as we could stop there, and a wakeup
 582          * or a SIGCONT (or both) could occur while we were stopped.
 583          * A SIGCONT would cause us to be marked as SSLEEP
 584          * without resuming us, thus we must be ready for sleep
 585          * when CURSIG is called.  If the wakeup happens while we're
 586          * stopped, p->p_wchan will be 0 upon return from CURSIG.
 587          */
 588         if (catch) {
 589                 p->p_flag |= P_SINTR;
 590                 if ((sig = CURSIG(p))) {
 591                         if (p->p_wchan)
 592                                 unsleep(p);
 593                         p->p_stat = SRUN;
 594                         goto resume;
 595                 }
 596                 if (p->p_wchan == NULL) {
 597                         catch = 0;
 598                         goto resume;
 599                 }
 600         } else
 601                 sig = 0;
 602         p->p_stat = SSLEEP;
 603         p->p_stats->p_ru.ru_nvcsw++;
 604         mi_switch();
 605 resume:
 606         curpriority = p->p_usrpri;
 607         *gen = w->gen;  /* update generation number */
 608         splx(s);
 609         p->p_flag &= ~P_SINTR;
 610         if (p->p_flag & P_TIMEOUT) {
 611                 p->p_flag &= ~P_TIMEOUT;
 612                 if (sig == 0) {
 613 #ifdef KTRACE
 614                         if (KTRPOINT(p, KTR_CSW))
 615                                 ktrcsw(p->p_tracep, 0, 0);
 616 #endif
 617                         return (EWOULDBLOCK);
 618                 }
 619         } else if (timo)
 620                 untimeout(endtsleep, (void *)p, thandle);
 621         if (catch && (sig != 0 || (sig = CURSIG(p)))) {
 622 #ifdef KTRACE
 623                 if (KTRPOINT(p, KTR_CSW))
 624                         ktrcsw(p->p_tracep, 0, 0);
 625 #endif
 626                 if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
 627                         return (EINTR);
 628                 return (ERESTART);
 629         }
 630 #ifdef KTRACE
 631         if (KTRPOINT(p, KTR_CSW))
 632                 ktrcsw(p->p_tracep, 0, 0);
 633 #endif
 634         return (0);
 635 }
 636
 637 /*
 638  * Implement timeout for tsleep or xsleep
 639  *
 640  * If process hasn't been awakened (wchan non-zero),
 641  * set timeout flag and undo the sleep.  If proc
 642  * is stopped, just unsleep so it will remain stopped.
 643  */
 644 static void
 645 endtsleep(arg)
 646         void *arg;
 647 {
 648         register struct proc *p;
 649         int s;
 650
 651         p = (struct proc *)arg;
 652         s = splhigh();
 653         if (p->p_wchan) {
 654                 if (p->p_stat == SSLEEP)
 655                         setrunnable(p);
 656                 else
 657                         unsleep(p);
 658                 p->p_flag |= P_TIMEOUT;
 659         }
 660         splx(s);
 661 }
 662
 663 /*
 664  * Remove a process from its wait queue
 665  */
 666 void
 667 unsleep(p)
 668         register struct proc *p;
 669 {
 670         int s;
 671
 672         s = splhigh();
 673         if (p->p_wchan) {
 674                 if (p->p_flag & P_XSLEEP) {
 675                         struct xwait *w = p->p_wchan;
 676                         TAILQ_REMOVE(&w->waitq, p, p_procq);
 677                         p->p_flag &= ~P_XSLEEP;
 678                 } else {
 679                         TAILQ_REMOVE(&slpque[LOOKUP(p->p_wchan)], p, p_procq);
 680                 }
 681                 p->p_wchan = NULL;
 682         }
 683         splx(s);
 684 }
 685
 686 /*
 687  * Make all processes sleeping on the explicit lock structure runnable.
 688  */
 689 void
 690 xwakeup(struct xwait *w)
 691 {
 692         struct proc *p;
 693         int s;
 694
 695         s = splhigh();
 696         ++w->gen;
 697         while ((p = TAILQ_FIRST(&w->waitq)) != NULL) {
 698                 TAILQ_REMOVE(&w->waitq, p, p_procq);
 699                 KASSERT(p->p_wchan == w && (p->p_flag & P_XSLEEP),
 700                     ("xwakeup: wchan mismatch for %p (%p/%p) %08x", p, p->p_wchan, w, p->p_flag & P_XSLEEP));
 701                 p->p_wchan = NULL;
 702                 p->p_flag &= ~P_XSLEEP;
 703                 if (p->p_stat == SSLEEP) {
 704                         /* OPTIMIZED EXPANSION OF setrunnable(p); */
 705                         if (p->p_slptime > 1)
 706                                 updatepri(p);
 707                         p->p_slptime = 0;
 708                         p->p_stat = SRUN;
 709                         if (p->p_flag & P_INMEM) {
 710                                 setrunqueue(p);
 711                                 maybe_resched(p);
 712                         } else {
 713                                 p->p_flag |= P_SWAPINREQ;
 714                                 wakeup((caddr_t)&proc0);
 715                         }
 716                 }
 717         }
 718         splx(s);
 719 }
 720
 721 /*
 722  * Make all processes sleeping on the specified identifier runnable.
 723  */
 724 void
 725 wakeup(ident)
 726         register void *ident;
 727 {
 728         register struct slpquehead *qp;
 729         register struct proc *p;
 730         struct proc *np;
 731         int s;
 732         int id = LOOKUP(ident);
 733
 734         s = splhigh();
 735         qp = &slpque[id];
 736 restart:
 737         for (p = TAILQ_FIRST(qp); p != NULL; p = np) {
 738                 np = TAILQ_NEXT(p, p_procq);
 739                 if (p->p_wchan == ident) {
 740                         TAILQ_REMOVE(qp, p, p_procq);
 741                         p->p_wchan = NULL;
 742                         if (p->p_stat == SSLEEP) {
 743                                 /* OPTIMIZED EXPANSION OF setrunnable(p); */
 744                                 if (p->p_slptime > 1)
 745                                         updatepri(p);
 746                                 p->p_slptime = 0;
 747                                 p->p_stat = SRUN;
 748                                 if (p->p_flag & P_INMEM) {
 749                                         setrunqueue(p);
 750                                         maybe_resched(p);
 751                                 } else {
 752                                         p->p_flag |= P_SWAPINREQ;
 753                                         wakeup((caddr_t)&proc0);
 754                                 }
 755                                 /* END INLINE EXPANSION */
 756                                 goto restart;
 757                         }
 758                 }
 759         }
 760         splx(s);
 761 }
 762
 763 /*
 764  * Make a process sleeping on the specified identifier runnable.
 765  * May wake more than one process if a target process is currently
 766  * swapped out.
 767  */
 768 void
 769 wakeup_one(ident)
 770         register void *ident;
 771 {
 772         register struct slpquehead *qp;
 773         register struct proc *p;
 774         struct proc *np;
 775         int s;
 776         int id = LOOKUP(ident);
 777
 778         s = splhigh();
 779         qp = &slpque[id];
 780
 781 restart:
 782         for (p = TAILQ_FIRST(qp); p != NULL; p = np) {
 783                 np = TAILQ_NEXT(p, p_procq);
 784                 if (p->p_wchan == ident) {
 785                         TAILQ_REMOVE(qp, p, p_procq);
 786                         p->p_wchan = 0;
 787                         if (p->p_stat == SSLEEP) {
 788                                 /* OPTIMIZED EXPANSION OF setrunnable(p); */
 789                                 if (p->p_slptime > 1)
 790                                         updatepri(p);
 791                                 p->p_slptime = 0;
 792                                 p->p_stat = SRUN;
 793                                 if (p->p_flag & P_INMEM) {
 794                                         setrunqueue(p);
 795                                         maybe_resched(p);
 796                                         break;
 797                                 } else {
 798                                         p->p_flag |= P_SWAPINREQ;
 799                                         wakeup((caddr_t)&proc0);
 800                                 }
 801                                 /* END INLINE EXPANSION */
 802                                 goto restart;
 803                         }
 804                 }
 805         }
 806         splx(s);
 807 }
 808
 809 /*
 810  * The machine independent parts of mi_switch().
 811  * Must be called at splstatclock() or higher.
 812  */
 813 void
 814 mi_switch()
 815 {
 816         struct timeval new_switchtime;
 817         register struct proc *p = curproc;      /* XXX */
 818         register struct rlimit *rlim;
 819         int x;
 820
 821         /*
 822          * XXX this spl is almost unnecessary.  It is partly to allow for
 823          * sloppy callers that don't do it (issignal() via CURSIG() is the
 824          * main offender).  It is partly to work around a bug in the i386
 825          * cpu_switch() (the ipl is not preserved).  We ran for years
 826          * without it.  I think there was only a interrupt latency problem.
 827          * The main caller, tsleep(), does an splx() a couple of instructions
 828          * after calling here.  The buggy caller, issignal(), usually calls
 829          * here at spl0() and sometimes returns at splhigh().  The process
 830          * then runs for a little too long at splhigh().  The ipl gets fixed
 831          * when the process returns to user mode (or earlier).
 832          *
 833          * It would probably be better to always call here at spl0(). Callers
 834          * are prepared to give up control to another process, so they must
 835          * be prepared to be interrupted.  The clock stuff here may not
 836          * actually need splstatclock().
 837          */
 838         x = splstatclock();
 839         clear_resched();
 840
 841 #ifdef SIMPLELOCK_DEBUG
 842         if (p->p_simple_locks)
 843                 printf("sleep: holding simple lock\n");
 844 #endif
 845         /*
 846          * Compute the amount of time during which the current
 847          * process was running, and add that to its total so far.
 848          */
 849         microuptime(&new_switchtime);
 850         if (timevalcmp(&new_switchtime, &mycpu->gd_switchtime, <)) {
 851                 printf("microuptime() went backwards (%ld.%06ld -> %ld.%06ld)\n",
 852                     mycpu->gd_switchtime.tv_sec, mycpu->gd_switchtime.tv_usec,
 853                     new_switchtime.tv_sec, new_switchtime.tv_usec);
 854                 new_switchtime = mycpu->gd_switchtime;
 855         } else {
 856                 p->p_runtime +=
 857                     (new_switchtime.tv_usec - mycpu->gd_switchtime.tv_usec) +
 858                     (new_switchtime.tv_sec - mycpu->gd_switchtime.tv_sec) *
 859                     (int64_t)1000000;
 860         }
 861
 862         /*
 863          * Check if the process exceeds its cpu resource allocation.
 864          * If over max, kill it.
 865          */
 866         if (p->p_stat != SZOMB && p->p_limit->p_cpulimit != RLIM_INFINITY &&
 867             p->p_runtime > p->p_limit->p_cpulimit) {
 868                 rlim = &p->p_rlimit[RLIMIT_CPU];
 869                 if (p->p_runtime / (rlim_t)1000000 >= rlim->rlim_max) {
 870                         killproc(p, "exceeded maximum CPU limit");
 871                 } else {
 872                         psignal(p, SIGXCPU);
 873                         if (rlim->rlim_cur < rlim->rlim_max) {
 874                                 /* XXX: we should make a private copy */
 875                                 rlim->rlim_cur += 5;
 876                         }
 877                 }
 878         }
 879
 880         /*
 881          * Pick a new current process and record its start time.
 882          * YYY lwkt_switch() will run the heavy weight process restoration
 883          * code, which removes the target thread and process from their
 884          * respective run queues to temporarily mimic 5.x behavior.
 885          * YYY the userland scheduler should pick only one user process
 886          * at a time to run per cpu.
 887          */
 888         cnt.v_swtch++;
 889         mycpu->gd_switchtime = new_switchtime;
 890         lwkt_switch();
 891         remrunqueue(p);
 892         if (mycpu->gd_switchtime.tv_sec == 0)
 893                 microuptime(&mycpu->gd_switchtime);
 894         mycpu->gd_switchticks = ticks;
 895
 896         splx(x);
 897 }
 898
 899 /*
 900  * Change process state to be runnable,
 901  * placing it on the run queue if it is in memory,
 902  * and awakening the swapper if it isn't in memory.
 903  */
 904 void
 905 setrunnable(p)
 906         register struct proc *p;
 907 {
 908         register int s;
 909
 910         s = splhigh();
 911         switch (p->p_stat) {
 912         case 0:
 913         case SRUN:
 914         case SZOMB:
 915         default:
 916                 panic("setrunnable");
 917         case SSTOP:
 918         case SSLEEP:
 919                 unsleep(p);             /* e.g. when sending signals */
 920                 break;
 921
 922         case SIDL:
 923                 break;
 924         }
 925         p->p_stat = SRUN;
 926         if (p->p_flag & P_INMEM)
 927                 setrunqueue(p);
 928         splx(s);
 929         if (p->p_slptime > 1)
 930                 updatepri(p);
 931         p->p_slptime = 0;
 932         if ((p->p_flag & P_INMEM) == 0) {
 933                 p->p_flag |= P_SWAPINREQ;
 934                 wakeup((caddr_t)&proc0);
 935         }
 936         else
 937                 maybe_resched(p);
 938 }
 939
 940 /*
 941  * Compute the priority of a process when running in user mode.
 942  * Arrange to reschedule if the resulting priority is better
 943  * than that of the current process.
 944  */
 945 void
 946 resetpriority(p)
 947         register struct proc *p;
 948 {
 949         register unsigned int newpriority;
 950
 951         if (p->p_rtprio.type == RTP_PRIO_NORMAL) {
 952                 newpriority = PUSER + p->p_estcpu / INVERSE_ESTCPU_WEIGHT +
 953                     NICE_WEIGHT * p->p_nice;
 954                 newpriority = min(newpriority, MAXPRI);
 955                 p->p_usrpri = newpriority;
 956         }
 957         maybe_resched(p);
 958 }
 959
 960 /*
 961  * Compute a tenex style load average of a quantity on
 962  * 1, 5 and 15 minute intervals.
 963  */
 964 static void
 965 loadav(void *arg)
 966 {
 967         int i, nrun;
 968         struct loadavg *avg;
 969         struct proc *p;
 970
 971         avg = &averunnable;
 972         nrun = 0;
 973         LIST_FOREACH(p, &allproc, p_list) {
 974                 switch (p->p_stat) {
 975                 case SRUN:
 976                 case SIDL:
 977                         nrun++;
 978                 }
 979         }
 980         for (i = 0; i < 3; i++)
 981                 avg->ldavg[i] = (cexp[i] * avg->ldavg[i] +
 982                     nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT;
 983
 984         /*
 985          * Schedule the next update to occur after 5 seconds, but add a
 986          * random variation to avoid synchronisation with processes that
 987          * run at regular intervals.
 988          */
 989         callout_reset(&loadav_callout, hz * 4 + (int)(random() % (hz * 2 + 1)),
 990             loadav, NULL);
 991 }
 992
 993 /* ARGSUSED */
 994 static void
 995 sched_setup(dummy)
 996         void *dummy;
 997 {
 998
 999         callout_init(&loadav_callout);
1000
1001         /* Kick off timeout driven events by calling first time. */
1002         roundrobin(NULL);
1003         schedcpu(NULL);
1004         loadav(NULL);
1005 }
1006
1007 /*
1008  * We adjust the priority of the current process.  The priority of
1009  * a process gets worse as it accumulates CPU time.  The cpu usage
1010  * estimator (p_estcpu) is increased here.  resetpriority() will
1011  * compute a different priority each time p_estcpu increases by
1012  * INVERSE_ESTCPU_WEIGHT
1013  * (until MAXPRI is reached).  The cpu usage estimator ramps up
1014  * quite quickly when the process is running (linearly), and decays
1015  * away exponentially, at a rate which is proportionally slower when
1016  * the system is busy.  The basic principle is that the system will
1017  * 90% forget that the process used a lot of CPU time in 5 * loadav
1018  * seconds.  This causes the system to favor processes which haven't
1019  * run much recently, and to round-robin among other processes.
1020  */
1021 void
1022 schedclock(p)
1023         struct proc *p;
1024 {
1025
1026         p->p_cpticks++;
1027         p->p_estcpu = ESTCPULIM(p->p_estcpu + 1);
1028         if ((p->p_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) {
1029                 resetpriority(p);
1030                 if (p->p_priority >= PUSER)
1031                         p->p_priority = p->p_usrpri;
1032         }
1033 }