sys/kern/kern_synch.c

   1 /*-
   2  * Copyright (c) 1982, 1986, 1990, 1991, 1993
   3  *      The Regents of the University of California.  All rights reserved.
   4  * (c) UNIX System Laboratories, Inc.
   5  * All or some portions of this file are derived from material licensed
   6  * to the University of California by American Telephone and Telegraph
   7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   8  * the permission of UNIX System Laboratories, Inc.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 3. All advertising materials mentioning features or use of this software
  19  *    must display the following acknowledgement:
  20  *      This product includes software developed by the University of
  21  *      California, Berkeley and its contributors.
  22  * 4. Neither the name of the University nor the names of its contributors
  23  *    may be used to endorse or promote products derived from this software
  24  *    without specific prior written permission.
  25  *
  26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  36  * SUCH DAMAGE.
  37  *
  38  *      @(#)kern_synch.c        8.9 (Berkeley) 5/19/95
  39  * $FreeBSD: src/sys/kern/kern_synch.c,v 1.87.2.6 2002/10/13 07:29:53 kbyanc Exp $
  40  * $DragonFly: src/sys/kern/kern_synch.c,v 1.8 2003/06/25 03:55:57 dillon Exp $
  41  */
  42
  43 #include "opt_ktrace.h"
  44
  45 #include <sys/param.h>
  46 #include <sys/systm.h>
  47 #include <sys/proc.h>
  48 #include <sys/kernel.h>
  49 #include <sys/signalvar.h>
  50 #include <sys/resourcevar.h>
  51 #include <sys/vmmeter.h>
  52 #include <sys/sysctl.h>
  53 #ifdef KTRACE
  54 #include <sys/uio.h>
  55 #include <sys/ktrace.h>
  56 #endif
  57 #include <sys/xwait.h>
  58
  59 #include <machine/cpu.h>
  60 #include <machine/ipl.h>
  61 #include <machine/smp.h>
  62
  63 static void sched_setup __P((void *dummy));
  64 SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL)
  65
  66 u_char  curpriority;
  67 int     hogticks;
  68 int     lbolt;
  69 int     sched_quantum;          /* Roundrobin scheduling quantum in ticks. */
  70
  71 static struct callout loadav_callout;
  72
  73 struct loadavg averunnable =
  74         { {0, 0, 0}, FSCALE };  /* load average, of runnable procs */
  75 /*
  76  * Constants for averages over 1, 5, and 15 minutes
  77  * when sampling at 5 second intervals.
  78  */
  79 static fixpt_t cexp[3] = {
  80         0.9200444146293232 * FSCALE,    /* exp(-1/12) */
  81         0.9834714538216174 * FSCALE,    /* exp(-1/60) */
  82         0.9944598480048967 * FSCALE,    /* exp(-1/180) */
  83 };
  84
  85 static int      curpriority_cmp __P((struct proc *p));
  86 static void     endtsleep __P((void *));
  87 static void     loadav __P((void *arg));
  88 static void     maybe_resched __P((struct proc *chk));
  89 static void     roundrobin __P((void *arg));
  90 static void     schedcpu __P((void *arg));
  91 static void     updatepri __P((struct proc *p));
  92
  93 static int
  94 sysctl_kern_quantum(SYSCTL_HANDLER_ARGS)
  95 {
  96         int error, new_val;
  97
  98         new_val = sched_quantum * tick;
  99         error = sysctl_handle_int(oidp, &new_val, 0, req);
 100         if (error != 0 || req->newptr == NULL)
 101                 return (error);
 102         if (new_val < tick)
 103                 return (EINVAL);
 104         sched_quantum = new_val / tick;
 105         hogticks = 2 * sched_quantum;
 106         return (0);
 107 }
 108
 109 SYSCTL_PROC(_kern, OID_AUTO, quantum, CTLTYPE_INT|CTLFLAG_RW,
 110         0, sizeof sched_quantum, sysctl_kern_quantum, "I", "");
 111
 112 /*-
 113  * Compare priorities.  Return:
 114  *     <0: priority of p < current priority
 115  *      0: priority of p == current priority
 116  *     >0: priority of p > current priority
 117  * The priorities are the normal priorities or the normal realtime priorities
 118  * if p is on the same scheduler as curproc.  Otherwise the process on the
 119  * more realtimeish scheduler has lowest priority.  As usual, a higher
 120  * priority really means a lower priority.
 121  */
 122 static int
 123 curpriority_cmp(p)
 124         struct proc *p;
 125 {
 126         int c_class, p_class;
 127
 128         c_class = RTP_PRIO_BASE(curproc->p_rtprio.type);
 129         p_class = RTP_PRIO_BASE(p->p_rtprio.type);
 130         if (p_class != c_class)
 131                 return (p_class - c_class);
 132         if (p_class == RTP_PRIO_NORMAL)
 133                 return (((int)p->p_priority - (int)curpriority) / PPQ);
 134         return ((int)p->p_rtprio.prio - (int)curproc->p_rtprio.prio);
 135 }
 136
 137 /*
 138  * Arrange to reschedule if necessary, taking the priorities and
 139  * schedulers into account.
 140  */
 141 static void
 142 maybe_resched(chk)
 143         struct proc *chk;
 144 {
 145         struct proc *p = curproc; /* XXX */
 146
 147         /*
 148          * XXX idle scheduler still broken because proccess stays on idle
 149          * scheduler during waits (such as when getting FS locks).  If a
 150          * standard process becomes runaway cpu-bound, the system can lockup
 151          * due to idle-scheduler processes in wakeup never getting any cpu.
 152          */
 153         if (p == NULL) {
 154 #if 0
 155                 need_resched();
 156 #endif
 157         } else if (chk == p) {
 158                 /* We may need to yield if our priority has been raised. */
 159                 if (curpriority_cmp(chk) > 0)
 160                         need_resched();
 161         } else if (curpriority_cmp(chk) < 0)
 162                 need_resched();
 163 }
 164
 165 int
 166 roundrobin_interval(void)
 167 {
 168         return (sched_quantum);
 169 }
 170
 171 /*
 172  * Force switch among equal priority processes every 100ms.
 173  */
 174 /* ARGSUSED */
 175 static void
 176 roundrobin(arg)
 177         void *arg;
 178 {
 179 #ifndef SMP
 180         struct proc *p = curproc; /* XXX */
 181 #endif
 182
 183 #ifdef SMP
 184         need_resched();
 185         forward_roundrobin();
 186 #else
 187         if (p == 0 || RTP_PRIO_NEED_RR(p->p_rtprio.type))
 188                 need_resched();
 189 #endif
 190
 191         timeout(roundrobin, NULL, sched_quantum);
 192 }
 193
 194 /*
 195  * Constants for digital decay and forget:
 196  *      90% of (p_estcpu) usage in 5 * loadav time
 197  *      95% of (p_pctcpu) usage in 60 seconds (load insensitive)
 198  *          Note that, as ps(1) mentions, this can let percentages
 199  *          total over 100% (I've seen 137.9% for 3 processes).
 200  *
 201  * Note that schedclock() updates p_estcpu and p_cpticks asynchronously.
 202  *
 203  * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds.
 204  * That is, the system wants to compute a value of decay such
 205  * that the following for loop:
 206  *      for (i = 0; i < (5 * loadavg); i++)
 207  *              p_estcpu *= decay;
 208  * will compute
 209  *      p_estcpu *= 0.1;
 210  * for all values of loadavg:
 211  *
 212  * Mathematically this loop can be expressed by saying:
 213  *      decay ** (5 * loadavg) ~= .1
 214  *
 215  * The system computes decay as:
 216  *      decay = (2 * loadavg) / (2 * loadavg + 1)
 217  *
 218  * We wish to prove that the system's computation of decay
 219  * will always fulfill the equation:
 220  *      decay ** (5 * loadavg) ~= .1
 221  *
 222  * If we compute b as:
 223  *      b = 2 * loadavg
 224  * then
 225  *      decay = b / (b + 1)
 226  *
 227  * We now need to prove two things:
 228  *      1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
 229  *      2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
 230  *
 231  * Facts:
 232  *         For x close to zero, exp(x) =~ 1 + x, since
 233  *              exp(x) = 0! + x**1/1! + x**2/2! + ... .
 234  *              therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
 235  *         For x close to zero, ln(1+x) =~ x, since
 236  *              ln(1+x) = x - x**2/2 + x**3/3 - ...     -1 < x < 1
 237  *              therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
 238  *         ln(.1) =~ -2.30
 239  *
 240  * Proof of (1):
 241  *    Solve (factor)**(power) =~ .1 given power (5*loadav):
 242  *      solving for factor,
 243  *      ln(factor) =~ (-2.30/5*loadav), or
 244  *      factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
 245  *          exp(-1/b) =~ (b-1)/b =~ b/(b+1).                    QED
 246  *
 247  * Proof of (2):
 248  *    Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
 249  *      solving for power,
 250  *      power*ln(b/(b+1)) =~ -2.30, or
 251  *      power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav.  QED
 252  *
 253  * Actual power values for the implemented algorithm are as follows:
 254  *      loadav: 1       2       3       4
 255  *      power:  5.68    10.32   14.94   19.55
 256  */
 257
 258 /* calculations for digital decay to forget 90% of usage in 5*loadav sec */
 259 #define loadfactor(loadav)      (2 * (loadav))
 260 #define decay_cpu(loadfac, cpu) (((loadfac) * (cpu)) / ((loadfac) + FSCALE))
 261
 262 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
 263 static fixpt_t  ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
 264 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
 265
 266 /* kernel uses `FSCALE', userland (SHOULD) use kern.fscale */
 267 static int      fscale __unused = FSCALE;
 268 SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, 0, FSCALE, "");
 269
 270 /*
 271  * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
 272  * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
 273  * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
 274  *
 275  * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
 276  *      1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
 277  *
 278  * If you don't want to bother with the faster/more-accurate formula, you
 279  * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
 280  * (more general) method of calculating the %age of CPU used by a process.
 281  */
 282 #define CCPU_SHIFT      11
 283
 284 /*
 285  * Recompute process priorities, every hz ticks.
 286  */
 287 /* ARGSUSED */
 288 static void
 289 schedcpu(arg)
 290         void *arg;
 291 {
 292         register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
 293         register struct proc *p;
 294         register int realstathz, s;
 295
 296         realstathz = stathz ? stathz : hz;
 297         LIST_FOREACH(p, &allproc, p_list) {
 298                 /*
 299                  * Increment time in/out of memory and sleep time
 300                  * (if sleeping).  We ignore overflow; with 16-bit int's
 301                  * (remember them?) overflow takes 45 days.
 302                  */
 303                 p->p_swtime++;
 304                 if (p->p_stat == SSLEEP || p->p_stat == SSTOP)
 305                         p->p_slptime++;
 306                 p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
 307                 /*
 308                  * If the process has slept the entire second,
 309                  * stop recalculating its priority until it wakes up.
 310                  */
 311                 if (p->p_slptime > 1)
 312                         continue;
 313                 s = splhigh();  /* prevent state changes and protect run queue */
 314                 /*
 315                  * p_pctcpu is only for ps.
 316                  */
 317 #if     (FSHIFT >= CCPU_SHIFT)
 318                 p->p_pctcpu += (realstathz == 100)?
 319                         ((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT):
 320                         100 * (((fixpt_t) p->p_cpticks)
 321                                 << (FSHIFT - CCPU_SHIFT)) / realstathz;
 322 #else
 323                 p->p_pctcpu += ((FSCALE - ccpu) *
 324                         (p->p_cpticks * FSCALE / realstathz)) >> FSHIFT;
 325 #endif
 326                 p->p_cpticks = 0;
 327                 p->p_estcpu = decay_cpu(loadfac, p->p_estcpu);
 328                 resetpriority(p);
 329                 if (p->p_priority >= PUSER) {
 330                         if ((p != curproc) &&
 331 #ifdef SMP
 332                             p->p_oncpu == 0xff &&       /* idle */
 333 #endif
 334                             p->p_stat == SRUN &&
 335                             (p->p_flag & P_INMEM) &&
 336                             (p->p_priority / PPQ) != (p->p_usrpri / PPQ)) {
 337                                 remrunqueue(p);
 338                                 p->p_priority = p->p_usrpri;
 339                                 setrunqueue(p);
 340                         } else {
 341                                 p->p_priority = p->p_usrpri;
 342                         }
 343                 }
 344                 splx(s);
 345         }
 346         wakeup((caddr_t)&lbolt);
 347         timeout(schedcpu, (void *)0, hz);
 348 }
 349
 350 /*
 351  * Recalculate the priority of a process after it has slept for a while.
 352  * For all load averages >= 1 and max p_estcpu of 255, sleeping for at
 353  * least six times the loadfactor will decay p_estcpu to zero.
 354  */
 355 static void
 356 updatepri(p)
 357         register struct proc *p;
 358 {
 359         register unsigned int newcpu = p->p_estcpu;
 360         register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
 361
 362         if (p->p_slptime > 5 * loadfac)
 363                 p->p_estcpu = 0;
 364         else {
 365                 p->p_slptime--; /* the first time was done in schedcpu */
 366                 while (newcpu && --p->p_slptime)
 367                         newcpu = decay_cpu(loadfac, newcpu);
 368                 p->p_estcpu = newcpu;
 369         }
 370         resetpriority(p);
 371 }
 372
 373 /*
 374  * We're only looking at 7 bits of the address; everything is
 375  * aligned to 4, lots of things are aligned to greater powers
 376  * of 2.  Shift right by 8, i.e. drop the bottom 256 worth.
 377  */
 378 #define TABLESIZE       128
 379 static TAILQ_HEAD(slpquehead, proc) slpque[TABLESIZE];
 380 #define LOOKUP(x)       (((intptr_t)(x) >> 8) & (TABLESIZE - 1))
 381
 382 /*
 383  * During autoconfiguration or after a panic, a sleep will simply
 384  * lower the priority briefly to allow interrupts, then return.
 385  * The priority to be used (safepri) is machine-dependent, thus this
 386  * value is initialized and maintained in the machine-dependent layers.
 387  * This priority will typically be 0, or the lowest priority
 388  * that is safe for use on the interrupt stack; it can be made
 389  * higher to block network software interrupts after panics.
 390  */
 391 int safepri;
 392
 393 void
 394 sleepinit(void)
 395 {
 396         int i;
 397
 398         sched_quantum = hz/10;
 399         hogticks = 2 * sched_quantum;
 400         for (i = 0; i < TABLESIZE; i++)
 401                 TAILQ_INIT(&slpque[i]);
 402 }
 403
 404 /*
 405  * General sleep call.  Suspends the current process until a wakeup is
 406  * performed on the specified identifier.  The process will then be made
 407  * runnable with the specified priority.  Sleeps at most timo/hz seconds
 408  * (0 means no timeout).  If pri includes PCATCH flag, signals are checked
 409  * before and after sleeping, else signals are not checked.  Returns 0 if
 410  * awakened, EWOULDBLOCK if the timeout expires.  If PCATCH is set and a
 411  * signal needs to be delivered, ERESTART is returned if the current system
 412  * call should be restarted if possible, and EINTR is returned if the system
 413  * call should be interrupted by the signal (return EINTR).
 414  */
 415 int
 416 tsleep(ident, priority, wmesg, timo)
 417         void *ident;
 418         int priority, timo;
 419         const char *wmesg;
 420 {
 421         struct thread *td = curthread;
 422         struct proc *p = td->td_proc;
 423         int s, sig, catch = priority & PCATCH;
 424         int id = LOOKUP(ident);
 425         struct callout_handle thandle;
 426
 427 #ifdef KTRACE
 428         if (KTRPOINT(td, KTR_CSW))
 429                 ktrcsw(p->p_tracep, 1, 0);
 430 #endif
 431         s = splhigh();
 432
 433         if (cold || panicstr) {
 434                 /*
 435                  * After a panic, or during autoconfiguration,
 436                  * just give interrupts a chance, then just return;
 437                  * don't run any other procs or panic below,
 438                  * in case this is the idle process and already asleep.
 439                  */
 440                 splx(safepri);
 441                 splx(s);
 442                 return (0);
 443         }
 444         KASSERT(p != NULL, ("tsleep1"));
 445         KASSERT(ident != NULL && p->p_stat == SRUN, ("tsleep %p %s %d", ident, wmesg, p->p_stat));
 446
 447         p->p_wchan = ident;
 448         p->p_wmesg = wmesg;
 449         p->p_slptime = 0;
 450         p->p_priority = priority & PRIMASK;
 451         TAILQ_INSERT_TAIL(&slpque[id], p, p_procq);
 452         if (timo)
 453                 thandle = timeout(endtsleep, (void *)p, timo);
 454         /*
 455          * We put ourselves on the sleep queue and start our timeout
 456          * before calling CURSIG, as we could stop there, and a wakeup
 457          * or a SIGCONT (or both) could occur while we were stopped.
 458          * A SIGCONT would cause us to be marked as SSLEEP
 459          * without resuming us, thus we must be ready for sleep
 460          * when CURSIG is called.  If the wakeup happens while we're
 461          * stopped, p->p_wchan will be 0 upon return from CURSIG.
 462          */
 463         if (catch) {
 464                 p->p_flag |= P_SINTR;
 465                 if ((sig = CURSIG(p))) {
 466                         if (p->p_wchan)
 467                                 unsleep(p);
 468                         p->p_stat = SRUN;
 469                         goto resume;
 470                 }
 471                 if (p->p_wchan == 0) {
 472                         catch = 0;
 473                         goto resume;
 474                 }
 475         } else
 476                 sig = 0;
 477         p->p_stat = SSLEEP;
 478         p->p_stats->p_ru.ru_nvcsw++;
 479         mi_switch();
 480 resume:
 481         curpriority = p->p_usrpri;
 482         splx(s);
 483         p->p_flag &= ~P_SINTR;
 484         if (p->p_flag & P_TIMEOUT) {
 485                 p->p_flag &= ~P_TIMEOUT;
 486                 if (sig == 0) {
 487 #ifdef KTRACE
 488                         if (KTRPOINT(td, KTR_CSW))
 489                                 ktrcsw(p->p_tracep, 0, 0);
 490 #endif
 491                         return (EWOULDBLOCK);
 492                 }
 493         } else if (timo)
 494                 untimeout(endtsleep, (void *)p, thandle);
 495         if (catch && (sig != 0 || (sig = CURSIG(p)))) {
 496 #ifdef KTRACE
 497                 if (KTRPOINT(td, KTR_CSW))
 498                         ktrcsw(p->p_tracep, 0, 0);
 499 #endif
 500                 if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
 501                         return (EINTR);
 502                 return (ERESTART);
 503         }
 504 #ifdef KTRACE
 505         if (KTRPOINT(td, KTR_CSW))
 506                 ktrcsw(p->p_tracep, 0, 0);
 507 #endif
 508         return (0);
 509 }
 510
 511 /*
 512  * General sleep call.  Suspends the current process until a wakeup is
 513  * performed on the specified xwait structure.  The process will then be made
 514  * runnable with the specified priority.  Sleeps at most timo/hz seconds
 515  * (0 means no timeout).  If pri includes PCATCH flag, signals are checked
 516  * before and after sleeping, else signals are not checked.  Returns 0 if
 517  * awakened, EWOULDBLOCK if the timeout expires.  If PCATCH is set and a
 518  * signal needs to be delivered, ERESTART is returned if the current system
 519  * call should be restarted if possible, and EINTR is returned if the system
 520  * call should be interrupted by the signal (return EINTR).
 521  *
 522  * If the passed generation number is different from the generation number
 523  * in the xwait, return immediately.
 524  */
 525 int
 526 xsleep(struct xwait *w, int priority, const char *wmesg, int timo, int *gen)
 527 {
 528         struct thread *td = curthread;
 529         struct proc *p = td->td_proc;
 530         int s, sig, catch = priority & PCATCH;
 531         struct callout_handle thandle;
 532
 533 #ifdef KTRACE
 534         if (KTRPOINT(td, KTR_CSW))
 535                 ktrcsw(p->p_tracep, 1, 0);
 536 #endif
 537         s = splhigh();
 538
 539         if (cold || panicstr) {
 540                 /*
 541                  * After a panic, or during autoconfiguration,
 542                  * just give interrupts a chance, then just return;
 543                  * don't run any other procs or panic below,
 544                  * in case this is the idle process and already asleep.
 545                  */
 546                 splx(safepri);
 547                 splx(s);
 548                 return (0);
 549         }
 550         KASSERT(p != NULL, ("xsleep1"));
 551         KASSERT(w != NULL && p->p_stat == SRUN, ("xsleep"));
 552
 553         /*
 554          * If the generation number does not match we return immediately.
 555          */
 556         if (*gen != w->gen) {
 557                 *gen = w->gen;
 558                 splx(s);
 559 #ifdef KTRACE
 560                 if (KTRPOINT(td, KTR_CSW))
 561                         ktrcsw(p->p_tracep, 0, 0);
 562 #endif
 563                 return(0);
 564         }
 565
 566         p->p_wchan = w;
 567         p->p_wmesg = wmesg;
 568         p->p_slptime = 0;
 569         p->p_priority = priority & PRIMASK;
 570         p->p_flag |= P_XSLEEP;
 571         TAILQ_INSERT_TAIL(&w->waitq, p, p_procq);
 572         if (timo)
 573                 thandle = timeout(endtsleep, (void *)p, timo);
 574         /*
 575          * We put ourselves on the sleep queue and start our timeout
 576          * before calling CURSIG, as we could stop there, and a wakeup
 577          * or a SIGCONT (or both) could occur while we were stopped.
 578          * A SIGCONT would cause us to be marked as SSLEEP
 579          * without resuming us, thus we must be ready for sleep
 580          * when CURSIG is called.  If the wakeup happens while we're
 581          * stopped, p->p_wchan will be 0 upon return from CURSIG.
 582          */
 583         if (catch) {
 584                 p->p_flag |= P_SINTR;
 585                 if ((sig = CURSIG(p))) {
 586                         if (p->p_wchan)
 587                                 unsleep(p);
 588                         p->p_stat = SRUN;
 589                         goto resume;
 590                 }
 591                 if (p->p_wchan == NULL) {
 592                         catch = 0;
 593                         goto resume;
 594                 }
 595         } else
 596                 sig = 0;
 597         p->p_stat = SSLEEP;
 598         p->p_stats->p_ru.ru_nvcsw++;
 599         mi_switch();
 600 resume:
 601         curpriority = p->p_usrpri;
 602         *gen = w->gen;  /* update generation number */
 603         splx(s);
 604         p->p_flag &= ~P_SINTR;
 605         if (p->p_flag & P_TIMEOUT) {
 606                 p->p_flag &= ~P_TIMEOUT;
 607                 if (sig == 0) {
 608 #ifdef KTRACE
 609                         if (KTRPOINT(td, KTR_CSW))
 610                                 ktrcsw(p->p_tracep, 0, 0);
 611 #endif
 612                         return (EWOULDBLOCK);
 613                 }
 614         } else if (timo)
 615                 untimeout(endtsleep, (void *)p, thandle);
 616         if (catch && (sig != 0 || (sig = CURSIG(p)))) {
 617 #ifdef KTRACE
 618                 if (KTRPOINT(td, KTR_CSW))
 619                         ktrcsw(p->p_tracep, 0, 0);
 620 #endif
 621                 if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
 622                         return (EINTR);
 623                 return (ERESTART);
 624         }
 625 #ifdef KTRACE
 626         if (KTRPOINT(td, KTR_CSW))
 627                 ktrcsw(p->p_tracep, 0, 0);
 628 #endif
 629         return (0);
 630 }
 631
 632 /*
 633  * Implement timeout for tsleep or xsleep
 634  *
 635  * If process hasn't been awakened (wchan non-zero),
 636  * set timeout flag and undo the sleep.  If proc
 637  * is stopped, just unsleep so it will remain stopped.
 638  */
 639 static void
 640 endtsleep(arg)
 641         void *arg;
 642 {
 643         register struct proc *p;
 644         int s;
 645
 646         p = (struct proc *)arg;
 647         s = splhigh();
 648         if (p->p_wchan) {
 649                 if (p->p_stat == SSLEEP)
 650                         setrunnable(p);
 651                 else
 652                         unsleep(p);
 653                 p->p_flag |= P_TIMEOUT;
 654         }
 655         splx(s);
 656 }
 657
 658 /*
 659  * Remove a process from its wait queue
 660  */
 661 void
 662 unsleep(p)
 663         register struct proc *p;
 664 {
 665         int s;
 666
 667         s = splhigh();
 668         if (p->p_wchan) {
 669                 if (p->p_flag & P_XSLEEP) {
 670                         struct xwait *w = p->p_wchan;
 671                         TAILQ_REMOVE(&w->waitq, p, p_procq);
 672                         p->p_flag &= ~P_XSLEEP;
 673                 } else {
 674                         TAILQ_REMOVE(&slpque[LOOKUP(p->p_wchan)], p, p_procq);
 675                 }
 676                 p->p_wchan = NULL;
 677         }
 678         splx(s);
 679 }
 680
 681 /*
 682  * Make all processes sleeping on the explicit lock structure runnable.
 683  */
 684 void
 685 xwakeup(struct xwait *w)
 686 {
 687         struct proc *p;
 688         int s;
 689
 690         s = splhigh();
 691         ++w->gen;
 692         while ((p = TAILQ_FIRST(&w->waitq)) != NULL) {
 693                 TAILQ_REMOVE(&w->waitq, p, p_procq);
 694                 KASSERT(p->p_wchan == w && (p->p_flag & P_XSLEEP),
 695                     ("xwakeup: wchan mismatch for %p (%p/%p) %08x", p, p->p_wchan, w, p->p_flag & P_XSLEEP));
 696                 p->p_wchan = NULL;
 697                 p->p_flag &= ~P_XSLEEP;
 698                 if (p->p_stat == SSLEEP) {
 699                         /* OPTIMIZED EXPANSION OF setrunnable(p); */
 700                         if (p->p_slptime > 1)
 701                                 updatepri(p);
 702                         p->p_slptime = 0;
 703                         p->p_stat = SRUN;
 704                         if (p->p_flag & P_INMEM) {
 705                                 setrunqueue(p);
 706                                 maybe_resched(p);
 707                         } else {
 708                                 p->p_flag |= P_SWAPINREQ;
 709                                 wakeup((caddr_t)&proc0);
 710                         }
 711                 }
 712         }
 713         splx(s);
 714 }
 715
 716 /*
 717  * Make all processes sleeping on the specified identifier runnable.
 718  */
 719 void
 720 wakeup(ident)
 721         register void *ident;
 722 {
 723         register struct slpquehead *qp;
 724         register struct proc *p;
 725         struct proc *np;
 726         int s;
 727         int id = LOOKUP(ident);
 728
 729         s = splhigh();
 730         qp = &slpque[id];
 731 restart:
 732         for (p = TAILQ_FIRST(qp); p != NULL; p = np) {
 733                 np = TAILQ_NEXT(p, p_procq);
 734                 if (p->p_wchan == ident) {
 735                         TAILQ_REMOVE(qp, p, p_procq);
 736                         p->p_wchan = NULL;
 737                         if (p->p_stat == SSLEEP) {
 738                                 /* OPTIMIZED EXPANSION OF setrunnable(p); */
 739                                 if (p->p_slptime > 1)
 740                                         updatepri(p);
 741                                 p->p_slptime = 0;
 742                                 p->p_stat = SRUN;
 743                                 if (p->p_flag & P_INMEM) {
 744                                         setrunqueue(p);
 745                                         maybe_resched(p);
 746                                 } else {
 747                                         p->p_flag |= P_SWAPINREQ;
 748                                         wakeup((caddr_t)&proc0);
 749                                 }
 750                                 /* END INLINE EXPANSION */
 751                                 goto restart;
 752                         }
 753                 }
 754         }
 755         splx(s);
 756 }
 757
 758 /*
 759  * Make a process sleeping on the specified identifier runnable.
 760  * May wake more than one process if a target process is currently
 761  * swapped out.
 762  */
 763 void
 764 wakeup_one(ident)
 765         register void *ident;
 766 {
 767         register struct slpquehead *qp;
 768         register struct proc *p;
 769         struct proc *np;
 770         int s;
 771         int id = LOOKUP(ident);
 772
 773         s = splhigh();
 774         qp = &slpque[id];
 775
 776 restart:
 777         for (p = TAILQ_FIRST(qp); p != NULL; p = np) {
 778                 np = TAILQ_NEXT(p, p_procq);
 779                 if (p->p_wchan == ident) {
 780                         TAILQ_REMOVE(qp, p, p_procq);
 781                         p->p_wchan = 0;
 782                         if (p->p_stat == SSLEEP) {
 783                                 /* OPTIMIZED EXPANSION OF setrunnable(p); */
 784                                 if (p->p_slptime > 1)
 785                                         updatepri(p);
 786                                 p->p_slptime = 0;
 787                                 p->p_stat = SRUN;
 788                                 if (p->p_flag & P_INMEM) {
 789                                         setrunqueue(p);
 790                                         maybe_resched(p);
 791                                         break;
 792                                 } else {
 793                                         p->p_flag |= P_SWAPINREQ;
 794                                         wakeup((caddr_t)&proc0);
 795                                 }
 796                                 /* END INLINE EXPANSION */
 797                                 goto restart;
 798                         }
 799                 }
 800         }
 801         splx(s);
 802 }
 803
 804 /*
 805  * The machine independent parts of mi_switch().
 806  * Must be called at splstatclock() or higher.
 807  */
 808 void
 809 mi_switch()
 810 {
 811         struct thread *td = curthread;
 812         struct proc *p = td->td_proc;   /* XXX */
 813         struct rlimit *rlim;
 814         int x;
 815         u_int64_t ttime;
 816
 817         /*
 818          * XXX this spl is almost unnecessary.  It is partly to allow for
 819          * sloppy callers that don't do it (issignal() via CURSIG() is the
 820          * main offender).  It is partly to work around a bug in the i386
 821          * cpu_switch() (the ipl is not preserved).  We ran for years
 822          * without it.  I think there was only a interrupt latency problem.
 823          * The main caller, tsleep(), does an splx() a couple of instructions
 824          * after calling here.  The buggy caller, issignal(), usually calls
 825          * here at spl0() and sometimes returns at splhigh().  The process
 826          * then runs for a little too long at splhigh().  The ipl gets fixed
 827          * when the process returns to user mode (or earlier).
 828          *
 829          * It would probably be better to always call here at spl0(). Callers
 830          * are prepared to give up control to another process, so they must
 831          * be prepared to be interrupted.  The clock stuff here may not
 832          * actually need splstatclock().
 833          */
 834         x = splstatclock();
 835         clear_resched();
 836
 837 #ifdef SIMPLELOCK_DEBUG
 838         if (p->p_simple_locks)
 839                 printf("sleep: holding simple lock\n");
 840 #endif
 841
 842         /*
 843          * Check if the process exceeds its cpu resource allocation.
 844          * If over max, kill it.  Time spent in interrupts is not
 845          * included.  YYY 64 bit match is expensive.  Ick.
 846          */
 847         ttime = td->td_sticks + td->td_uticks;
 848         if (p->p_stat != SZOMB && p->p_limit->p_cpulimit != RLIM_INFINITY &&
 849             ttime > p->p_limit->p_cpulimit) {
 850                 rlim = &p->p_rlimit[RLIMIT_CPU];
 851                 if (ttime / (rlim_t)1000000 >= rlim->rlim_max) {
 852                         killproc(p, "exceeded maximum CPU limit");
 853                 } else {
 854                         psignal(p, SIGXCPU);
 855                         if (rlim->rlim_cur < rlim->rlim_max) {
 856                                 /* XXX: we should make a private copy */
 857                                 rlim->rlim_cur += 5;
 858                         }
 859                 }
 860         }
 861
 862         /*
 863          * Pick a new current process and record its start time.
 864          * YYY lwkt_switch() will run the heavy weight process restoration
 865          * code, which removes the target thread and process from their
 866          * respective run queues to temporarily mimic 5.x behavior.
 867          * YYY the userland scheduler should pick only one user process
 868          * at a time to run per cpu.
 869          */
 870         cnt.v_swtch++;
 871         lwkt_switch();
 872         remrunqueue(p);
 873
 874         splx(x);
 875 }
 876
 877 /*
 878  * Change process state to be runnable,
 879  * placing it on the run queue if it is in memory,
 880  * and awakening the swapper if it isn't in memory.
 881  */
 882 void
 883 setrunnable(p)
 884         register struct proc *p;
 885 {
 886         register int s;
 887
 888         s = splhigh();
 889         switch (p->p_stat) {
 890         case 0:
 891         case SRUN:
 892         case SZOMB:
 893         default:
 894                 panic("setrunnable");
 895         case SSTOP:
 896         case SSLEEP:
 897                 unsleep(p);             /* e.g. when sending signals */
 898                 break;
 899
 900         case SIDL:
 901                 break;
 902         }
 903         p->p_stat = SRUN;
 904         if (p->p_flag & P_INMEM)
 905                 setrunqueue(p);
 906         splx(s);
 907         if (p->p_slptime > 1)
 908                 updatepri(p);
 909         p->p_slptime = 0;
 910         if ((p->p_flag & P_INMEM) == 0) {
 911                 p->p_flag |= P_SWAPINREQ;
 912                 wakeup((caddr_t)&proc0);
 913         }
 914         else
 915                 maybe_resched(p);
 916 }
 917
 918 /*
 919  * Compute the priority of a process when running in user mode.
 920  * Arrange to reschedule if the resulting priority is better
 921  * than that of the current process.
 922  */
 923 void
 924 resetpriority(p)
 925         register struct proc *p;
 926 {
 927         register unsigned int newpriority;
 928
 929         if (p->p_rtprio.type == RTP_PRIO_NORMAL) {
 930                 newpriority = PUSER + p->p_estcpu / INVERSE_ESTCPU_WEIGHT +
 931                     NICE_WEIGHT * p->p_nice;
 932                 newpriority = min(newpriority, MAXPRI);
 933                 p->p_usrpri = newpriority;
 934         }
 935         maybe_resched(p);
 936 }
 937
 938 /*
 939  * Compute a tenex style load average of a quantity on
 940  * 1, 5 and 15 minute intervals.
 941  */
 942 static void
 943 loadav(void *arg)
 944 {
 945         int i, nrun;
 946         struct loadavg *avg;
 947         struct proc *p;
 948
 949         avg = &averunnable;
 950         nrun = 0;
 951         LIST_FOREACH(p, &allproc, p_list) {
 952                 switch (p->p_stat) {
 953                 case SRUN:
 954                 case SIDL:
 955                         nrun++;
 956                 }
 957         }
 958         for (i = 0; i < 3; i++)
 959                 avg->ldavg[i] = (cexp[i] * avg->ldavg[i] +
 960                     nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT;
 961
 962         /*
 963          * Schedule the next update to occur after 5 seconds, but add a
 964          * random variation to avoid synchronisation with processes that
 965          * run at regular intervals.
 966          */
 967         callout_reset(&loadav_callout, hz * 4 + (int)(random() % (hz * 2 + 1)),
 968             loadav, NULL);
 969 }
 970
 971 /* ARGSUSED */
 972 static void
 973 sched_setup(dummy)
 974         void *dummy;
 975 {
 976
 977         callout_init(&loadav_callout);
 978
 979         /* Kick off timeout driven events by calling first time. */
 980         roundrobin(NULL);
 981         schedcpu(NULL);
 982         loadav(NULL);
 983 }
 984
 985 /*
 986  * We adjust the priority of the current process.  The priority of
 987  * a process gets worse as it accumulates CPU time.  The cpu usage
 988  * estimator (p_estcpu) is increased here.  resetpriority() will
 989  * compute a different priority each time p_estcpu increases by
 990  * INVERSE_ESTCPU_WEIGHT
 991  * (until MAXPRI is reached).  The cpu usage estimator ramps up
 992  * quite quickly when the process is running (linearly), and decays
 993  * away exponentially, at a rate which is proportionally slower when
 994  * the system is busy.  The basic principle is that the system will
 995  * 90% forget that the process used a lot of CPU time in 5 * loadav
 996  * seconds.  This causes the system to favor processes which haven't
 997  * run much recently, and to round-robin among other processes.
 998  */
 999 void
1000 schedclock(p)
1001         struct proc *p;
1002 {
1003
1004         p->p_cpticks++;
1005         p->p_estcpu = ESTCPULIM(p->p_estcpu + 1);
1006         if ((p->p_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) {
1007                 resetpriority(p);
1008                 if (p->p_priority >= PUSER)
1009                         p->p_priority = p->p_usrpri;
1010         }
1011 }