sys/kern/kern_synch.c

   1 /*-
   2  * Copyright (c) 1982, 1986, 1990, 1991, 1993
   3  *      The Regents of the University of California.  All rights reserved.
   4  * (c) UNIX System Laboratories, Inc.
   5  * All or some portions of this file are derived from material licensed
   6  * to the University of California by American Telephone and Telegraph
   7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   8  * the permission of UNIX System Laboratories, Inc.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 3. All advertising materials mentioning features or use of this software
  19  *    must display the following acknowledgement:
  20  *      This product includes software developed by the University of
  21  *      California, Berkeley and its contributors.
  22  * 4. Neither the name of the University nor the names of its contributors
  23  *    may be used to endorse or promote products derived from this software
  24  *    without specific prior written permission.
  25  *
  26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  36  * SUCH DAMAGE.
  37  *
  38  *      @(#)kern_synch.c        8.9 (Berkeley) 5/19/95
  39  * $FreeBSD: src/sys/kern/kern_synch.c,v 1.87.2.6 2002/10/13 07:29:53 kbyanc Exp $
  40  * $DragonFly: src/sys/kern/kern_synch.c,v 1.91 2008/09/09 04:06:13 dillon Exp $
  41  */
  42
  43 #include "opt_ktrace.h"
  44
  45 #include <sys/param.h>
  46 #include <sys/systm.h>
  47 #include <sys/proc.h>
  48 #include <sys/kernel.h>
  49 #include <sys/signalvar.h>
  50 #include <sys/resourcevar.h>
  51 #include <sys/vmmeter.h>
  52 #include <sys/sysctl.h>
  53 #include <sys/lock.h>
  54 #include <sys/uio.h>
  55 #ifdef KTRACE
  56 #include <sys/ktrace.h>
  57 #endif
  58 #include <sys/xwait.h>
  59 #include <sys/ktr.h>
  60 #include <sys/serialize.h>
  61
  62 #include <sys/signal2.h>
  63 #include <sys/thread2.h>
  64 #include <sys/spinlock2.h>
  65 #include <sys/mutex2.h>
  66
  67 #include <machine/cpu.h>
  68 #include <machine/smp.h>
  69
  70 TAILQ_HEAD(tslpque, thread);
  71
  72 static void sched_setup (void *dummy);
  73 SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL)
  74
  75 int     hogticks;
  76 int     lbolt;
  77 int     lbolt_syncer;
  78 int     sched_quantum;          /* Roundrobin scheduling quantum in ticks. */
  79 int     ncpus;
  80 int     ncpus2, ncpus2_shift, ncpus2_mask;      /* note: mask not cpumask_t */
  81 int     ncpus_fit, ncpus_fit_mask;              /* note: mask not cpumask_t */
  82 int     safepri;
  83 int     tsleep_now_works;
  84 int     tsleep_crypto_dump = 0;
  85
  86 static struct callout loadav_callout;
  87 static struct callout schedcpu_callout;
  88 MALLOC_DEFINE(M_TSLEEP, "tslpque", "tsleep queues");
  89
  90 #define __DEALL(ident)  __DEQUALIFY(void *, ident)
  91
  92 #if !defined(KTR_TSLEEP)
  93 #define KTR_TSLEEP      KTR_ALL
  94 #endif
  95 KTR_INFO_MASTER(tsleep);
  96 KTR_INFO(KTR_TSLEEP, tsleep, tsleep_beg, 0, "tsleep enter %p", sizeof(void *));
  97 KTR_INFO(KTR_TSLEEP, tsleep, tsleep_end, 1, "tsleep exit", 0);
  98 KTR_INFO(KTR_TSLEEP, tsleep, wakeup_beg, 2, "wakeup enter %p", sizeof(void *));
  99 KTR_INFO(KTR_TSLEEP, tsleep, wakeup_end, 3, "wakeup exit", 0);
 100 KTR_INFO(KTR_TSLEEP, tsleep, ilockfail,  4, "interlock failed %p", sizeof(void *));
 101
 102 #define logtsleep1(name)        KTR_LOG(tsleep_ ## name)
 103 #define logtsleep2(name, val)   KTR_LOG(tsleep_ ## name, val)
 104
 105 struct loadavg averunnable =
 106         { {0, 0, 0}, FSCALE };  /* load average, of runnable procs */
 107 /*
 108  * Constants for averages over 1, 5, and 15 minutes
 109  * when sampling at 5 second intervals.
 110  */
 111 static fixpt_t cexp[3] = {
 112         0.9200444146293232 * FSCALE,    /* exp(-1/12) */
 113         0.9834714538216174 * FSCALE,    /* exp(-1/60) */
 114         0.9944598480048967 * FSCALE,    /* exp(-1/180) */
 115 };
 116
 117 static void     endtsleep (void *);
 118 static void     loadav (void *arg);
 119 static void     schedcpu (void *arg);
 120 #ifdef SMP
 121 static void     tsleep_wakeup_remote(struct thread *td);
 122 #endif
 123
 124 /*
 125  * Adjust the scheduler quantum.  The quantum is specified in microseconds.
 126  * Note that 'tick' is in microseconds per tick.
 127  */
 128 static int
 129 sysctl_kern_quantum(SYSCTL_HANDLER_ARGS)
 130 {
 131         int error, new_val;
 132
 133         new_val = sched_quantum * ustick;
 134         error = sysctl_handle_int(oidp, &new_val, 0, req);
 135         if (error != 0 || req->newptr == NULL)
 136                 return (error);
 137         if (new_val < ustick)
 138                 return (EINVAL);
 139         sched_quantum = new_val / ustick;
 140         hogticks = 2 * sched_quantum;
 141         return (0);
 142 }
 143
 144 SYSCTL_PROC(_kern, OID_AUTO, quantum, CTLTYPE_INT|CTLFLAG_RW,
 145         0, sizeof sched_quantum, sysctl_kern_quantum, "I", "");
 146
 147 /*
 148  * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
 149  * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
 150  * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
 151  *
 152  * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
 153  *     1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
 154  *
 155  * If you don't want to bother with the faster/more-accurate formula, you
 156  * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
 157  * (more general) method of calculating the %age of CPU used by a process.
 158  *
 159  * decay 95% of `lwp_pctcpu' in 60 seconds; see CCPU_SHIFT before changing
 160  */
 161 #define CCPU_SHIFT      11
 162
 163 static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
 164 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
 165
 166 /*
 167  * kernel uses `FSCALE', userland (SHOULD) use kern.fscale
 168  */
 169 int     fscale __unused = FSCALE;       /* exported to systat */
 170 SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, 0, FSCALE, "");
 171
 172 /*
 173  * Recompute process priorities, once a second.
 174  *
 175  * Since the userland schedulers are typically event oriented, if the
 176  * estcpu calculation at wakeup() time is not sufficient to make a
 177  * process runnable relative to other processes in the system we have
 178  * a 1-second recalc to help out.
 179  *
 180  * This code also allows us to store sysclock_t data in the process structure
 181  * without fear of an overrun, since sysclock_t are guarenteed to hold
 182  * several seconds worth of count.
 183  *
 184  * WARNING!  callouts can preempt normal threads.  However, they will not
 185  * preempt a thread holding a spinlock so we *can* safely use spinlocks.
 186  */
 187 static int schedcpu_stats(struct proc *p, void *data __unused);
 188 static int schedcpu_resource(struct proc *p, void *data __unused);
 189
 190 static void
 191 schedcpu(void *arg)
 192 {
 193         allproc_scan(schedcpu_stats, NULL);
 194         allproc_scan(schedcpu_resource, NULL);
 195         wakeup((caddr_t)&lbolt);
 196         wakeup((caddr_t)&lbolt_syncer);
 197         callout_reset(&schedcpu_callout, hz, schedcpu, NULL);
 198 }
 199
 200 /*
 201  * General process statistics once a second
 202  */
 203 static int
 204 schedcpu_stats(struct proc *p, void *data __unused)
 205 {
 206         struct lwp *lp;
 207
 208         /*
 209          * Threads may not be completely set up if process in SIDL state.
 210          */
 211         if (p->p_stat == SIDL)
 212                 return(0);
 213
 214         PHOLD(p);
 215         lwkt_gettoken(&p->p_token);
 216
 217         p->p_swtime++;
 218         FOREACH_LWP_IN_PROC(lp, p) {
 219                 if (lp->lwp_stat == LSSLEEP)
 220                         lp->lwp_slptime++;
 221
 222                 /*
 223                  * Only recalculate processes that are active or have slept
 224                  * less then 2 seconds.  The schedulers understand this.
 225                  */
 226                 if (lp->lwp_slptime <= 1) {
 227                         p->p_usched->recalculate(lp);
 228                 } else {
 229                         lp->lwp_pctcpu = (lp->lwp_pctcpu * ccpu) >> FSHIFT;
 230                 }
 231         }
 232         lwkt_reltoken(&p->p_token);
 233         PRELE(p);
 234         return(0);
 235 }
 236
 237 /*
 238  * Resource checks.  XXX break out since ksignal/killproc can block,
 239  * limiting us to one process killed per second.  There is probably
 240  * a better way.
 241  */
 242 static int
 243 schedcpu_resource(struct proc *p, void *data __unused)
 244 {
 245         u_int64_t ttime;
 246         struct lwp *lp;
 247
 248         if (p->p_stat == SIDL)
 249                 return(0);
 250
 251         PHOLD(p);
 252         lwkt_gettoken(&p->p_token);
 253
 254         if (p->p_stat == SZOMB || p->p_limit == NULL) {
 255                 lwkt_reltoken(&p->p_token);
 256                 PRELE(p);
 257                 return(0);
 258         }
 259
 260         ttime = 0;
 261         FOREACH_LWP_IN_PROC(lp, p) {
 262                 /*
 263                  * We may have caught an lp in the middle of being
 264                  * created, lwp_thread can be NULL.
 265                  */
 266                 if (lp->lwp_thread) {
 267                         ttime += lp->lwp_thread->td_sticks;
 268                         ttime += lp->lwp_thread->td_uticks;
 269                 }
 270         }
 271
 272         switch(plimit_testcpulimit(p->p_limit, ttime)) {
 273         case PLIMIT_TESTCPU_KILL:
 274                 killproc(p, "exceeded maximum CPU limit");
 275                 break;
 276         case PLIMIT_TESTCPU_XCPU:
 277                 if ((p->p_flag & P_XCPU) == 0) {
 278                         p->p_flag |= P_XCPU;
 279                         ksignal(p, SIGXCPU);
 280                 }
 281                 break;
 282         default:
 283                 break;
 284         }
 285         lwkt_reltoken(&p->p_token);
 286         PRELE(p);
 287         return(0);
 288 }
 289
 290 /*
 291  * This is only used by ps.  Generate a cpu percentage use over
 292  * a period of one second.
 293  *
 294  * MPSAFE
 295  */
 296 void
 297 updatepcpu(struct lwp *lp, int cpticks, int ttlticks)
 298 {
 299         fixpt_t acc;
 300         int remticks;
 301
 302         acc = (cpticks << FSHIFT) / ttlticks;
 303         if (ttlticks >= ESTCPUFREQ) {
 304                 lp->lwp_pctcpu = acc;
 305         } else {
 306                 remticks = ESTCPUFREQ - ttlticks;
 307                 lp->lwp_pctcpu = (acc * ttlticks + lp->lwp_pctcpu * remticks) /
 308                                 ESTCPUFREQ;
 309         }
 310 }
 311
 312 /*
 313  * tsleep/wakeup hash table parameters.  Try to find the sweet spot for
 314  * like addresses being slept on.
 315  */
 316 #define TABLESIZE       1024
 317 #define LOOKUP(x)       (((intptr_t)(x) >> 6) & (TABLESIZE - 1))
 318
 319 static cpumask_t slpque_cpumasks[TABLESIZE];
 320
 321 /*
 322  * General scheduler initialization.  We force a reschedule 25 times
 323  * a second by default.  Note that cpu0 is initialized in early boot and
 324  * cannot make any high level calls.
 325  *
 326  * Each cpu has its own sleep queue.
 327  */
 328 void
 329 sleep_gdinit(globaldata_t gd)
 330 {
 331         static struct tslpque slpque_cpu0[TABLESIZE];
 332         int i;
 333
 334         if (gd->gd_cpuid == 0) {
 335                 sched_quantum = (hz + 24) / 25;
 336                 hogticks = 2 * sched_quantum;
 337
 338                 gd->gd_tsleep_hash = slpque_cpu0;
 339         } else {
 340                 gd->gd_tsleep_hash = kmalloc(sizeof(slpque_cpu0),
 341                                             M_TSLEEP, M_WAITOK | M_ZERO);
 342         }
 343         for (i = 0; i < TABLESIZE; ++i)
 344                 TAILQ_INIT(&gd->gd_tsleep_hash[i]);
 345 }
 346
 347 /*
 348  * This is a dandy function that allows us to interlock tsleep/wakeup
 349  * operations with unspecified upper level locks, such as lockmgr locks,
 350  * simply by holding a critical section.  The sequence is:
 351  *
 352  *      (acquire upper level lock)
 353  *      tsleep_interlock(blah)
 354  *      (release upper level lock)
 355  *      tsleep(blah, ...)
 356  *
 357  * Basically this functions queues us on the tsleep queue without actually
 358  * descheduling us.  When tsleep() is later called with PINTERLOCK it
 359  * assumes the thread was already queued, otherwise it queues it there.
 360  *
 361  * Thus it is possible to receive the wakeup prior to going to sleep and
 362  * the race conditions are covered.
 363  */
 364 static __inline void
 365 _tsleep_interlock(globaldata_t gd, const volatile void *ident, int flags)
 366 {
 367         thread_t td = gd->gd_curthread;
 368         int id;
 369
 370         crit_enter_quick(td);
 371         if (td->td_flags & TDF_TSLEEPQ) {
 372                 id = LOOKUP(td->td_wchan);
 373                 TAILQ_REMOVE(&gd->gd_tsleep_hash[id], td, td_sleepq);
 374                 if (TAILQ_FIRST(&gd->gd_tsleep_hash[id]) == NULL)
 375                         atomic_clear_cpumask(&slpque_cpumasks[id], gd->gd_cpumask);
 376         } else {
 377                 td->td_flags |= TDF_TSLEEPQ;
 378         }
 379         id = LOOKUP(ident);
 380         TAILQ_INSERT_TAIL(&gd->gd_tsleep_hash[id], td, td_sleepq);
 381         atomic_set_cpumask(&slpque_cpumasks[id], gd->gd_cpumask);
 382         td->td_wchan = ident;
 383         td->td_wdomain = flags & PDOMAIN_MASK;
 384         crit_exit_quick(td);
 385 }
 386
 387 void
 388 tsleep_interlock(const volatile void *ident, int flags)
 389 {
 390         _tsleep_interlock(mycpu, ident, flags);
 391 }
 392
 393 /*
 394  * Remove thread from sleepq.  Must be called with a critical section held.
 395  */
 396 static __inline void
 397 _tsleep_remove(thread_t td)
 398 {
 399         globaldata_t gd = mycpu;
 400         int id;
 401
 402         KKASSERT(td->td_gd == gd);
 403         if (td->td_flags & TDF_TSLEEPQ) {
 404                 td->td_flags &= ~TDF_TSLEEPQ;
 405                 id = LOOKUP(td->td_wchan);
 406                 TAILQ_REMOVE(&gd->gd_tsleep_hash[id], td, td_sleepq);
 407                 if (TAILQ_FIRST(&gd->gd_tsleep_hash[id]) == NULL)
 408                         atomic_clear_cpumask(&slpque_cpumasks[id], gd->gd_cpumask);
 409                 td->td_wchan = NULL;
 410                 td->td_wdomain = 0;
 411         }
 412 }
 413
 414 void
 415 tsleep_remove(thread_t td)
 416 {
 417         _tsleep_remove(td);
 418 }
 419
 420 /*
 421  * This function removes a thread from the tsleep queue and schedules
 422  * it.  This function may act asynchronously.  The target thread may be
 423  * sleeping on a different cpu.
 424  *
 425  * This function mus be called while in a critical section but if the
 426  * target thread is sleeping on a different cpu we cannot safely probe
 427  * td_flags.
 428  *
 429  * This function is only called from a different cpu via setrunnable()
 430  * when the thread is in a known sleep.  However, multiple wakeups are
 431  * possible and we must hold the td to prevent a race against the thread
 432  * exiting.
 433  */
 434 static __inline
 435 void
 436 _tsleep_wakeup(struct thread *td)
 437 {
 438 #ifdef SMP
 439         globaldata_t gd = mycpu;
 440
 441         if (td->td_gd != gd) {
 442                 lwkt_hold(td);
 443                 lwkt_send_ipiq(td->td_gd, (ipifunc1_t)tsleep_wakeup_remote, td);
 444                 return;
 445         }
 446 #endif
 447         _tsleep_remove(td);
 448         if (td->td_flags & TDF_TSLEEP_DESCHEDULED) {
 449                 td->td_flags &= ~TDF_TSLEEP_DESCHEDULED;
 450                 lwkt_schedule(td);
 451         }
 452 }
 453
 454 #ifdef SMP
 455 static
 456 void
 457 tsleep_wakeup_remote(struct thread *td)
 458 {
 459         _tsleep_wakeup(td);
 460         lwkt_rele(td);
 461 }
 462 #endif
 463
 464
 465 /*
 466  * General sleep call.  Suspends the current process until a wakeup is
 467  * performed on the specified identifier.  The process will then be made
 468  * runnable with the specified priority.  Sleeps at most timo/hz seconds
 469  * (0 means no timeout).  If flags includes PCATCH flag, signals are checked
 470  * before and after sleeping, else signals are not checked.  Returns 0 if
 471  * awakened, EWOULDBLOCK if the timeout expires.  If PCATCH is set and a
 472  * signal needs to be delivered, ERESTART is returned if the current system
 473  * call should be restarted if possible, and EINTR is returned if the system
 474  * call should be interrupted by the signal (return EINTR).
 475  *
 476  * Note that if we are a process, we release_curproc() before messing with
 477  * the LWKT scheduler.
 478  *
 479  * During autoconfiguration or after a panic, a sleep will simply
 480  * lower the priority briefly to allow interrupts, then return.
 481  */
 482 int
 483 tsleep(const volatile void *ident, int flags, const char *wmesg, int timo)
 484 {
 485         struct thread *td = curthread;
 486         struct lwp *lp = td->td_lwp;
 487         struct proc *p = td->td_proc;           /* may be NULL */
 488         globaldata_t gd;
 489         int sig;
 490         int catch;
 491         int id;
 492         int error;
 493         int oldpri;
 494         struct callout thandle;
 495
 496         /*
 497          * NOTE: removed KTRPOINT, it could cause races due to blocking
 498          * even in stable.  Just scrap it for now.
 499          */
 500         if (!tsleep_crypto_dump && (tsleep_now_works == 0 || panicstr)) {
 501                 /*
 502                  * After a panic, or before we actually have an operational
 503                  * softclock, just give interrupts a chance, then just return;
 504                  *
 505                  * don't run any other procs or panic below,
 506                  * in case this is the idle process and already asleep.
 507                  */
 508                 splz();
 509                 oldpri = td->td_pri;
 510                 lwkt_setpri_self(safepri);
 511                 lwkt_switch();
 512                 lwkt_setpri_self(oldpri);
 513                 return (0);
 514         }
 515         logtsleep2(tsleep_beg, ident);
 516         gd = td->td_gd;
 517         KKASSERT(td != &gd->gd_idlethread);     /* you must be kidding! */
 518
 519         /*
 520          * NOTE: all of this occurs on the current cpu, including any
 521          * callout-based wakeups, so a critical section is a sufficient
 522          * interlock.
 523          *
 524          * The entire sequence through to where we actually sleep must
 525          * run without breaking the critical section.
 526          */
 527         catch = flags & PCATCH;
 528         error = 0;
 529         sig = 0;
 530
 531         crit_enter_quick(td);
 532
 533         KASSERT(ident != NULL, ("tsleep: no ident"));
 534         KASSERT(lp == NULL ||
 535                 lp->lwp_stat == LSRUN ||        /* Obvious */
 536                 lp->lwp_stat == LSSTOP,         /* Set in tstop */
 537                 ("tsleep %p %s %d",
 538                         ident, wmesg, lp->lwp_stat));
 539
 540         /*
 541          * We interlock the sleep queue if the caller has not already done
 542          * it for us.  This must be done before we potentially acquire any
 543          * tokens or we can loose the wakeup.
 544          */
 545         if ((flags & PINTERLOCKED) == 0) {
 546                 id = LOOKUP(ident);
 547                 _tsleep_interlock(gd, ident, flags);
 548         }
 549
 550         /*
 551          * Setup for the current process (if this is a process).
 552          *
 553          * We hold the process token if lp && catch.  The resume
 554          * code will release it.
 555          */
 556         if (lp) {
 557                 if (catch) {
 558                         /*
 559                          * Early termination if PCATCH was set and a
 560                          * signal is pending, interlocked with the
 561                          * critical section.
 562                          *
 563                          * Early termination only occurs when tsleep() is
 564                          * entered while in a normal LSRUN state.
 565                          */
 566                         lwkt_gettoken(&p->p_token);
 567                         if ((sig = CURSIG(lp)) != 0)
 568                                 goto resume;
 569
 570                         /*
 571                          * Early termination if PCATCH was set and a
 572                          * mailbox signal was possibly delivered prior to
 573                          * the system call even being made, in order to
 574                          * allow the user to interlock without having to
 575                          * make additional system calls.
 576                          */
 577                         if (p->p_flag & P_MAILBOX)
 578                                 goto resume;
 579
 580                         /*
 581                          * Causes ksignal to wake us up if a signal is
 582                          * received (interlocked with p->p_token).
 583                          */
 584                         lp->lwp_flag |= LWP_SINTR;
 585                 }
 586         } else {
 587                 KKASSERT(p == NULL);
 588         }
 589
 590         /*
 591          * Make sure the current process has been untangled from
 592          * the userland scheduler and initialize slptime to start
 593          * counting.
 594          */
 595         if (lp) {
 596                 p->p_usched->release_curproc(lp);
 597                 lp->lwp_slptime = 0;
 598         }
 599
 600         /*
 601          * If the interlocked flag is set but our cpu bit in the slpqueue
 602          * is no longer set, then a wakeup was processed inbetween the
 603          * tsleep_interlock() (ours or the callers), and here.  This can
 604          * occur under numerous circumstances including when we release the
 605          * current process.
 606          *
 607          * Extreme loads can cause the sending of an IPI (e.g. wakeup()'s)
 608          * to process incoming IPIs, thus draining incoming wakeups.
 609          */
 610         if ((td->td_flags & TDF_TSLEEPQ) == 0) {
 611                 logtsleep2(ilockfail, ident);
 612                 goto resume;
 613         }
 614
 615         /*
 616          * scheduling is blocked while in a critical section.  Coincide
 617          * the descheduled-by-tsleep flag with the descheduling of the
 618          * lwkt.
 619          *
 620          * The timer callout is localized on our cpu and interlocked by
 621          * our critical section.
 622          */
 623         lwkt_deschedule_self(td);
 624         td->td_flags |= TDF_TSLEEP_DESCHEDULED;
 625         td->td_wmesg = wmesg;
 626
 627         /*
 628          * Setup the timeout, if any.  The timeout is only operable while
 629          * the thread is flagged descheduled.
 630          */
 631         KKASSERT((td->td_flags & TDF_TIMEOUT) == 0);
 632         if (timo) {
 633                 callout_init_mp(&thandle);
 634                 callout_reset(&thandle, timo, endtsleep, td);
 635         }
 636
 637         /*
 638          * Beddy bye bye.
 639          */
 640         if (lp) {
 641                 /*
 642                  * Ok, we are sleeping.  Place us in the SSLEEP state.
 643                  */
 644                 KKASSERT((lp->lwp_flag & LWP_ONRUNQ) == 0);
 645                 /*
 646                  * tstop() sets LSSTOP, so don't fiddle with that.
 647                  */
 648                 if (lp->lwp_stat != LSSTOP)
 649                         lp->lwp_stat = LSSLEEP;
 650                 lp->lwp_ru.ru_nvcsw++;
 651                 lwkt_switch();
 652                 td->td_flags &= ~TDF_TSLEEP_DESCHEDULED;
 653
 654                 /*
 655                  * And when we are woken up, put us back in LSRUN.  If we
 656                  * slept for over a second, recalculate our estcpu.
 657                  */
 658                 lp->lwp_stat = LSRUN;
 659                 if (lp->lwp_slptime)
 660                         p->p_usched->recalculate(lp);
 661                 lp->lwp_slptime = 0;
 662         } else {
 663                 lwkt_switch();
 664                 td->td_flags &= ~TDF_TSLEEP_DESCHEDULED;
 665         }
 666
 667         /*
 668          * Make sure we haven't switched cpus while we were asleep.  It's
 669          * not supposed to happen.  Cleanup our temporary flags.
 670          */
 671         KKASSERT(gd == td->td_gd);
 672
 673         /*
 674          * Cleanup the timeout.  If the timeout has already occured thandle
 675          * has already been stopped, otherwise stop thandle.
 676          */
 677         if (timo) {
 678                 if (td->td_flags & TDF_TIMEOUT) {
 679                         td->td_flags &= ~TDF_TIMEOUT;
 680                         error = EWOULDBLOCK;
 681                 } else {
 682                         /* does not block when on same cpu */
 683                         callout_stop(&thandle);
 684                 }
 685         }
 686
 687         /*
 688          * Make sure we have been removed from the sleepq.  In most
 689          * cases this will have been done for us already but it is
 690          * possible for a scheduling IPI to be in-flight from a
 691          * previous tsleep/tsleep_interlock() or due to a straight-out
 692          * call to lwkt_schedule() (in the case of an interrupt thread),
 693          * causing a spurious wakeup.
 694          */
 695         _tsleep_remove(td);
 696         td->td_wmesg = NULL;
 697
 698         /*
 699          * Figure out the correct error return.  If interrupted by a
 700          * signal we want to return EINTR or ERESTART.
 701          *
 702          * If P_MAILBOX is set no automatic system call restart occurs
 703          * and we return EINTR.  P_MAILBOX is meant to be used as an
 704          * interlock, the user must poll it prior to any system call
 705          * that it wishes to interlock a mailbox signal against since
 706          * the flag is cleared on *any* system call that sleeps.
 707          *
 708          * p->p_token is held in the p && catch case.
 709          */
 710 resume:
 711         if (p) {
 712                 if (catch && error == 0) {
 713                         if ((p->p_flag & P_MAILBOX) && sig == 0) {
 714                                 error = EINTR;
 715                         } else if (sig != 0 || (sig = CURSIG(lp))) {
 716                                 if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
 717                                         error = EINTR;
 718                                 else
 719                                         error = ERESTART;
 720                         }
 721                 }
 722                 if (catch)
 723                         lwkt_reltoken(&p->p_token);
 724                 lp->lwp_flag &= ~(LWP_BREAKTSLEEP | LWP_SINTR);
 725                 p->p_flag &= ~P_MAILBOX;
 726         }
 727         logtsleep1(tsleep_end);
 728         crit_exit_quick(td);
 729         return (error);
 730 }
 731
 732 /*
 733  * Interlocked spinlock sleep.  An exclusively held spinlock must
 734  * be passed to ssleep().  The function will atomically release the
 735  * spinlock and tsleep on the ident, then reacquire the spinlock and
 736  * return.
 737  *
 738  * This routine is fairly important along the critical path, so optimize it
 739  * heavily.
 740  */
 741 int
 742 ssleep(const volatile void *ident, struct spinlock *spin, int flags,
 743        const char *wmesg, int timo)
 744 {
 745         globaldata_t gd = mycpu;
 746         int error;
 747
 748         _tsleep_interlock(gd, ident, flags);
 749         spin_unlock_quick(gd, spin);
 750         error = tsleep(ident, flags | PINTERLOCKED, wmesg, timo);
 751         spin_lock_quick(gd, spin);
 752
 753         return (error);
 754 }
 755
 756 int
 757 lksleep(const volatile void *ident, struct lock *lock, int flags,
 758         const char *wmesg, int timo)
 759 {
 760         globaldata_t gd = mycpu;
 761         int error;
 762
 763         _tsleep_interlock(gd, ident, flags);
 764         lockmgr(lock, LK_RELEASE);
 765         error = tsleep(ident, flags | PINTERLOCKED, wmesg, timo);
 766         lockmgr(lock, LK_EXCLUSIVE);
 767
 768         return (error);
 769 }
 770
 771 /*
 772  * Interlocked mutex sleep.  An exclusively held mutex must be passed
 773  * to mtxsleep().  The function will atomically release the mutex
 774  * and tsleep on the ident, then reacquire the mutex and return.
 775  */
 776 int
 777 mtxsleep(const volatile void *ident, struct mtx *mtx, int flags,
 778          const char *wmesg, int timo)
 779 {
 780         globaldata_t gd = mycpu;
 781         int error;
 782
 783         _tsleep_interlock(gd, ident, flags);
 784         mtx_unlock(mtx);
 785         error = tsleep(ident, flags | PINTERLOCKED, wmesg, timo);
 786         mtx_lock_ex_quick(mtx, wmesg);
 787
 788         return (error);
 789 }
 790
 791 /*
 792  * Interlocked serializer sleep.  An exclusively held serializer must
 793  * be passed to zsleep().  The function will atomically release
 794  * the serializer and tsleep on the ident, then reacquire the serializer
 795  * and return.
 796  */
 797 int
 798 zsleep(const volatile void *ident, struct lwkt_serialize *slz, int flags,
 799        const char *wmesg, int timo)
 800 {
 801         globaldata_t gd = mycpu;
 802         int ret;
 803
 804         ASSERT_SERIALIZED(slz);
 805
 806         _tsleep_interlock(gd, ident, flags);
 807         lwkt_serialize_exit(slz);
 808         ret = tsleep(ident, flags | PINTERLOCKED, wmesg, timo);
 809         lwkt_serialize_enter(slz);
 810
 811         return ret;
 812 }
 813
 814 /*
 815  * Directly block on the LWKT thread by descheduling it.  This
 816  * is much faster then tsleep(), but the only legal way to wake
 817  * us up is to directly schedule the thread.
 818  *
 819  * Setting TDF_SINTR will cause new signals to directly schedule us.
 820  *
 821  * This routine must be called while in a critical section.
 822  */
 823 int
 824 lwkt_sleep(const char *wmesg, int flags)
 825 {
 826         thread_t td = curthread;
 827         int sig;
 828
 829         if ((flags & PCATCH) == 0 || td->td_lwp == NULL) {
 830                 td->td_flags |= TDF_BLOCKED;
 831                 td->td_wmesg = wmesg;
 832                 lwkt_deschedule_self(td);
 833                 lwkt_switch();
 834                 td->td_wmesg = NULL;
 835                 td->td_flags &= ~TDF_BLOCKED;
 836                 return(0);
 837         }
 838         if ((sig = CURSIG(td->td_lwp)) != 0) {
 839                 if (SIGISMEMBER(td->td_proc->p_sigacts->ps_sigintr, sig))
 840                         return(EINTR);
 841                 else
 842                         return(ERESTART);
 843
 844         }
 845         td->td_flags |= TDF_BLOCKED | TDF_SINTR;
 846         td->td_wmesg = wmesg;
 847         lwkt_deschedule_self(td);
 848         lwkt_switch();
 849         td->td_flags &= ~(TDF_BLOCKED | TDF_SINTR);
 850         td->td_wmesg = NULL;
 851         return(0);
 852 }
 853
 854 /*
 855  * Implement the timeout for tsleep.
 856  *
 857  * We set LWP_BREAKTSLEEP to indicate that an event has occured, but
 858  * we only call setrunnable if the process is not stopped.
 859  *
 860  * This type of callout timeout is scheduled on the same cpu the process
 861  * is sleeping on.  Also, at the moment, the MP lock is held.
 862  */
 863 static void
 864 endtsleep(void *arg)
 865 {
 866         thread_t td = arg;
 867         struct lwp *lp;
 868
 869         crit_enter();
 870
 871         /*
 872          * Do this before we potentially block acquiring the token.  Setting
 873          * TDF_TIMEOUT tells tsleep that we have already stopped the callout.
 874          */
 875         lwkt_hold(td);
 876         td->td_flags |= TDF_TIMEOUT;
 877
 878         /*
 879          * This can block
 880          */
 881         if ((lp = td->td_lwp) != NULL)
 882                 lwkt_gettoken(&lp->lwp_proc->p_token);
 883
 884         /*
 885          * Only do nominal wakeup processing if TDF_TIMEOUT and
 886          * TDF_TSLEEP_DESCHEDULED are both still set.  Otherwise
 887          * we raced a wakeup or we began executed and raced due to
 888          * blocking in the token above, and should do nothing.
 889          */
 890         if ((td->td_flags & (TDF_TIMEOUT | TDF_TSLEEP_DESCHEDULED)) ==
 891             (TDF_TIMEOUT | TDF_TSLEEP_DESCHEDULED)) {
 892                 if (lp) {
 893                         lp->lwp_flag |= LWP_BREAKTSLEEP;
 894                         if (lp->lwp_proc->p_stat != SSTOP)
 895                                 setrunnable(lp);
 896                 } else {
 897                         _tsleep_wakeup(td);
 898                 }
 899         }
 900         if (lp)
 901                 lwkt_reltoken(&lp->lwp_proc->p_token);
 902         lwkt_rele(td);
 903         crit_exit();
 904 }
 905
 906 /*
 907  * Make all processes sleeping on the specified identifier runnable.
 908  * count may be zero or one only.
 909  *
 910  * The domain encodes the sleep/wakeup domain AND the first cpu to check
 911  * (which is always the current cpu).  As we iterate across cpus
 912  *
 913  * This call may run without the MP lock held.  We can only manipulate thread
 914  * state on the cpu owning the thread.  We CANNOT manipulate process state
 915  * at all.
 916  *
 917  * _wakeup() can be passed to an IPI so we can't use (const volatile
 918  * void *ident).
 919  */
 920 static void
 921 _wakeup(void *ident, int domain)
 922 {
 923         struct tslpque *qp;
 924         struct thread *td;
 925         struct thread *ntd;
 926         globaldata_t gd;
 927 #ifdef SMP
 928         cpumask_t mask;
 929 #endif
 930         int id;
 931
 932         crit_enter();
 933         logtsleep2(wakeup_beg, ident);
 934         gd = mycpu;
 935         id = LOOKUP(ident);
 936         qp = &gd->gd_tsleep_hash[id];
 937 restart:
 938         for (td = TAILQ_FIRST(qp); td != NULL; td = ntd) {
 939                 ntd = TAILQ_NEXT(td, td_sleepq);
 940                 if (td->td_wchan == ident &&
 941                     td->td_wdomain == (domain & PDOMAIN_MASK)
 942                 ) {
 943                         KKASSERT(td->td_gd == gd);
 944                         _tsleep_remove(td);
 945                         if (td->td_flags & TDF_TSLEEP_DESCHEDULED) {
 946                                 td->td_flags &= ~TDF_TSLEEP_DESCHEDULED;
 947                                 lwkt_schedule(td);
 948                                 if (domain & PWAKEUP_ONE)
 949                                         goto done;
 950                         }
 951                         goto restart;
 952                 }
 953         }
 954
 955 #ifdef SMP
 956         /*
 957          * We finished checking the current cpu but there still may be
 958          * more work to do.  Either wakeup_one was requested and no matching
 959          * thread was found, or a normal wakeup was requested and we have
 960          * to continue checking cpus.
 961          *
 962          * It should be noted that this scheme is actually less expensive then
 963          * the old scheme when waking up multiple threads, since we send
 964          * only one IPI message per target candidate which may then schedule
 965          * multiple threads.  Before we could have wound up sending an IPI
 966          * message for each thread on the target cpu (!= current cpu) that
 967          * needed to be woken up.
 968          *
 969          * NOTE: Wakeups occuring on remote cpus are asynchronous.  This
 970          * should be ok since we are passing idents in the IPI rather then
 971          * thread pointers.
 972          */
 973         if ((domain & PWAKEUP_MYCPU) == 0 &&
 974             (mask = slpque_cpumasks[id] & gd->gd_other_cpus) != 0) {
 975                 lwkt_send_ipiq2_mask(mask, _wakeup, ident,
 976                                      domain | PWAKEUP_MYCPU);
 977         }
 978 #endif
 979 done:
 980         logtsleep1(wakeup_end);
 981         crit_exit();
 982 }
 983
 984 /*
 985  * Wakeup all threads tsleep()ing on the specified ident, on all cpus
 986  */
 987 void
 988 wakeup(const volatile void *ident)
 989 {
 990     _wakeup(__DEALL(ident), PWAKEUP_ENCODE(0, mycpu->gd_cpuid));
 991 }
 992
 993 /*
 994  * Wakeup one thread tsleep()ing on the specified ident, on any cpu.
 995  */
 996 void
 997 wakeup_one(const volatile void *ident)
 998 {
 999     /* XXX potentially round-robin the first responding cpu */
1000     _wakeup(__DEALL(ident), PWAKEUP_ENCODE(0, mycpu->gd_cpuid) | PWAKEUP_ONE);
1001 }
1002
1003 /*
1004  * Wakeup threads tsleep()ing on the specified ident on the current cpu
1005  * only.
1006  */
1007 void
1008 wakeup_mycpu(const volatile void *ident)
1009 {
1010     _wakeup(__DEALL(ident), PWAKEUP_MYCPU);
1011 }
1012
1013 /*
1014  * Wakeup one thread tsleep()ing on the specified ident on the current cpu
1015  * only.
1016  */
1017 void
1018 wakeup_mycpu_one(const volatile void *ident)
1019 {
1020     /* XXX potentially round-robin the first responding cpu */
1021     _wakeup(__DEALL(ident), PWAKEUP_MYCPU|PWAKEUP_ONE);
1022 }
1023
1024 /*
1025  * Wakeup all thread tsleep()ing on the specified ident on the specified cpu
1026  * only.
1027  */
1028 void
1029 wakeup_oncpu(globaldata_t gd, const volatile void *ident)
1030 {
1031 #ifdef SMP
1032     if (gd == mycpu) {
1033         _wakeup(__DEALL(ident), PWAKEUP_MYCPU);
1034     } else {
1035         lwkt_send_ipiq2(gd, _wakeup, __DEALL(ident), PWAKEUP_MYCPU);
1036     }
1037 #else
1038     _wakeup(__DEALL(ident), PWAKEUP_MYCPU);
1039 #endif
1040 }
1041
1042 /*
1043  * Wakeup one thread tsleep()ing on the specified ident on the specified cpu
1044  * only.
1045  */
1046 void
1047 wakeup_oncpu_one(globaldata_t gd, const volatile void *ident)
1048 {
1049 #ifdef SMP
1050     if (gd == mycpu) {
1051         _wakeup(__DEALL(ident), PWAKEUP_MYCPU | PWAKEUP_ONE);
1052     } else {
1053         lwkt_send_ipiq2(gd, _wakeup, __DEALL(ident),
1054                         PWAKEUP_MYCPU | PWAKEUP_ONE);
1055     }
1056 #else
1057     _wakeup(__DEALL(ident), PWAKEUP_MYCPU | PWAKEUP_ONE);
1058 #endif
1059 }
1060
1061 /*
1062  * Wakeup all threads waiting on the specified ident that slept using
1063  * the specified domain, on all cpus.
1064  */
1065 void
1066 wakeup_domain(const volatile void *ident, int domain)
1067 {
1068     _wakeup(__DEALL(ident), PWAKEUP_ENCODE(domain, mycpu->gd_cpuid));
1069 }
1070
1071 /*
1072  * Wakeup one thread waiting on the specified ident that slept using
1073  * the specified  domain, on any cpu.
1074  */
1075 void
1076 wakeup_domain_one(const volatile void *ident, int domain)
1077 {
1078     /* XXX potentially round-robin the first responding cpu */
1079     _wakeup(__DEALL(ident),
1080             PWAKEUP_ENCODE(domain, mycpu->gd_cpuid) | PWAKEUP_ONE);
1081 }
1082
1083 /*
1084  * setrunnable()
1085  *
1086  * Make a process runnable.  lp->lwp_proc->p_token must be held on call.
1087  * This only has an effect if we are in SSLEEP.  We only break out of the
1088  * tsleep if LWP_BREAKTSLEEP is set, otherwise we just fix-up the state.
1089  *
1090  * NOTE: With p_token held we can only safely manipulate the process
1091  * structure and the lp's lwp_stat.
1092  */
1093 void
1094 setrunnable(struct lwp *lp)
1095 {
1096         ASSERT_LWKT_TOKEN_HELD(&lp->lwp_proc->p_token);
1097         crit_enter();
1098         if (lp->lwp_stat == LSSTOP)
1099                 lp->lwp_stat = LSSLEEP;
1100         if (lp->lwp_stat == LSSLEEP && (lp->lwp_flag & LWP_BREAKTSLEEP))
1101                 _tsleep_wakeup(lp->lwp_thread);
1102         crit_exit();
1103 }
1104
1105 /*
1106  * The process is stopped due to some condition, usually because p_stat is
1107  * set to SSTOP, but also possibly due to being traced.
1108  *
1109  * NOTE!  If the caller sets SSTOP, the caller must also clear P_WAITED
1110  * because the parent may check the child's status before the child actually
1111  * gets to this routine.
1112  *
1113  * This routine is called with the current lwp only, typically just
1114  * before returning to userland.
1115  *
1116  * Setting LWP_BREAKTSLEEP before entering the tsleep will cause a passive
1117  * SIGCONT to break out of the tsleep.
1118  */
1119 void
1120 tstop(void)
1121 {
1122         struct lwp *lp = curthread->td_lwp;
1123         struct proc *p = lp->lwp_proc;
1124
1125         crit_enter();
1126         /*
1127          * If LWP_WSTOP is set, we were sleeping
1128          * while our process was stopped.  At this point
1129          * we were already counted as stopped.
1130          */
1131         if ((lp->lwp_flag & LWP_WSTOP) == 0) {
1132                 /*
1133                  * If we're the last thread to stop, signal
1134                  * our parent.
1135                  */
1136                 p->p_nstopped++;
1137                 lp->lwp_flag |= LWP_WSTOP;
1138                 wakeup(&p->p_nstopped);
1139                 if (p->p_nstopped == p->p_nthreads) {
1140                         p->p_flag &= ~P_WAITED;
1141                         wakeup(p->p_pptr);
1142                         if ((p->p_pptr->p_sigacts->ps_flag & PS_NOCLDSTOP) == 0)
1143                                 ksignal(p->p_pptr, SIGCHLD);
1144                 }
1145         }
1146         while (p->p_stat == SSTOP) {
1147                 lp->lwp_flag |= LWP_BREAKTSLEEP;
1148                 lp->lwp_stat = LSSTOP;
1149                 tsleep(p, 0, "stop", 0);
1150         }
1151         p->p_nstopped--;
1152         lp->lwp_flag &= ~LWP_WSTOP;
1153         crit_exit();
1154 }
1155
1156 /*
1157  * Compute a tenex style load average of a quantity on
1158  * 1, 5 and 15 minute intervals.
1159  */
1160 static int loadav_count_runnable(struct lwp *p, void *data);
1161
1162 static void
1163 loadav(void *arg)
1164 {
1165         struct loadavg *avg;
1166         int i, nrun;
1167
1168         nrun = 0;
1169         alllwp_scan(loadav_count_runnable, &nrun);
1170         avg = &averunnable;
1171         for (i = 0; i < 3; i++) {
1172                 avg->ldavg[i] = (cexp[i] * avg->ldavg[i] +
1173                     nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT;
1174         }
1175
1176         /*
1177          * Schedule the next update to occur after 5 seconds, but add a
1178          * random variation to avoid synchronisation with processes that
1179          * run at regular intervals.
1180          */
1181         callout_reset(&loadav_callout, hz * 4 + (int)(krandom() % (hz * 2 + 1)),
1182                       loadav, NULL);
1183 }
1184
1185 static int
1186 loadav_count_runnable(struct lwp *lp, void *data)
1187 {
1188         int *nrunp = data;
1189         thread_t td;
1190
1191         switch (lp->lwp_stat) {
1192         case LSRUN:
1193                 if ((td = lp->lwp_thread) == NULL)
1194                         break;
1195                 if (td->td_flags & TDF_BLOCKED)
1196                         break;
1197                 ++*nrunp;
1198                 break;
1199         default:
1200                 break;
1201         }
1202         return(0);
1203 }
1204
1205 /* ARGSUSED */
1206 static void
1207 sched_setup(void *dummy)
1208 {
1209         callout_init_mp(&loadav_callout);
1210         callout_init_mp(&schedcpu_callout);
1211
1212         /* Kick off timeout driven events by calling first time. */
1213         schedcpu(NULL);
1214         loadav(NULL);
1215 }
1216