bsd.libnames.mk: Adjust for recent libobjc removal.
[dragonfly.git] / sys / kern / kern_synch.c
CommitLineData
984263bc
MD
1/*-
2 * Copyright (c) 1982, 1986, 1990, 1991, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
dc71b7ab 18 * 3. Neither the name of the University nor the names of its contributors
984263bc
MD
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95
35 * $FreeBSD: src/sys/kern/kern_synch.c,v 1.87.2.6 2002/10/13 07:29:53 kbyanc Exp $
36 */
37
38#include "opt_ktrace.h"
39
40#include <sys/param.h>
41#include <sys/systm.h>
42#include <sys/proc.h>
43#include <sys/kernel.h>
44#include <sys/signalvar.h>
45#include <sys/resourcevar.h>
46#include <sys/vmmeter.h>
47#include <sys/sysctl.h>
344ad853 48#include <sys/lock.h>
984263bc 49#include <sys/uio.h>
63c41f05 50#include <sys/priv.h>
f6aeec64 51#include <sys/kcollect.h>
fc9ae81d 52#ifdef KTRACE
984263bc
MD
53#include <sys/ktrace.h>
54#endif
9afb0ffd 55#include <sys/ktr.h>
684a93c4 56#include <sys/serialize.h>
984263bc 57
684a93c4 58#include <sys/signal2.h>
bf765287
MD
59#include <sys/thread2.h>
60#include <sys/spinlock2.h>
7f6220a9 61#include <sys/mutex2.h>
bf765287 62
984263bc 63#include <machine/cpu.h>
984263bc
MD
64#include <machine/smp.h>
65
1bc11bc6
SZ
66#include <vm/vm_extern.h>
67
8acf0617
MD
68struct tslpque {
69 TAILQ_HEAD(, thread) queue;
70 const volatile void *ident0;
71 const volatile void *ident1;
72 const volatile void *ident2;
73 const volatile void *ident3;
74};
fc17ad60 75
402ed7e1 76static void sched_setup (void *dummy);
f3f3eadb 77SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL);
666ff13c
MD
78static void sched_dyninit (void *dummy);
79SYSINIT(sched_dyninit, SI_BOOT1_DYNALLOC, SI_ORDER_FIRST, sched_dyninit, NULL);
984263bc 80
984263bc 81int lbolt;
50e4012a 82void *lbolt_syncer;
17a9f566 83int ncpus;
da23a592 84int ncpus_fit, ncpus_fit_mask; /* note: mask not cpumask_t */
e43a034f 85int safepri;
dbcd0c9b 86int tsleep_now_works;
5ea440eb 87int tsleep_crypto_dump = 0;
984263bc 88
fc17ad60 89MALLOC_DEFINE(M_TSLEEP, "tslpque", "tsleep queues");
984263bc 90
5decebc7
MD
91#define __DEALL(ident) __DEQUALIFY(void *, ident)
92
9afb0ffd
MD
93#if !defined(KTR_TSLEEP)
94#define KTR_TSLEEP KTR_ALL
95#endif
96KTR_INFO_MASTER(tsleep);
5bf48697
AE
97KTR_INFO(KTR_TSLEEP, tsleep, tsleep_beg, 0, "tsleep enter %p", const volatile void *ident);
98KTR_INFO(KTR_TSLEEP, tsleep, tsleep_end, 1, "tsleep exit");
99KTR_INFO(KTR_TSLEEP, tsleep, wakeup_beg, 2, "wakeup enter %p", const volatile void *ident);
100KTR_INFO(KTR_TSLEEP, tsleep, wakeup_end, 3, "wakeup exit");
101KTR_INFO(KTR_TSLEEP, tsleep, ilockfail, 4, "interlock failed %p", const volatile void *ident);
8aa3430c
MD
102
103#define logtsleep1(name) KTR_LOG(tsleep_ ## name)
104#define logtsleep2(name, val) KTR_LOG(tsleep_ ## name, val)
9afb0ffd 105
984263bc
MD
106struct loadavg averunnable =
107 { {0, 0, 0}, FSCALE }; /* load average, of runnable procs */
108/*
109 * Constants for averages over 1, 5, and 15 minutes
110 * when sampling at 5 second intervals.
111 */
112static fixpt_t cexp[3] = {
113 0.9200444146293232 * FSCALE, /* exp(-1/12) */
114 0.9834714538216174 * FSCALE, /* exp(-1/60) */
115 0.9944598480048967 * FSCALE, /* exp(-1/180) */
116};
117
402ed7e1
RG
118static void endtsleep (void *);
119static void loadav (void *arg);
402ed7e1 120static void schedcpu (void *arg);
984263bc 121
bc55d64f 122static int pctcpu_decay = 10;
a8eec89c
MD
123SYSCTL_INT(_kern, OID_AUTO, pctcpu_decay, CTLFLAG_RW,
124 &pctcpu_decay, 0, "");
dcc99b62
MD
125
126/*
127 * kernel uses `FSCALE', userland (SHOULD) use kern.fscale
984263bc 128 */
460426e6 129int fscale __unused = FSCALE; /* exported to systat */
dcc99b62 130SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, 0, FSCALE, "");
984263bc 131
a8eec89c
MD
132/*
133 * Issue a wakeup() from userland (debugging)
134 */
135static int
136sysctl_wakeup(SYSCTL_HANDLER_ARGS)
137{
138 uint64_t ident = 1;
139 int error = 0;
140
141 if (req->newptr != NULL) {
142 if (priv_check(curthread, PRIV_ROOT))
143 return (EPERM);
144 error = SYSCTL_IN(req, &ident, sizeof(ident));
145 if (error)
146 return error;
147 kprintf("issue wakeup %016jx\n", ident);
148 wakeup((void *)(intptr_t)ident);
149 }
150 if (req->oldptr != NULL) {
151 error = SYSCTL_OUT(req, &ident, sizeof(ident));
152 }
153 return error;
154}
155
63c41f05
MD
156static int
157sysctl_wakeup_umtx(SYSCTL_HANDLER_ARGS)
158{
159 uint64_t ident = 1;
160 int error = 0;
161
162 if (req->newptr != NULL) {
163 if (priv_check(curthread, PRIV_ROOT))
164 return (EPERM);
165 error = SYSCTL_IN(req, &ident, sizeof(ident));
166 if (error)
167 return error;
168 kprintf("issue wakeup %016jx, PDOMAIN_UMTX\n", ident);
169 wakeup_domain((void *)(intptr_t)ident, PDOMAIN_UMTX);
170 }
171 if (req->oldptr != NULL) {
172 error = SYSCTL_OUT(req, &ident, sizeof(ident));
173 }
174 return error;
175}
176
a8eec89c
MD
177SYSCTL_PROC(_debug, OID_AUTO, wakeup, CTLTYPE_UQUAD|CTLFLAG_RW, 0, 0,
178 sysctl_wakeup, "Q", "issue wakeup(addr)");
63c41f05
MD
179SYSCTL_PROC(_debug, OID_AUTO, wakeup_umtx, CTLTYPE_UQUAD|CTLFLAG_RW, 0, 0,
180 sysctl_wakeup_umtx, "Q", "issue wakeup(addr, PDOMAIN_UMTX)");
a8eec89c 181
984263bc 182/*
0a3f9b47 183 * Recompute process priorities, once a second.
dcc99b62
MD
184 *
185 * Since the userland schedulers are typically event oriented, if the
186 * estcpu calculation at wakeup() time is not sufficient to make a
187 * process runnable relative to other processes in the system we have
188 * a 1-second recalc to help out.
189 *
190 * This code also allows us to store sysclock_t data in the process structure
191 * without fear of an overrun, since sysclock_t are guarenteed to hold
192 * several seconds worth of count.
8fa76237
MD
193 *
194 * WARNING! callouts can preempt normal threads. However, they will not
195 * preempt a thread holding a spinlock so we *can* safely use spinlocks.
984263bc 196 */
8fa76237
MD
197static int schedcpu_stats(struct proc *p, void *data __unused);
198static int schedcpu_resource(struct proc *p, void *data __unused);
199
984263bc 200static void
26a0694b 201schedcpu(void *arg)
984263bc 202{
586c4308
MD
203 allproc_scan(schedcpu_stats, NULL, 1);
204 allproc_scan(schedcpu_resource, NULL, 1);
205 if (mycpu->gd_cpuid == 0) {
206 wakeup((caddr_t)&lbolt);
207 wakeup(lbolt_syncer);
208 }
209 callout_reset(&mycpu->gd_schedcpu_callout, hz, schedcpu, NULL);
8fa76237
MD
210}
211
212/*
213 * General process statistics once a second
214 */
215static int
216schedcpu_stats(struct proc *p, void *data __unused)
217{
08f2f1bb
SS
218 struct lwp *lp;
219
7bea4e64
MD
220 /*
221 * Threads may not be completely set up if process in SIDL state.
222 */
223 if (p->p_stat == SIDL)
224 return(0);
225
0d78b86e 226 PHOLD(p);
85946b6c
MD
227 if (lwkt_trytoken(&p->p_token) == FALSE) {
228 PRELE(p);
229 return(0);
230 }
0d78b86e 231
8fa76237 232 p->p_swtime++;
c7e98b2f 233 FOREACH_LWP_IN_PROC(lp, p) {
de4d4cb0
MD
234 if (lp->lwp_stat == LSSLEEP) {
235 ++lp->lwp_slptime;
236 if (lp->lwp_slptime == 1)
237 p->p_usched->uload_update(lp);
238 }
4b5f931b 239
c7e98b2f
SS
240 /*
241 * Only recalculate processes that are active or have slept
242 * less then 2 seconds. The schedulers understand this.
bc55d64f 243 * Otherwise decay by 50% per second.
c7e98b2f
SS
244 */
245 if (lp->lwp_slptime <= 1) {
246 p->p_usched->recalculate(lp);
247 } else {
bc55d64f
MD
248 int decay;
249
250 decay = pctcpu_decay;
251 cpu_ccfence();
252 if (decay <= 1)
253 decay = 1;
254 if (decay > 100)
255 decay = 100;
256 lp->lwp_pctcpu = (lp->lwp_pctcpu * (decay - 1)) / decay;
c7e98b2f 257 }
8fa76237 258 }
0d78b86e 259 lwkt_reltoken(&p->p_token);
d2d8515b 260 lwkt_yield();
0d78b86e 261 PRELE(p);
8fa76237
MD
262 return(0);
263}
a46fac56 264
8fa76237 265/*
84204577 266 * Resource checks. XXX break out since ksignal/killproc can block,
8fa76237
MD
267 * limiting us to one process killed per second. There is probably
268 * a better way.
269 */
270static int
271schedcpu_resource(struct proc *p, void *data __unused)
272{
273 u_int64_t ttime;
08f2f1bb 274 struct lwp *lp;
8fa76237 275
0d78b86e
MD
276 if (p->p_stat == SIDL)
277 return(0);
278
279 PHOLD(p);
85946b6c
MD
280 if (lwkt_trytoken(&p->p_token) == FALSE) {
281 PRELE(p);
282 return(0);
283 }
0d78b86e
MD
284
285 if (p->p_stat == SZOMB || p->p_limit == NULL) {
286 lwkt_reltoken(&p->p_token);
287 PRELE(p);
8fa76237 288 return(0);
984263bc 289 }
344ad853 290
c7e98b2f
SS
291 ttime = 0;
292 FOREACH_LWP_IN_PROC(lp, p) {
e595c6cd
MD
293 /*
294 * We may have caught an lp in the middle of being
295 * created, lwp_thread can be NULL.
296 */
297 if (lp->lwp_thread) {
298 ttime += lp->lwp_thread->td_sticks;
299 ttime += lp->lwp_thread->td_uticks;
300 }
c7e98b2f 301 }
8fa76237 302
384ee8f0 303 switch(plimit_testcpulimit(p, ttime)) {
8fa76237
MD
304 case PLIMIT_TESTCPU_KILL:
305 killproc(p, "exceeded maximum CPU limit");
306 break;
307 case PLIMIT_TESTCPU_XCPU:
4643740a
MD
308 if ((p->p_flags & P_XCPU) == 0) {
309 p->p_flags |= P_XCPU;
84204577 310 ksignal(p, SIGXCPU);
344ad853 311 }
8fa76237
MD
312 break;
313 default:
c0b8a06d 314 break;
344ad853 315 }
0d78b86e 316 lwkt_reltoken(&p->p_token);
d2d8515b 317 lwkt_yield();
0d78b86e 318 PRELE(p);
8fa76237 319 return(0);
984263bc
MD
320}
321
322/*
dcc99b62
MD
323 * This is only used by ps. Generate a cpu percentage use over
324 * a period of one second.
984263bc 325 */
dcc99b62 326void
553ea3c8 327updatepcpu(struct lwp *lp, int cpticks, int ttlticks)
984263bc 328{
dcc99b62
MD
329 fixpt_t acc;
330 int remticks;
331
332 acc = (cpticks << FSHIFT) / ttlticks;
333 if (ttlticks >= ESTCPUFREQ) {
553ea3c8 334 lp->lwp_pctcpu = acc;
dcc99b62
MD
335 } else {
336 remticks = ESTCPUFREQ - ttlticks;
553ea3c8 337 lp->lwp_pctcpu = (acc * ttlticks + lp->lwp_pctcpu * remticks) /
dcc99b62 338 ESTCPUFREQ;
a46fac56 339 }
984263bc
MD
340}
341
342/*
666ff13c
MD
343 * Handy macros to calculate hash indices. LOOKUP() calculates the
344 * global cpumask hash index, TCHASHSHIFT() converts that into the
345 * pcpu hash index.
fc17ad60 346 *
666ff13c
MD
347 * By making the pcpu hash arrays smaller we save a significant amount
348 * of memory at very low cost. The real cost is in IPIs, which are handled
349 * by the much larger global cpumask hash table.
984263bc 350 */
41c11378
SZ
351#define LOOKUP_PRIME 66555444443333333ULL
352#define LOOKUP(x) ((((uintptr_t)(x) + ((uintptr_t)(x) >> 18)) ^ \
353 LOOKUP_PRIME) % slpque_tablesize)
666ff13c 354#define TCHASHSHIFT(x) ((x) >> 4)
984263bc 355
666ff13c
MD
356static uint32_t slpque_tablesize;
357static cpumask_t *slpque_cpumasks;
984263bc 358
4e8fa957
SZ
359SYSCTL_UINT(_kern, OID_AUTO, slpque_tablesize, CTLFLAG_RD, &slpque_tablesize,
360 0, "");
361
ae8e83e6
MD
362/*
363 * This is a dandy function that allows us to interlock tsleep/wakeup
364 * operations with unspecified upper level locks, such as lockmgr locks,
365 * simply by holding a critical section. The sequence is:
366 *
367 * (acquire upper level lock)
368 * tsleep_interlock(blah)
369 * (release upper level lock)
370 * tsleep(blah, ...)
371 *
372 * Basically this functions queues us on the tsleep queue without actually
373 * descheduling us. When tsleep() is later called with PINTERLOCK it
374 * assumes the thread was already queued, otherwise it queues it there.
375 *
376 * Thus it is possible to receive the wakeup prior to going to sleep and
377 * the race conditions are covered.
378 */
379static __inline void
5decebc7 380_tsleep_interlock(globaldata_t gd, const volatile void *ident, int flags)
ae8e83e6
MD
381{
382 thread_t td = gd->gd_curthread;
8acf0617 383 struct tslpque *qp;
666ff13c
MD
384 uint32_t cid;
385 uint32_t gid;
ae8e83e6 386
a8eec89c
MD
387 if (ident == NULL) {
388 kprintf("tsleep_interlock: NULL ident %s\n", td->td_comm);
389 print_backtrace(5);
390 }
391
ae8e83e6
MD
392 crit_enter_quick(td);
393 if (td->td_flags & TDF_TSLEEPQ) {
afd7f124
MD
394 /*
395 * Shortcut if unchanged
396 */
397 if (td->td_wchan == ident &&
398 td->td_wdomain == (flags & PDOMAIN_MASK)) {
399 crit_exit_quick(td);
400 return;
401 }
402
403 /*
404 * Remove current sleepq
405 */
666ff13c
MD
406 cid = LOOKUP(td->td_wchan);
407 gid = TCHASHSHIFT(cid);
8acf0617
MD
408 qp = &gd->gd_tsleep_hash[gid];
409 TAILQ_REMOVE(&qp->queue, td, td_sleepq);
410 if (TAILQ_FIRST(&qp->queue) == NULL) {
8acf0617
MD
411 qp->ident0 = NULL;
412 qp->ident1 = NULL;
413 qp->ident2 = NULL;
414 qp->ident3 = NULL;
b4d1b684
MD
415 ATOMIC_CPUMASK_NANDBIT(slpque_cpumasks[cid],
416 gd->gd_cpuid);
b12defdc 417 }
ae8e83e6
MD
418 } else {
419 td->td_flags |= TDF_TSLEEPQ;
420 }
666ff13c
MD
421 cid = LOOKUP(ident);
422 gid = TCHASHSHIFT(cid);
8acf0617
MD
423 qp = &gd->gd_tsleep_hash[gid];
424 TAILQ_INSERT_TAIL(&qp->queue, td, td_sleepq);
425 if (qp->ident0 != ident && qp->ident1 != ident &&
426 qp->ident2 != ident && qp->ident3 != ident) {
427 if (qp->ident0 == NULL)
428 qp->ident0 = ident;
429 else if (qp->ident1 == NULL)
430 qp->ident1 = ident;
431 else if (qp->ident2 == NULL)
432 qp->ident2 = ident;
433 else if (qp->ident3 == NULL)
434 qp->ident3 = ident;
435 else
436 qp->ident0 = (void *)(intptr_t)-1;
437 }
666ff13c 438 ATOMIC_CPUMASK_ORBIT(slpque_cpumasks[cid], gd->gd_cpuid);
ae8e83e6
MD
439 td->td_wchan = ident;
440 td->td_wdomain = flags & PDOMAIN_MASK;
ae8e83e6
MD
441 crit_exit_quick(td);
442}
443
444void
5decebc7 445tsleep_interlock(const volatile void *ident, int flags)
ae8e83e6
MD
446{
447 _tsleep_interlock(mycpu, ident, flags);
448}
449
450/*
451 * Remove thread from sleepq. Must be called with a critical section held.
4643740a 452 * The thread must not be migrating.
ae8e83e6
MD
453 */
454static __inline void
455_tsleep_remove(thread_t td)
456{
457 globaldata_t gd = mycpu;
8acf0617 458 struct tslpque *qp;
666ff13c
MD
459 uint32_t cid;
460 uint32_t gid;
ae8e83e6 461
957625b2 462 KKASSERT(td->td_gd == gd && IN_CRITICAL_SECT(td));
4643740a 463 KKASSERT((td->td_flags & TDF_MIGRATING) == 0);
ae8e83e6
MD
464 if (td->td_flags & TDF_TSLEEPQ) {
465 td->td_flags &= ~TDF_TSLEEPQ;
666ff13c
MD
466 cid = LOOKUP(td->td_wchan);
467 gid = TCHASHSHIFT(cid);
8acf0617
MD
468 qp = &gd->gd_tsleep_hash[gid];
469 TAILQ_REMOVE(&qp->queue, td, td_sleepq);
470 if (TAILQ_FIRST(&qp->queue) == NULL) {
666ff13c 471 ATOMIC_CPUMASK_NANDBIT(slpque_cpumasks[cid],
c07315c4
MD
472 gd->gd_cpuid);
473 }
ae8e83e6
MD
474 td->td_wchan = NULL;
475 td->td_wdomain = 0;
476 }
477}
478
479void
480tsleep_remove(thread_t td)
481{
482 _tsleep_remove(td);
483}
484
984263bc
MD
485/*
486 * General sleep call. Suspends the current process until a wakeup is
487 * performed on the specified identifier. The process will then be made
488 * runnable with the specified priority. Sleeps at most timo/hz seconds
377d4740 489 * (0 means no timeout). If flags includes PCATCH flag, signals are checked
984263bc
MD
490 * before and after sleeping, else signals are not checked. Returns 0 if
491 * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a
492 * signal needs to be delivered, ERESTART is returned if the current system
493 * call should be restarted if possible, and EINTR is returned if the system
494 * call should be interrupted by the signal (return EINTR).
26a0694b 495 *
0a3f9b47
MD
496 * Note that if we are a process, we release_curproc() before messing with
497 * the LWKT scheduler.
a46fac56
MD
498 *
499 * During autoconfiguration or after a panic, a sleep will simply
500 * lower the priority briefly to allow interrupts, then return.
94f98873
MD
501 *
502 * WARNING! This code can't block (short of switching away), or bad things
503 * will happen. No getting tokens, no blocking locks, etc.
984263bc
MD
504 */
505int
5decebc7 506tsleep(const volatile void *ident, int flags, const char *wmesg, int timo)
984263bc 507{
dadab5e9 508 struct thread *td = curthread;
08f2f1bb 509 struct lwp *lp = td->td_lwp;
0cfcada1 510 struct proc *p = td->td_proc; /* may be NULL */
fc17ad60 511 globaldata_t gd;
344ad853
MD
512 int sig;
513 int catch;
344ad853 514 int error;
e43a034f 515 int oldpri;
076fecef 516 struct callout thandle;
984263bc 517
b0da0c88
MD
518 /*
519 * Currently a severe hack. Make sure any delayed wakeups
520 * are flushed before we sleep or we might deadlock on whatever
521 * event we are sleeping on.
522 */
523 if (td->td_flags & TDF_DELAYED_WAKEUP)
524 wakeup_end_delayed();
525
0cfcada1
MD
526 /*
527 * NOTE: removed KTRPOINT, it could cause races due to blocking
528 * even in stable. Just scrap it for now.
529 */
5ea440eb 530 if (!tsleep_crypto_dump && (tsleep_now_works == 0 || panicstr)) {
984263bc 531 /*
dbcd0c9b
MD
532 * After a panic, or before we actually have an operational
533 * softclock, just give interrupts a chance, then just return;
534 *
984263bc
MD
535 * don't run any other procs or panic below,
536 * in case this is the idle process and already asleep.
537 */
e43a034f 538 splz();
f9235b6d 539 oldpri = td->td_pri;
e43a034f
MD
540 lwkt_setpri_self(safepri);
541 lwkt_switch();
542 lwkt_setpri_self(oldpri);
984263bc
MD
543 return (0);
544 }
8aa3430c 545 logtsleep2(tsleep_beg, ident);
fc17ad60
MD
546 gd = td->td_gd;
547 KKASSERT(td != &gd->gd_idlethread); /* you must be kidding! */
de4d4cb0 548 td->td_wakefromcpu = -1; /* overwritten by _wakeup */
344ad853
MD
549
550 /*
551 * NOTE: all of this occurs on the current cpu, including any
552 * callout-based wakeups, so a critical section is a sufficient
553 * interlock.
554 *
555 * The entire sequence through to where we actually sleep must
556 * run without breaking the critical section.
557 */
344ad853
MD
558 catch = flags & PCATCH;
559 error = 0;
560 sig = 0;
561
37af14fe 562 crit_enter_quick(td);
344ad853 563
0cfcada1 564 KASSERT(ident != NULL, ("tsleep: no ident"));
7278a846
SS
565 KASSERT(lp == NULL ||
566 lp->lwp_stat == LSRUN || /* Obvious */
567 lp->lwp_stat == LSSTOP, /* Set in tstop */
568 ("tsleep %p %s %d",
569 ident, wmesg, lp->lwp_stat));
0cfcada1 570
5686ec5a
MD
571 /*
572 * We interlock the sleep queue if the caller has not already done
573 * it for us. This must be done before we potentially acquire any
574 * tokens or we can loose the wakeup.
575 */
576 if ((flags & PINTERLOCKED) == 0) {
5686ec5a
MD
577 _tsleep_interlock(gd, ident, flags);
578 }
579
344ad853 580 /*
4643740a
MD
581 * Setup for the current process (if this is a process). We must
582 * interlock with lwp_token to avoid remote wakeup races via
583 * setrunnable()
344ad853 584 */
08f2f1bb 585 if (lp) {
4643740a 586 lwkt_gettoken(&lp->lwp_token);
1a3f33f1
MD
587
588 /*
589 * If the umbrella process is in the SCORE state then
590 * make sure that the thread is flagged going into a
591 * normal sleep to allow the core dump to proceed, otherwise
592 * the coredump can end up waiting forever. If the normal
593 * sleep is woken up, the thread will enter a stopped state
594 * upon return to userland.
595 *
596 * We do not want to interrupt or cause a thread exist at
597 * this juncture because that will mess-up the state the
598 * coredump is trying to save.
599 */
ac39aef5
MD
600 if (p->p_stat == SCORE) {
601 lwkt_gettoken(&p->p_token);
602 if ((lp->lwp_mpflags & LWP_MP_WSTOP) == 0) {
603 atomic_set_int(&lp->lwp_mpflags, LWP_MP_WSTOP);
604 ++p->p_nstopped;
605 }
606 lwkt_reltoken(&p->p_token);
1a3f33f1
MD
607 }
608
609 /*
610 * PCATCH requested.
611 */
344ad853
MD
612 if (catch) {
613 /*
614 * Early termination if PCATCH was set and a
615 * signal is pending, interlocked with the
616 * critical section.
617 *
618 * Early termination only occurs when tsleep() is
164b8401 619 * entered while in a normal LSRUN state.
344ad853 620 */
08f2f1bb 621 if ((sig = CURSIG(lp)) != 0)
344ad853
MD
622 goto resume;
623
624 /*
5686ec5a 625 * Causes ksignal to wake us up if a signal is
a094cc95 626 * received (interlocked with lp->lwp_token).
344ad853 627 */
4643740a 628 lp->lwp_flags |= LWP_SINTR;
344ad853 629 }
5686ec5a
MD
630 } else {
631 KKASSERT(p == NULL);
4ecd8190 632 }
344ad853 633
4ecd8190 634 /*
4ecd8190
MD
635 * Make sure the current process has been untangled from
636 * the userland scheduler and initialize slptime to start
5686ec5a 637 * counting.
c75e41b7
MD
638 *
639 * NOTE: td->td_wakefromcpu is pre-set by the release function
640 * for the dfly scheduler, and then adjusted by _wakeup()
4ecd8190
MD
641 */
642 if (lp) {
08f2f1bb
SS
643 p->p_usched->release_curproc(lp);
644 lp->lwp_slptime = 0;
0a3f9b47 645 }
fc17ad60 646
d9345d3a 647 /*
afd7f124
MD
648 * For PINTERLOCKED operation, TDF_TSLEEPQ might not be set if
649 * a wakeup() was processed before the thread could go to sleep.
650 *
651 * If TDF_TSLEEPQ is set, make sure the ident matches the recorded
652 * ident. If it does not then the thread slept inbetween the
653 * caller's initial tsleep_interlock() call and the caller's tsleep()
654 * call.
d9345d3a 655 *
4ecd8190
MD
656 * Extreme loads can cause the sending of an IPI (e.g. wakeup()'s)
657 * to process incoming IPIs, thus draining incoming wakeups.
d9345d3a 658 */
4ecd8190
MD
659 if ((td->td_flags & TDF_TSLEEPQ) == 0) {
660 logtsleep2(ilockfail, ident);
661 goto resume;
afd7f124
MD
662 } else if (td->td_wchan != ident ||
663 td->td_wdomain != (flags & PDOMAIN_MASK)) {
664 logtsleep2(ilockfail, ident);
665 goto resume;
d9345d3a 666 }
4ecd8190
MD
667
668 /*
669 * scheduling is blocked while in a critical section. Coincide
670 * the descheduled-by-tsleep flag with the descheduling of the
671 * lwkt.
8d446850
MD
672 *
673 * The timer callout is localized on our cpu and interlocked by
674 * our critical section.
4ecd8190 675 */
37af14fe 676 lwkt_deschedule_self(td);
ae8e83e6 677 td->td_flags |= TDF_TSLEEP_DESCHEDULED;
344ad853 678 td->td_wmesg = wmesg;
344ad853
MD
679
680 /*
8d446850
MD
681 * Setup the timeout, if any. The timeout is only operable while
682 * the thread is flagged descheduled.
344ad853 683 */
8d446850 684 KKASSERT((td->td_flags & TDF_TIMEOUT) == 0);
076fecef 685 if (timo) {
8d446850 686 callout_init_mp(&thandle);
076fecef
MD
687 callout_reset(&thandle, timo, endtsleep, td);
688 }
344ad853 689
984263bc 690 /*
344ad853 691 * Beddy bye bye.
984263bc 692 */
08f2f1bb 693 if (lp) {
26a0694b 694 /*
52eedfb5 695 * Ok, we are sleeping. Place us in the SSLEEP state.
26a0694b 696 */
4643740a 697 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0);
6b4d33c2 698
7278a846
SS
699 /*
700 * tstop() sets LSSTOP, so don't fiddle with that.
701 */
702 if (lp->lwp_stat != LSSTOP)
703 lp->lwp_stat = LSSLEEP;
08f2f1bb 704 lp->lwp_ru.ru_nvcsw++;
e28d8b15 705 p->p_usched->uload_update(lp);
de4d4cb0 706 lwkt_switch();
ab554892
MD
707
708 /*
164b8401 709 * And when we are woken up, put us back in LSRUN. If we
ab554892
MD
710 * slept for over a second, recalculate our estcpu.
711 */
164b8401 712 lp->lwp_stat = LSRUN;
de4d4cb0
MD
713 if (lp->lwp_slptime) {
714 p->p_usched->uload_update(lp);
08f2f1bb 715 p->p_usched->recalculate(lp);
de4d4cb0 716 }
08f2f1bb 717 lp->lwp_slptime = 0;
0cfcada1
MD
718 } else {
719 lwkt_switch();
720 }
344ad853 721
fc17ad60
MD
722 /*
723 * Make sure we haven't switched cpus while we were asleep. It's
344ad853 724 * not supposed to happen. Cleanup our temporary flags.
fc17ad60
MD
725 */
726 KKASSERT(gd == td->td_gd);
344ad853
MD
727
728 /*
8d446850 729 * Cleanup the timeout. If the timeout has already occured thandle
4643740a
MD
730 * has already been stopped, otherwise stop thandle. If the timeout
731 * is running (the callout thread must be blocked trying to get
732 * lwp_token) then wait for us to get scheduled.
344ad853
MD
733 */
734 if (timo) {
4643740a 735 while (td->td_flags & TDF_TIMEOUT_RUNNING) {
a739e4f8
MD
736 /* else we won't get rescheduled! */
737 if (lp->lwp_stat != LSSTOP)
738 lp->lwp_stat = LSSLEEP;
4643740a
MD
739 lwkt_deschedule_self(td);
740 td->td_wmesg = "tsrace";
741 lwkt_switch();
742 kprintf("td %p %s: timeout race\n", td, td->td_comm);
743 }
344ad853
MD
744 if (td->td_flags & TDF_TIMEOUT) {
745 td->td_flags &= ~TDF_TIMEOUT;
a40da8f0 746 error = EWOULDBLOCK;
344ad853 747 } else {
8d446850 748 /* does not block when on same cpu */
eb67213a 749 callout_cancel(&thandle);
344ad853 750 }
0cfcada1 751 }
4643740a 752 td->td_flags &= ~TDF_TSLEEP_DESCHEDULED;
344ad853
MD
753
754 /*
8d446850
MD
755 * Make sure we have been removed from the sleepq. In most
756 * cases this will have been done for us already but it is
757 * possible for a scheduling IPI to be in-flight from a
758 * previous tsleep/tsleep_interlock() or due to a straight-out
759 * call to lwkt_schedule() (in the case of an interrupt thread),
760 * causing a spurious wakeup.
344ad853 761 */
ae8e83e6 762 _tsleep_remove(td);
344ad853 763 td->td_wmesg = NULL;
344ad853
MD
764
765 /*
7c1212ec
MD
766 * Figure out the correct error return. If interrupted by a
767 * signal we want to return EINTR or ERESTART.
344ad853
MD
768 */
769resume:
4643740a 770 if (lp) {
7c1212ec 771 if (catch && error == 0) {
94f98873 772 if (sig != 0 || (sig = CURSIG(lp))) {
7c1212ec
MD
773 if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
774 error = EINTR;
775 else
776 error = ERESTART;
777 }
984263bc 778 }
5525fede 779
4643740a 780 lp->lwp_flags &= ~LWP_SINTR;
5525fede
MD
781
782 /*
9c960153
MD
783 * Unconditionally set us to LSRUN on resume. lwp_stat could
784 * be in a weird state due to the goto resume, particularly
785 * when tsleep() is called from tstop().
5525fede 786 */
9c960153 787 lp->lwp_stat = LSRUN;
4643740a 788 lwkt_reltoken(&lp->lwp_token);
984263bc 789 }
8aa3430c 790 logtsleep1(tsleep_end);
344ad853 791 crit_exit_quick(td);
afd7f124 792
344ad853 793 return (error);
984263bc
MD
794}
795
bf765287
MD
796/*
797 * Interlocked spinlock sleep. An exclusively held spinlock must
e590ee86 798 * be passed to ssleep(). The function will atomically release the
bf765287
MD
799 * spinlock and tsleep on the ident, then reacquire the spinlock and
800 * return.
801 *
802 * This routine is fairly important along the critical path, so optimize it
803 * heavily.
804 */
805int
5decebc7 806ssleep(const volatile void *ident, struct spinlock *spin, int flags,
bf765287
MD
807 const char *wmesg, int timo)
808{
809 globaldata_t gd = mycpu;
810 int error;
16523a43 811
ae8e83e6 812 _tsleep_interlock(gd, ident, flags);
7cfe2b28 813 spin_unlock_quick(gd, spin);
ef48be0d 814 error = tsleep(ident, flags | PINTERLOCKED, wmesg, timo);
a8eec89c 815 KKASSERT(gd == mycpu);
050032ec 816 _spin_lock_quick(gd, spin, wmesg);
bf765287
MD
817
818 return (error);
16523a43
MD
819}
820
bed060de 821int
5decebc7
MD
822lksleep(const volatile void *ident, struct lock *lock, int flags,
823 const char *wmesg, int timo)
bed060de
AH
824{
825 globaldata_t gd = mycpu;
826 int error;
827
828 _tsleep_interlock(gd, ident, flags);
829 lockmgr(lock, LK_RELEASE);
830 error = tsleep(ident, flags | PINTERLOCKED, wmesg, timo);
831 lockmgr(lock, LK_EXCLUSIVE);
832
833 return (error);
834}
835
7f6220a9
MD
836/*
837 * Interlocked mutex sleep. An exclusively held mutex must be passed
838 * to mtxsleep(). The function will atomically release the mutex
839 * and tsleep on the ident, then reacquire the mutex and return.
840 */
841int
5decebc7 842mtxsleep(const volatile void *ident, struct mtx *mtx, int flags,
7f6220a9
MD
843 const char *wmesg, int timo)
844{
845 globaldata_t gd = mycpu;
846 int error;
847
848 _tsleep_interlock(gd, ident, flags);
849 mtx_unlock(mtx);
850 error = tsleep(ident, flags | PINTERLOCKED, wmesg, timo);
cabfc9f6 851 mtx_lock_ex_quick(mtx);
7f6220a9
MD
852
853 return (error);
854}
855
362e59be
SZ
856/*
857 * Interlocked serializer sleep. An exclusively held serializer must
ed3f6624 858 * be passed to zsleep(). The function will atomically release
362e59be
SZ
859 * the serializer and tsleep on the ident, then reacquire the serializer
860 * and return.
861 */
862int
5decebc7 863zsleep(const volatile void *ident, struct lwkt_serialize *slz, int flags,
ed3f6624 864 const char *wmesg, int timo)
362e59be 865{
ae8e83e6 866 globaldata_t gd = mycpu;
362e59be
SZ
867 int ret;
868
869 ASSERT_SERIALIZED(slz);
870
ae8e83e6 871 _tsleep_interlock(gd, ident, flags);
362e59be 872 lwkt_serialize_exit(slz);
ef48be0d 873 ret = tsleep(ident, flags | PINTERLOCKED, wmesg, timo);
362e59be 874 lwkt_serialize_enter(slz);
362e59be
SZ
875
876 return ret;
877}
878
a22c590e
MD
879/*
880 * Directly block on the LWKT thread by descheduling it. This
881 * is much faster then tsleep(), but the only legal way to wake
882 * us up is to directly schedule the thread.
883 *
884 * Setting TDF_SINTR will cause new signals to directly schedule us.
885 *
ae8e83e6 886 * This routine must be called while in a critical section.
a22c590e
MD
887 */
888int
889lwkt_sleep(const char *wmesg, int flags)
890{
891 thread_t td = curthread;
892 int sig;
893
894 if ((flags & PCATCH) == 0 || td->td_lwp == NULL) {
895 td->td_flags |= TDF_BLOCKED;
896 td->td_wmesg = wmesg;
897 lwkt_deschedule_self(td);
898 lwkt_switch();
899 td->td_wmesg = NULL;
900 td->td_flags &= ~TDF_BLOCKED;
901 return(0);
902 }
903 if ((sig = CURSIG(td->td_lwp)) != 0) {
904 if (SIGISMEMBER(td->td_proc->p_sigacts->ps_sigintr, sig))
905 return(EINTR);
906 else
907 return(ERESTART);
908
909 }
910 td->td_flags |= TDF_BLOCKED | TDF_SINTR;
911 td->td_wmesg = wmesg;
912 lwkt_deschedule_self(td);
913 lwkt_switch();
914 td->td_flags &= ~(TDF_BLOCKED | TDF_SINTR);
915 td->td_wmesg = NULL;
916 return(0);
917}
918
984263bc 919/*
344ad853 920 * Implement the timeout for tsleep.
fc17ad60 921 *
344ad853
MD
922 * This type of callout timeout is scheduled on the same cpu the process
923 * is sleeping on. Also, at the moment, the MP lock is held.
984263bc
MD
924 */
925static void
0cfcada1 926endtsleep(void *arg)
984263bc 927{
0cfcada1 928 thread_t td = arg;
9a379a4a 929 struct lwp *lp;
984263bc 930
8d446850 931 /*
4643740a
MD
932 * We are going to have to get the lwp_token, which means we might
933 * block. This can race a tsleep getting woken up by other means
934 * so set TDF_TIMEOUT_RUNNING to force the tsleep to wait for our
935 * processing to complete (sorry tsleep!).
936 *
937 * We can safely set td_flags because td MUST be on the same cpu
938 * as we are.
8d446850 939 */
4643740a
MD
940 KKASSERT(td->td_gd == mycpu);
941 crit_enter();
942 td->td_flags |= TDF_TIMEOUT_RUNNING | TDF_TIMEOUT;
344ad853
MD
943
944 /*
4643740a
MD
945 * This can block but TDF_TIMEOUT_RUNNING will prevent the thread
946 * from exiting the tsleep on us. The flag is interlocked by virtue
947 * of lp being on the same cpu as we are.
344ad853 948 */
8d446850 949 if ((lp = td->td_lwp) != NULL)
e2b148c6 950 lwkt_gettoken(&lp->lwp_token);
344ad853 951
4643740a
MD
952 KKASSERT(td->td_flags & TDF_TSLEEP_DESCHEDULED);
953
954 if (lp) {
d4688a1e 955 /*
9c960153
MD
956 * callout timer should normally never be set in tstop()
957 * because it passes a timeout of 0. However, there is a
958 * case during thread exit (which SSTOP's all the threads)
959 * for which tstop() must break out and can (properly) leave
960 * the thread in LSSTOP.
d4688a1e 961 */
9c960153
MD
962 KKASSERT(lp->lwp_stat != LSSTOP ||
963 (lp->lwp_mpflags & LWP_MP_WEXIT));
d4688a1e 964 setrunnable(lp);
e2b148c6 965 lwkt_reltoken(&lp->lwp_token);
4643740a
MD
966 } else {
967 _tsleep_remove(td);
968 lwkt_schedule(td);
969 }
970 KKASSERT(td->td_gd == mycpu);
971 td->td_flags &= ~TDF_TIMEOUT_RUNNING;
37af14fe 972 crit_exit();
984263bc
MD
973}
974
8fb8bca6
EN
975/*
976 * Make all processes sleeping on the specified identifier runnable.
fc17ad60
MD
977 * count may be zero or one only.
978 *
c75e41b7
MD
979 * The domain encodes the sleep/wakeup domain, flags, plus the originating
980 * cpu.
344ad853
MD
981 *
982 * This call may run without the MP lock held. We can only manipulate thread
983 * state on the cpu owning the thread. We CANNOT manipulate process state
984 * at all.
5decebc7
MD
985 *
986 * _wakeup() can be passed to an IPI so we can't use (const volatile
987 * void *ident).
8fb8bca6
EN
988 */
989static void
fc17ad60 990_wakeup(void *ident, int domain)
984263bc 991{
fc17ad60 992 struct tslpque *qp;
0cfcada1
MD
993 struct thread *td;
994 struct thread *ntd;
fc17ad60 995 globaldata_t gd;
fc17ad60 996 cpumask_t mask;
666ff13c
MD
997 uint32_t cid;
998 uint32_t gid;
8acf0617 999 int wids = 0;
984263bc 1000
37af14fe 1001 crit_enter();
8aa3430c 1002 logtsleep2(wakeup_beg, ident);
fc17ad60 1003 gd = mycpu;
666ff13c
MD
1004 cid = LOOKUP(ident);
1005 gid = TCHASHSHIFT(cid);
1006 qp = &gd->gd_tsleep_hash[gid];
984263bc 1007restart:
8acf0617 1008 for (td = TAILQ_FIRST(&qp->queue); td != NULL; td = ntd) {
ae8e83e6 1009 ntd = TAILQ_NEXT(td, td_sleepq);
fc17ad60
MD
1010 if (td->td_wchan == ident &&
1011 td->td_wdomain == (domain & PDOMAIN_MASK)
1012 ) {
ae8e83e6
MD
1013 KKASSERT(td->td_gd == gd);
1014 _tsleep_remove(td);
c75e41b7 1015 td->td_wakefromcpu = PWAKEUP_DECODE(domain);
ae8e83e6 1016 if (td->td_flags & TDF_TSLEEP_DESCHEDULED) {
ae8e83e6
MD
1017 lwkt_schedule(td);
1018 if (domain & PWAKEUP_ONE)
1019 goto done;
fc17ad60 1020 }
0cfcada1 1021 goto restart;
984263bc 1022 }
8acf0617
MD
1023 if (td->td_wchan == qp->ident0)
1024 wids |= 1;
1025 else if (td->td_wchan == qp->ident1)
1026 wids |= 2;
1027 else if (td->td_wchan == qp->ident2)
1028 wids |= 4;
1029 else if (td->td_wchan == qp->ident3)
1030 wids |= 8;
1031 else
1032 wids |= 16; /* force ident0 to be retained (-1) */
984263bc 1033 }
fc17ad60 1034
f26f7bb3
MD
1035 /*
1036 * Because a bunch of cpumask array entries cover the same queue, it
1037 * is possible for our bit to remain set in some of them and cause
1038 * spurious wakeup IPIs later on. Make sure that the bit is cleared
1039 * when a spurious IPI occurs to prevent further spurious IPIs.
1040 */
8acf0617 1041 if (TAILQ_FIRST(&qp->queue) == NULL) {
f26f7bb3 1042 ATOMIC_CPUMASK_NANDBIT(slpque_cpumasks[cid], gd->gd_cpuid);
8acf0617
MD
1043 qp->ident0 = NULL;
1044 qp->ident1 = NULL;
1045 qp->ident2 = NULL;
1046 qp->ident3 = NULL;
1047 } else {
1048 if ((wids & 1) == 0) {
a8eec89c 1049 if ((wids & 16) == 0) {
8acf0617 1050 qp->ident0 = NULL;
a8eec89c
MD
1051 } else {
1052 KKASSERT(qp->ident0 == (void *)(intptr_t)-1);
1053 }
8acf0617
MD
1054 }
1055 if ((wids & 2) == 0)
1056 qp->ident1 = NULL;
1057 if ((wids & 4) == 0)
1058 qp->ident2 = NULL;
1059 if ((wids & 8) == 0)
1060 qp->ident3 = NULL;
f26f7bb3
MD
1061 }
1062
fc17ad60
MD
1063 /*
1064 * We finished checking the current cpu but there still may be
1065 * more work to do. Either wakeup_one was requested and no matching
1066 * thread was found, or a normal wakeup was requested and we have
1067 * to continue checking cpus.
1068 *
fc17ad60
MD
1069 * It should be noted that this scheme is actually less expensive then
1070 * the old scheme when waking up multiple threads, since we send
1071 * only one IPI message per target candidate which may then schedule
1072 * multiple threads. Before we could have wound up sending an IPI
1073 * message for each thread on the target cpu (!= current cpu) that
1074 * needed to be woken up.
1075 *
1076 * NOTE: Wakeups occuring on remote cpus are asynchronous. This
e676ebda
MD
1077 * should be ok since we are passing idents in the IPI rather
1078 * then thread pointers.
1079 *
9b302485 1080 * NOTE: We MUST mfence (or use an atomic op) prior to reading
e676ebda
MD
1081 * the cpumask, as another cpu may have written to it in
1082 * a fashion interlocked with whatever the caller did before
1083 * calling wakeup(). Otherwise we might miss the interaction
1084 * (kern_mutex.c can cause this problem).
9b302485
MD
1085 *
1086 * lfence is insufficient as it may allow a written state to
1087 * reorder around the cpumask load.
fc17ad60 1088 */
c07315c4 1089 if ((domain & PWAKEUP_MYCPU) == 0) {
8acf0617 1090 globaldata_t tgd;
35a64553 1091 const volatile void *id0;
8acf0617
MD
1092 int n;
1093
9b302485 1094 cpu_mfence();
a8eec89c 1095 /* cpu_lfence(); */
666ff13c 1096 mask = slpque_cpumasks[cid];
c07315c4 1097 CPUMASK_ANDMASK(mask, gd->gd_other_cpus);
8acf0617
MD
1098 while (CPUMASK_TESTNZERO(mask)) {
1099 n = BSRCPUMASK(mask);
1100 CPUMASK_NANDBIT(mask, n);
1101 tgd = globaldata_find(n);
b4d1b684
MD
1102
1103 /*
1104 * Both ident0 compares must from a single load
1105 * to avoid ident0 update races crossing the two
1106 * compares.
1107 */
a8eec89c 1108 qp = &tgd->gd_tsleep_hash[gid];
b4d1b684
MD
1109 id0 = qp->ident0;
1110 cpu_ccfence();
1111 if (id0 == (void *)(intptr_t)-1) {
1112 lwkt_send_ipiq2(tgd, _wakeup, ident,
1113 domain | PWAKEUP_MYCPU);
8acf0617 1114 ++tgd->gd_cnt.v_wakeup_colls;
b4d1b684
MD
1115 } else if (id0 == ident ||
1116 qp->ident1 == ident ||
1117 qp->ident2 == ident ||
1118 qp->ident3 == ident) {
8acf0617
MD
1119 lwkt_send_ipiq2(tgd, _wakeup, ident,
1120 domain | PWAKEUP_MYCPU);
1121 }
a8eec89c 1122 }
8acf0617 1123#if 0
a8eec89c 1124 if (CPUMASK_TESTNZERO(mask)) {
c07315c4
MD
1125 lwkt_send_ipiq2_mask(mask, _wakeup, ident,
1126 domain | PWAKEUP_MYCPU);
1127 }
a8eec89c 1128#endif
fc17ad60 1129 }
fc17ad60 1130done:
8aa3430c 1131 logtsleep1(wakeup_end);
37af14fe 1132 crit_exit();
984263bc
MD
1133}
1134
b336a9b1
MD
1135/*
1136 * Wakeup all threads tsleep()ing on the specified ident, on all cpus
1137 */
984263bc 1138void
5decebc7 1139wakeup(const volatile void *ident)
984263bc 1140{
b0da0c88
MD
1141 globaldata_t gd = mycpu;
1142 thread_t td = gd->gd_curthread;
1143
1144 if (td && (td->td_flags & TDF_DELAYED_WAKEUP)) {
2e90abac
MD
1145 /*
1146 * If we are in a delayed wakeup section, record up to two wakeups in
1147 * a per-CPU queue and issue them when we block or exit the delayed
1148 * wakeup section.
1149 */
1150 if (atomic_cmpset_ptr(&gd->gd_delayed_wakeup[0], NULL, ident))
1151 return;
1152 if (atomic_cmpset_ptr(&gd->gd_delayed_wakeup[1], NULL, ident))
1153 return;
1154
1155 ident = atomic_swap_ptr(__DEQUALIFY(volatile void **, &gd->gd_delayed_wakeup[1]),
1156 __DEALL(ident));
1157 ident = atomic_swap_ptr(__DEQUALIFY(volatile void **, &gd->gd_delayed_wakeup[0]),
1158 __DEALL(ident));
b0da0c88 1159 }
2e90abac 1160
b0da0c88 1161 _wakeup(__DEALL(ident), PWAKEUP_ENCODE(0, gd->gd_cpuid));
0cfcada1 1162}
984263bc 1163
b336a9b1
MD
1164/*
1165 * Wakeup one thread tsleep()ing on the specified ident, on any cpu.
1166 */
0cfcada1 1167void
5decebc7 1168wakeup_one(const volatile void *ident)
0cfcada1 1169{
fc17ad60 1170 /* XXX potentially round-robin the first responding cpu */
c75e41b7
MD
1171 _wakeup(__DEALL(ident), PWAKEUP_ENCODE(0, mycpu->gd_cpuid) |
1172 PWAKEUP_ONE);
da5fb9ef
MD
1173}
1174
b336a9b1
MD
1175/*
1176 * Wakeup threads tsleep()ing on the specified ident on the current cpu
1177 * only.
1178 */
1179void
5decebc7 1180wakeup_mycpu(const volatile void *ident)
b336a9b1 1181{
c75e41b7
MD
1182 _wakeup(__DEALL(ident), PWAKEUP_ENCODE(0, mycpu->gd_cpuid) |
1183 PWAKEUP_MYCPU);
b336a9b1
MD
1184}
1185
1186/*
1187 * Wakeup one thread tsleep()ing on the specified ident on the current cpu
1188 * only.
1189 */
1190void
5decebc7 1191wakeup_mycpu_one(const volatile void *ident)
b336a9b1
MD
1192{
1193 /* XXX potentially round-robin the first responding cpu */
c75e41b7
MD
1194 _wakeup(__DEALL(ident), PWAKEUP_ENCODE(0, mycpu->gd_cpuid) |
1195 PWAKEUP_MYCPU | PWAKEUP_ONE);
b336a9b1
MD
1196}
1197
1198/*
1199 * Wakeup all thread tsleep()ing on the specified ident on the specified cpu
1200 * only.
1201 */
1202void
5decebc7 1203wakeup_oncpu(globaldata_t gd, const volatile void *ident)
b336a9b1 1204{
c75e41b7 1205 globaldata_t mygd = mycpu;
b336a9b1 1206 if (gd == mycpu) {
c75e41b7
MD
1207 _wakeup(__DEALL(ident), PWAKEUP_ENCODE(0, mygd->gd_cpuid) |
1208 PWAKEUP_MYCPU);
b336a9b1 1209 } else {
c75e41b7
MD
1210 lwkt_send_ipiq2(gd, _wakeup, __DEALL(ident),
1211 PWAKEUP_ENCODE(0, mygd->gd_cpuid) |
1212 PWAKEUP_MYCPU);
b336a9b1
MD
1213 }
1214}
1215
1216/*
1217 * Wakeup one thread tsleep()ing on the specified ident on the specified cpu
1218 * only.
1219 */
1220void
5decebc7 1221wakeup_oncpu_one(globaldata_t gd, const volatile void *ident)
b336a9b1 1222{
c75e41b7
MD
1223 globaldata_t mygd = mycpu;
1224 if (gd == mygd) {
1225 _wakeup(__DEALL(ident), PWAKEUP_ENCODE(0, mygd->gd_cpuid) |
1226 PWAKEUP_MYCPU | PWAKEUP_ONE);
b336a9b1 1227 } else {
5decebc7 1228 lwkt_send_ipiq2(gd, _wakeup, __DEALL(ident),
c75e41b7 1229 PWAKEUP_ENCODE(0, mygd->gd_cpuid) |
5decebc7 1230 PWAKEUP_MYCPU | PWAKEUP_ONE);
b336a9b1
MD
1231 }
1232}
1233
1234/*
1235 * Wakeup all threads waiting on the specified ident that slept using
1236 * the specified domain, on all cpus.
1237 */
da5fb9ef 1238void
5decebc7 1239wakeup_domain(const volatile void *ident, int domain)
da5fb9ef 1240{
5decebc7 1241 _wakeup(__DEALL(ident), PWAKEUP_ENCODE(domain, mycpu->gd_cpuid));
da5fb9ef
MD
1242}
1243
b336a9b1
MD
1244/*
1245 * Wakeup one thread waiting on the specified ident that slept using
1246 * the specified domain, on any cpu.
1247 */
da5fb9ef 1248void
5decebc7 1249wakeup_domain_one(const volatile void *ident, int domain)
da5fb9ef 1250{
fc17ad60 1251 /* XXX potentially round-robin the first responding cpu */
5decebc7
MD
1252 _wakeup(__DEALL(ident),
1253 PWAKEUP_ENCODE(domain, mycpu->gd_cpuid) | PWAKEUP_ONE);
984263bc
MD
1254}
1255
b0da0c88
MD
1256void
1257wakeup_start_delayed(void)
1258{
1259 globaldata_t gd = mycpu;
1260
1261 crit_enter();
1262 gd->gd_curthread->td_flags |= TDF_DELAYED_WAKEUP;
1263 crit_exit();
1264}
1265
1266void
1267wakeup_end_delayed(void)
1268{
1269 globaldata_t gd = mycpu;
1270
1271 if (gd->gd_curthread->td_flags & TDF_DELAYED_WAKEUP) {
1272 crit_enter();
1273 gd->gd_curthread->td_flags &= ~TDF_DELAYED_WAKEUP;
1274 if (gd->gd_delayed_wakeup[0] || gd->gd_delayed_wakeup[1]) {
1275 if (gd->gd_delayed_wakeup[0]) {
1276 wakeup(gd->gd_delayed_wakeup[0]);
1277 gd->gd_delayed_wakeup[0] = NULL;
1278 }
1279 if (gd->gd_delayed_wakeup[1]) {
1280 wakeup(gd->gd_delayed_wakeup[1]);
1281 gd->gd_delayed_wakeup[1] = NULL;
1282 }
1283 }
1284 crit_exit();
1285 }
1286}
1287
984263bc 1288/*
344ad853
MD
1289 * setrunnable()
1290 *
4643740a
MD
1291 * Make a process runnable. lp->lwp_token must be held on call and this
1292 * function must be called from the cpu owning lp.
37af14fe 1293 *
4643740a 1294 * This only has an effect if we are in LSSTOP or LSSLEEP.
984263bc
MD
1295 */
1296void
9a379a4a 1297setrunnable(struct lwp *lp)
984263bc 1298{
4643740a
MD
1299 thread_t td = lp->lwp_thread;
1300
e2b148c6 1301 ASSERT_LWKT_TOKEN_HELD(&lp->lwp_token);
4643740a 1302 KKASSERT(td->td_gd == mycpu);
344ad853 1303 crit_enter();
2daf83b0
SS
1304 if (lp->lwp_stat == LSSTOP)
1305 lp->lwp_stat = LSSLEEP;
4643740a
MD
1306 if (lp->lwp_stat == LSSLEEP) {
1307 _tsleep_remove(td);
1308 lwkt_schedule(td);
1309 } else if (td->td_flags & TDF_SINTR) {
1310 lwkt_schedule(td);
1311 }
344ad853 1312 crit_exit();
984263bc
MD
1313}
1314
1315/*
164b8401
SS
1316 * The process is stopped due to some condition, usually because p_stat is
1317 * set to SSTOP, but also possibly due to being traced.
fc17ad60 1318 *
4643740a
MD
1319 * Caller must hold p->p_token
1320 *
164b8401 1321 * NOTE! If the caller sets SSTOP, the caller must also clear P_WAITED
344ad853
MD
1322 * because the parent may check the child's status before the child actually
1323 * gets to this routine.
1324 *
9a379a4a 1325 * This routine is called with the current lwp only, typically just
4643740a
MD
1326 * before returning to userland if the process state is detected as
1327 * possibly being in a stopped state.
984263bc
MD
1328 */
1329void
9a379a4a 1330tstop(void)
984263bc 1331{
9a379a4a 1332 struct lwp *lp = curthread->td_lwp;
7278a846 1333 struct proc *p = lp->lwp_proc;
8c986a82 1334 struct proc *q;
9a379a4a 1335
4643740a 1336 lwkt_gettoken(&lp->lwp_token);
7278a846 1337 crit_enter();
4643740a 1338
f33e8653 1339 /*
4643740a 1340 * If LWP_MP_WSTOP is set, we were sleeping
f33e8653
SS
1341 * while our process was stopped. At this point
1342 * we were already counted as stopped.
1343 */
4643740a 1344 if ((lp->lwp_mpflags & LWP_MP_WSTOP) == 0) {
f33e8653
SS
1345 /*
1346 * If we're the last thread to stop, signal
1347 * our parent.
1348 */
1349 p->p_nstopped++;
4643740a 1350 atomic_set_int(&lp->lwp_mpflags, LWP_MP_WSTOP);
ea59a697 1351 wakeup(&p->p_nstopped);
f33e8653 1352 if (p->p_nstopped == p->p_nthreads) {
8c986a82
MD
1353 /*
1354 * Token required to interlock kern_wait()
1355 */
1356 q = p->p_pptr;
1357 PHOLD(q);
1358 lwkt_gettoken(&q->p_token);
4643740a 1359 p->p_flags &= ~P_WAITED;
f33e8653 1360 wakeup(p->p_pptr);
8c986a82
MD
1361 if ((q->p_sigacts->ps_flag & PS_NOCLDSTOP) == 0)
1362 ksignal(q, SIGCHLD);
1363 lwkt_reltoken(&q->p_token);
1364 PRELE(q);
f33e8653
SS
1365 }
1366 }
0001762f
MD
1367
1368 /*
1369 * Wait here while in a stopped state, interlocked with lwp_token.
1370 * We must break-out if the whole process is trying to exit.
1371 */
9c960153 1372 while (STOPLWP(p, lp)) {
ea59a697
SS
1373 lp->lwp_stat = LSSTOP;
1374 tsleep(p, 0, "stop", 0);
1375 }
7278a846 1376 p->p_nstopped--;
4643740a 1377 atomic_clear_int(&lp->lwp_mpflags, LWP_MP_WSTOP);
7278a846 1378 crit_exit();
4643740a 1379 lwkt_reltoken(&lp->lwp_token);
26a0694b
MD
1380}
1381
984263bc
MD
1382/*
1383 * Compute a tenex style load average of a quantity on
586c4308
MD
1384 * 1, 5 and 15 minute intervals. This is a pcpu callout.
1385 *
1386 * We segment the lwp scan on a pcpu basis. This does NOT
1387 * mean the associated lwps are on this cpu, it is done
1388 * just to break the work up.
1389 *
1390 * The callout on cpu0 rolls up the stats from the other
1391 * cpus.
984263bc 1392 */
c7e98b2f 1393static int loadav_count_runnable(struct lwp *p, void *data);
8fa76237 1394
984263bc
MD
1395static void
1396loadav(void *arg)
1397{
586c4308 1398 globaldata_t gd = mycpu;
984263bc 1399 struct loadavg *avg;
8fa76237 1400 int i, nrun;
984263bc 1401
984263bc 1402 nrun = 0;
586c4308
MD
1403 alllwp_scan(loadav_count_runnable, &nrun, 1);
1404 gd->gd_loadav_nrunnable = nrun;
1405 if (gd->gd_cpuid == 0) {
1406 avg = &averunnable;
1407 nrun = 0;
1408 for (i = 0; i < ncpus; ++i)
1409 nrun += globaldata_find(i)->gd_loadav_nrunnable;
1410 for (i = 0; i < 3; i++) {
1411 avg->ldavg[i] = (cexp[i] * avg->ldavg[i] +
1412 (long)nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT;
1413 }
8fa76237 1414 }
984263bc
MD
1415
1416 /*
1417 * Schedule the next update to occur after 5 seconds, but add a
1418 * random variation to avoid synchronisation with processes that
1419 * run at regular intervals.
1420 */
586c4308
MD
1421 callout_reset(&gd->gd_loadav_callout,
1422 hz * 4 + (int)(krandom() % (hz * 2 + 1)),
8fa76237
MD
1423 loadav, NULL);
1424}
1425
1426static int
c7e98b2f 1427loadav_count_runnable(struct lwp *lp, void *data)
8fa76237
MD
1428{
1429 int *nrunp = data;
1430 thread_t td;
1431
164b8401
SS
1432 switch (lp->lwp_stat) {
1433 case LSRUN:
08f2f1bb 1434 if ((td = lp->lwp_thread) == NULL)
8fa76237
MD
1435 break;
1436 if (td->td_flags & TDF_BLOCKED)
1437 break;
8fa76237
MD
1438 ++*nrunp;
1439 break;
1440 default:
1441 break;
1442 }
d2d8515b 1443 lwkt_yield();
8fa76237 1444 return(0);
984263bc
MD
1445}
1446
f6aeec64
MD
1447/*
1448 * Regular data collection
1449 */
1450static uint64_t
1451collect_load_callback(int n)
1452{
77bc82e1
MD
1453 int fscale = averunnable.fscale;
1454
1455 return ((averunnable.ldavg[0] * 100 + (fscale >> 1)) / fscale);
f6aeec64
MD
1456}
1457
984263bc 1458static void
666ff13c 1459sched_setup(void *dummy __unused)
984263bc 1460{
666ff13c
MD
1461 globaldata_t save_gd = mycpu;
1462 globaldata_t gd;
1463 int n;
1464
f6aeec64
MD
1465 kcollect_register(KCOLLECT_LOAD, "load", collect_load_callback,
1466 KCOLLECT_SCALE(KCOLLECT_LOAD_FORMAT, 0));
666ff13c
MD
1467
1468 /*
1469 * Kick off timeout driven events by calling first time. We
1470 * split the work across available cpus to help scale it,
1471 * it can eat a lot of cpu when there are a lot of processes
1472 * on the system.
1473 */
1474 for (n = 0; n < ncpus; ++n) {
1475 gd = globaldata_find(n);
1476 lwkt_setcpu_self(gd);
1477 callout_init_mp(&gd->gd_loadav_callout);
1478 callout_init_mp(&gd->gd_schedcpu_callout);
1479 schedcpu(NULL);
1480 loadav(NULL);
1481 }
1482 lwkt_setcpu_self(save_gd);
1483}
1484
1485/*
1486 * Extremely early initialization, dummy-up the tables so we don't have
1487 * to conditionalize for NULL in _wakeup() and tsleep_interlock(). Even
1488 * though the system isn't blocking this early, these functions still
1489 * try to access the hash table.
1490 *
1491 * This setup will be overridden once sched_dyninit() -> sleep_gdinit()
1492 * is called.
1493 */
1494void
1495sleep_early_gdinit(globaldata_t gd)
1496{
1497 static struct tslpque dummy_slpque;
1498 static cpumask_t dummy_cpumasks;
1499
1500 slpque_tablesize = 1;
1501 gd->gd_tsleep_hash = &dummy_slpque;
1502 slpque_cpumasks = &dummy_cpumasks;
8acf0617 1503 TAILQ_INIT(&dummy_slpque.queue);
666ff13c
MD
1504}
1505
1506/*
1507 * PCPU initialization. Called after KMALLOC is operational, by
1508 * sched_dyninit() for cpu 0, and by mi_gdinit() for other cpus later.
1509 *
1510 * WARNING! The pcpu hash table is smaller than the global cpumask
1511 * hash table, which can save us a lot of memory when maxproc
1512 * is set high.
1513 */
1514void
1515sleep_gdinit(globaldata_t gd)
1516{
1517 struct thread *td;
1bc11bc6 1518 size_t hash_size;
666ff13c
MD
1519 uint32_t n;
1520 uint32_t i;
1521
1522 /*
1523 * This shouldn't happen, that is there shouldn't be any threads
1524 * waiting on the dummy tsleep queue this early in the boot.
1525 */
1526 if (gd->gd_cpuid == 0) {
8acf0617
MD
1527 struct tslpque *qp = &gd->gd_tsleep_hash[0];
1528 TAILQ_FOREACH(td, &qp->queue, td_sleepq) {
666ff13c
MD
1529 kprintf("SLEEP_GDINIT SWITCH %s\n", td->td_comm);
1530 }
1531 }
1532
1533 /*
1534 * Note that we have to allocate one extra slot because we are
1535 * shifting a modulo value. TCHASHSHIFT(slpque_tablesize - 1) can
1536 * return the same value as TCHASHSHIFT(slpque_tablesize).
1537 */
1538 n = TCHASHSHIFT(slpque_tablesize) + 1;
1539
1bc11bc6
SZ
1540 hash_size = sizeof(struct tslpque) * n;
1541 gd->gd_tsleep_hash = (void *)kmem_alloc3(&kernel_map, hash_size,
1542 VM_SUBSYS_GD,
1543 KM_CPU(gd->gd_cpuid));
1544 memset(gd->gd_tsleep_hash, 0, hash_size);
666ff13c 1545 for (i = 0; i < n; ++i)
8acf0617 1546 TAILQ_INIT(&gd->gd_tsleep_hash[i].queue);
666ff13c
MD
1547}
1548
1549/*
1550 * Dynamic initialization after the memory system is operational.
1551 */
1552static void
1553sched_dyninit(void *dummy __unused)
1554{
1555 int tblsize;
1556 int tblsize2;
1557 int n;
1558
1559 /*
1560 * Calculate table size for slpque hash. We want a prime number
1561 * large enough to avoid overloading slpque_cpumasks when the
1562 * system has a large number of sleeping processes, which will
1563 * spam IPIs on wakeup().
1564 *
1565 * While it is true this is really a per-lwp factor, generally
1566 * speaking the maxproc limit is a good metric to go by.
1567 */
1568 for (tblsize = maxproc | 1; ; tblsize += 2) {
1569 if (tblsize % 3 == 0)
1570 continue;
1571 if (tblsize % 5 == 0)
1572 continue;
1573 tblsize2 = (tblsize / 2) | 1;
1574 for (n = 7; n < tblsize2; n += 2) {
1575 if (tblsize % n == 0)
1576 break;
1577 }
1578 if (n == tblsize2)
1579 break;
1580 }
1581
1582 /*
1583 * PIDs are currently limited to 6 digits. Cap the table size
1584 * at double this.
1585 */
1586 if (tblsize > 2000003)
1587 tblsize = 2000003;
1588
1589 slpque_tablesize = tblsize;
1590 slpque_cpumasks = kmalloc(sizeof(*slpque_cpumasks) * slpque_tablesize,
1591 M_TSLEEP, M_WAITOK | M_ZERO);
1592 sleep_gdinit(mycpu);
984263bc 1593}