Commit | Line | Data |
---|---|---|
984263bc MD |
1 | /*- |
2 | * Copyright (c) 1982, 1986, 1990, 1991, 1993 | |
3 | * The Regents of the University of California. All rights reserved. | |
4 | * (c) UNIX System Laboratories, Inc. | |
5 | * All or some portions of this file are derived from material licensed | |
6 | * to the University of California by American Telephone and Telegraph | |
7 | * Co. or Unix System Laboratories, Inc. and are reproduced herein with | |
8 | * the permission of UNIX System Laboratories, Inc. | |
9 | * | |
10 | * Redistribution and use in source and binary forms, with or without | |
11 | * modification, are permitted provided that the following conditions | |
12 | * are met: | |
13 | * 1. Redistributions of source code must retain the above copyright | |
14 | * notice, this list of conditions and the following disclaimer. | |
15 | * 2. Redistributions in binary form must reproduce the above copyright | |
16 | * notice, this list of conditions and the following disclaimer in the | |
17 | * documentation and/or other materials provided with the distribution. | |
dc71b7ab | 18 | * 3. Neither the name of the University nor the names of its contributors |
984263bc MD |
19 | * may be used to endorse or promote products derived from this software |
20 | * without specific prior written permission. | |
21 | * | |
22 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND | |
23 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
24 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
25 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE | |
26 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
27 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
28 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
29 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
30 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
31 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
32 | * SUCH DAMAGE. | |
33 | * | |
34 | * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95 | |
35 | * $FreeBSD: src/sys/kern/kern_synch.c,v 1.87.2.6 2002/10/13 07:29:53 kbyanc Exp $ | |
36 | */ | |
37 | ||
38 | #include "opt_ktrace.h" | |
39 | ||
40 | #include <sys/param.h> | |
41 | #include <sys/systm.h> | |
42 | #include <sys/proc.h> | |
43 | #include <sys/kernel.h> | |
44 | #include <sys/signalvar.h> | |
45 | #include <sys/resourcevar.h> | |
46 | #include <sys/vmmeter.h> | |
47 | #include <sys/sysctl.h> | |
344ad853 | 48 | #include <sys/lock.h> |
2b3f93ea | 49 | #include <sys/caps.h> |
f6aeec64 | 50 | #include <sys/kcollect.h> |
e2164e29 | 51 | #include <sys/malloc.h> |
fc9ae81d | 52 | #ifdef KTRACE |
984263bc MD |
53 | #include <sys/ktrace.h> |
54 | #endif | |
9afb0ffd | 55 | #include <sys/ktr.h> |
684a93c4 | 56 | #include <sys/serialize.h> |
984263bc | 57 | |
684a93c4 | 58 | #include <sys/signal2.h> |
bf765287 MD |
59 | #include <sys/thread2.h> |
60 | #include <sys/spinlock2.h> | |
7f6220a9 | 61 | #include <sys/mutex2.h> |
bf765287 | 62 | |
984263bc | 63 | #include <machine/cpu.h> |
984263bc MD |
64 | #include <machine/smp.h> |
65 | ||
1bc11bc6 SZ |
66 | #include <vm/vm_extern.h> |
67 | ||
8acf0617 MD |
68 | struct tslpque { |
69 | TAILQ_HEAD(, thread) queue; | |
70 | const volatile void *ident0; | |
71 | const volatile void *ident1; | |
72 | const volatile void *ident2; | |
73 | const volatile void *ident3; | |
74 | }; | |
fc17ad60 | 75 | |
402ed7e1 | 76 | static void sched_setup (void *dummy); |
f3f3eadb | 77 | SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL); |
666ff13c MD |
78 | static void sched_dyninit (void *dummy); |
79 | SYSINIT(sched_dyninit, SI_BOOT1_DYNALLOC, SI_ORDER_FIRST, sched_dyninit, NULL); | |
984263bc | 80 | |
984263bc | 81 | int lbolt; |
50e4012a | 82 | void *lbolt_syncer; |
ac7fb7ba | 83 | __read_mostly int tsleep_crypto_dump = 0; |
e3c330f0 MD |
84 | __read_mostly int ncpus; |
85 | __read_mostly int ncpus_fit, ncpus_fit_mask; /* note: mask not cpumask_t */ | |
86 | __read_mostly int safepri; | |
87 | __read_mostly int tsleep_now_works; | |
984263bc | 88 | |
fc17ad60 | 89 | MALLOC_DEFINE(M_TSLEEP, "tslpque", "tsleep queues"); |
984263bc | 90 | |
5decebc7 MD |
91 | #define __DEALL(ident) __DEQUALIFY(void *, ident) |
92 | ||
9afb0ffd MD |
93 | #if !defined(KTR_TSLEEP) |
94 | #define KTR_TSLEEP KTR_ALL | |
95 | #endif | |
96 | KTR_INFO_MASTER(tsleep); | |
5bf48697 AE |
97 | KTR_INFO(KTR_TSLEEP, tsleep, tsleep_beg, 0, "tsleep enter %p", const volatile void *ident); |
98 | KTR_INFO(KTR_TSLEEP, tsleep, tsleep_end, 1, "tsleep exit"); | |
99 | KTR_INFO(KTR_TSLEEP, tsleep, wakeup_beg, 2, "wakeup enter %p", const volatile void *ident); | |
100 | KTR_INFO(KTR_TSLEEP, tsleep, wakeup_end, 3, "wakeup exit"); | |
101 | KTR_INFO(KTR_TSLEEP, tsleep, ilockfail, 4, "interlock failed %p", const volatile void *ident); | |
8aa3430c MD |
102 | |
103 | #define logtsleep1(name) KTR_LOG(tsleep_ ## name) | |
104 | #define logtsleep2(name, val) KTR_LOG(tsleep_ ## name, val) | |
9afb0ffd | 105 | |
ac7fb7ba | 106 | __exclusive_cache_line |
984263bc MD |
107 | struct loadavg averunnable = |
108 | { {0, 0, 0}, FSCALE }; /* load average, of runnable procs */ | |
109 | /* | |
110 | * Constants for averages over 1, 5, and 15 minutes | |
111 | * when sampling at 5 second intervals. | |
112 | */ | |
ac7fb7ba | 113 | __read_mostly |
984263bc MD |
114 | static fixpt_t cexp[3] = { |
115 | 0.9200444146293232 * FSCALE, /* exp(-1/12) */ | |
116 | 0.9834714538216174 * FSCALE, /* exp(-1/60) */ | |
117 | 0.9944598480048967 * FSCALE, /* exp(-1/180) */ | |
118 | }; | |
119 | ||
402ed7e1 RG |
120 | static void endtsleep (void *); |
121 | static void loadav (void *arg); | |
402ed7e1 | 122 | static void schedcpu (void *arg); |
984263bc | 123 | |
ac7fb7ba | 124 | __read_mostly static int pctcpu_decay = 10; |
a8eec89c MD |
125 | SYSCTL_INT(_kern, OID_AUTO, pctcpu_decay, CTLFLAG_RW, |
126 | &pctcpu_decay, 0, ""); | |
dcc99b62 MD |
127 | |
128 | /* | |
bce6845a | 129 | * kernel uses `FSCALE', userland (SHOULD) use kern.fscale |
984263bc | 130 | */ |
ac7fb7ba | 131 | __read_mostly int fscale __unused = FSCALE; /* exported to systat */ |
dcc99b62 | 132 | SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, 0, FSCALE, ""); |
984263bc | 133 | |
a8eec89c MD |
134 | /* |
135 | * Issue a wakeup() from userland (debugging) | |
136 | */ | |
137 | static int | |
138 | sysctl_wakeup(SYSCTL_HANDLER_ARGS) | |
139 | { | |
140 | uint64_t ident = 1; | |
141 | int error = 0; | |
142 | ||
143 | if (req->newptr != NULL) { | |
2b3f93ea | 144 | if (caps_priv_check_self(SYSCAP_RESTRICTEDROOT)) |
a8eec89c MD |
145 | return (EPERM); |
146 | error = SYSCTL_IN(req, &ident, sizeof(ident)); | |
147 | if (error) | |
148 | return error; | |
149 | kprintf("issue wakeup %016jx\n", ident); | |
150 | wakeup((void *)(intptr_t)ident); | |
151 | } | |
152 | if (req->oldptr != NULL) { | |
153 | error = SYSCTL_OUT(req, &ident, sizeof(ident)); | |
154 | } | |
155 | return error; | |
156 | } | |
157 | ||
63c41f05 MD |
158 | static int |
159 | sysctl_wakeup_umtx(SYSCTL_HANDLER_ARGS) | |
160 | { | |
161 | uint64_t ident = 1; | |
162 | int error = 0; | |
163 | ||
164 | if (req->newptr != NULL) { | |
2b3f93ea | 165 | if (caps_priv_check_self(SYSCAP_RESTRICTEDROOT)) |
63c41f05 MD |
166 | return (EPERM); |
167 | error = SYSCTL_IN(req, &ident, sizeof(ident)); | |
168 | if (error) | |
169 | return error; | |
170 | kprintf("issue wakeup %016jx, PDOMAIN_UMTX\n", ident); | |
171 | wakeup_domain((void *)(intptr_t)ident, PDOMAIN_UMTX); | |
172 | } | |
173 | if (req->oldptr != NULL) { | |
174 | error = SYSCTL_OUT(req, &ident, sizeof(ident)); | |
175 | } | |
176 | return error; | |
177 | } | |
178 | ||
a8eec89c MD |
179 | SYSCTL_PROC(_debug, OID_AUTO, wakeup, CTLTYPE_UQUAD|CTLFLAG_RW, 0, 0, |
180 | sysctl_wakeup, "Q", "issue wakeup(addr)"); | |
63c41f05 MD |
181 | SYSCTL_PROC(_debug, OID_AUTO, wakeup_umtx, CTLTYPE_UQUAD|CTLFLAG_RW, 0, 0, |
182 | sysctl_wakeup_umtx, "Q", "issue wakeup(addr, PDOMAIN_UMTX)"); | |
a8eec89c | 183 | |
984263bc | 184 | /* |
0a3f9b47 | 185 | * Recompute process priorities, once a second. |
dcc99b62 MD |
186 | * |
187 | * Since the userland schedulers are typically event oriented, if the | |
188 | * estcpu calculation at wakeup() time is not sufficient to make a | |
189 | * process runnable relative to other processes in the system we have | |
190 | * a 1-second recalc to help out. | |
191 | * | |
192 | * This code also allows us to store sysclock_t data in the process structure | |
bce6845a | 193 | * without fear of an overrun, since sysclock_t are guarenteed to hold |
dcc99b62 | 194 | * several seconds worth of count. |
8fa76237 MD |
195 | * |
196 | * WARNING! callouts can preempt normal threads. However, they will not | |
197 | * preempt a thread holding a spinlock so we *can* safely use spinlocks. | |
984263bc | 198 | */ |
8fa76237 MD |
199 | static int schedcpu_stats(struct proc *p, void *data __unused); |
200 | static int schedcpu_resource(struct proc *p, void *data __unused); | |
201 | ||
984263bc | 202 | static void |
26a0694b | 203 | schedcpu(void *arg) |
984263bc | 204 | { |
586c4308 MD |
205 | allproc_scan(schedcpu_stats, NULL, 1); |
206 | allproc_scan(schedcpu_resource, NULL, 1); | |
207 | if (mycpu->gd_cpuid == 0) { | |
208 | wakeup((caddr_t)&lbolt); | |
209 | wakeup(lbolt_syncer); | |
210 | } | |
211 | callout_reset(&mycpu->gd_schedcpu_callout, hz, schedcpu, NULL); | |
8fa76237 MD |
212 | } |
213 | ||
214 | /* | |
215 | * General process statistics once a second | |
216 | */ | |
217 | static int | |
218 | schedcpu_stats(struct proc *p, void *data __unused) | |
219 | { | |
08f2f1bb SS |
220 | struct lwp *lp; |
221 | ||
7bea4e64 MD |
222 | /* |
223 | * Threads may not be completely set up if process in SIDL state. | |
224 | */ | |
225 | if (p->p_stat == SIDL) | |
226 | return(0); | |
227 | ||
0d78b86e | 228 | PHOLD(p); |
85946b6c MD |
229 | if (lwkt_trytoken(&p->p_token) == FALSE) { |
230 | PRELE(p); | |
231 | return(0); | |
232 | } | |
0d78b86e | 233 | |
8fa76237 | 234 | p->p_swtime++; |
c7e98b2f | 235 | FOREACH_LWP_IN_PROC(lp, p) { |
de4d4cb0 MD |
236 | if (lp->lwp_stat == LSSLEEP) { |
237 | ++lp->lwp_slptime; | |
238 | if (lp->lwp_slptime == 1) | |
239 | p->p_usched->uload_update(lp); | |
240 | } | |
4b5f931b | 241 | |
c7e98b2f SS |
242 | /* |
243 | * Only recalculate processes that are active or have slept | |
244 | * less then 2 seconds. The schedulers understand this. | |
bc55d64f | 245 | * Otherwise decay by 50% per second. |
5f4788b4 MD |
246 | * |
247 | * NOTE: uload_update is called separately from kern_synch.c | |
248 | * when slptime == 1, removing the thread's | |
249 | * uload/ucount. | |
c7e98b2f SS |
250 | */ |
251 | if (lp->lwp_slptime <= 1) { | |
252 | p->p_usched->recalculate(lp); | |
253 | } else { | |
bc55d64f MD |
254 | int decay; |
255 | ||
256 | decay = pctcpu_decay; | |
257 | cpu_ccfence(); | |
258 | if (decay <= 1) | |
259 | decay = 1; | |
260 | if (decay > 100) | |
261 | decay = 100; | |
262 | lp->lwp_pctcpu = (lp->lwp_pctcpu * (decay - 1)) / decay; | |
c7e98b2f | 263 | } |
8fa76237 | 264 | } |
0d78b86e | 265 | lwkt_reltoken(&p->p_token); |
d2d8515b | 266 | lwkt_yield(); |
0d78b86e | 267 | PRELE(p); |
8fa76237 MD |
268 | return(0); |
269 | } | |
a46fac56 | 270 | |
8fa76237 | 271 | /* |
84204577 | 272 | * Resource checks. XXX break out since ksignal/killproc can block, |
8fa76237 MD |
273 | * limiting us to one process killed per second. There is probably |
274 | * a better way. | |
275 | */ | |
276 | static int | |
277 | schedcpu_resource(struct proc *p, void *data __unused) | |
278 | { | |
279 | u_int64_t ttime; | |
08f2f1bb | 280 | struct lwp *lp; |
8fa76237 | 281 | |
0d78b86e MD |
282 | if (p->p_stat == SIDL) |
283 | return(0); | |
284 | ||
285 | PHOLD(p); | |
85946b6c MD |
286 | if (lwkt_trytoken(&p->p_token) == FALSE) { |
287 | PRELE(p); | |
288 | return(0); | |
289 | } | |
0d78b86e MD |
290 | |
291 | if (p->p_stat == SZOMB || p->p_limit == NULL) { | |
292 | lwkt_reltoken(&p->p_token); | |
293 | PRELE(p); | |
8fa76237 | 294 | return(0); |
984263bc | 295 | } |
344ad853 | 296 | |
c7e98b2f SS |
297 | ttime = 0; |
298 | FOREACH_LWP_IN_PROC(lp, p) { | |
e595c6cd MD |
299 | /* |
300 | * We may have caught an lp in the middle of being | |
301 | * created, lwp_thread can be NULL. | |
302 | */ | |
303 | if (lp->lwp_thread) { | |
304 | ttime += lp->lwp_thread->td_sticks; | |
305 | ttime += lp->lwp_thread->td_uticks; | |
306 | } | |
c7e98b2f | 307 | } |
8fa76237 | 308 | |
384ee8f0 | 309 | switch(plimit_testcpulimit(p, ttime)) { |
8fa76237 MD |
310 | case PLIMIT_TESTCPU_KILL: |
311 | killproc(p, "exceeded maximum CPU limit"); | |
312 | break; | |
313 | case PLIMIT_TESTCPU_XCPU: | |
4643740a MD |
314 | if ((p->p_flags & P_XCPU) == 0) { |
315 | p->p_flags |= P_XCPU; | |
84204577 | 316 | ksignal(p, SIGXCPU); |
344ad853 | 317 | } |
8fa76237 MD |
318 | break; |
319 | default: | |
c0b8a06d | 320 | break; |
344ad853 | 321 | } |
0d78b86e | 322 | lwkt_reltoken(&p->p_token); |
d2d8515b | 323 | lwkt_yield(); |
0d78b86e | 324 | PRELE(p); |
8fa76237 | 325 | return(0); |
984263bc MD |
326 | } |
327 | ||
328 | /* | |
dcc99b62 MD |
329 | * This is only used by ps. Generate a cpu percentage use over |
330 | * a period of one second. | |
984263bc | 331 | */ |
dcc99b62 | 332 | void |
553ea3c8 | 333 | updatepcpu(struct lwp *lp, int cpticks, int ttlticks) |
984263bc | 334 | { |
dcc99b62 MD |
335 | fixpt_t acc; |
336 | int remticks; | |
337 | ||
338 | acc = (cpticks << FSHIFT) / ttlticks; | |
339 | if (ttlticks >= ESTCPUFREQ) { | |
553ea3c8 | 340 | lp->lwp_pctcpu = acc; |
dcc99b62 MD |
341 | } else { |
342 | remticks = ESTCPUFREQ - ttlticks; | |
553ea3c8 | 343 | lp->lwp_pctcpu = (acc * ttlticks + lp->lwp_pctcpu * remticks) / |
dcc99b62 | 344 | ESTCPUFREQ; |
a46fac56 | 345 | } |
984263bc MD |
346 | } |
347 | ||
348 | /* | |
666ff13c MD |
349 | * Handy macros to calculate hash indices. LOOKUP() calculates the |
350 | * global cpumask hash index, TCHASHSHIFT() converts that into the | |
351 | * pcpu hash index. | |
fc17ad60 | 352 | * |
666ff13c MD |
353 | * By making the pcpu hash arrays smaller we save a significant amount |
354 | * of memory at very low cost. The real cost is in IPIs, which are handled | |
355 | * by the much larger global cpumask hash table. | |
984263bc | 356 | */ |
41c11378 SZ |
357 | #define LOOKUP_PRIME 66555444443333333ULL |
358 | #define LOOKUP(x) ((((uintptr_t)(x) + ((uintptr_t)(x) >> 18)) ^ \ | |
359 | LOOKUP_PRIME) % slpque_tablesize) | |
666ff13c | 360 | #define TCHASHSHIFT(x) ((x) >> 4) |
984263bc | 361 | |
a7c16d7a MD |
362 | __read_mostly static uint32_t slpque_tablesize; |
363 | __read_mostly static cpumask_t *slpque_cpumasks; | |
984263bc | 364 | |
4e8fa957 SZ |
365 | SYSCTL_UINT(_kern, OID_AUTO, slpque_tablesize, CTLFLAG_RD, &slpque_tablesize, |
366 | 0, ""); | |
367 | ||
ae8e83e6 MD |
368 | /* |
369 | * This is a dandy function that allows us to interlock tsleep/wakeup | |
370 | * operations with unspecified upper level locks, such as lockmgr locks, | |
371 | * simply by holding a critical section. The sequence is: | |
372 | * | |
373 | * (acquire upper level lock) | |
374 | * tsleep_interlock(blah) | |
375 | * (release upper level lock) | |
376 | * tsleep(blah, ...) | |
377 | * | |
378 | * Basically this functions queues us on the tsleep queue without actually | |
379 | * descheduling us. When tsleep() is later called with PINTERLOCK it | |
380 | * assumes the thread was already queued, otherwise it queues it there. | |
381 | * | |
382 | * Thus it is possible to receive the wakeup prior to going to sleep and | |
383 | * the race conditions are covered. | |
384 | */ | |
385 | static __inline void | |
5decebc7 | 386 | _tsleep_interlock(globaldata_t gd, const volatile void *ident, int flags) |
ae8e83e6 MD |
387 | { |
388 | thread_t td = gd->gd_curthread; | |
8acf0617 | 389 | struct tslpque *qp; |
666ff13c MD |
390 | uint32_t cid; |
391 | uint32_t gid; | |
ae8e83e6 | 392 | |
a8eec89c MD |
393 | if (ident == NULL) { |
394 | kprintf("tsleep_interlock: NULL ident %s\n", td->td_comm); | |
395 | print_backtrace(5); | |
396 | } | |
397 | ||
ae8e83e6 MD |
398 | crit_enter_quick(td); |
399 | if (td->td_flags & TDF_TSLEEPQ) { | |
afd7f124 MD |
400 | /* |
401 | * Shortcut if unchanged | |
402 | */ | |
403 | if (td->td_wchan == ident && | |
404 | td->td_wdomain == (flags & PDOMAIN_MASK)) { | |
405 | crit_exit_quick(td); | |
406 | return; | |
407 | } | |
408 | ||
409 | /* | |
410 | * Remove current sleepq | |
411 | */ | |
666ff13c MD |
412 | cid = LOOKUP(td->td_wchan); |
413 | gid = TCHASHSHIFT(cid); | |
8acf0617 MD |
414 | qp = &gd->gd_tsleep_hash[gid]; |
415 | TAILQ_REMOVE(&qp->queue, td, td_sleepq); | |
416 | if (TAILQ_FIRST(&qp->queue) == NULL) { | |
8acf0617 MD |
417 | qp->ident0 = NULL; |
418 | qp->ident1 = NULL; | |
419 | qp->ident2 = NULL; | |
420 | qp->ident3 = NULL; | |
b4d1b684 MD |
421 | ATOMIC_CPUMASK_NANDBIT(slpque_cpumasks[cid], |
422 | gd->gd_cpuid); | |
b12defdc | 423 | } |
ae8e83e6 MD |
424 | } else { |
425 | td->td_flags |= TDF_TSLEEPQ; | |
426 | } | |
666ff13c MD |
427 | cid = LOOKUP(ident); |
428 | gid = TCHASHSHIFT(cid); | |
8acf0617 MD |
429 | qp = &gd->gd_tsleep_hash[gid]; |
430 | TAILQ_INSERT_TAIL(&qp->queue, td, td_sleepq); | |
431 | if (qp->ident0 != ident && qp->ident1 != ident && | |
432 | qp->ident2 != ident && qp->ident3 != ident) { | |
433 | if (qp->ident0 == NULL) | |
434 | qp->ident0 = ident; | |
435 | else if (qp->ident1 == NULL) | |
436 | qp->ident1 = ident; | |
437 | else if (qp->ident2 == NULL) | |
438 | qp->ident2 = ident; | |
439 | else if (qp->ident3 == NULL) | |
440 | qp->ident3 = ident; | |
441 | else | |
442 | qp->ident0 = (void *)(intptr_t)-1; | |
443 | } | |
666ff13c | 444 | ATOMIC_CPUMASK_ORBIT(slpque_cpumasks[cid], gd->gd_cpuid); |
ae8e83e6 MD |
445 | td->td_wchan = ident; |
446 | td->td_wdomain = flags & PDOMAIN_MASK; | |
ae8e83e6 MD |
447 | crit_exit_quick(td); |
448 | } | |
449 | ||
450 | void | |
5decebc7 | 451 | tsleep_interlock(const volatile void *ident, int flags) |
ae8e83e6 MD |
452 | { |
453 | _tsleep_interlock(mycpu, ident, flags); | |
454 | } | |
455 | ||
456 | /* | |
457 | * Remove thread from sleepq. Must be called with a critical section held. | |
4643740a | 458 | * The thread must not be migrating. |
ae8e83e6 MD |
459 | */ |
460 | static __inline void | |
461 | _tsleep_remove(thread_t td) | |
462 | { | |
463 | globaldata_t gd = mycpu; | |
8acf0617 | 464 | struct tslpque *qp; |
666ff13c MD |
465 | uint32_t cid; |
466 | uint32_t gid; | |
ae8e83e6 | 467 | |
957625b2 | 468 | KKASSERT(td->td_gd == gd && IN_CRITICAL_SECT(td)); |
4643740a | 469 | KKASSERT((td->td_flags & TDF_MIGRATING) == 0); |
ae8e83e6 MD |
470 | if (td->td_flags & TDF_TSLEEPQ) { |
471 | td->td_flags &= ~TDF_TSLEEPQ; | |
666ff13c MD |
472 | cid = LOOKUP(td->td_wchan); |
473 | gid = TCHASHSHIFT(cid); | |
8acf0617 MD |
474 | qp = &gd->gd_tsleep_hash[gid]; |
475 | TAILQ_REMOVE(&qp->queue, td, td_sleepq); | |
476 | if (TAILQ_FIRST(&qp->queue) == NULL) { | |
666ff13c | 477 | ATOMIC_CPUMASK_NANDBIT(slpque_cpumasks[cid], |
c07315c4 MD |
478 | gd->gd_cpuid); |
479 | } | |
ae8e83e6 MD |
480 | td->td_wchan = NULL; |
481 | td->td_wdomain = 0; | |
482 | } | |
483 | } | |
484 | ||
485 | void | |
486 | tsleep_remove(thread_t td) | |
487 | { | |
488 | _tsleep_remove(td); | |
489 | } | |
490 | ||
984263bc MD |
491 | /* |
492 | * General sleep call. Suspends the current process until a wakeup is | |
493 | * performed on the specified identifier. The process will then be made | |
494 | * runnable with the specified priority. Sleeps at most timo/hz seconds | |
377d4740 | 495 | * (0 means no timeout). If flags includes PCATCH flag, signals are checked |
984263bc MD |
496 | * before and after sleeping, else signals are not checked. Returns 0 if |
497 | * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a | |
498 | * signal needs to be delivered, ERESTART is returned if the current system | |
499 | * call should be restarted if possible, and EINTR is returned if the system | |
500 | * call should be interrupted by the signal (return EINTR). | |
26a0694b | 501 | * |
0a3f9b47 MD |
502 | * Note that if we are a process, we release_curproc() before messing with |
503 | * the LWKT scheduler. | |
a46fac56 MD |
504 | * |
505 | * During autoconfiguration or after a panic, a sleep will simply | |
506 | * lower the priority briefly to allow interrupts, then return. | |
94f98873 MD |
507 | * |
508 | * WARNING! This code can't block (short of switching away), or bad things | |
509 | * will happen. No getting tokens, no blocking locks, etc. | |
984263bc MD |
510 | */ |
511 | int | |
5decebc7 | 512 | tsleep(const volatile void *ident, int flags, const char *wmesg, int timo) |
984263bc | 513 | { |
dadab5e9 | 514 | struct thread *td = curthread; |
08f2f1bb | 515 | struct lwp *lp = td->td_lwp; |
0cfcada1 | 516 | struct proc *p = td->td_proc; /* may be NULL */ |
fc17ad60 | 517 | globaldata_t gd; |
344ad853 MD |
518 | int sig; |
519 | int catch; | |
344ad853 | 520 | int error; |
e43a034f | 521 | int oldpri; |
fac0eb3c MD |
522 | struct callout thandle1; |
523 | struct _callout thandle2; | |
984263bc | 524 | |
b0da0c88 MD |
525 | /* |
526 | * Currently a severe hack. Make sure any delayed wakeups | |
527 | * are flushed before we sleep or we might deadlock on whatever | |
528 | * event we are sleeping on. | |
529 | */ | |
530 | if (td->td_flags & TDF_DELAYED_WAKEUP) | |
531 | wakeup_end_delayed(); | |
532 | ||
0cfcada1 MD |
533 | /* |
534 | * NOTE: removed KTRPOINT, it could cause races due to blocking | |
535 | * even in stable. Just scrap it for now. | |
536 | */ | |
5ea440eb | 537 | if (!tsleep_crypto_dump && (tsleep_now_works == 0 || panicstr)) { |
984263bc | 538 | /* |
dbcd0c9b MD |
539 | * After a panic, or before we actually have an operational |
540 | * softclock, just give interrupts a chance, then just return; | |
541 | * | |
984263bc MD |
542 | * don't run any other procs or panic below, |
543 | * in case this is the idle process and already asleep. | |
544 | */ | |
e43a034f | 545 | splz(); |
f9235b6d | 546 | oldpri = td->td_pri; |
e43a034f MD |
547 | lwkt_setpri_self(safepri); |
548 | lwkt_switch(); | |
549 | lwkt_setpri_self(oldpri); | |
984263bc MD |
550 | return (0); |
551 | } | |
8aa3430c | 552 | logtsleep2(tsleep_beg, ident); |
fc17ad60 MD |
553 | gd = td->td_gd; |
554 | KKASSERT(td != &gd->gd_idlethread); /* you must be kidding! */ | |
344ad853 MD |
555 | |
556 | /* | |
557 | * NOTE: all of this occurs on the current cpu, including any | |
558 | * callout-based wakeups, so a critical section is a sufficient | |
559 | * interlock. | |
560 | * | |
561 | * The entire sequence through to where we actually sleep must | |
562 | * run without breaking the critical section. | |
563 | */ | |
344ad853 MD |
564 | catch = flags & PCATCH; |
565 | error = 0; | |
566 | sig = 0; | |
567 | ||
37af14fe | 568 | crit_enter_quick(td); |
344ad853 | 569 | |
0cfcada1 | 570 | KASSERT(ident != NULL, ("tsleep: no ident")); |
7278a846 SS |
571 | KASSERT(lp == NULL || |
572 | lp->lwp_stat == LSRUN || /* Obvious */ | |
573 | lp->lwp_stat == LSSTOP, /* Set in tstop */ | |
574 | ("tsleep %p %s %d", | |
575 | ident, wmesg, lp->lwp_stat)); | |
0cfcada1 | 576 | |
5686ec5a MD |
577 | /* |
578 | * We interlock the sleep queue if the caller has not already done | |
579 | * it for us. This must be done before we potentially acquire any | |
580 | * tokens or we can loose the wakeup. | |
581 | */ | |
582 | if ((flags & PINTERLOCKED) == 0) { | |
5686ec5a MD |
583 | _tsleep_interlock(gd, ident, flags); |
584 | } | |
585 | ||
344ad853 | 586 | /* |
4643740a MD |
587 | * Setup for the current process (if this is a process). We must |
588 | * interlock with lwp_token to avoid remote wakeup races via | |
589 | * setrunnable() | |
344ad853 | 590 | */ |
08f2f1bb | 591 | if (lp) { |
4643740a | 592 | lwkt_gettoken(&lp->lwp_token); |
1a3f33f1 MD |
593 | |
594 | /* | |
595 | * If the umbrella process is in the SCORE state then | |
596 | * make sure that the thread is flagged going into a | |
597 | * normal sleep to allow the core dump to proceed, otherwise | |
598 | * the coredump can end up waiting forever. If the normal | |
599 | * sleep is woken up, the thread will enter a stopped state | |
600 | * upon return to userland. | |
601 | * | |
602 | * We do not want to interrupt or cause a thread exist at | |
603 | * this juncture because that will mess-up the state the | |
604 | * coredump is trying to save. | |
605 | */ | |
ac39aef5 MD |
606 | if (p->p_stat == SCORE) { |
607 | lwkt_gettoken(&p->p_token); | |
608 | if ((lp->lwp_mpflags & LWP_MP_WSTOP) == 0) { | |
609 | atomic_set_int(&lp->lwp_mpflags, LWP_MP_WSTOP); | |
610 | ++p->p_nstopped; | |
611 | } | |
612 | lwkt_reltoken(&p->p_token); | |
1a3f33f1 MD |
613 | } |
614 | ||
615 | /* | |
616 | * PCATCH requested. | |
617 | */ | |
344ad853 MD |
618 | if (catch) { |
619 | /* | |
620 | * Early termination if PCATCH was set and a | |
621 | * signal is pending, interlocked with the | |
622 | * critical section. | |
623 | * | |
624 | * Early termination only occurs when tsleep() is | |
164b8401 | 625 | * entered while in a normal LSRUN state. |
344ad853 | 626 | */ |
08f2f1bb | 627 | if ((sig = CURSIG(lp)) != 0) |
344ad853 MD |
628 | goto resume; |
629 | ||
630 | /* | |
5686ec5a | 631 | * Causes ksignal to wake us up if a signal is |
a094cc95 | 632 | * received (interlocked with lp->lwp_token). |
344ad853 | 633 | */ |
4643740a | 634 | lp->lwp_flags |= LWP_SINTR; |
344ad853 | 635 | } |
5686ec5a MD |
636 | } else { |
637 | KKASSERT(p == NULL); | |
4ecd8190 | 638 | } |
344ad853 | 639 | |
4ecd8190 | 640 | /* |
4ecd8190 MD |
641 | * Make sure the current process has been untangled from |
642 | * the userland scheduler and initialize slptime to start | |
5686ec5a | 643 | * counting. |
c75e41b7 MD |
644 | * |
645 | * NOTE: td->td_wakefromcpu is pre-set by the release function | |
646 | * for the dfly scheduler, and then adjusted by _wakeup() | |
4ecd8190 MD |
647 | */ |
648 | if (lp) { | |
08f2f1bb SS |
649 | p->p_usched->release_curproc(lp); |
650 | lp->lwp_slptime = 0; | |
0a3f9b47 | 651 | } |
fc17ad60 | 652 | |
d9345d3a | 653 | /* |
afd7f124 MD |
654 | * For PINTERLOCKED operation, TDF_TSLEEPQ might not be set if |
655 | * a wakeup() was processed before the thread could go to sleep. | |
656 | * | |
657 | * If TDF_TSLEEPQ is set, make sure the ident matches the recorded | |
658 | * ident. If it does not then the thread slept inbetween the | |
659 | * caller's initial tsleep_interlock() call and the caller's tsleep() | |
660 | * call. | |
d9345d3a | 661 | * |
4ecd8190 MD |
662 | * Extreme loads can cause the sending of an IPI (e.g. wakeup()'s) |
663 | * to process incoming IPIs, thus draining incoming wakeups. | |
d9345d3a | 664 | */ |
4ecd8190 MD |
665 | if ((td->td_flags & TDF_TSLEEPQ) == 0) { |
666 | logtsleep2(ilockfail, ident); | |
667 | goto resume; | |
afd7f124 MD |
668 | } else if (td->td_wchan != ident || |
669 | td->td_wdomain != (flags & PDOMAIN_MASK)) { | |
670 | logtsleep2(ilockfail, ident); | |
671 | goto resume; | |
d9345d3a | 672 | } |
4ecd8190 MD |
673 | |
674 | /* | |
675 | * scheduling is blocked while in a critical section. Coincide | |
676 | * the descheduled-by-tsleep flag with the descheduling of the | |
677 | * lwkt. | |
8d446850 MD |
678 | * |
679 | * The timer callout is localized on our cpu and interlocked by | |
680 | * our critical section. | |
4ecd8190 | 681 | */ |
37af14fe | 682 | lwkt_deschedule_self(td); |
ae8e83e6 | 683 | td->td_flags |= TDF_TSLEEP_DESCHEDULED; |
344ad853 | 684 | td->td_wmesg = wmesg; |
344ad853 MD |
685 | |
686 | /* | |
8d446850 MD |
687 | * Setup the timeout, if any. The timeout is only operable while |
688 | * the thread is flagged descheduled. | |
344ad853 | 689 | */ |
8d446850 | 690 | KKASSERT((td->td_flags & TDF_TIMEOUT) == 0); |
076fecef | 691 | if (timo) { |
fac0eb3c | 692 | _callout_setup_quick(&thandle1, &thandle2, timo, endtsleep, td); |
076fecef | 693 | } |
344ad853 | 694 | |
984263bc | 695 | /* |
344ad853 | 696 | * Beddy bye bye. |
984263bc | 697 | */ |
08f2f1bb | 698 | if (lp) { |
26a0694b | 699 | /* |
52eedfb5 | 700 | * Ok, we are sleeping. Place us in the SSLEEP state. |
26a0694b | 701 | */ |
4643740a | 702 | KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0); |
6b4d33c2 | 703 | |
7278a846 SS |
704 | /* |
705 | * tstop() sets LSSTOP, so don't fiddle with that. | |
706 | */ | |
707 | if (lp->lwp_stat != LSSTOP) | |
708 | lp->lwp_stat = LSSLEEP; | |
08f2f1bb | 709 | lp->lwp_ru.ru_nvcsw++; |
e28d8b15 | 710 | p->p_usched->uload_update(lp); |
de4d4cb0 | 711 | lwkt_switch(); |
ab554892 MD |
712 | |
713 | /* | |
164b8401 | 714 | * And when we are woken up, put us back in LSRUN. If we |
ab554892 MD |
715 | * slept for over a second, recalculate our estcpu. |
716 | */ | |
164b8401 | 717 | lp->lwp_stat = LSRUN; |
de4d4cb0 MD |
718 | if (lp->lwp_slptime) { |
719 | p->p_usched->uload_update(lp); | |
08f2f1bb | 720 | p->p_usched->recalculate(lp); |
de4d4cb0 | 721 | } |
08f2f1bb | 722 | lp->lwp_slptime = 0; |
0cfcada1 MD |
723 | } else { |
724 | lwkt_switch(); | |
725 | } | |
344ad853 | 726 | |
bce6845a | 727 | /* |
fc17ad60 | 728 | * Make sure we haven't switched cpus while we were asleep. It's |
344ad853 | 729 | * not supposed to happen. Cleanup our temporary flags. |
fc17ad60 MD |
730 | */ |
731 | KKASSERT(gd == td->td_gd); | |
344ad853 MD |
732 | |
733 | /* | |
8d446850 | 734 | * Cleanup the timeout. If the timeout has already occured thandle |
8ef3e14a MD |
735 | * has already been stopped, otherwise stop thandle. |
736 | * | |
737 | * If the timeout is still running the callout thread must be blocked | |
738 | * trying to get lwp_token, or this is a VM where cpu-cpu races are | |
739 | * common, then wait for us to get scheduled. | |
344ad853 MD |
740 | */ |
741 | if (timo) { | |
4643740a | 742 | while (td->td_flags & TDF_TIMEOUT_RUNNING) { |
a739e4f8 MD |
743 | /* else we won't get rescheduled! */ |
744 | if (lp->lwp_stat != LSSTOP) | |
745 | lp->lwp_stat = LSSLEEP; | |
4643740a MD |
746 | lwkt_deschedule_self(td); |
747 | td->td_wmesg = "tsrace"; | |
748 | lwkt_switch(); | |
4643740a | 749 | } |
344ad853 MD |
750 | if (td->td_flags & TDF_TIMEOUT) { |
751 | td->td_flags &= ~TDF_TIMEOUT; | |
a40da8f0 | 752 | error = EWOULDBLOCK; |
344ad853 | 753 | } else { |
fac0eb3c MD |
754 | /* |
755 | * We are on the same cpu so use the quick version | |
756 | * which is guaranteed not to block or race. | |
757 | */ | |
758 | _callout_cancel_quick(&thandle2); | |
344ad853 | 759 | } |
0cfcada1 | 760 | } |
4643740a | 761 | td->td_flags &= ~TDF_TSLEEP_DESCHEDULED; |
344ad853 MD |
762 | |
763 | /* | |
8d446850 MD |
764 | * Make sure we have been removed from the sleepq. In most |
765 | * cases this will have been done for us already but it is | |
766 | * possible for a scheduling IPI to be in-flight from a | |
767 | * previous tsleep/tsleep_interlock() or due to a straight-out | |
768 | * call to lwkt_schedule() (in the case of an interrupt thread), | |
769 | * causing a spurious wakeup. | |
344ad853 | 770 | */ |
ae8e83e6 | 771 | _tsleep_remove(td); |
344ad853 | 772 | td->td_wmesg = NULL; |
344ad853 MD |
773 | |
774 | /* | |
7c1212ec | 775 | * Figure out the correct error return. If interrupted by a |
bce6845a | 776 | * signal we want to return EINTR or ERESTART. |
344ad853 MD |
777 | */ |
778 | resume: | |
4643740a | 779 | if (lp) { |
7c1212ec | 780 | if (catch && error == 0) { |
94f98873 | 781 | if (sig != 0 || (sig = CURSIG(lp))) { |
7c1212ec MD |
782 | if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig)) |
783 | error = EINTR; | |
784 | else | |
785 | error = ERESTART; | |
786 | } | |
984263bc | 787 | } |
5525fede | 788 | |
4643740a | 789 | lp->lwp_flags &= ~LWP_SINTR; |
5525fede MD |
790 | |
791 | /* | |
9c960153 MD |
792 | * Unconditionally set us to LSRUN on resume. lwp_stat could |
793 | * be in a weird state due to the goto resume, particularly | |
794 | * when tsleep() is called from tstop(). | |
5525fede | 795 | */ |
9c960153 | 796 | lp->lwp_stat = LSRUN; |
4643740a | 797 | lwkt_reltoken(&lp->lwp_token); |
984263bc | 798 | } |
8aa3430c | 799 | logtsleep1(tsleep_end); |
344ad853 | 800 | crit_exit_quick(td); |
afd7f124 | 801 | |
344ad853 | 802 | return (error); |
984263bc MD |
803 | } |
804 | ||
bf765287 MD |
805 | /* |
806 | * Interlocked spinlock sleep. An exclusively held spinlock must | |
e590ee86 | 807 | * be passed to ssleep(). The function will atomically release the |
bf765287 MD |
808 | * spinlock and tsleep on the ident, then reacquire the spinlock and |
809 | * return. | |
810 | * | |
811 | * This routine is fairly important along the critical path, so optimize it | |
812 | * heavily. | |
813 | */ | |
814 | int | |
5decebc7 | 815 | ssleep(const volatile void *ident, struct spinlock *spin, int flags, |
bf765287 MD |
816 | const char *wmesg, int timo) |
817 | { | |
818 | globaldata_t gd = mycpu; | |
819 | int error; | |
16523a43 | 820 | |
ae8e83e6 | 821 | _tsleep_interlock(gd, ident, flags); |
7cfe2b28 | 822 | spin_unlock_quick(gd, spin); |
ef48be0d | 823 | error = tsleep(ident, flags | PINTERLOCKED, wmesg, timo); |
a8eec89c | 824 | KKASSERT(gd == mycpu); |
050032ec | 825 | _spin_lock_quick(gd, spin, wmesg); |
bf765287 MD |
826 | |
827 | return (error); | |
16523a43 MD |
828 | } |
829 | ||
bed060de | 830 | int |
5decebc7 MD |
831 | lksleep(const volatile void *ident, struct lock *lock, int flags, |
832 | const char *wmesg, int timo) | |
bed060de AH |
833 | { |
834 | globaldata_t gd = mycpu; | |
835 | int error; | |
836 | ||
837 | _tsleep_interlock(gd, ident, flags); | |
838 | lockmgr(lock, LK_RELEASE); | |
839 | error = tsleep(ident, flags | PINTERLOCKED, wmesg, timo); | |
840 | lockmgr(lock, LK_EXCLUSIVE); | |
841 | ||
842 | return (error); | |
843 | } | |
844 | ||
7f6220a9 MD |
845 | /* |
846 | * Interlocked mutex sleep. An exclusively held mutex must be passed | |
847 | * to mtxsleep(). The function will atomically release the mutex | |
848 | * and tsleep on the ident, then reacquire the mutex and return. | |
849 | */ | |
850 | int | |
5decebc7 | 851 | mtxsleep(const volatile void *ident, struct mtx *mtx, int flags, |
7f6220a9 MD |
852 | const char *wmesg, int timo) |
853 | { | |
854 | globaldata_t gd = mycpu; | |
855 | int error; | |
856 | ||
857 | _tsleep_interlock(gd, ident, flags); | |
858 | mtx_unlock(mtx); | |
859 | error = tsleep(ident, flags | PINTERLOCKED, wmesg, timo); | |
cabfc9f6 | 860 | mtx_lock_ex_quick(mtx); |
7f6220a9 MD |
861 | |
862 | return (error); | |
863 | } | |
864 | ||
362e59be SZ |
865 | /* |
866 | * Interlocked serializer sleep. An exclusively held serializer must | |
ed3f6624 | 867 | * be passed to zsleep(). The function will atomically release |
362e59be SZ |
868 | * the serializer and tsleep on the ident, then reacquire the serializer |
869 | * and return. | |
870 | */ | |
871 | int | |
5decebc7 | 872 | zsleep(const volatile void *ident, struct lwkt_serialize *slz, int flags, |
ed3f6624 | 873 | const char *wmesg, int timo) |
362e59be | 874 | { |
ae8e83e6 | 875 | globaldata_t gd = mycpu; |
362e59be SZ |
876 | int ret; |
877 | ||
878 | ASSERT_SERIALIZED(slz); | |
879 | ||
ae8e83e6 | 880 | _tsleep_interlock(gd, ident, flags); |
362e59be | 881 | lwkt_serialize_exit(slz); |
ef48be0d | 882 | ret = tsleep(ident, flags | PINTERLOCKED, wmesg, timo); |
362e59be | 883 | lwkt_serialize_enter(slz); |
362e59be SZ |
884 | |
885 | return ret; | |
886 | } | |
887 | ||
a22c590e MD |
888 | /* |
889 | * Directly block on the LWKT thread by descheduling it. This | |
890 | * is much faster then tsleep(), but the only legal way to wake | |
891 | * us up is to directly schedule the thread. | |
892 | * | |
893 | * Setting TDF_SINTR will cause new signals to directly schedule us. | |
894 | * | |
ae8e83e6 | 895 | * This routine must be called while in a critical section. |
a22c590e MD |
896 | */ |
897 | int | |
898 | lwkt_sleep(const char *wmesg, int flags) | |
899 | { | |
900 | thread_t td = curthread; | |
901 | int sig; | |
902 | ||
903 | if ((flags & PCATCH) == 0 || td->td_lwp == NULL) { | |
904 | td->td_flags |= TDF_BLOCKED; | |
905 | td->td_wmesg = wmesg; | |
906 | lwkt_deschedule_self(td); | |
907 | lwkt_switch(); | |
908 | td->td_wmesg = NULL; | |
909 | td->td_flags &= ~TDF_BLOCKED; | |
910 | return(0); | |
911 | } | |
912 | if ((sig = CURSIG(td->td_lwp)) != 0) { | |
913 | if (SIGISMEMBER(td->td_proc->p_sigacts->ps_sigintr, sig)) | |
914 | return(EINTR); | |
915 | else | |
916 | return(ERESTART); | |
bce6845a | 917 | |
a22c590e MD |
918 | } |
919 | td->td_flags |= TDF_BLOCKED | TDF_SINTR; | |
920 | td->td_wmesg = wmesg; | |
921 | lwkt_deschedule_self(td); | |
922 | lwkt_switch(); | |
923 | td->td_flags &= ~(TDF_BLOCKED | TDF_SINTR); | |
924 | td->td_wmesg = NULL; | |
925 | return(0); | |
926 | } | |
927 | ||
984263bc | 928 | /* |
344ad853 | 929 | * Implement the timeout for tsleep. |
fc17ad60 | 930 | * |
344ad853 MD |
931 | * This type of callout timeout is scheduled on the same cpu the process |
932 | * is sleeping on. Also, at the moment, the MP lock is held. | |
984263bc MD |
933 | */ |
934 | static void | |
0cfcada1 | 935 | endtsleep(void *arg) |
984263bc | 936 | { |
0cfcada1 | 937 | thread_t td = arg; |
9a379a4a | 938 | struct lwp *lp; |
984263bc | 939 | |
8d446850 | 940 | /* |
4643740a MD |
941 | * We are going to have to get the lwp_token, which means we might |
942 | * block. This can race a tsleep getting woken up by other means | |
943 | * so set TDF_TIMEOUT_RUNNING to force the tsleep to wait for our | |
944 | * processing to complete (sorry tsleep!). | |
945 | * | |
946 | * We can safely set td_flags because td MUST be on the same cpu | |
947 | * as we are. | |
8d446850 | 948 | */ |
4643740a MD |
949 | KKASSERT(td->td_gd == mycpu); |
950 | crit_enter(); | |
951 | td->td_flags |= TDF_TIMEOUT_RUNNING | TDF_TIMEOUT; | |
344ad853 MD |
952 | |
953 | /* | |
4643740a MD |
954 | * This can block but TDF_TIMEOUT_RUNNING will prevent the thread |
955 | * from exiting the tsleep on us. The flag is interlocked by virtue | |
956 | * of lp being on the same cpu as we are. | |
344ad853 | 957 | */ |
8d446850 | 958 | if ((lp = td->td_lwp) != NULL) |
e2b148c6 | 959 | lwkt_gettoken(&lp->lwp_token); |
344ad853 | 960 | |
4643740a MD |
961 | KKASSERT(td->td_flags & TDF_TSLEEP_DESCHEDULED); |
962 | ||
963 | if (lp) { | |
d4688a1e | 964 | /* |
9c960153 MD |
965 | * callout timer should normally never be set in tstop() |
966 | * because it passes a timeout of 0. However, there is a | |
967 | * case during thread exit (which SSTOP's all the threads) | |
968 | * for which tstop() must break out and can (properly) leave | |
969 | * the thread in LSSTOP. | |
d4688a1e | 970 | */ |
9c960153 MD |
971 | KKASSERT(lp->lwp_stat != LSSTOP || |
972 | (lp->lwp_mpflags & LWP_MP_WEXIT)); | |
d4688a1e | 973 | setrunnable(lp); |
e2b148c6 | 974 | lwkt_reltoken(&lp->lwp_token); |
4643740a MD |
975 | } else { |
976 | _tsleep_remove(td); | |
977 | lwkt_schedule(td); | |
978 | } | |
979 | KKASSERT(td->td_gd == mycpu); | |
980 | td->td_flags &= ~TDF_TIMEOUT_RUNNING; | |
37af14fe | 981 | crit_exit(); |
984263bc MD |
982 | } |
983 | ||
8fb8bca6 EN |
984 | /* |
985 | * Make all processes sleeping on the specified identifier runnable. | |
fc17ad60 MD |
986 | * count may be zero or one only. |
987 | * | |
c75e41b7 MD |
988 | * The domain encodes the sleep/wakeup domain, flags, plus the originating |
989 | * cpu. | |
344ad853 MD |
990 | * |
991 | * This call may run without the MP lock held. We can only manipulate thread | |
992 | * state on the cpu owning the thread. We CANNOT manipulate process state | |
993 | * at all. | |
5decebc7 MD |
994 | * |
995 | * _wakeup() can be passed to an IPI so we can't use (const volatile | |
996 | * void *ident). | |
8fb8bca6 EN |
997 | */ |
998 | static void | |
fc17ad60 | 999 | _wakeup(void *ident, int domain) |
984263bc | 1000 | { |
fc17ad60 | 1001 | struct tslpque *qp; |
0cfcada1 MD |
1002 | struct thread *td; |
1003 | struct thread *ntd; | |
fc17ad60 | 1004 | globaldata_t gd; |
fc17ad60 | 1005 | cpumask_t mask; |
666ff13c MD |
1006 | uint32_t cid; |
1007 | uint32_t gid; | |
8acf0617 | 1008 | int wids = 0; |
984263bc | 1009 | |
37af14fe | 1010 | crit_enter(); |
8aa3430c | 1011 | logtsleep2(wakeup_beg, ident); |
fc17ad60 | 1012 | gd = mycpu; |
666ff13c MD |
1013 | cid = LOOKUP(ident); |
1014 | gid = TCHASHSHIFT(cid); | |
1015 | qp = &gd->gd_tsleep_hash[gid]; | |
984263bc | 1016 | restart: |
8acf0617 | 1017 | for (td = TAILQ_FIRST(&qp->queue); td != NULL; td = ntd) { |
ae8e83e6 | 1018 | ntd = TAILQ_NEXT(td, td_sleepq); |
bce6845a | 1019 | if (td->td_wchan == ident && |
fc17ad60 MD |
1020 | td->td_wdomain == (domain & PDOMAIN_MASK) |
1021 | ) { | |
ae8e83e6 MD |
1022 | KKASSERT(td->td_gd == gd); |
1023 | _tsleep_remove(td); | |
c75e41b7 | 1024 | td->td_wakefromcpu = PWAKEUP_DECODE(domain); |
ae8e83e6 | 1025 | if (td->td_flags & TDF_TSLEEP_DESCHEDULED) { |
ae8e83e6 MD |
1026 | lwkt_schedule(td); |
1027 | if (domain & PWAKEUP_ONE) | |
1028 | goto done; | |
fc17ad60 | 1029 | } |
0cfcada1 | 1030 | goto restart; |
984263bc | 1031 | } |
8acf0617 MD |
1032 | if (td->td_wchan == qp->ident0) |
1033 | wids |= 1; | |
1034 | else if (td->td_wchan == qp->ident1) | |
1035 | wids |= 2; | |
1036 | else if (td->td_wchan == qp->ident2) | |
1037 | wids |= 4; | |
1038 | else if (td->td_wchan == qp->ident3) | |
1039 | wids |= 8; | |
1040 | else | |
1041 | wids |= 16; /* force ident0 to be retained (-1) */ | |
984263bc | 1042 | } |
fc17ad60 | 1043 | |
f26f7bb3 MD |
1044 | /* |
1045 | * Because a bunch of cpumask array entries cover the same queue, it | |
1046 | * is possible for our bit to remain set in some of them and cause | |
1047 | * spurious wakeup IPIs later on. Make sure that the bit is cleared | |
1048 | * when a spurious IPI occurs to prevent further spurious IPIs. | |
1049 | */ | |
8acf0617 | 1050 | if (TAILQ_FIRST(&qp->queue) == NULL) { |
f26f7bb3 | 1051 | ATOMIC_CPUMASK_NANDBIT(slpque_cpumasks[cid], gd->gd_cpuid); |
8acf0617 MD |
1052 | qp->ident0 = NULL; |
1053 | qp->ident1 = NULL; | |
1054 | qp->ident2 = NULL; | |
1055 | qp->ident3 = NULL; | |
1056 | } else { | |
1057 | if ((wids & 1) == 0) { | |
a8eec89c | 1058 | if ((wids & 16) == 0) { |
8acf0617 | 1059 | qp->ident0 = NULL; |
a8eec89c MD |
1060 | } else { |
1061 | KKASSERT(qp->ident0 == (void *)(intptr_t)-1); | |
1062 | } | |
8acf0617 MD |
1063 | } |
1064 | if ((wids & 2) == 0) | |
1065 | qp->ident1 = NULL; | |
1066 | if ((wids & 4) == 0) | |
1067 | qp->ident2 = NULL; | |
1068 | if ((wids & 8) == 0) | |
1069 | qp->ident3 = NULL; | |
f26f7bb3 MD |
1070 | } |
1071 | ||
fc17ad60 MD |
1072 | /* |
1073 | * We finished checking the current cpu but there still may be | |
1074 | * more work to do. Either wakeup_one was requested and no matching | |
1075 | * thread was found, or a normal wakeup was requested and we have | |
1076 | * to continue checking cpus. | |
1077 | * | |
fc17ad60 | 1078 | * It should be noted that this scheme is actually less expensive then |
bce6845a | 1079 | * the old scheme when waking up multiple threads, since we send |
fc17ad60 MD |
1080 | * only one IPI message per target candidate which may then schedule |
1081 | * multiple threads. Before we could have wound up sending an IPI | |
1082 | * message for each thread on the target cpu (!= current cpu) that | |
1083 | * needed to be woken up. | |
1084 | * | |
1085 | * NOTE: Wakeups occuring on remote cpus are asynchronous. This | |
e676ebda MD |
1086 | * should be ok since we are passing idents in the IPI rather |
1087 | * then thread pointers. | |
1088 | * | |
9b302485 | 1089 | * NOTE: We MUST mfence (or use an atomic op) prior to reading |
e676ebda MD |
1090 | * the cpumask, as another cpu may have written to it in |
1091 | * a fashion interlocked with whatever the caller did before | |
1092 | * calling wakeup(). Otherwise we might miss the interaction | |
1093 | * (kern_mutex.c can cause this problem). | |
9b302485 MD |
1094 | * |
1095 | * lfence is insufficient as it may allow a written state to | |
1096 | * reorder around the cpumask load. | |
fc17ad60 | 1097 | */ |
c07315c4 | 1098 | if ((domain & PWAKEUP_MYCPU) == 0) { |
8acf0617 | 1099 | globaldata_t tgd; |
35a64553 | 1100 | const volatile void *id0; |
8acf0617 MD |
1101 | int n; |
1102 | ||
9b302485 | 1103 | cpu_mfence(); |
a8eec89c | 1104 | /* cpu_lfence(); */ |
666ff13c | 1105 | mask = slpque_cpumasks[cid]; |
c07315c4 | 1106 | CPUMASK_ANDMASK(mask, gd->gd_other_cpus); |
8acf0617 MD |
1107 | while (CPUMASK_TESTNZERO(mask)) { |
1108 | n = BSRCPUMASK(mask); | |
1109 | CPUMASK_NANDBIT(mask, n); | |
1110 | tgd = globaldata_find(n); | |
b4d1b684 MD |
1111 | |
1112 | /* | |
1113 | * Both ident0 compares must from a single load | |
1114 | * to avoid ident0 update races crossing the two | |
1115 | * compares. | |
1116 | */ | |
a8eec89c | 1117 | qp = &tgd->gd_tsleep_hash[gid]; |
b4d1b684 MD |
1118 | id0 = qp->ident0; |
1119 | cpu_ccfence(); | |
1120 | if (id0 == (void *)(intptr_t)-1) { | |
1121 | lwkt_send_ipiq2(tgd, _wakeup, ident, | |
1122 | domain | PWAKEUP_MYCPU); | |
8acf0617 | 1123 | ++tgd->gd_cnt.v_wakeup_colls; |
b4d1b684 MD |
1124 | } else if (id0 == ident || |
1125 | qp->ident1 == ident || | |
1126 | qp->ident2 == ident || | |
1127 | qp->ident3 == ident) { | |
8acf0617 MD |
1128 | lwkt_send_ipiq2(tgd, _wakeup, ident, |
1129 | domain | PWAKEUP_MYCPU); | |
1130 | } | |
a8eec89c | 1131 | } |
8acf0617 | 1132 | #if 0 |
a8eec89c | 1133 | if (CPUMASK_TESTNZERO(mask)) { |
c07315c4 MD |
1134 | lwkt_send_ipiq2_mask(mask, _wakeup, ident, |
1135 | domain | PWAKEUP_MYCPU); | |
1136 | } | |
a8eec89c | 1137 | #endif |
fc17ad60 | 1138 | } |
fc17ad60 | 1139 | done: |
8aa3430c | 1140 | logtsleep1(wakeup_end); |
37af14fe | 1141 | crit_exit(); |
984263bc MD |
1142 | } |
1143 | ||
b336a9b1 MD |
1144 | /* |
1145 | * Wakeup all threads tsleep()ing on the specified ident, on all cpus | |
1146 | */ | |
984263bc | 1147 | void |
5decebc7 | 1148 | wakeup(const volatile void *ident) |
984263bc | 1149 | { |
b0da0c88 MD |
1150 | globaldata_t gd = mycpu; |
1151 | thread_t td = gd->gd_curthread; | |
1152 | ||
1153 | if (td && (td->td_flags & TDF_DELAYED_WAKEUP)) { | |
2e90abac MD |
1154 | /* |
1155 | * If we are in a delayed wakeup section, record up to two wakeups in | |
1156 | * a per-CPU queue and issue them when we block or exit the delayed | |
1157 | * wakeup section. | |
1158 | */ | |
1159 | if (atomic_cmpset_ptr(&gd->gd_delayed_wakeup[0], NULL, ident)) | |
1160 | return; | |
1161 | if (atomic_cmpset_ptr(&gd->gd_delayed_wakeup[1], NULL, ident)) | |
1162 | return; | |
1163 | ||
1164 | ident = atomic_swap_ptr(__DEQUALIFY(volatile void **, &gd->gd_delayed_wakeup[1]), | |
1165 | __DEALL(ident)); | |
1166 | ident = atomic_swap_ptr(__DEQUALIFY(volatile void **, &gd->gd_delayed_wakeup[0]), | |
1167 | __DEALL(ident)); | |
b0da0c88 | 1168 | } |
2e90abac | 1169 | |
b0da0c88 | 1170 | _wakeup(__DEALL(ident), PWAKEUP_ENCODE(0, gd->gd_cpuid)); |
0cfcada1 | 1171 | } |
984263bc | 1172 | |
b336a9b1 MD |
1173 | /* |
1174 | * Wakeup one thread tsleep()ing on the specified ident, on any cpu. | |
1175 | */ | |
0cfcada1 | 1176 | void |
5decebc7 | 1177 | wakeup_one(const volatile void *ident) |
0cfcada1 | 1178 | { |
fc17ad60 | 1179 | /* XXX potentially round-robin the first responding cpu */ |
c75e41b7 MD |
1180 | _wakeup(__DEALL(ident), PWAKEUP_ENCODE(0, mycpu->gd_cpuid) | |
1181 | PWAKEUP_ONE); | |
da5fb9ef MD |
1182 | } |
1183 | ||
b336a9b1 MD |
1184 | /* |
1185 | * Wakeup threads tsleep()ing on the specified ident on the current cpu | |
1186 | * only. | |
1187 | */ | |
1188 | void | |
5decebc7 | 1189 | wakeup_mycpu(const volatile void *ident) |
b336a9b1 | 1190 | { |
c75e41b7 MD |
1191 | _wakeup(__DEALL(ident), PWAKEUP_ENCODE(0, mycpu->gd_cpuid) | |
1192 | PWAKEUP_MYCPU); | |
b336a9b1 MD |
1193 | } |
1194 | ||
1195 | /* | |
1196 | * Wakeup one thread tsleep()ing on the specified ident on the current cpu | |
1197 | * only. | |
1198 | */ | |
1199 | void | |
5decebc7 | 1200 | wakeup_mycpu_one(const volatile void *ident) |
b336a9b1 MD |
1201 | { |
1202 | /* XXX potentially round-robin the first responding cpu */ | |
c75e41b7 MD |
1203 | _wakeup(__DEALL(ident), PWAKEUP_ENCODE(0, mycpu->gd_cpuid) | |
1204 | PWAKEUP_MYCPU | PWAKEUP_ONE); | |
b336a9b1 MD |
1205 | } |
1206 | ||
1207 | /* | |
1208 | * Wakeup all thread tsleep()ing on the specified ident on the specified cpu | |
1209 | * only. | |
1210 | */ | |
1211 | void | |
5decebc7 | 1212 | wakeup_oncpu(globaldata_t gd, const volatile void *ident) |
b336a9b1 | 1213 | { |
c75e41b7 | 1214 | globaldata_t mygd = mycpu; |
b336a9b1 | 1215 | if (gd == mycpu) { |
c75e41b7 MD |
1216 | _wakeup(__DEALL(ident), PWAKEUP_ENCODE(0, mygd->gd_cpuid) | |
1217 | PWAKEUP_MYCPU); | |
b336a9b1 | 1218 | } else { |
c75e41b7 MD |
1219 | lwkt_send_ipiq2(gd, _wakeup, __DEALL(ident), |
1220 | PWAKEUP_ENCODE(0, mygd->gd_cpuid) | | |
1221 | PWAKEUP_MYCPU); | |
b336a9b1 MD |
1222 | } |
1223 | } | |
1224 | ||
1225 | /* | |
1226 | * Wakeup one thread tsleep()ing on the specified ident on the specified cpu | |
1227 | * only. | |
1228 | */ | |
1229 | void | |
5decebc7 | 1230 | wakeup_oncpu_one(globaldata_t gd, const volatile void *ident) |
b336a9b1 | 1231 | { |
c75e41b7 MD |
1232 | globaldata_t mygd = mycpu; |
1233 | if (gd == mygd) { | |
1234 | _wakeup(__DEALL(ident), PWAKEUP_ENCODE(0, mygd->gd_cpuid) | | |
1235 | PWAKEUP_MYCPU | PWAKEUP_ONE); | |
b336a9b1 | 1236 | } else { |
5decebc7 | 1237 | lwkt_send_ipiq2(gd, _wakeup, __DEALL(ident), |
c75e41b7 | 1238 | PWAKEUP_ENCODE(0, mygd->gd_cpuid) | |
5decebc7 | 1239 | PWAKEUP_MYCPU | PWAKEUP_ONE); |
b336a9b1 MD |
1240 | } |
1241 | } | |
1242 | ||
1243 | /* | |
1244 | * Wakeup all threads waiting on the specified ident that slept using | |
1245 | * the specified domain, on all cpus. | |
1246 | */ | |
da5fb9ef | 1247 | void |
5decebc7 | 1248 | wakeup_domain(const volatile void *ident, int domain) |
da5fb9ef | 1249 | { |
5decebc7 | 1250 | _wakeup(__DEALL(ident), PWAKEUP_ENCODE(domain, mycpu->gd_cpuid)); |
da5fb9ef MD |
1251 | } |
1252 | ||
b336a9b1 MD |
1253 | /* |
1254 | * Wakeup one thread waiting on the specified ident that slept using | |
1255 | * the specified domain, on any cpu. | |
1256 | */ | |
da5fb9ef | 1257 | void |
5decebc7 | 1258 | wakeup_domain_one(const volatile void *ident, int domain) |
da5fb9ef | 1259 | { |
fc17ad60 | 1260 | /* XXX potentially round-robin the first responding cpu */ |
5decebc7 MD |
1261 | _wakeup(__DEALL(ident), |
1262 | PWAKEUP_ENCODE(domain, mycpu->gd_cpuid) | PWAKEUP_ONE); | |
984263bc MD |
1263 | } |
1264 | ||
b0da0c88 MD |
1265 | void |
1266 | wakeup_start_delayed(void) | |
1267 | { | |
1268 | globaldata_t gd = mycpu; | |
1269 | ||
1270 | crit_enter(); | |
1271 | gd->gd_curthread->td_flags |= TDF_DELAYED_WAKEUP; | |
1272 | crit_exit(); | |
1273 | } | |
1274 | ||
1275 | void | |
1276 | wakeup_end_delayed(void) | |
1277 | { | |
1278 | globaldata_t gd = mycpu; | |
1279 | ||
1280 | if (gd->gd_curthread->td_flags & TDF_DELAYED_WAKEUP) { | |
1281 | crit_enter(); | |
1282 | gd->gd_curthread->td_flags &= ~TDF_DELAYED_WAKEUP; | |
1283 | if (gd->gd_delayed_wakeup[0] || gd->gd_delayed_wakeup[1]) { | |
1284 | if (gd->gd_delayed_wakeup[0]) { | |
1285 | wakeup(gd->gd_delayed_wakeup[0]); | |
1286 | gd->gd_delayed_wakeup[0] = NULL; | |
1287 | } | |
1288 | if (gd->gd_delayed_wakeup[1]) { | |
1289 | wakeup(gd->gd_delayed_wakeup[1]); | |
1290 | gd->gd_delayed_wakeup[1] = NULL; | |
1291 | } | |
1292 | } | |
1293 | crit_exit(); | |
1294 | } | |
1295 | } | |
1296 | ||
984263bc | 1297 | /* |
344ad853 MD |
1298 | * setrunnable() |
1299 | * | |
4643740a MD |
1300 | * Make a process runnable. lp->lwp_token must be held on call and this |
1301 | * function must be called from the cpu owning lp. | |
37af14fe | 1302 | * |
4643740a | 1303 | * This only has an effect if we are in LSSTOP or LSSLEEP. |
984263bc MD |
1304 | */ |
1305 | void | |
9a379a4a | 1306 | setrunnable(struct lwp *lp) |
984263bc | 1307 | { |
4643740a MD |
1308 | thread_t td = lp->lwp_thread; |
1309 | ||
e2b148c6 | 1310 | ASSERT_LWKT_TOKEN_HELD(&lp->lwp_token); |
4643740a | 1311 | KKASSERT(td->td_gd == mycpu); |
344ad853 | 1312 | crit_enter(); |
2daf83b0 SS |
1313 | if (lp->lwp_stat == LSSTOP) |
1314 | lp->lwp_stat = LSSLEEP; | |
4643740a MD |
1315 | if (lp->lwp_stat == LSSLEEP) { |
1316 | _tsleep_remove(td); | |
1317 | lwkt_schedule(td); | |
1318 | } else if (td->td_flags & TDF_SINTR) { | |
1319 | lwkt_schedule(td); | |
1320 | } | |
344ad853 | 1321 | crit_exit(); |
984263bc MD |
1322 | } |
1323 | ||
1324 | /* | |
164b8401 | 1325 | * The process is stopped due to some condition, usually because p_stat is |
bce6845a | 1326 | * set to SSTOP, but also possibly due to being traced. |
fc17ad60 | 1327 | * |
4643740a MD |
1328 | * Caller must hold p->p_token |
1329 | * | |
164b8401 | 1330 | * NOTE! If the caller sets SSTOP, the caller must also clear P_WAITED |
344ad853 MD |
1331 | * because the parent may check the child's status before the child actually |
1332 | * gets to this routine. | |
1333 | * | |
9a379a4a | 1334 | * This routine is called with the current lwp only, typically just |
4643740a MD |
1335 | * before returning to userland if the process state is detected as |
1336 | * possibly being in a stopped state. | |
984263bc MD |
1337 | */ |
1338 | void | |
9a379a4a | 1339 | tstop(void) |
984263bc | 1340 | { |
9a379a4a | 1341 | struct lwp *lp = curthread->td_lwp; |
7278a846 | 1342 | struct proc *p = lp->lwp_proc; |
8c986a82 | 1343 | struct proc *q; |
9a379a4a | 1344 | |
4643740a | 1345 | lwkt_gettoken(&lp->lwp_token); |
7278a846 | 1346 | crit_enter(); |
4643740a | 1347 | |
f33e8653 | 1348 | /* |
4643740a | 1349 | * If LWP_MP_WSTOP is set, we were sleeping |
f33e8653 SS |
1350 | * while our process was stopped. At this point |
1351 | * we were already counted as stopped. | |
1352 | */ | |
4643740a | 1353 | if ((lp->lwp_mpflags & LWP_MP_WSTOP) == 0) { |
f33e8653 SS |
1354 | /* |
1355 | * If we're the last thread to stop, signal | |
1356 | * our parent. | |
1357 | */ | |
1358 | p->p_nstopped++; | |
4643740a | 1359 | atomic_set_int(&lp->lwp_mpflags, LWP_MP_WSTOP); |
ea59a697 | 1360 | wakeup(&p->p_nstopped); |
f33e8653 | 1361 | if (p->p_nstopped == p->p_nthreads) { |
8c986a82 MD |
1362 | /* |
1363 | * Token required to interlock kern_wait() | |
1364 | */ | |
1365 | q = p->p_pptr; | |
1366 | PHOLD(q); | |
1367 | lwkt_gettoken(&q->p_token); | |
4643740a | 1368 | p->p_flags &= ~P_WAITED; |
f33e8653 | 1369 | wakeup(p->p_pptr); |
8c986a82 MD |
1370 | if ((q->p_sigacts->ps_flag & PS_NOCLDSTOP) == 0) |
1371 | ksignal(q, SIGCHLD); | |
1372 | lwkt_reltoken(&q->p_token); | |
1373 | PRELE(q); | |
f33e8653 SS |
1374 | } |
1375 | } | |
0001762f MD |
1376 | |
1377 | /* | |
1378 | * Wait here while in a stopped state, interlocked with lwp_token. | |
1379 | * We must break-out if the whole process is trying to exit. | |
1380 | */ | |
9c960153 | 1381 | while (STOPLWP(p, lp)) { |
ea59a697 SS |
1382 | lp->lwp_stat = LSSTOP; |
1383 | tsleep(p, 0, "stop", 0); | |
1384 | } | |
7278a846 | 1385 | p->p_nstopped--; |
4643740a | 1386 | atomic_clear_int(&lp->lwp_mpflags, LWP_MP_WSTOP); |
7278a846 | 1387 | crit_exit(); |
4643740a | 1388 | lwkt_reltoken(&lp->lwp_token); |
26a0694b MD |
1389 | } |
1390 | ||
984263bc MD |
1391 | /* |
1392 | * Compute a tenex style load average of a quantity on | |
586c4308 MD |
1393 | * 1, 5 and 15 minute intervals. This is a pcpu callout. |
1394 | * | |
1395 | * We segment the lwp scan on a pcpu basis. This does NOT | |
1396 | * mean the associated lwps are on this cpu, it is done | |
1397 | * just to break the work up. | |
1398 | * | |
1399 | * The callout on cpu0 rolls up the stats from the other | |
1400 | * cpus. | |
984263bc | 1401 | */ |
c7e98b2f | 1402 | static int loadav_count_runnable(struct lwp *p, void *data); |
8fa76237 | 1403 | |
984263bc MD |
1404 | static void |
1405 | loadav(void *arg) | |
1406 | { | |
586c4308 | 1407 | globaldata_t gd = mycpu; |
984263bc | 1408 | struct loadavg *avg; |
8fa76237 | 1409 | int i, nrun; |
984263bc | 1410 | |
984263bc | 1411 | nrun = 0; |
586c4308 MD |
1412 | alllwp_scan(loadav_count_runnable, &nrun, 1); |
1413 | gd->gd_loadav_nrunnable = nrun; | |
1414 | if (gd->gd_cpuid == 0) { | |
1415 | avg = &averunnable; | |
1416 | nrun = 0; | |
1417 | for (i = 0; i < ncpus; ++i) | |
1418 | nrun += globaldata_find(i)->gd_loadav_nrunnable; | |
1419 | for (i = 0; i < 3; i++) { | |
1420 | avg->ldavg[i] = (cexp[i] * avg->ldavg[i] + | |
1421 | (long)nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT; | |
1422 | } | |
8fa76237 | 1423 | } |
984263bc MD |
1424 | |
1425 | /* | |
1426 | * Schedule the next update to occur after 5 seconds, but add a | |
1427 | * random variation to avoid synchronisation with processes that | |
1428 | * run at regular intervals. | |
1429 | */ | |
586c4308 MD |
1430 | callout_reset(&gd->gd_loadav_callout, |
1431 | hz * 4 + (int)(krandom() % (hz * 2 + 1)), | |
8fa76237 MD |
1432 | loadav, NULL); |
1433 | } | |
1434 | ||
1435 | static int | |
c7e98b2f | 1436 | loadav_count_runnable(struct lwp *lp, void *data) |
8fa76237 MD |
1437 | { |
1438 | int *nrunp = data; | |
1439 | thread_t td; | |
1440 | ||
164b8401 SS |
1441 | switch (lp->lwp_stat) { |
1442 | case LSRUN: | |
08f2f1bb | 1443 | if ((td = lp->lwp_thread) == NULL) |
8fa76237 MD |
1444 | break; |
1445 | if (td->td_flags & TDF_BLOCKED) | |
1446 | break; | |
8fa76237 MD |
1447 | ++*nrunp; |
1448 | break; | |
1449 | default: | |
1450 | break; | |
1451 | } | |
d2d8515b | 1452 | lwkt_yield(); |
8fa76237 | 1453 | return(0); |
984263bc MD |
1454 | } |
1455 | ||
f6aeec64 MD |
1456 | /* |
1457 | * Regular data collection | |
1458 | */ | |
1459 | static uint64_t | |
1460 | collect_load_callback(int n) | |
1461 | { | |
77bc82e1 MD |
1462 | int fscale = averunnable.fscale; |
1463 | ||
1464 | return ((averunnable.ldavg[0] * 100 + (fscale >> 1)) / fscale); | |
f6aeec64 MD |
1465 | } |
1466 | ||
984263bc | 1467 | static void |
666ff13c | 1468 | sched_setup(void *dummy __unused) |
984263bc | 1469 | { |
666ff13c MD |
1470 | globaldata_t save_gd = mycpu; |
1471 | globaldata_t gd; | |
1472 | int n; | |
1473 | ||
f6aeec64 MD |
1474 | kcollect_register(KCOLLECT_LOAD, "load", collect_load_callback, |
1475 | KCOLLECT_SCALE(KCOLLECT_LOAD_FORMAT, 0)); | |
666ff13c MD |
1476 | |
1477 | /* | |
1478 | * Kick off timeout driven events by calling first time. We | |
1479 | * split the work across available cpus to help scale it, | |
1480 | * it can eat a lot of cpu when there are a lot of processes | |
1481 | * on the system. | |
1482 | */ | |
1483 | for (n = 0; n < ncpus; ++n) { | |
1484 | gd = globaldata_find(n); | |
1485 | lwkt_setcpu_self(gd); | |
1486 | callout_init_mp(&gd->gd_loadav_callout); | |
1487 | callout_init_mp(&gd->gd_schedcpu_callout); | |
1488 | schedcpu(NULL); | |
1489 | loadav(NULL); | |
1490 | } | |
1491 | lwkt_setcpu_self(save_gd); | |
1492 | } | |
1493 | ||
1494 | /* | |
1495 | * Extremely early initialization, dummy-up the tables so we don't have | |
1496 | * to conditionalize for NULL in _wakeup() and tsleep_interlock(). Even | |
1497 | * though the system isn't blocking this early, these functions still | |
1498 | * try to access the hash table. | |
1499 | * | |
1500 | * This setup will be overridden once sched_dyninit() -> sleep_gdinit() | |
1501 | * is called. | |
1502 | */ | |
1503 | void | |
1504 | sleep_early_gdinit(globaldata_t gd) | |
1505 | { | |
1506 | static struct tslpque dummy_slpque; | |
1507 | static cpumask_t dummy_cpumasks; | |
1508 | ||
1509 | slpque_tablesize = 1; | |
1510 | gd->gd_tsleep_hash = &dummy_slpque; | |
1511 | slpque_cpumasks = &dummy_cpumasks; | |
8acf0617 | 1512 | TAILQ_INIT(&dummy_slpque.queue); |
666ff13c MD |
1513 | } |
1514 | ||
1515 | /* | |
1516 | * PCPU initialization. Called after KMALLOC is operational, by | |
1517 | * sched_dyninit() for cpu 0, and by mi_gdinit() for other cpus later. | |
1518 | * | |
1519 | * WARNING! The pcpu hash table is smaller than the global cpumask | |
1520 | * hash table, which can save us a lot of memory when maxproc | |
1521 | * is set high. | |
1522 | */ | |
1523 | void | |
1524 | sleep_gdinit(globaldata_t gd) | |
1525 | { | |
1526 | struct thread *td; | |
1bc11bc6 | 1527 | size_t hash_size; |
666ff13c MD |
1528 | uint32_t n; |
1529 | uint32_t i; | |
1530 | ||
1531 | /* | |
1532 | * This shouldn't happen, that is there shouldn't be any threads | |
1533 | * waiting on the dummy tsleep queue this early in the boot. | |
1534 | */ | |
1535 | if (gd->gd_cpuid == 0) { | |
8acf0617 MD |
1536 | struct tslpque *qp = &gd->gd_tsleep_hash[0]; |
1537 | TAILQ_FOREACH(td, &qp->queue, td_sleepq) { | |
666ff13c MD |
1538 | kprintf("SLEEP_GDINIT SWITCH %s\n", td->td_comm); |
1539 | } | |
1540 | } | |
1541 | ||
1542 | /* | |
1543 | * Note that we have to allocate one extra slot because we are | |
1544 | * shifting a modulo value. TCHASHSHIFT(slpque_tablesize - 1) can | |
1545 | * return the same value as TCHASHSHIFT(slpque_tablesize). | |
1546 | */ | |
1547 | n = TCHASHSHIFT(slpque_tablesize) + 1; | |
1548 | ||
1bc11bc6 | 1549 | hash_size = sizeof(struct tslpque) * n; |
1eeaf6b2 | 1550 | gd->gd_tsleep_hash = (void *)kmem_alloc3(kernel_map, hash_size, |
1bc11bc6 SZ |
1551 | VM_SUBSYS_GD, |
1552 | KM_CPU(gd->gd_cpuid)); | |
1553 | memset(gd->gd_tsleep_hash, 0, hash_size); | |
666ff13c | 1554 | for (i = 0; i < n; ++i) |
8acf0617 | 1555 | TAILQ_INIT(&gd->gd_tsleep_hash[i].queue); |
666ff13c MD |
1556 | } |
1557 | ||
1558 | /* | |
1559 | * Dynamic initialization after the memory system is operational. | |
1560 | */ | |
1561 | static void | |
1562 | sched_dyninit(void *dummy __unused) | |
1563 | { | |
1564 | int tblsize; | |
1565 | int tblsize2; | |
1566 | int n; | |
1567 | ||
1568 | /* | |
1569 | * Calculate table size for slpque hash. We want a prime number | |
1570 | * large enough to avoid overloading slpque_cpumasks when the | |
1571 | * system has a large number of sleeping processes, which will | |
1572 | * spam IPIs on wakeup(). | |
1573 | * | |
1574 | * While it is true this is really a per-lwp factor, generally | |
1575 | * speaking the maxproc limit is a good metric to go by. | |
1576 | */ | |
1577 | for (tblsize = maxproc | 1; ; tblsize += 2) { | |
1578 | if (tblsize % 3 == 0) | |
1579 | continue; | |
1580 | if (tblsize % 5 == 0) | |
1581 | continue; | |
1582 | tblsize2 = (tblsize / 2) | 1; | |
1583 | for (n = 7; n < tblsize2; n += 2) { | |
1584 | if (tblsize % n == 0) | |
1585 | break; | |
1586 | } | |
1587 | if (n == tblsize2) | |
1588 | break; | |
1589 | } | |
1590 | ||
1591 | /* | |
1592 | * PIDs are currently limited to 6 digits. Cap the table size | |
1593 | * at double this. | |
1594 | */ | |
1595 | if (tblsize > 2000003) | |
1596 | tblsize = 2000003; | |
1597 | ||
1598 | slpque_tablesize = tblsize; | |
1599 | slpque_cpumasks = kmalloc(sizeof(*slpque_cpumasks) * slpque_tablesize, | |
1600 | M_TSLEEP, M_WAITOK | M_ZERO); | |
1601 | sleep_gdinit(mycpu); | |
984263bc | 1602 | } |