Commit | Line | Data |
---|---|---|
8ad65e08 | 1 | /* |
3b998fa9 | 2 | * Copyright (c) 2003-2010 The DragonFly Project. All rights reserved. |
60f60350 | 3 | * |
8c10bfcf MD |
4 | * This code is derived from software contributed to The DragonFly Project |
5 | * by Matthew Dillon <dillon@backplane.com> | |
60f60350 | 6 | * |
8ad65e08 MD |
7 | * Redistribution and use in source and binary forms, with or without |
8 | * modification, are permitted provided that the following conditions | |
9 | * are met: | |
60f60350 | 10 | * |
8ad65e08 MD |
11 | * 1. Redistributions of source code must retain the above copyright |
12 | * notice, this list of conditions and the following disclaimer. | |
13 | * 2. Redistributions in binary form must reproduce the above copyright | |
8c10bfcf MD |
14 | * notice, this list of conditions and the following disclaimer in |
15 | * the documentation and/or other materials provided with the | |
16 | * distribution. | |
17 | * 3. Neither the name of The DragonFly Project nor the names of its | |
18 | * contributors may be used to endorse or promote products derived | |
19 | * from this software without specific, prior written permission. | |
60f60350 | 20 | * |
8c10bfcf MD |
21 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
22 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
23 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS | |
24 | * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE | |
25 | * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, | |
26 | * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, | |
27 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
28 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED | |
29 | * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
30 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT | |
31 | * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
8ad65e08 | 32 | * SUCH DAMAGE. |
75cdbe6c MD |
33 | */ |
34 | ||
35 | /* | |
36 | * Each cpu in a system has its own self-contained light weight kernel | |
37 | * thread scheduler, which means that generally speaking we only need | |
38 | * to use a critical section to avoid problems. Foreign thread | |
39 | * scheduling is queued via (async) IPIs. | |
8ad65e08 MD |
40 | */ |
41 | ||
42 | #include <sys/param.h> | |
43 | #include <sys/systm.h> | |
44 | #include <sys/kernel.h> | |
45 | #include <sys/proc.h> | |
46 | #include <sys/rtprio.h> | |
b37f18d6 | 47 | #include <sys/kinfo.h> |
8ad65e08 | 48 | #include <sys/queue.h> |
7d0bac62 | 49 | #include <sys/sysctl.h> |
99df837e | 50 | #include <sys/kthread.h> |
f1d1c3fa | 51 | #include <machine/cpu.h> |
99df837e | 52 | #include <sys/lock.h> |
f6bf3af1 | 53 | #include <sys/caps.h> |
9d265729 | 54 | #include <sys/spinlock.h> |
57aa743c | 55 | #include <sys/ktr.h> |
9d265729 MD |
56 | |
57 | #include <sys/thread2.h> | |
58 | #include <sys/spinlock2.h> | |
684a93c4 | 59 | #include <sys/mplock2.h> |
f1d1c3fa | 60 | |
8c72e3d5 AH |
61 | #include <sys/dsched.h> |
62 | ||
7d0bac62 MD |
63 | #include <vm/vm.h> |
64 | #include <vm/vm_param.h> | |
65 | #include <vm/vm_kern.h> | |
66 | #include <vm/vm_object.h> | |
67 | #include <vm/vm_page.h> | |
68 | #include <vm/vm_map.h> | |
69 | #include <vm/vm_pager.h> | |
70 | #include <vm/vm_extern.h> | |
7d0bac62 | 71 | |
99df837e | 72 | #include <machine/stdarg.h> |
96728c05 | 73 | #include <machine/smp.h> |
99df837e | 74 | |
d850923c AE |
75 | #if !defined(KTR_CTXSW) |
76 | #define KTR_CTXSW KTR_ALL | |
77 | #endif | |
78 | KTR_INFO_MASTER(ctxsw); | |
a1f0fb66 AE |
79 | KTR_INFO(KTR_CTXSW, ctxsw, sw, 0, "#cpu[%d].td = %p", |
80 | sizeof(int) + sizeof(struct thread *)); | |
81 | KTR_INFO(KTR_CTXSW, ctxsw, pre, 1, "#cpu[%d].td = %p", | |
82 | sizeof(int) + sizeof(struct thread *)); | |
83 | KTR_INFO(KTR_CTXSW, ctxsw, newtd, 2, "#threads[%p].name = %s", | |
84 | sizeof (struct thread *) + sizeof(char *)); | |
85 | KTR_INFO(KTR_CTXSW, ctxsw, deadtd, 3, "#threads[%p].name = <dead>", sizeof (struct thread *)); | |
1541028a | 86 | |
40aaf5fc NT |
87 | static MALLOC_DEFINE(M_THREAD, "thread", "lwkt threads"); |
88 | ||
0f7a3396 MD |
89 | #ifdef INVARIANTS |
90 | static int panic_on_cscount = 0; | |
91 | #endif | |
05220613 MD |
92 | static __int64_t switch_count = 0; |
93 | static __int64_t preempt_hit = 0; | |
94 | static __int64_t preempt_miss = 0; | |
95 | static __int64_t preempt_weird = 0; | |
f64b567c | 96 | static __int64_t token_contention_count __debugvar = 0; |
fb0f29c4 | 97 | static int lwkt_use_spin_port; |
40aaf5fc | 98 | static struct objcache *thread_cache; |
05220613 | 99 | |
88ebb169 | 100 | #ifdef SMP |
e381e77c | 101 | static void lwkt_schedule_remote(void *arg, int arg2, struct intrframe *frame); |
88ebb169 | 102 | #endif |
f9235b6d | 103 | static void lwkt_fairq_accumulate(globaldata_t gd, thread_t td); |
e381e77c | 104 | |
0855a2af JG |
105 | extern void cpu_heavy_restore(void); |
106 | extern void cpu_lwkt_restore(void); | |
107 | extern void cpu_kthread_restore(void); | |
108 | extern void cpu_idle_restore(void); | |
109 | ||
fb0f29c4 MD |
110 | /* |
111 | * We can make all thread ports use the spin backend instead of the thread | |
112 | * backend. This should only be set to debug the spin backend. | |
113 | */ | |
114 | TUNABLE_INT("lwkt.use_spin_port", &lwkt_use_spin_port); | |
115 | ||
0f7a3396 | 116 | #ifdef INVARIANTS |
0c52fa62 SG |
117 | SYSCTL_INT(_lwkt, OID_AUTO, panic_on_cscount, CTLFLAG_RW, &panic_on_cscount, 0, |
118 | "Panic if attempting to switch lwkt's while mastering cpusync"); | |
0f7a3396 | 119 | #endif |
0c52fa62 SG |
120 | SYSCTL_QUAD(_lwkt, OID_AUTO, switch_count, CTLFLAG_RW, &switch_count, 0, |
121 | "Number of switched threads"); | |
9733f757 | 122 | SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_hit, CTLFLAG_RW, &preempt_hit, 0, |
0c52fa62 | 123 | "Successful preemption events"); |
9733f757 | 124 | SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_miss, CTLFLAG_RW, &preempt_miss, 0, |
0c52fa62 SG |
125 | "Failed preemption events"); |
126 | SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_weird, CTLFLAG_RW, &preempt_weird, 0, | |
127 | "Number of preempted threads."); | |
38717797 HP |
128 | #ifdef INVARIANTS |
129 | SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count, CTLFLAG_RW, | |
130 | &token_contention_count, 0, "spinning due to token contention"); | |
38717797 | 131 | #endif |
f9235b6d | 132 | static int fairq_enable = 1; |
2a418930 MD |
133 | SYSCTL_INT(_lwkt, OID_AUTO, fairq_enable, CTLFLAG_RW, |
134 | &fairq_enable, 0, "Turn on fairq priority accumulators"); | |
135 | static int lwkt_spin_loops = 10; | |
136 | SYSCTL_INT(_lwkt, OID_AUTO, spin_loops, CTLFLAG_RW, | |
137 | &lwkt_spin_loops, 0, ""); | |
138 | static int lwkt_spin_delay = 1; | |
139 | SYSCTL_INT(_lwkt, OID_AUTO, spin_delay, CTLFLAG_RW, | |
140 | &lwkt_spin_delay, 0, "Scheduler spin delay in microseconds 0=auto"); | |
141 | static int lwkt_spin_method = 1; | |
142 | SYSCTL_INT(_lwkt, OID_AUTO, spin_method, CTLFLAG_RW, | |
143 | &lwkt_spin_method, 0, "LWKT scheduler behavior when contended"); | |
d5b2d319 MD |
144 | static int lwkt_spin_fatal = 0; /* disabled */ |
145 | SYSCTL_INT(_lwkt, OID_AUTO, spin_fatal, CTLFLAG_RW, | |
146 | &lwkt_spin_fatal, 0, "LWKT scheduler spin loops till fatal panic"); | |
fbc024e4 | 147 | static int preempt_enable = 1; |
2a418930 MD |
148 | SYSCTL_INT(_lwkt, OID_AUTO, preempt_enable, CTLFLAG_RW, |
149 | &preempt_enable, 0, "Enable preemption"); | |
fbc024e4 | 150 | |
2a418930 MD |
151 | static __cachealign int lwkt_cseq_rindex; |
152 | static __cachealign int lwkt_cseq_windex; | |
05220613 | 153 | |
4b5f931b MD |
154 | /* |
155 | * These helper procedures handle the runq, they can only be called from | |
156 | * within a critical section. | |
75cdbe6c MD |
157 | * |
158 | * WARNING! Prior to SMP being brought up it is possible to enqueue and | |
159 | * dequeue threads belonging to other cpus, so be sure to use td->td_gd | |
160 | * instead of 'mycpu' when referencing the globaldata structure. Once | |
161 | * SMP live enqueuing and dequeueing only occurs on the current cpu. | |
4b5f931b | 162 | */ |
f1d1c3fa MD |
163 | static __inline |
164 | void | |
165 | _lwkt_dequeue(thread_t td) | |
166 | { | |
167 | if (td->td_flags & TDF_RUNQ) { | |
75cdbe6c | 168 | struct globaldata *gd = td->td_gd; |
4b5f931b | 169 | |
f1d1c3fa | 170 | td->td_flags &= ~TDF_RUNQ; |
f9235b6d MD |
171 | TAILQ_REMOVE(&gd->gd_tdrunq, td, td_threadq); |
172 | gd->gd_fairq_total_pri -= td->td_pri; | |
173 | if (TAILQ_FIRST(&gd->gd_tdrunq) == NULL) | |
2a418930 | 174 | atomic_clear_int(&gd->gd_reqflags, RQF_RUNNING); |
f1d1c3fa MD |
175 | } |
176 | } | |
177 | ||
f9235b6d MD |
178 | /* |
179 | * Priority enqueue. | |
180 | * | |
181 | * NOTE: There are a limited number of lwkt threads runnable since user | |
182 | * processes only schedule one at a time per cpu. | |
183 | */ | |
f1d1c3fa MD |
184 | static __inline |
185 | void | |
186 | _lwkt_enqueue(thread_t td) | |
187 | { | |
f9235b6d MD |
188 | thread_t xtd; |
189 | ||
7f5d7ed7 | 190 | if ((td->td_flags & (TDF_RUNQ|TDF_MIGRATING|TDF_BLOCKQ)) == 0) { |
75cdbe6c | 191 | struct globaldata *gd = td->td_gd; |
4b5f931b | 192 | |
f1d1c3fa | 193 | td->td_flags |= TDF_RUNQ; |
f9235b6d MD |
194 | xtd = TAILQ_FIRST(&gd->gd_tdrunq); |
195 | if (xtd == NULL) { | |
196 | TAILQ_INSERT_TAIL(&gd->gd_tdrunq, td, td_threadq); | |
2a418930 | 197 | atomic_set_int(&gd->gd_reqflags, RQF_RUNNING); |
f9235b6d MD |
198 | } else { |
199 | while (xtd && xtd->td_pri > td->td_pri) | |
200 | xtd = TAILQ_NEXT(xtd, td_threadq); | |
201 | if (xtd) | |
202 | TAILQ_INSERT_BEFORE(xtd, td, td_threadq); | |
203 | else | |
204 | TAILQ_INSERT_TAIL(&gd->gd_tdrunq, td, td_threadq); | |
205 | } | |
206 | gd->gd_fairq_total_pri += td->td_pri; | |
f1d1c3fa MD |
207 | } |
208 | } | |
8ad65e08 | 209 | |
40aaf5fc NT |
210 | static __boolean_t |
211 | _lwkt_thread_ctor(void *obj, void *privdata, int ocflags) | |
212 | { | |
213 | struct thread *td = (struct thread *)obj; | |
214 | ||
215 | td->td_kstack = NULL; | |
216 | td->td_kstack_size = 0; | |
217 | td->td_flags = TDF_ALLOCATED_THREAD; | |
218 | return (1); | |
219 | } | |
220 | ||
221 | static void | |
222 | _lwkt_thread_dtor(void *obj, void *privdata) | |
223 | { | |
224 | struct thread *td = (struct thread *)obj; | |
225 | ||
226 | KASSERT(td->td_flags & TDF_ALLOCATED_THREAD, | |
227 | ("_lwkt_thread_dtor: not allocated from objcache")); | |
228 | KASSERT((td->td_flags & TDF_ALLOCATED_STACK) && td->td_kstack && | |
229 | td->td_kstack_size > 0, | |
230 | ("_lwkt_thread_dtor: corrupted stack")); | |
231 | kmem_free(&kernel_map, (vm_offset_t)td->td_kstack, td->td_kstack_size); | |
232 | } | |
233 | ||
234 | /* | |
235 | * Initialize the lwkt s/system. | |
236 | */ | |
237 | void | |
238 | lwkt_init(void) | |
239 | { | |
240 | /* An objcache has 2 magazines per CPU so divide cache size by 2. */ | |
0aa16b5d SZ |
241 | thread_cache = objcache_create_mbacked(M_THREAD, sizeof(struct thread), |
242 | NULL, CACHE_NTHREADS/2, | |
243 | _lwkt_thread_ctor, _lwkt_thread_dtor, NULL); | |
40aaf5fc NT |
244 | } |
245 | ||
37af14fe MD |
246 | /* |
247 | * Schedule a thread to run. As the current thread we can always safely | |
248 | * schedule ourselves, and a shortcut procedure is provided for that | |
249 | * function. | |
250 | * | |
251 | * (non-blocking, self contained on a per cpu basis) | |
252 | */ | |
253 | void | |
254 | lwkt_schedule_self(thread_t td) | |
255 | { | |
cfaeae2a | 256 | KKASSERT((td->td_flags & TDF_MIGRATING) == 0); |
37af14fe | 257 | crit_enter_quick(td); |
f9235b6d MD |
258 | KASSERT(td != &td->td_gd->gd_idlethread, |
259 | ("lwkt_schedule_self(): scheduling gd_idlethread is illegal!")); | |
9388413d | 260 | KKASSERT(td->td_lwp == NULL || (td->td_lwp->lwp_flag & LWP_ONRUNQ) == 0); |
37af14fe | 261 | _lwkt_enqueue(td); |
37af14fe MD |
262 | crit_exit_quick(td); |
263 | } | |
264 | ||
265 | /* | |
266 | * Deschedule a thread. | |
267 | * | |
268 | * (non-blocking, self contained on a per cpu basis) | |
269 | */ | |
270 | void | |
271 | lwkt_deschedule_self(thread_t td) | |
272 | { | |
273 | crit_enter_quick(td); | |
37af14fe MD |
274 | _lwkt_dequeue(td); |
275 | crit_exit_quick(td); | |
276 | } | |
277 | ||
8ad65e08 MD |
278 | /* |
279 | * LWKTs operate on a per-cpu basis | |
280 | * | |
73e4f7b9 | 281 | * WARNING! Called from early boot, 'mycpu' may not work yet. |
8ad65e08 MD |
282 | */ |
283 | void | |
284 | lwkt_gdinit(struct globaldata *gd) | |
285 | { | |
f9235b6d | 286 | TAILQ_INIT(&gd->gd_tdrunq); |
73e4f7b9 | 287 | TAILQ_INIT(&gd->gd_tdallq); |
8ad65e08 MD |
288 | } |
289 | ||
7d0bac62 MD |
290 | /* |
291 | * Create a new thread. The thread must be associated with a process context | |
75cdbe6c MD |
292 | * or LWKT start address before it can be scheduled. If the target cpu is |
293 | * -1 the thread will be created on the current cpu. | |
0cfcada1 MD |
294 | * |
295 | * If you intend to create a thread without a process context this function | |
296 | * does everything except load the startup and switcher function. | |
7d0bac62 MD |
297 | */ |
298 | thread_t | |
d3d32139 | 299 | lwkt_alloc_thread(struct thread *td, int stksize, int cpu, int flags) |
7d0bac62 | 300 | { |
c070746a | 301 | globaldata_t gd = mycpu; |
99df837e | 302 | void *stack; |
7d0bac62 | 303 | |
c070746a MD |
304 | /* |
305 | * If static thread storage is not supplied allocate a thread. Reuse | |
306 | * a cached free thread if possible. gd_freetd is used to keep an exiting | |
307 | * thread intact through the exit. | |
308 | */ | |
ef0fdad1 | 309 | if (td == NULL) { |
cf709dd2 MD |
310 | crit_enter_gd(gd); |
311 | if ((td = gd->gd_freetd) != NULL) { | |
312 | KKASSERT((td->td_flags & (TDF_RUNNING|TDF_PREEMPT_LOCK| | |
313 | TDF_RUNQ)) == 0); | |
c070746a | 314 | gd->gd_freetd = NULL; |
cf709dd2 | 315 | } else { |
c070746a | 316 | td = objcache_get(thread_cache, M_WAITOK); |
cf709dd2 MD |
317 | KKASSERT((td->td_flags & (TDF_RUNNING|TDF_PREEMPT_LOCK| |
318 | TDF_RUNQ)) == 0); | |
319 | } | |
320 | crit_exit_gd(gd); | |
40aaf5fc NT |
321 | KASSERT((td->td_flags & |
322 | (TDF_ALLOCATED_THREAD|TDF_RUNNING)) == TDF_ALLOCATED_THREAD, | |
323 | ("lwkt_alloc_thread: corrupted td flags 0x%X", td->td_flags)); | |
324 | flags |= td->td_flags & (TDF_ALLOCATED_THREAD|TDF_ALLOCATED_STACK); | |
ef0fdad1 | 325 | } |
c070746a MD |
326 | |
327 | /* | |
328 | * Try to reuse cached stack. | |
329 | */ | |
f470d0c8 MD |
330 | if ((stack = td->td_kstack) != NULL && td->td_kstack_size != stksize) { |
331 | if (flags & TDF_ALLOCATED_STACK) { | |
e4846942 | 332 | kmem_free(&kernel_map, (vm_offset_t)stack, td->td_kstack_size); |
f470d0c8 MD |
333 | stack = NULL; |
334 | } | |
335 | } | |
336 | if (stack == NULL) { | |
e40cfbd7 | 337 | stack = (void *)kmem_alloc_stack(&kernel_map, stksize); |
ef0fdad1 | 338 | flags |= TDF_ALLOCATED_STACK; |
99df837e | 339 | } |
75cdbe6c | 340 | if (cpu < 0) |
c070746a | 341 | lwkt_init_thread(td, stack, stksize, flags, gd); |
75cdbe6c | 342 | else |
f470d0c8 | 343 | lwkt_init_thread(td, stack, stksize, flags, globaldata_find(cpu)); |
99df837e | 344 | return(td); |
7d0bac62 MD |
345 | } |
346 | ||
347 | /* | |
348 | * Initialize a preexisting thread structure. This function is used by | |
349 | * lwkt_alloc_thread() and also used to initialize the per-cpu idlethread. | |
350 | * | |
f8c3996b MD |
351 | * All threads start out in a critical section at a priority of |
352 | * TDPRI_KERN_DAEMON. Higher level code will modify the priority as | |
75cdbe6c MD |
353 | * appropriate. This function may send an IPI message when the |
354 | * requested cpu is not the current cpu and consequently gd_tdallq may | |
355 | * not be initialized synchronously from the point of view of the originating | |
356 | * cpu. | |
357 | * | |
358 | * NOTE! we have to be careful in regards to creating threads for other cpus | |
359 | * if SMP has not yet been activated. | |
7d0bac62 | 360 | */ |
41a01a4d MD |
361 | #ifdef SMP |
362 | ||
75cdbe6c MD |
363 | static void |
364 | lwkt_init_thread_remote(void *arg) | |
365 | { | |
366 | thread_t td = arg; | |
367 | ||
52eedfb5 MD |
368 | /* |
369 | * Protected by critical section held by IPI dispatch | |
370 | */ | |
75cdbe6c MD |
371 | TAILQ_INSERT_TAIL(&td->td_gd->gd_tdallq, td, td_allq); |
372 | } | |
373 | ||
41a01a4d MD |
374 | #endif |
375 | ||
fdce8919 MD |
376 | /* |
377 | * lwkt core thread structural initialization. | |
378 | * | |
379 | * NOTE: All threads are initialized as mpsafe threads. | |
380 | */ | |
7d0bac62 | 381 | void |
f470d0c8 MD |
382 | lwkt_init_thread(thread_t td, void *stack, int stksize, int flags, |
383 | struct globaldata *gd) | |
7d0bac62 | 384 | { |
37af14fe MD |
385 | globaldata_t mygd = mycpu; |
386 | ||
99df837e MD |
387 | bzero(td, sizeof(struct thread)); |
388 | td->td_kstack = stack; | |
f470d0c8 | 389 | td->td_kstack_size = stksize; |
d3d32139 | 390 | td->td_flags = flags; |
26a0694b | 391 | td->td_gd = gd; |
f9235b6d MD |
392 | td->td_pri = TDPRI_KERN_DAEMON; |
393 | td->td_critcount = 1; | |
3b998fa9 | 394 | td->td_toks_stop = &td->td_toks_base; |
fb0f29c4 MD |
395 | if (lwkt_use_spin_port) |
396 | lwkt_initport_spin(&td->td_msgport); | |
397 | else | |
398 | lwkt_initport_thread(&td->td_msgport, td); | |
99df837e | 399 | pmap_init_thread(td); |
0f7a3396 | 400 | #ifdef SMP |
5d21b981 MD |
401 | /* |
402 | * Normally initializing a thread for a remote cpu requires sending an | |
403 | * IPI. However, the idlethread is setup before the other cpus are | |
404 | * activated so we have to treat it as a special case. XXX manipulation | |
405 | * of gd_tdallq requires the BGL. | |
406 | */ | |
407 | if (gd == mygd || td == &gd->gd_idlethread) { | |
37af14fe | 408 | crit_enter_gd(mygd); |
75cdbe6c | 409 | TAILQ_INSERT_TAIL(&gd->gd_tdallq, td, td_allq); |
37af14fe | 410 | crit_exit_gd(mygd); |
75cdbe6c | 411 | } else { |
2db3b277 | 412 | lwkt_send_ipiq(gd, lwkt_init_thread_remote, td); |
75cdbe6c | 413 | } |
0f7a3396 | 414 | #else |
37af14fe | 415 | crit_enter_gd(mygd); |
0f7a3396 | 416 | TAILQ_INSERT_TAIL(&gd->gd_tdallq, td, td_allq); |
37af14fe | 417 | crit_exit_gd(mygd); |
0f7a3396 | 418 | #endif |
8c72e3d5 AH |
419 | |
420 | dsched_new_thread(td); | |
73e4f7b9 MD |
421 | } |
422 | ||
423 | void | |
424 | lwkt_set_comm(thread_t td, const char *ctl, ...) | |
425 | { | |
e2565a42 | 426 | __va_list va; |
73e4f7b9 | 427 | |
e2565a42 | 428 | __va_start(va, ctl); |
379210cb | 429 | kvsnprintf(td->td_comm, sizeof(td->td_comm), ctl, va); |
e2565a42 | 430 | __va_end(va); |
e7c0dbba | 431 | KTR_LOG(ctxsw_newtd, td, &td->td_comm[0]); |
7d0bac62 MD |
432 | } |
433 | ||
99df837e | 434 | void |
73e4f7b9 | 435 | lwkt_hold(thread_t td) |
99df837e | 436 | { |
74c9628e | 437 | atomic_add_int(&td->td_refs, 1); |
73e4f7b9 MD |
438 | } |
439 | ||
440 | void | |
441 | lwkt_rele(thread_t td) | |
442 | { | |
443 | KKASSERT(td->td_refs > 0); | |
74c9628e | 444 | atomic_add_int(&td->td_refs, -1); |
73e4f7b9 MD |
445 | } |
446 | ||
447 | void | |
448 | lwkt_wait_free(thread_t td) | |
449 | { | |
450 | while (td->td_refs) | |
377d4740 | 451 | tsleep(td, 0, "tdreap", hz); |
73e4f7b9 MD |
452 | } |
453 | ||
454 | void | |
455 | lwkt_free_thread(thread_t td) | |
456 | { | |
74c9628e | 457 | KKASSERT(td->td_refs == 0); |
cf709dd2 | 458 | KKASSERT((td->td_flags & (TDF_RUNNING|TDF_PREEMPT_LOCK|TDF_RUNQ)) == 0); |
40aaf5fc NT |
459 | if (td->td_flags & TDF_ALLOCATED_THREAD) { |
460 | objcache_put(thread_cache, td); | |
461 | } else if (td->td_flags & TDF_ALLOCATED_STACK) { | |
462 | /* client-allocated struct with internally allocated stack */ | |
463 | KASSERT(td->td_kstack && td->td_kstack_size > 0, | |
464 | ("lwkt_free_thread: corrupted stack")); | |
465 | kmem_free(&kernel_map, (vm_offset_t)td->td_kstack, td->td_kstack_size); | |
466 | td->td_kstack = NULL; | |
467 | td->td_kstack_size = 0; | |
99df837e | 468 | } |
e7c0dbba | 469 | KTR_LOG(ctxsw_deadtd, td); |
99df837e MD |
470 | } |
471 | ||
472 | ||
8ad65e08 MD |
473 | /* |
474 | * Switch to the next runnable lwkt. If no LWKTs are runnable then | |
f1d1c3fa MD |
475 | * switch to the idlethread. Switching must occur within a critical |
476 | * section to avoid races with the scheduling queue. | |
477 | * | |
478 | * We always have full control over our cpu's run queue. Other cpus | |
479 | * that wish to manipulate our queue must use the cpu_*msg() calls to | |
480 | * talk to our cpu, so a critical section is all that is needed and | |
481 | * the result is very, very fast thread switching. | |
482 | * | |
96728c05 MD |
483 | * The LWKT scheduler uses a fixed priority model and round-robins at |
484 | * each priority level. User process scheduling is a totally | |
485 | * different beast and LWKT priorities should not be confused with | |
486 | * user process priorities. | |
f1d1c3fa | 487 | * |
69d78e99 MD |
488 | * PREEMPTION NOTE: Preemption occurs via lwkt_preempt(). lwkt_switch() |
489 | * is not called by the current thread in the preemption case, only when | |
490 | * the preempting thread blocks (in order to return to the original thread). | |
cfaeae2a MD |
491 | * |
492 | * SPECIAL NOTE ON SWITCH ATOMICY: Certain operations such as thread | |
493 | * migration and tsleep deschedule the current lwkt thread and call | |
494 | * lwkt_switch(). In particular, the target cpu of the migration fully | |
495 | * expects the thread to become non-runnable and can deadlock against | |
496 | * cpusync operations if we run any IPIs prior to switching the thread out. | |
497 | * | |
498 | * WE MUST BE VERY CAREFUL NOT TO RUN SPLZ DIRECTLY OR INDIRECTLY IF | |
95858b91 | 499 | * THE CURRENT THREAD HAS BEEN DESCHEDULED! |
8ad65e08 MD |
500 | */ |
501 | void | |
502 | lwkt_switch(void) | |
503 | { | |
37af14fe MD |
504 | globaldata_t gd = mycpu; |
505 | thread_t td = gd->gd_curthread; | |
8ad65e08 | 506 | thread_t ntd; |
f9235b6d | 507 | thread_t xtd; |
2a418930 MD |
508 | int spinning = lwkt_spin_loops; /* loops before HLTing */ |
509 | int reqflags; | |
510 | int cseq; | |
0f0466c0 | 511 | int oseq; |
d5b2d319 | 512 | int fatal_count; |
8ad65e08 | 513 | |
46a3f46d | 514 | /* |
27e88a6e MD |
515 | * Switching from within a 'fast' (non thread switched) interrupt or IPI |
516 | * is illegal. However, we may have to do it anyway if we hit a fatal | |
517 | * kernel trap or we have paniced. | |
518 | * | |
519 | * If this case occurs save and restore the interrupt nesting level. | |
46a3f46d | 520 | */ |
27e88a6e MD |
521 | if (gd->gd_intr_nesting_level) { |
522 | int savegdnest; | |
523 | int savegdtrap; | |
524 | ||
5fddbda2 | 525 | if (gd->gd_trap_nesting_level == 0 && panic_cpu_gd != mycpu) { |
4a28fe22 MD |
526 | panic("lwkt_switch: Attempt to switch from a " |
527 | "a fast interrupt, ipi, or hard code section, " | |
528 | "td %p\n", | |
529 | td); | |
27e88a6e MD |
530 | } else { |
531 | savegdnest = gd->gd_intr_nesting_level; | |
532 | savegdtrap = gd->gd_trap_nesting_level; | |
533 | gd->gd_intr_nesting_level = 0; | |
534 | gd->gd_trap_nesting_level = 0; | |
a7422615 MD |
535 | if ((td->td_flags & TDF_PANICWARN) == 0) { |
536 | td->td_flags |= TDF_PANICWARN; | |
4a28fe22 MD |
537 | kprintf("Warning: thread switch from interrupt, IPI, " |
538 | "or hard code section.\n" | |
a7422615 | 539 | "thread %p (%s)\n", td, td->td_comm); |
7ce2998e | 540 | print_backtrace(-1); |
a7422615 | 541 | } |
27e88a6e MD |
542 | lwkt_switch(); |
543 | gd->gd_intr_nesting_level = savegdnest; | |
544 | gd->gd_trap_nesting_level = savegdtrap; | |
545 | return; | |
546 | } | |
96728c05 | 547 | } |
ef0fdad1 | 548 | |
cb973d15 MD |
549 | /* |
550 | * Passive release (used to transition from user to kernel mode | |
551 | * when we block or switch rather then when we enter the kernel). | |
552 | * This function is NOT called if we are switching into a preemption | |
553 | * or returning from a preemption. Typically this causes us to lose | |
0a3f9b47 MD |
554 | * our current process designation (if we have one) and become a true |
555 | * LWKT thread, and may also hand the current process designation to | |
556 | * another process and schedule thread. | |
cb973d15 MD |
557 | */ |
558 | if (td->td_release) | |
559 | td->td_release(td); | |
560 | ||
37af14fe | 561 | crit_enter_gd(gd); |
3b998fa9 | 562 | if (TD_TOKS_HELD(td)) |
9d265729 MD |
563 | lwkt_relalltokens(td); |
564 | ||
565 | /* | |
b02926de MD |
566 | * We had better not be holding any spin locks, but don't get into an |
567 | * endless panic loop. | |
9d265729 | 568 | */ |
d666840a MD |
569 | KASSERT(gd->gd_spinlocks_wr == 0 || panicstr != NULL, |
570 | ("lwkt_switch: still holding %d exclusive spinlocks!", | |
571 | gd->gd_spinlocks_wr)); | |
9d265729 | 572 | |
8a8d5d85 MD |
573 | |
574 | #ifdef SMP | |
0f7a3396 MD |
575 | #ifdef INVARIANTS |
576 | if (td->td_cscount) { | |
6ea70f76 | 577 | kprintf("Diagnostic: attempt to switch while mastering cpusync: %p\n", |
0f7a3396 MD |
578 | td); |
579 | if (panic_on_cscount) | |
580 | panic("switching while mastering cpusync"); | |
581 | } | |
582 | #endif | |
8a8d5d85 | 583 | #endif |
f9235b6d MD |
584 | |
585 | /* | |
586 | * If we had preempted another thread on this cpu, resume the preempted | |
587 | * thread. This occurs transparently, whether the preempted thread | |
588 | * was scheduled or not (it may have been preempted after descheduling | |
589 | * itself). | |
590 | * | |
591 | * We have to setup the MP lock for the original thread after backing | |
592 | * out the adjustment that was made to curthread when the original | |
593 | * was preempted. | |
594 | */ | |
99df837e | 595 | if ((ntd = td->td_preempted) != NULL) { |
26a0694b MD |
596 | KKASSERT(ntd->td_flags & TDF_PREEMPT_LOCK); |
597 | ntd->td_flags |= TDF_PREEMPT_DONE; | |
8ec60c3f MD |
598 | |
599 | /* | |
b9eb1c19 MD |
600 | * The interrupt may have woken a thread up, we need to properly |
601 | * set the reschedule flag if the originally interrupted thread is | |
602 | * at a lower priority. | |
8ec60c3f | 603 | */ |
f9235b6d MD |
604 | if (TAILQ_FIRST(&gd->gd_tdrunq) && |
605 | TAILQ_FIRST(&gd->gd_tdrunq)->td_pri > ntd->td_pri) { | |
8ec60c3f | 606 | need_lwkt_resched(); |
f9235b6d | 607 | } |
8a8d5d85 | 608 | /* YYY release mp lock on switchback if original doesn't need it */ |
f9235b6d MD |
609 | goto havethread_preempted; |
610 | } | |
611 | ||
612 | /* | |
613 | * Implement round-robin fairq with priority insertion. The priority | |
614 | * insertion is handled by _lwkt_enqueue() | |
615 | * | |
f9235b6d | 616 | * If we cannot obtain ownership of the tokens we cannot immediately |
cfaeae2a MD |
617 | * schedule the target thread. |
618 | * | |
619 | * Reminder: Again, we cannot afford to run any IPIs in this path if | |
620 | * the current thread has been descheduled. | |
f9235b6d MD |
621 | */ |
622 | for (;;) { | |
2a418930 MD |
623 | /* |
624 | * Clear RQF_AST_LWKT_RESCHED (we handle the reschedule request) | |
625 | * and set RQF_WAKEUP (prevent unnecessary IPIs from being | |
626 | * received). | |
627 | */ | |
628 | for (;;) { | |
629 | reqflags = gd->gd_reqflags; | |
630 | if (atomic_cmpset_int(&gd->gd_reqflags, reqflags, | |
631 | (reqflags & ~RQF_AST_LWKT_RESCHED) | | |
632 | RQF_WAKEUP)) { | |
633 | break; | |
634 | } | |
635 | } | |
f9235b6d | 636 | |
4b5f931b | 637 | /* |
2a418930 MD |
638 | * Hotpath - pull the head of the run queue and attempt to schedule |
639 | * it. Fairq exhaustion moves the task to the end of the list. If | |
640 | * no threads are runnable we switch to the idle thread. | |
41a01a4d | 641 | */ |
2a418930 MD |
642 | for (;;) { |
643 | ntd = TAILQ_FIRST(&gd->gd_tdrunq); | |
644 | ||
645 | if (ntd == NULL) { | |
646 | /* | |
647 | * Runq is empty, switch to idle and clear RQF_WAKEUP | |
648 | * to allow it to halt. | |
649 | */ | |
650 | ntd = &gd->gd_idlethread; | |
6f207a2c | 651 | #ifdef SMP |
2a418930 | 652 | if (gd->gd_trap_nesting_level == 0 && panicstr == NULL) |
b5d16701 | 653 | ASSERT_NO_TOKENS_HELD(ntd); |
6f207a2c | 654 | #endif |
2a418930 MD |
655 | cpu_time.cp_msg[0] = 0; |
656 | cpu_time.cp_stallpc = 0; | |
657 | atomic_clear_int(&gd->gd_reqflags, RQF_WAKEUP); | |
658 | goto haveidle; | |
659 | } | |
660 | ||
661 | if (ntd->td_fairq_accum >= 0) | |
662 | break; | |
663 | ||
cfaeae2a | 664 | /*splz_check(); cannot do this here, see above */ |
2a418930 MD |
665 | lwkt_fairq_accumulate(gd, ntd); |
666 | TAILQ_REMOVE(&gd->gd_tdrunq, ntd, td_threadq); | |
667 | TAILQ_INSERT_TAIL(&gd->gd_tdrunq, ntd, td_threadq); | |
f9235b6d | 668 | } |
41a01a4d | 669 | |
8ec60c3f | 670 | /* |
2a418930 MD |
671 | * Hotpath - schedule ntd. Leaves RQF_WAKEUP set to prevent |
672 | * unwanted decontention IPIs. | |
6f207a2c MD |
673 | * |
674 | * NOTE: For UP there is no mplock and lwkt_getalltokens() | |
675 | * always succeeds. | |
8ec60c3f | 676 | */ |
2a418930 | 677 | if (TD_TOKS_NOT_HELD(ntd) || lwkt_getalltokens(ntd)) |
f9235b6d | 678 | goto havethread; |
f9235b6d | 679 | |
f9235b6d | 680 | /* |
2a418930 MD |
681 | * Coldpath (SMP only since tokens always succeed on UP) |
682 | * | |
683 | * We had some contention on the thread we wanted to schedule. | |
684 | * What we do now is try to find a thread that we can schedule | |
685 | * in its stead until decontention reschedules on our cpu. | |
686 | * | |
687 | * The coldpath scan does NOT rearrange threads in the run list | |
688 | * and it also ignores the accumulator. | |
689 | * | |
690 | * We do not immediately schedule a user priority thread, instead | |
691 | * we record it in xtd and continue looking for kernel threads. | |
692 | * A cpu can only have one user priority thread (normally) so just | |
693 | * record the first one. | |
694 | * | |
695 | * NOTE: This scan will also include threads whos fairq's were | |
696 | * accumulated in the first loop. | |
f9235b6d | 697 | */ |
2a418930 MD |
698 | ++token_contention_count; |
699 | xtd = NULL; | |
700 | while ((ntd = TAILQ_NEXT(ntd, td_threadq)) != NULL) { | |
41a01a4d | 701 | /* |
2a418930 MD |
702 | * Try to switch to this thread. If the thread is running at |
703 | * user priority we clear WAKEUP to allow decontention IPIs | |
704 | * (since this thread is simply running until the one we wanted | |
705 | * decontends), and we make sure that LWKT_RESCHED is not set. | |
df6b8ba0 | 706 | * |
2a418930 MD |
707 | * Otherwise for kernel threads we leave WAKEUP set to avoid |
708 | * unnecessary decontention IPIs. | |
41a01a4d | 709 | */ |
2a418930 MD |
710 | if (ntd->td_pri < TDPRI_KERN_LPSCHED) { |
711 | if (xtd == NULL) | |
712 | xtd = ntd; | |
713 | continue; | |
f9235b6d | 714 | } |
a453459d | 715 | |
f9235b6d | 716 | /* |
2a418930 MD |
717 | * Do not let the fairq get too negative. Even though we are |
718 | * ignoring it atm once the scheduler decontends a very negative | |
719 | * thread will get moved to the end of the queue. | |
f9235b6d | 720 | */ |
2a418930 MD |
721 | if (TD_TOKS_NOT_HELD(ntd) || lwkt_getalltokens(ntd)) { |
722 | if (ntd->td_fairq_accum < -TDFAIRQ_MAX(gd)) | |
723 | ntd->td_fairq_accum = -TDFAIRQ_MAX(gd); | |
724 | goto havethread; | |
8a8d5d85 | 725 | } |
f9235b6d | 726 | |
df6b8ba0 | 727 | /* |
2a418930 | 728 | * Well fubar, this thread is contended as well, loop |
df6b8ba0 | 729 | */ |
2a418930 MD |
730 | /* */ |
731 | } | |
732 | ||
733 | /* | |
734 | * We exhausted the run list but we may have recorded a user | |
735 | * thread to try. We have three choices based on | |
736 | * lwkt.decontention_method. | |
737 | * | |
738 | * (0) Atomically clear RQF_WAKEUP in order to receive decontention | |
739 | * IPIs (to interrupt the user process) and test | |
740 | * RQF_AST_LWKT_RESCHED at the same time. | |
741 | * | |
742 | * This results in significant decontention IPI traffic but may | |
743 | * be more responsive. | |
744 | * | |
745 | * (1) Leave RQF_WAKEUP set so we do not receive a decontention IPI. | |
746 | * An automatic LWKT reschedule will occur on the next hardclock | |
747 | * (typically 100hz). | |
748 | * | |
749 | * This results in no decontention IPI traffic but may be less | |
750 | * responsive. This is the default. | |
751 | * | |
752 | * (2) Refuse to schedule the user process at this time. | |
753 | * | |
754 | * This is highly experimental and should not be used under | |
755 | * normal circumstances. This can cause a user process to | |
756 | * get starved out in situations where kernel threads are | |
757 | * fighting each other for tokens. | |
758 | */ | |
759 | if (xtd) { | |
760 | ntd = xtd; | |
761 | ||
762 | switch(lwkt_spin_method) { | |
763 | case 0: | |
764 | for (;;) { | |
765 | reqflags = gd->gd_reqflags; | |
766 | if (atomic_cmpset_int(&gd->gd_reqflags, | |
767 | reqflags, | |
768 | reqflags & ~RQF_WAKEUP)) { | |
769 | break; | |
770 | } | |
771 | } | |
772 | break; | |
773 | case 1: | |
774 | reqflags = gd->gd_reqflags; | |
775 | break; | |
776 | default: | |
777 | goto skip; | |
778 | break; | |
779 | } | |
780 | if ((reqflags & RQF_AST_LWKT_RESCHED) == 0 && | |
b5d16701 | 781 | (TD_TOKS_NOT_HELD(ntd) || lwkt_getalltokens(ntd)) |
f9235b6d | 782 | ) { |
2a418930 MD |
783 | if (ntd->td_fairq_accum < -TDFAIRQ_MAX(gd)) |
784 | ntd->td_fairq_accum = -TDFAIRQ_MAX(gd); | |
785 | goto havethread; | |
df6b8ba0 | 786 | } |
9ac1ee6e | 787 | |
2a418930 | 788 | skip: |
9ac1ee6e | 789 | /* |
2a418930 MD |
790 | * Make sure RQF_WAKEUP is set if we failed to schedule the |
791 | * user thread to prevent the idle thread from halting. | |
9ac1ee6e | 792 | */ |
2a418930 MD |
793 | atomic_set_int(&gd->gd_reqflags, RQF_WAKEUP); |
794 | } | |
795 | ||
796 | /* | |
797 | * We exhausted the run list, meaning that all runnable threads | |
798 | * are contended. | |
799 | */ | |
800 | cpu_pause(); | |
801 | ntd = &gd->gd_idlethread; | |
802 | #ifdef SMP | |
803 | if (gd->gd_trap_nesting_level == 0 && panicstr == NULL) | |
804 | ASSERT_NO_TOKENS_HELD(ntd); | |
805 | /* contention case, do not clear contention mask */ | |
806 | #endif | |
807 | ||
808 | /* | |
809 | * Ok, we might want to spin a few times as some tokens are held for | |
810 | * very short periods of time and IPI overhead is 1uS or worse | |
811 | * (meaning it is usually better to spin). Regardless we have to | |
812 | * call splz_check() to be sure to service any interrupts blocked | |
813 | * by our critical section, otherwise we could livelock e.g. IPIs. | |
814 | * | |
815 | * The IPI mechanic is really a last resort. In nearly all other | |
816 | * cases RQF_WAKEUP is left set to prevent decontention IPIs. | |
817 | * | |
818 | * When we decide not to spin we clear RQF_WAKEUP and switch to | |
819 | * the idle thread. Clearing RQF_WEAKEUP allows the idle thread | |
820 | * to halt and decontended tokens will issue an IPI to us. The | |
821 | * idle thread will check for pending reschedules already set | |
822 | * (RQF_AST_LWKT_RESCHED) before actually halting so we don't have | |
823 | * to here. | |
cfaeae2a MD |
824 | * |
825 | * Also, if TDF_RUNQ is not set the current thread is trying to | |
826 | * deschedule, possibly in an atomic fashion. We cannot afford to | |
827 | * stay here. | |
2a418930 | 828 | */ |
cfaeae2a | 829 | if (spinning <= 0 || (td->td_flags & TDF_RUNQ) == 0) { |
2a418930 MD |
830 | atomic_clear_int(&gd->gd_reqflags, RQF_WAKEUP); |
831 | goto haveidle; | |
4b5f931b | 832 | } |
2a418930 | 833 | --spinning; |
c5724852 MD |
834 | |
835 | /* | |
2a418930 MD |
836 | * When spinning a delay is required both to avoid livelocks from |
837 | * token order reversals (a thread may be trying to acquire multiple | |
838 | * tokens), and also to reduce cpu cache management traffic. | |
839 | * | |
840 | * In order to scale to a large number of CPUs we use a time slot | |
841 | * resequencer to force contending cpus into non-contending | |
842 | * time-slots. The scheduler may still contend with the lock holder | |
843 | * but will not (generally) contend with all the other cpus trying | |
844 | * trying to get the same token. | |
845 | * | |
846 | * The resequencer uses a FIFO counter mechanic. The owner of the | |
847 | * rindex at the head of the FIFO is allowed to pull itself off | |
848 | * the FIFO and fetchadd is used to enter into the FIFO. This bit | |
849 | * of code is VERY cache friendly and forces all spinning schedulers | |
850 | * into their own time slots. | |
c5724852 | 851 | * |
2a418930 MD |
852 | * This code has been tested to 48-cpus and caps the cache |
853 | * contention load at ~1uS intervals regardless of the number of | |
854 | * cpus. Scaling beyond 64 cpus might require additional smarts | |
855 | * (such as separate FIFOs for specific token cases). | |
856 | * | |
857 | * WARNING! We can't call splz_check() or anything else here as | |
858 | * it could cause a deadlock. | |
c5724852 | 859 | */ |
bd52bedf | 860 | #if defined(INVARIANTS) && defined(__amd64__) |
d5b2d319 MD |
861 | if ((read_rflags() & PSL_I) == 0) { |
862 | cpu_enable_intr(); | |
863 | panic("lwkt_switch() called with interrupts disabled"); | |
864 | } | |
865 | #endif | |
2a418930 | 866 | cseq = atomic_fetchadd_int(&lwkt_cseq_windex, 1); |
d5b2d319 | 867 | fatal_count = lwkt_spin_fatal; |
0f0466c0 MD |
868 | while ((oseq = lwkt_cseq_rindex) != cseq) { |
869 | cpu_ccfence(); | |
8b402283 | 870 | #if !defined(_KERNEL_VIRTUAL) |
0f0466c0 MD |
871 | if (cpu_mi_feature & CPU_MI_MONITOR) { |
872 | cpu_mmw_pause_int(&lwkt_cseq_rindex, oseq); | |
8b402283 SW |
873 | } else |
874 | #endif | |
875 | { | |
0f0466c0 MD |
876 | DELAY(1); |
877 | cpu_lfence(); | |
878 | } | |
d5b2d319 MD |
879 | if (fatal_count && --fatal_count == 0) |
880 | panic("lwkt_switch: fatal spin wait"); | |
2a418930 MD |
881 | } |
882 | cseq = lwkt_spin_delay; /* don't trust the system operator */ | |
883 | cpu_ccfence(); | |
884 | if (cseq < 1) | |
885 | cseq = 1; | |
886 | if (cseq > 1000) | |
887 | cseq = 1000; | |
888 | DELAY(cseq); | |
889 | atomic_add_int(&lwkt_cseq_rindex, 1); | |
cfaeae2a | 890 | splz_check(); /* ok, we already checked that td is still scheduled */ |
2a418930 | 891 | /* highest level for(;;) loop */ |
f1d1c3fa | 892 | } |
8a8d5d85 | 893 | |
2a418930 | 894 | havethread: |
8a8d5d85 | 895 | /* |
f9235b6d MD |
896 | * We must always decrement td_fairq_accum on non-idle threads just |
897 | * in case a thread never gets a tick due to being in a continuous | |
2a418930 | 898 | * critical section. The page-zeroing code does this, for example. |
f9235b6d MD |
899 | * |
900 | * If the thread we came up with is a higher or equal priority verses | |
901 | * the thread at the head of the queue we move our thread to the | |
902 | * front. This way we can always check the front of the queue. | |
be71787b MD |
903 | * |
904 | * Clear gd_idle_repeat when doing a normal switch to a non-idle | |
905 | * thread. | |
f9235b6d | 906 | */ |
f9235b6d MD |
907 | ++gd->gd_cnt.v_swtch; |
908 | --ntd->td_fairq_accum; | |
9ac1ee6e | 909 | ntd->td_wmesg = NULL; |
f9235b6d MD |
910 | xtd = TAILQ_FIRST(&gd->gd_tdrunq); |
911 | if (ntd != xtd && ntd->td_pri >= xtd->td_pri) { | |
912 | TAILQ_REMOVE(&gd->gd_tdrunq, ntd, td_threadq); | |
913 | TAILQ_INSERT_HEAD(&gd->gd_tdrunq, ntd, td_threadq); | |
914 | } | |
be71787b | 915 | gd->gd_idle_repeat = 0; |
2a418930 | 916 | |
f9235b6d | 917 | havethread_preempted: |
f9235b6d MD |
918 | /* |
919 | * If the new target does not need the MP lock and we are holding it, | |
920 | * release the MP lock. If the new target requires the MP lock we have | |
921 | * already acquired it for the target. | |
8a8d5d85 | 922 | */ |
2a418930 | 923 | ; |
f9235b6d MD |
924 | haveidle: |
925 | KASSERT(ntd->td_critcount, | |
b5d16701 MD |
926 | ("priority problem in lwkt_switch %d %d", |
927 | td->td_critcount, ntd->td_critcount)); | |
928 | ||
94f6d86e MD |
929 | if (td != ntd) { |
930 | ++switch_count; | |
a1f0fb66 | 931 | KTR_LOG(ctxsw_sw, gd->gd_cpuid, ntd); |
f1d1c3fa | 932 | td->td_switch(ntd); |
94f6d86e | 933 | } |
37af14fe MD |
934 | /* NOTE: current cpu may have changed after switch */ |
935 | crit_exit_quick(td); | |
8ad65e08 MD |
936 | } |
937 | ||
b68b7282 | 938 | /* |
96728c05 MD |
939 | * Request that the target thread preempt the current thread. Preemption |
940 | * only works under a specific set of conditions: | |
b68b7282 | 941 | * |
96728c05 MD |
942 | * - We are not preempting ourselves |
943 | * - The target thread is owned by the current cpu | |
944 | * - We are not currently being preempted | |
945 | * - The target is not currently being preempted | |
d3d1cbc8 MD |
946 | * - We are not holding any spin locks |
947 | * - The target thread is not holding any tokens | |
96728c05 MD |
948 | * - We are able to satisfy the target's MP lock requirements (if any). |
949 | * | |
950 | * THE CALLER OF LWKT_PREEMPT() MUST BE IN A CRITICAL SECTION. Typically | |
951 | * this is called via lwkt_schedule() through the td_preemptable callback. | |
f9235b6d | 952 | * critcount is the managed critical priority that we should ignore in order |
96728c05 MD |
953 | * to determine whether preemption is possible (aka usually just the crit |
954 | * priority of lwkt_schedule() itself). | |
b68b7282 | 955 | * |
26a0694b MD |
956 | * XXX at the moment we run the target thread in a critical section during |
957 | * the preemption in order to prevent the target from taking interrupts | |
958 | * that *WE* can't. Preemption is strictly limited to interrupt threads | |
959 | * and interrupt-like threads, outside of a critical section, and the | |
960 | * preempted source thread will be resumed the instant the target blocks | |
961 | * whether or not the source is scheduled (i.e. preemption is supposed to | |
962 | * be as transparent as possible). | |
b68b7282 MD |
963 | */ |
964 | void | |
f9235b6d | 965 | lwkt_preempt(thread_t ntd, int critcount) |
b68b7282 | 966 | { |
46a3f46d | 967 | struct globaldata *gd = mycpu; |
0a3f9b47 | 968 | thread_t td; |
2d910aaf | 969 | int save_gd_intr_nesting_level; |
b68b7282 | 970 | |
26a0694b | 971 | /* |
96728c05 MD |
972 | * The caller has put us in a critical section. We can only preempt |
973 | * if the caller of the caller was not in a critical section (basically | |
f9235b6d | 974 | * a local interrupt), as determined by the 'critcount' parameter. We |
47737962 | 975 | * also can't preempt if the caller is holding any spinlocks (even if |
d666840a | 976 | * he isn't in a critical section). This also handles the tokens test. |
96728c05 MD |
977 | * |
978 | * YYY The target thread must be in a critical section (else it must | |
979 | * inherit our critical section? I dunno yet). | |
41a01a4d | 980 | * |
0a3f9b47 | 981 | * Set need_lwkt_resched() unconditionally for now YYY. |
26a0694b | 982 | */ |
f9235b6d | 983 | KASSERT(ntd->td_critcount, ("BADCRIT0 %d", ntd->td_pri)); |
26a0694b | 984 | |
fbc024e4 MD |
985 | if (preempt_enable == 0) { |
986 | ++preempt_miss; | |
987 | return; | |
988 | } | |
989 | ||
0a3f9b47 | 990 | td = gd->gd_curthread; |
f9235b6d | 991 | if (ntd->td_pri <= td->td_pri) { |
57c254db MD |
992 | ++preempt_miss; |
993 | return; | |
994 | } | |
f9235b6d | 995 | if (td->td_critcount > critcount) { |
96728c05 | 996 | ++preempt_miss; |
8ec60c3f | 997 | need_lwkt_resched(); |
96728c05 MD |
998 | return; |
999 | } | |
1000 | #ifdef SMP | |
46a3f46d | 1001 | if (ntd->td_gd != gd) { |
96728c05 | 1002 | ++preempt_miss; |
8ec60c3f | 1003 | need_lwkt_resched(); |
96728c05 MD |
1004 | return; |
1005 | } | |
1006 | #endif | |
41a01a4d | 1007 | /* |
77912481 MD |
1008 | * We don't have to check spinlocks here as they will also bump |
1009 | * td_critcount. | |
d3d1cbc8 MD |
1010 | * |
1011 | * Do not try to preempt if the target thread is holding any tokens. | |
1012 | * We could try to acquire the tokens but this case is so rare there | |
1013 | * is no need to support it. | |
41a01a4d | 1014 | */ |
77912481 MD |
1015 | KKASSERT(gd->gd_spinlocks_wr == 0); |
1016 | ||
3b998fa9 | 1017 | if (TD_TOKS_HELD(ntd)) { |
d3d1cbc8 MD |
1018 | ++preempt_miss; |
1019 | need_lwkt_resched(); | |
1020 | return; | |
1021 | } | |
26a0694b MD |
1022 | if (td == ntd || ((td->td_flags | ntd->td_flags) & TDF_PREEMPT_LOCK)) { |
1023 | ++preempt_weird; | |
8ec60c3f | 1024 | need_lwkt_resched(); |
26a0694b MD |
1025 | return; |
1026 | } | |
1027 | if (ntd->td_preempted) { | |
4b5f931b | 1028 | ++preempt_hit; |
8ec60c3f | 1029 | need_lwkt_resched(); |
26a0694b | 1030 | return; |
b68b7282 | 1031 | } |
26a0694b | 1032 | |
8ec60c3f MD |
1033 | /* |
1034 | * Since we are able to preempt the current thread, there is no need to | |
1035 | * call need_lwkt_resched(). | |
2d910aaf MD |
1036 | * |
1037 | * We must temporarily clear gd_intr_nesting_level around the switch | |
1038 | * since switchouts from the target thread are allowed (they will just | |
1039 | * return to our thread), and since the target thread has its own stack. | |
8ec60c3f | 1040 | */ |
26a0694b MD |
1041 | ++preempt_hit; |
1042 | ntd->td_preempted = td; | |
1043 | td->td_flags |= TDF_PREEMPT_LOCK; | |
a1f0fb66 | 1044 | KTR_LOG(ctxsw_pre, gd->gd_cpuid, ntd); |
2d910aaf MD |
1045 | save_gd_intr_nesting_level = gd->gd_intr_nesting_level; |
1046 | gd->gd_intr_nesting_level = 0; | |
26a0694b | 1047 | td->td_switch(ntd); |
2d910aaf | 1048 | gd->gd_intr_nesting_level = save_gd_intr_nesting_level; |
b9eb1c19 | 1049 | |
26a0694b MD |
1050 | KKASSERT(ntd->td_preempted && (td->td_flags & TDF_PREEMPT_DONE)); |
1051 | ntd->td_preempted = NULL; | |
1052 | td->td_flags &= ~(TDF_PREEMPT_LOCK|TDF_PREEMPT_DONE); | |
b68b7282 MD |
1053 | } |
1054 | ||
f1d1c3fa | 1055 | /* |
faaeffac | 1056 | * Conditionally call splz() if gd_reqflags indicates work is pending. |
4a28fe22 MD |
1057 | * This will work inside a critical section but not inside a hard code |
1058 | * section. | |
ef0fdad1 | 1059 | * |
f1d1c3fa MD |
1060 | * (self contained on a per cpu basis) |
1061 | */ | |
1062 | void | |
faaeffac | 1063 | splz_check(void) |
f1d1c3fa | 1064 | { |
7966cb69 MD |
1065 | globaldata_t gd = mycpu; |
1066 | thread_t td = gd->gd_curthread; | |
ef0fdad1 | 1067 | |
4a28fe22 MD |
1068 | if ((gd->gd_reqflags & RQF_IDLECHECK_MASK) && |
1069 | gd->gd_intr_nesting_level == 0 && | |
1070 | td->td_nest_count < 2) | |
1071 | { | |
f1d1c3fa | 1072 | splz(); |
4a28fe22 MD |
1073 | } |
1074 | } | |
1075 | ||
1076 | /* | |
1077 | * This version is integrated into crit_exit, reqflags has already | |
1078 | * been tested but td_critcount has not. | |
1079 | * | |
1080 | * We only want to execute the splz() on the 1->0 transition of | |
1081 | * critcount and not in a hard code section or if too deeply nested. | |
1082 | */ | |
1083 | void | |
1084 | lwkt_maybe_splz(thread_t td) | |
1085 | { | |
1086 | globaldata_t gd = td->td_gd; | |
1087 | ||
1088 | if (td->td_critcount == 0 && | |
1089 | gd->gd_intr_nesting_level == 0 && | |
1090 | td->td_nest_count < 2) | |
1091 | { | |
1092 | splz(); | |
1093 | } | |
f1d1c3fa MD |
1094 | } |
1095 | ||
8ad65e08 | 1096 | /* |
f9235b6d MD |
1097 | * This function is used to negotiate a passive release of the current |
1098 | * process/lwp designation with the user scheduler, allowing the user | |
1099 | * scheduler to schedule another user thread. The related kernel thread | |
1100 | * (curthread) continues running in the released state. | |
8ad65e08 MD |
1101 | */ |
1102 | void | |
f9235b6d | 1103 | lwkt_passive_release(struct thread *td) |
8ad65e08 | 1104 | { |
f9235b6d MD |
1105 | struct lwp *lp = td->td_lwp; |
1106 | ||
1107 | td->td_release = NULL; | |
1108 | lwkt_setpri_self(TDPRI_KERN_USER); | |
1109 | lp->lwp_proc->p_usched->release_curproc(lp); | |
f1d1c3fa MD |
1110 | } |
1111 | ||
f9235b6d | 1112 | |
3824f392 | 1113 | /* |
f9235b6d MD |
1114 | * This implements a normal yield. This routine is virtually a nop if |
1115 | * there is nothing to yield to but it will always run any pending interrupts | |
1116 | * if called from a critical section. | |
1117 | * | |
1118 | * This yield is designed for kernel threads without a user context. | |
1119 | * | |
1120 | * (self contained on a per cpu basis) | |
3824f392 MD |
1121 | */ |
1122 | void | |
f9235b6d | 1123 | lwkt_yield(void) |
3824f392 | 1124 | { |
f9235b6d MD |
1125 | globaldata_t gd = mycpu; |
1126 | thread_t td = gd->gd_curthread; | |
1127 | thread_t xtd; | |
3824f392 | 1128 | |
f9235b6d MD |
1129 | if ((gd->gd_reqflags & RQF_IDLECHECK_MASK) && td->td_nest_count < 2) |
1130 | splz(); | |
1131 | if (td->td_fairq_accum < 0) { | |
1132 | lwkt_schedule_self(curthread); | |
1133 | lwkt_switch(); | |
1134 | } else { | |
1135 | xtd = TAILQ_FIRST(&gd->gd_tdrunq); | |
1136 | if (xtd && xtd->td_pri > td->td_pri) { | |
1137 | lwkt_schedule_self(curthread); | |
1138 | lwkt_switch(); | |
1139 | } | |
1140 | } | |
3824f392 MD |
1141 | } |
1142 | ||
1143 | /* | |
f9235b6d MD |
1144 | * This yield is designed for kernel threads with a user context. |
1145 | * | |
1146 | * The kernel acting on behalf of the user is potentially cpu-bound, | |
1147 | * this function will efficiently allow other threads to run and also | |
1148 | * switch to other processes by releasing. | |
3824f392 MD |
1149 | * |
1150 | * The lwkt_user_yield() function is designed to have very low overhead | |
1151 | * if no yield is determined to be needed. | |
1152 | */ | |
1153 | void | |
1154 | lwkt_user_yield(void) | |
1155 | { | |
f9235b6d MD |
1156 | globaldata_t gd = mycpu; |
1157 | thread_t td = gd->gd_curthread; | |
1158 | ||
1159 | /* | |
1160 | * Always run any pending interrupts in case we are in a critical | |
1161 | * section. | |
1162 | */ | |
1163 | if ((gd->gd_reqflags & RQF_IDLECHECK_MASK) && td->td_nest_count < 2) | |
1164 | splz(); | |
3824f392 | 1165 | |
3824f392 | 1166 | /* |
f9235b6d MD |
1167 | * Switch (which forces a release) if another kernel thread needs |
1168 | * the cpu, if userland wants us to resched, or if our kernel | |
1169 | * quantum has run out. | |
3824f392 | 1170 | */ |
f9235b6d MD |
1171 | if (lwkt_resched_wanted() || |
1172 | user_resched_wanted() || | |
1173 | td->td_fairq_accum < 0) | |
1174 | { | |
3824f392 | 1175 | lwkt_switch(); |
3824f392 MD |
1176 | } |
1177 | ||
f9235b6d | 1178 | #if 0 |
3824f392 | 1179 | /* |
f9235b6d MD |
1180 | * Reacquire the current process if we are released. |
1181 | * | |
1182 | * XXX not implemented atm. The kernel may be holding locks and such, | |
1183 | * so we want the thread to continue to receive cpu. | |
3824f392 | 1184 | */ |
f9235b6d MD |
1185 | if (td->td_release == NULL && lp) { |
1186 | lp->lwp_proc->p_usched->acquire_curproc(lp); | |
1187 | td->td_release = lwkt_passive_release; | |
1188 | lwkt_setpri_self(TDPRI_USER_NORM); | |
3824f392 | 1189 | } |
f9235b6d | 1190 | #endif |
b9eb1c19 MD |
1191 | } |
1192 | ||
8ad65e08 | 1193 | /* |
f1d1c3fa MD |
1194 | * Generic schedule. Possibly schedule threads belonging to other cpus and |
1195 | * deal with threads that might be blocked on a wait queue. | |
1196 | * | |
0a3f9b47 MD |
1197 | * We have a little helper inline function which does additional work after |
1198 | * the thread has been enqueued, including dealing with preemption and | |
1199 | * setting need_lwkt_resched() (which prevents the kernel from returning | |
1200 | * to userland until it has processed higher priority threads). | |
6330a558 MD |
1201 | * |
1202 | * It is possible for this routine to be called after a failed _enqueue | |
1203 | * (due to the target thread migrating, sleeping, or otherwise blocked). | |
1204 | * We have to check that the thread is actually on the run queue! | |
361d01dd MD |
1205 | * |
1206 | * reschedok is an optimized constant propagated from lwkt_schedule() or | |
1207 | * lwkt_schedule_noresched(). By default it is non-zero, causing a | |
1208 | * reschedule to be requested if the target thread has a higher priority. | |
1209 | * The port messaging code will set MSG_NORESCHED and cause reschedok to | |
1210 | * be 0, prevented undesired reschedules. | |
8ad65e08 | 1211 | */ |
0a3f9b47 MD |
1212 | static __inline |
1213 | void | |
f9235b6d | 1214 | _lwkt_schedule_post(globaldata_t gd, thread_t ntd, int ccount, int reschedok) |
0a3f9b47 | 1215 | { |
b9eb1c19 | 1216 | thread_t otd; |
c730be20 | 1217 | |
6330a558 | 1218 | if (ntd->td_flags & TDF_RUNQ) { |
361d01dd | 1219 | if (ntd->td_preemptable && reschedok) { |
f9235b6d | 1220 | ntd->td_preemptable(ntd, ccount); /* YYY +token */ |
361d01dd | 1221 | } else if (reschedok) { |
b9eb1c19 | 1222 | otd = curthread; |
f9235b6d | 1223 | if (ntd->td_pri > otd->td_pri) |
c730be20 | 1224 | need_lwkt_resched(); |
6330a558 | 1225 | } |
f9235b6d MD |
1226 | |
1227 | /* | |
1228 | * Give the thread a little fair share scheduler bump if it | |
1229 | * has been asleep for a while. This is primarily to avoid | |
1230 | * a degenerate case for interrupt threads where accumulator | |
1231 | * crosses into negative territory unnecessarily. | |
1232 | */ | |
1233 | if (ntd->td_fairq_lticks != ticks) { | |
1234 | ntd->td_fairq_lticks = ticks; | |
1235 | ntd->td_fairq_accum += gd->gd_fairq_total_pri; | |
1236 | if (ntd->td_fairq_accum > TDFAIRQ_MAX(gd)) | |
1237 | ntd->td_fairq_accum = TDFAIRQ_MAX(gd); | |
1238 | } | |
0a3f9b47 MD |
1239 | } |
1240 | } | |
1241 | ||
361d01dd | 1242 | static __inline |
8ad65e08 | 1243 | void |
361d01dd | 1244 | _lwkt_schedule(thread_t td, int reschedok) |
8ad65e08 | 1245 | { |
37af14fe MD |
1246 | globaldata_t mygd = mycpu; |
1247 | ||
cf709dd2 MD |
1248 | KASSERT(td != &td->td_gd->gd_idlethread, |
1249 | ("lwkt_schedule(): scheduling gd_idlethread is illegal!")); | |
cfaeae2a | 1250 | KKASSERT((td->td_flags & TDF_MIGRATING) == 0); |
37af14fe | 1251 | crit_enter_gd(mygd); |
9388413d | 1252 | KKASSERT(td->td_lwp == NULL || (td->td_lwp->lwp_flag & LWP_ONRUNQ) == 0); |
37af14fe | 1253 | if (td == mygd->gd_curthread) { |
f1d1c3fa MD |
1254 | _lwkt_enqueue(td); |
1255 | } else { | |
f1d1c3fa | 1256 | /* |
7cd8d145 MD |
1257 | * If we own the thread, there is no race (since we are in a |
1258 | * critical section). If we do not own the thread there might | |
1259 | * be a race but the target cpu will deal with it. | |
f1d1c3fa | 1260 | */ |
0f7a3396 | 1261 | #ifdef SMP |
7cd8d145 | 1262 | if (td->td_gd == mygd) { |
9d265729 | 1263 | _lwkt_enqueue(td); |
f9235b6d | 1264 | _lwkt_schedule_post(mygd, td, 1, reschedok); |
f1d1c3fa | 1265 | } else { |
e381e77c | 1266 | lwkt_send_ipiq3(td->td_gd, lwkt_schedule_remote, td, 0); |
7cd8d145 | 1267 | } |
0f7a3396 | 1268 | #else |
7cd8d145 | 1269 | _lwkt_enqueue(td); |
f9235b6d | 1270 | _lwkt_schedule_post(mygd, td, 1, reschedok); |
0f7a3396 | 1271 | #endif |
8ad65e08 | 1272 | } |
37af14fe | 1273 | crit_exit_gd(mygd); |
8ad65e08 MD |
1274 | } |
1275 | ||
361d01dd MD |
1276 | void |
1277 | lwkt_schedule(thread_t td) | |
1278 | { | |
1279 | _lwkt_schedule(td, 1); | |
1280 | } | |
1281 | ||
1282 | void | |
1283 | lwkt_schedule_noresched(thread_t td) | |
1284 | { | |
1285 | _lwkt_schedule(td, 0); | |
1286 | } | |
1287 | ||
88ebb169 SW |
1288 | #ifdef SMP |
1289 | ||
e381e77c MD |
1290 | /* |
1291 | * When scheduled remotely if frame != NULL the IPIQ is being | |
1292 | * run via doreti or an interrupt then preemption can be allowed. | |
1293 | * | |
1294 | * To allow preemption we have to drop the critical section so only | |
1295 | * one is present in _lwkt_schedule_post. | |
1296 | */ | |
1297 | static void | |
1298 | lwkt_schedule_remote(void *arg, int arg2, struct intrframe *frame) | |
1299 | { | |
1300 | thread_t td = curthread; | |
1301 | thread_t ntd = arg; | |
1302 | ||
1303 | if (frame && ntd->td_preemptable) { | |
1304 | crit_exit_noyield(td); | |
1305 | _lwkt_schedule(ntd, 1); | |
1306 | crit_enter_quick(td); | |
1307 | } else { | |
1308 | _lwkt_schedule(ntd, 1); | |
1309 | } | |
1310 | } | |
1311 | ||
d9eea1a5 | 1312 | /* |
52eedfb5 MD |
1313 | * Thread migration using a 'Pull' method. The thread may or may not be |
1314 | * the current thread. It MUST be descheduled and in a stable state. | |
1315 | * lwkt_giveaway() must be called on the cpu owning the thread. | |
1316 | * | |
1317 | * At any point after lwkt_giveaway() is called, the target cpu may | |
1318 | * 'pull' the thread by calling lwkt_acquire(). | |
1319 | * | |
ae8e83e6 MD |
1320 | * We have to make sure the thread is not sitting on a per-cpu tsleep |
1321 | * queue or it will blow up when it moves to another cpu. | |
1322 | * | |
52eedfb5 | 1323 | * MPSAFE - must be called under very specific conditions. |
d9eea1a5 | 1324 | */ |
52eedfb5 MD |
1325 | void |
1326 | lwkt_giveaway(thread_t td) | |
1327 | { | |
3b4192fb | 1328 | globaldata_t gd = mycpu; |
52eedfb5 | 1329 | |
3b4192fb MD |
1330 | crit_enter_gd(gd); |
1331 | if (td->td_flags & TDF_TSLEEPQ) | |
1332 | tsleep_remove(td); | |
1333 | KKASSERT(td->td_gd == gd); | |
1334 | TAILQ_REMOVE(&gd->gd_tdallq, td, td_allq); | |
1335 | td->td_flags |= TDF_MIGRATING; | |
1336 | crit_exit_gd(gd); | |
52eedfb5 MD |
1337 | } |
1338 | ||
a2a5ad0d MD |
1339 | void |
1340 | lwkt_acquire(thread_t td) | |
1341 | { | |
37af14fe MD |
1342 | globaldata_t gd; |
1343 | globaldata_t mygd; | |
a2a5ad0d | 1344 | |
52eedfb5 | 1345 | KKASSERT(td->td_flags & TDF_MIGRATING); |
a2a5ad0d | 1346 | gd = td->td_gd; |
37af14fe | 1347 | mygd = mycpu; |
52eedfb5 | 1348 | if (gd != mycpu) { |
35238fa5 | 1349 | cpu_lfence(); |
52eedfb5 | 1350 | KKASSERT((td->td_flags & TDF_RUNQ) == 0); |
37af14fe | 1351 | crit_enter_gd(mygd); |
cfaeae2a | 1352 | DEBUG_PUSH_INFO("lwkt_acquire"); |
df910c23 MD |
1353 | while (td->td_flags & (TDF_RUNNING|TDF_PREEMPT_LOCK)) { |
1354 | #ifdef SMP | |
1355 | lwkt_process_ipiq(); | |
1356 | #endif | |
52eedfb5 | 1357 | cpu_lfence(); |
df910c23 | 1358 | } |
cfaeae2a | 1359 | DEBUG_POP_INFO(); |
562273ea | 1360 | cpu_mfence(); |
37af14fe | 1361 | td->td_gd = mygd; |
52eedfb5 MD |
1362 | TAILQ_INSERT_TAIL(&mygd->gd_tdallq, td, td_allq); |
1363 | td->td_flags &= ~TDF_MIGRATING; | |
1364 | crit_exit_gd(mygd); | |
1365 | } else { | |
1366 | crit_enter_gd(mygd); | |
1367 | TAILQ_INSERT_TAIL(&mygd->gd_tdallq, td, td_allq); | |
1368 | td->td_flags &= ~TDF_MIGRATING; | |
37af14fe | 1369 | crit_exit_gd(mygd); |
a2a5ad0d MD |
1370 | } |
1371 | } | |
1372 | ||
52eedfb5 MD |
1373 | #endif |
1374 | ||
f1d1c3fa MD |
1375 | /* |
1376 | * Generic deschedule. Descheduling threads other then your own should be | |
1377 | * done only in carefully controlled circumstances. Descheduling is | |
1378 | * asynchronous. | |
1379 | * | |
1380 | * This function may block if the cpu has run out of messages. | |
8ad65e08 MD |
1381 | */ |
1382 | void | |
1383 | lwkt_deschedule(thread_t td) | |
1384 | { | |
f1d1c3fa | 1385 | crit_enter(); |
b8a98473 | 1386 | #ifdef SMP |
f1d1c3fa MD |
1387 | if (td == curthread) { |
1388 | _lwkt_dequeue(td); | |
1389 | } else { | |
a72187e9 | 1390 | if (td->td_gd == mycpu) { |
f1d1c3fa MD |
1391 | _lwkt_dequeue(td); |
1392 | } else { | |
b8a98473 | 1393 | lwkt_send_ipiq(td->td_gd, (ipifunc1_t)lwkt_deschedule, td); |
f1d1c3fa MD |
1394 | } |
1395 | } | |
b8a98473 MD |
1396 | #else |
1397 | _lwkt_dequeue(td); | |
1398 | #endif | |
f1d1c3fa MD |
1399 | crit_exit(); |
1400 | } | |
1401 | ||
4b5f931b MD |
1402 | /* |
1403 | * Set the target thread's priority. This routine does not automatically | |
1404 | * switch to a higher priority thread, LWKT threads are not designed for | |
1405 | * continuous priority changes. Yield if you want to switch. | |
4b5f931b MD |
1406 | */ |
1407 | void | |
1408 | lwkt_setpri(thread_t td, int pri) | |
1409 | { | |
a72187e9 | 1410 | KKASSERT(td->td_gd == mycpu); |
f9235b6d MD |
1411 | if (td->td_pri != pri) { |
1412 | KKASSERT(pri >= 0); | |
1413 | crit_enter(); | |
1414 | if (td->td_flags & TDF_RUNQ) { | |
1415 | _lwkt_dequeue(td); | |
1416 | td->td_pri = pri; | |
1417 | _lwkt_enqueue(td); | |
1418 | } else { | |
1419 | td->td_pri = pri; | |
1420 | } | |
1421 | crit_exit(); | |
26a0694b | 1422 | } |
26a0694b MD |
1423 | } |
1424 | ||
03bd0a5e MD |
1425 | /* |
1426 | * Set the initial priority for a thread prior to it being scheduled for | |
1427 | * the first time. The thread MUST NOT be scheduled before or during | |
1428 | * this call. The thread may be assigned to a cpu other then the current | |
1429 | * cpu. | |
1430 | * | |
1431 | * Typically used after a thread has been created with TDF_STOPPREQ, | |
1432 | * and before the thread is initially scheduled. | |
1433 | */ | |
1434 | void | |
1435 | lwkt_setpri_initial(thread_t td, int pri) | |
1436 | { | |
1437 | KKASSERT(pri >= 0); | |
1438 | KKASSERT((td->td_flags & TDF_RUNQ) == 0); | |
f9235b6d | 1439 | td->td_pri = pri; |
03bd0a5e MD |
1440 | } |
1441 | ||
26a0694b MD |
1442 | void |
1443 | lwkt_setpri_self(int pri) | |
1444 | { | |
1445 | thread_t td = curthread; | |
1446 | ||
4b5f931b MD |
1447 | KKASSERT(pri >= 0 && pri <= TDPRI_MAX); |
1448 | crit_enter(); | |
1449 | if (td->td_flags & TDF_RUNQ) { | |
1450 | _lwkt_dequeue(td); | |
f9235b6d | 1451 | td->td_pri = pri; |
4b5f931b MD |
1452 | _lwkt_enqueue(td); |
1453 | } else { | |
f9235b6d | 1454 | td->td_pri = pri; |
4b5f931b MD |
1455 | } |
1456 | crit_exit(); | |
1457 | } | |
1458 | ||
f9235b6d MD |
1459 | /* |
1460 | * 1/hz tick (typically 10ms) x TDFAIRQ_SCALE (typ 8) = 80ms full cycle. | |
1461 | * | |
1462 | * Example: two competing threads, same priority N. decrement by (2*N) | |
1463 | * increment by N*8, each thread will get 4 ticks. | |
1464 | */ | |
1465 | void | |
1466 | lwkt_fairq_schedulerclock(thread_t td) | |
1467 | { | |
2a418930 MD |
1468 | globaldata_t gd; |
1469 | ||
f9235b6d MD |
1470 | if (fairq_enable) { |
1471 | while (td) { | |
2a418930 MD |
1472 | gd = td->td_gd; |
1473 | if (td != &gd->gd_idlethread) { | |
1474 | td->td_fairq_accum -= gd->gd_fairq_total_pri; | |
1475 | if (td->td_fairq_accum < -TDFAIRQ_MAX(gd)) | |
1476 | td->td_fairq_accum = -TDFAIRQ_MAX(gd); | |
f9235b6d MD |
1477 | if (td->td_fairq_accum < 0) |
1478 | need_lwkt_resched(); | |
1479 | td->td_fairq_lticks = ticks; | |
1480 | } | |
1481 | td = td->td_preempted; | |
1482 | } | |
1483 | } | |
1484 | } | |
1485 | ||
1486 | static void | |
1487 | lwkt_fairq_accumulate(globaldata_t gd, thread_t td) | |
1488 | { | |
1489 | td->td_fairq_accum += td->td_pri * TDFAIRQ_SCALE; | |
1490 | if (td->td_fairq_accum > TDFAIRQ_MAX(td->td_gd)) | |
1491 | td->td_fairq_accum = TDFAIRQ_MAX(td->td_gd); | |
1492 | } | |
1493 | ||
5d21b981 | 1494 | /* |
52eedfb5 MD |
1495 | * Migrate the current thread to the specified cpu. |
1496 | * | |
1497 | * This is accomplished by descheduling ourselves from the current cpu, | |
1498 | * moving our thread to the tdallq of the target cpu, IPI messaging the | |
1499 | * target cpu, and switching out. TDF_MIGRATING prevents scheduling | |
1500 | * races while the thread is being migrated. | |
ae8e83e6 MD |
1501 | * |
1502 | * We must be sure to remove ourselves from the current cpu's tsleepq | |
1503 | * before potentially moving to another queue. The thread can be on | |
1504 | * a tsleepq due to a left-over tsleep_interlock(). | |
95858b91 MD |
1505 | * |
1506 | * We also have to make sure that the switch code doesn't allow an IPI | |
1507 | * processing operation to leak in between our send and our switch, or | |
1508 | * any other potential livelock such that might occur when we release the | |
1509 | * current process designation, so do that first. | |
5d21b981 | 1510 | */ |
3d28ff59 | 1511 | #ifdef SMP |
5d21b981 | 1512 | static void lwkt_setcpu_remote(void *arg); |
3d28ff59 | 1513 | #endif |
5d21b981 MD |
1514 | |
1515 | void | |
1516 | lwkt_setcpu_self(globaldata_t rgd) | |
1517 | { | |
1518 | #ifdef SMP | |
1519 | thread_t td = curthread; | |
1520 | ||
1521 | if (td->td_gd != rgd) { | |
1522 | crit_enter_quick(td); | |
95858b91 MD |
1523 | if (td->td_release) |
1524 | td->td_release(td); | |
ae8e83e6 | 1525 | if (td->td_flags & TDF_TSLEEPQ) |
3b4192fb | 1526 | tsleep_remove(td); |
5d21b981 MD |
1527 | td->td_flags |= TDF_MIGRATING; |
1528 | lwkt_deschedule_self(td); | |
52eedfb5 | 1529 | TAILQ_REMOVE(&td->td_gd->gd_tdallq, td, td_allq); |
b8a98473 | 1530 | lwkt_send_ipiq(rgd, (ipifunc1_t)lwkt_setcpu_remote, td); |
5d21b981 MD |
1531 | lwkt_switch(); |
1532 | /* we are now on the target cpu */ | |
52eedfb5 | 1533 | TAILQ_INSERT_TAIL(&rgd->gd_tdallq, td, td_allq); |
5d21b981 MD |
1534 | crit_exit_quick(td); |
1535 | } | |
1536 | #endif | |
1537 | } | |
1538 | ||
ecdefdda MD |
1539 | void |
1540 | lwkt_migratecpu(int cpuid) | |
1541 | { | |
1542 | #ifdef SMP | |
1543 | globaldata_t rgd; | |
1544 | ||
1545 | rgd = globaldata_find(cpuid); | |
1546 | lwkt_setcpu_self(rgd); | |
1547 | #endif | |
1548 | } | |
1549 | ||
5d21b981 MD |
1550 | /* |
1551 | * Remote IPI for cpu migration (called while in a critical section so we | |
1552 | * do not have to enter another one). The thread has already been moved to | |
1553 | * our cpu's allq, but we must wait for the thread to be completely switched | |
1554 | * out on the originating cpu before we schedule it on ours or the stack | |
1555 | * state may be corrupt. We clear TDF_MIGRATING after flushing the GD | |
1556 | * change to main memory. | |
1557 | * | |
1558 | * XXX The use of TDF_MIGRATING might not be sufficient to avoid races | |
1559 | * against wakeups. It is best if this interface is used only when there | |
1560 | * are no pending events that might try to schedule the thread. | |
1561 | */ | |
3d28ff59 | 1562 | #ifdef SMP |
5d21b981 MD |
1563 | static void |
1564 | lwkt_setcpu_remote(void *arg) | |
1565 | { | |
1566 | thread_t td = arg; | |
1567 | globaldata_t gd = mycpu; | |
cfaeae2a | 1568 | int retry = 10000000; |
5d21b981 | 1569 | |
cfaeae2a | 1570 | DEBUG_PUSH_INFO("lwkt_setcpu_remote"); |
df910c23 MD |
1571 | while (td->td_flags & (TDF_RUNNING|TDF_PREEMPT_LOCK)) { |
1572 | #ifdef SMP | |
1573 | lwkt_process_ipiq(); | |
1574 | #endif | |
35238fa5 | 1575 | cpu_lfence(); |
562273ea | 1576 | cpu_pause(); |
cfaeae2a MD |
1577 | if (--retry == 0) { |
1578 | kprintf("lwkt_setcpu_remote: td->td_flags %08x\n", | |
1579 | td->td_flags); | |
1580 | retry = 10000000; | |
1581 | } | |
df910c23 | 1582 | } |
cfaeae2a | 1583 | DEBUG_POP_INFO(); |
5d21b981 | 1584 | td->td_gd = gd; |
562273ea | 1585 | cpu_mfence(); |
5d21b981 | 1586 | td->td_flags &= ~TDF_MIGRATING; |
9388413d | 1587 | KKASSERT(td->td_lwp == NULL || (td->td_lwp->lwp_flag & LWP_ONRUNQ) == 0); |
5d21b981 MD |
1588 | _lwkt_enqueue(td); |
1589 | } | |
3d28ff59 | 1590 | #endif |
5d21b981 | 1591 | |
553ea3c8 | 1592 | struct lwp * |
4b5f931b MD |
1593 | lwkt_preempted_proc(void) |
1594 | { | |
73e4f7b9 | 1595 | thread_t td = curthread; |
4b5f931b MD |
1596 | while (td->td_preempted) |
1597 | td = td->td_preempted; | |
553ea3c8 | 1598 | return(td->td_lwp); |
4b5f931b MD |
1599 | } |
1600 | ||
99df837e MD |
1601 | /* |
1602 | * Create a kernel process/thread/whatever. It shares it's address space | |
1603 | * with proc0 - ie: kernel only. | |
1604 | * | |
365fa13f MD |
1605 | * NOTE! By default new threads are created with the MP lock held. A |
1606 | * thread which does not require the MP lock should release it by calling | |
1607 | * rel_mplock() at the start of the new thread. | |
99df837e MD |
1608 | */ |
1609 | int | |
c9e9fb21 MD |
1610 | lwkt_create(void (*func)(void *), void *arg, struct thread **tdp, |
1611 | thread_t template, int tdflags, int cpu, const char *fmt, ...) | |
99df837e | 1612 | { |
73e4f7b9 | 1613 | thread_t td; |
e2565a42 | 1614 | __va_list ap; |
99df837e | 1615 | |
d3d32139 | 1616 | td = lwkt_alloc_thread(template, LWKT_THREAD_STACK, cpu, |
dbcd0c9b | 1617 | tdflags); |
a2a5ad0d MD |
1618 | if (tdp) |
1619 | *tdp = td; | |
709799ea | 1620 | cpu_set_thread_handler(td, lwkt_exit, func, arg); |
99df837e MD |
1621 | |
1622 | /* | |
1623 | * Set up arg0 for 'ps' etc | |
1624 | */ | |
e2565a42 | 1625 | __va_start(ap, fmt); |
379210cb | 1626 | kvsnprintf(td->td_comm, sizeof(td->td_comm), fmt, ap); |
e2565a42 | 1627 | __va_end(ap); |
99df837e MD |
1628 | |
1629 | /* | |
1630 | * Schedule the thread to run | |
1631 | */ | |
ef0fdad1 MD |
1632 | if ((td->td_flags & TDF_STOPREQ) == 0) |
1633 | lwkt_schedule(td); | |
1634 | else | |
1635 | td->td_flags &= ~TDF_STOPREQ; | |
99df837e MD |
1636 | return 0; |
1637 | } | |
1638 | ||
1639 | /* | |
1640 | * Destroy an LWKT thread. Warning! This function is not called when | |
1641 | * a process exits, cpu_proc_exit() directly calls cpu_thread_exit() and | |
1642 | * uses a different reaping mechanism. | |
1643 | */ | |
1644 | void | |
1645 | lwkt_exit(void) | |
1646 | { | |
1647 | thread_t td = curthread; | |
c070746a | 1648 | thread_t std; |
8826f33a | 1649 | globaldata_t gd; |
99df837e | 1650 | |
2883d2d8 MD |
1651 | /* |
1652 | * Do any cleanup that might block here | |
1653 | */ | |
99df837e | 1654 | if (td->td_flags & TDF_VERBOSE) |
6ea70f76 | 1655 | kprintf("kthread %p %s has exited\n", td, td->td_comm); |
f6bf3af1 | 1656 | caps_exit(td); |
2883d2d8 MD |
1657 | biosched_done(td); |
1658 | dsched_exit_thread(td); | |
c070746a MD |
1659 | |
1660 | /* | |
1661 | * Get us into a critical section to interlock gd_freetd and loop | |
1662 | * until we can get it freed. | |
1663 | * | |
1664 | * We have to cache the current td in gd_freetd because objcache_put()ing | |
1665 | * it would rip it out from under us while our thread is still active. | |
1666 | */ | |
1667 | gd = mycpu; | |
37af14fe | 1668 | crit_enter_quick(td); |
c070746a | 1669 | while ((std = gd->gd_freetd) != NULL) { |
cf709dd2 | 1670 | KKASSERT((std->td_flags & (TDF_RUNNING|TDF_PREEMPT_LOCK)) == 0); |
c070746a MD |
1671 | gd->gd_freetd = NULL; |
1672 | objcache_put(thread_cache, std); | |
1673 | } | |
3b4192fb MD |
1674 | |
1675 | /* | |
1676 | * Remove thread resources from kernel lists and deschedule us for | |
2883d2d8 MD |
1677 | * the last time. We cannot block after this point or we may end |
1678 | * up with a stale td on the tsleepq. | |
3b4192fb MD |
1679 | */ |
1680 | if (td->td_flags & TDF_TSLEEPQ) | |
1681 | tsleep_remove(td); | |
37af14fe | 1682 | lwkt_deschedule_self(td); |
e56e4dea | 1683 | lwkt_remove_tdallq(td); |
74c9628e | 1684 | KKASSERT(td->td_refs == 0); |
2883d2d8 MD |
1685 | |
1686 | /* | |
1687 | * Final cleanup | |
1688 | */ | |
1689 | KKASSERT(gd->gd_freetd == NULL); | |
c070746a MD |
1690 | if (td->td_flags & TDF_ALLOCATED_THREAD) |
1691 | gd->gd_freetd = td; | |
99df837e MD |
1692 | cpu_thread_exit(); |
1693 | } | |
1694 | ||
e56e4dea MD |
1695 | void |
1696 | lwkt_remove_tdallq(thread_t td) | |
1697 | { | |
1698 | KKASSERT(td->td_gd == mycpu); | |
1699 | TAILQ_REMOVE(&td->td_gd->gd_tdallq, td, td_allq); | |
1700 | } | |
1701 | ||
9cf43f91 MD |
1702 | /* |
1703 | * Code reduction and branch prediction improvements. Call/return | |
1704 | * overhead on modern cpus often degenerates into 0 cycles due to | |
1705 | * the cpu's branch prediction hardware and return pc cache. We | |
1706 | * can take advantage of this by not inlining medium-complexity | |
1707 | * functions and we can also reduce the branch prediction impact | |
1708 | * by collapsing perfectly predictable branches into a single | |
1709 | * procedure instead of duplicating it. | |
1710 | * | |
1711 | * Is any of this noticeable? Probably not, so I'll take the | |
1712 | * smaller code size. | |
1713 | */ | |
1714 | void | |
b6468f56 | 1715 | crit_exit_wrapper(__DEBUG_CRIT_ARG__) |
9cf43f91 | 1716 | { |
b6468f56 | 1717 | _crit_exit(mycpu __DEBUG_CRIT_PASS_ARG__); |
9cf43f91 MD |
1718 | } |
1719 | ||
2d93b37a MD |
1720 | void |
1721 | crit_panic(void) | |
1722 | { | |
1723 | thread_t td = curthread; | |
850634cc | 1724 | int lcrit = td->td_critcount; |
2d93b37a | 1725 | |
850634cc AH |
1726 | td->td_critcount = 0; |
1727 | panic("td_critcount is/would-go negative! %p %d", td, lcrit); | |
4a28fe22 | 1728 | /* NOT REACHED */ |
2d93b37a MD |
1729 | } |
1730 | ||
d165e668 MD |
1731 | #ifdef SMP |
1732 | ||
bd8015ca MD |
1733 | /* |
1734 | * Called from debugger/panic on cpus which have been stopped. We must still | |
1735 | * process the IPIQ while stopped, even if we were stopped while in a critical | |
1736 | * section (XXX). | |
1737 | * | |
1738 | * If we are dumping also try to process any pending interrupts. This may | |
1739 | * or may not work depending on the state of the cpu at the point it was | |
1740 | * stopped. | |
1741 | */ | |
1742 | void | |
1743 | lwkt_smp_stopped(void) | |
1744 | { | |
1745 | globaldata_t gd = mycpu; | |
1746 | ||
1747 | crit_enter_gd(gd); | |
1748 | if (dumping) { | |
1749 | lwkt_process_ipiq(); | |
1750 | splz(); | |
1751 | } else { | |
1752 | lwkt_process_ipiq(); | |
1753 | } | |
1754 | crit_exit_gd(gd); | |
1755 | } | |
1756 | ||
d165e668 | 1757 | #endif |