kernel - Fix races in disk iteration and diskctx handling
[dragonfly.git] / sys / kern / dsched / bfq / bfq_helper_thread.c
CommitLineData
aabeb187
BP
1/*
2 * Copyright (c) 2011 The DragonFly Project. All rights reserved.
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Brills Peng <brillsp@gmail.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 */
34
35
36/*
37 * bfq_helper_thread.c:
38 * Thread function of the helper thread and
39 * message sending routines.
40 *
41 * XXX: The current approach of serializing using lwkt messages is suboptimal.
42 * The idea is to replace it with way more fine-grained and lockless
43 * accesses spread all over the place. It makes things more complicated,
44 * but it will also improve performance significantly.
45 *
46 * The sysctl node of bfq is also initialized
47 * here.
48 */
49
50#include <sys/systm.h>
51#include <sys/kernel.h>
52#include <sys/proc.h>
53#include <sys/sysctl.h>
54#include <sys/buf.h>
55#include <sys/conf.h>
56#include <sys/diskslice.h>
57#include <sys/disk.h>
58#include <sys/malloc.h>
59#include <machine/md_var.h>
60#include <sys/ctype.h>
61#include <sys/syslog.h>
62#include <sys/device.h>
63#include <sys/msgport.h>
64#include <sys/msgport2.h>
65#include <sys/mplock2.h>
66#include <sys/buf2.h>
67#include <sys/dsched.h>
68#include <sys/fcntl.h>
69#include <machine/varargs.h>
70
71#include <kern/dsched/bfq/bfq.h>
72#include <kern/dsched/bfq/bfq_helper_thread.h>
73
74extern struct sysctl_oid *bfq_mod_oid;
75extern struct dsched_policy dsched_bfq_policy;
76
77static void helper_thread(struct bfq_disk_ctx *bfq_diskctx);
78static int helper_msg_exec(helper_msg_t msg);
79static void helper_sysctl_init(struct bfq_disk_ctx *bfq_diskctx);
80
81MALLOC_DEFINE(M_HELPER, "bfq", "BFQ helper thread message allocations");
82
83/*
84 * All threads share one dispose port
85 */
86static struct lwkt_port helper_dispose_port;
87
88/* XXX: should be an mpipe */
89static struct objcache_malloc_args helper_msg_malloc_args = {
90 sizeof(struct helper_msg), M_HELPER };
91
92
93static helper_msg_t
94helper_msg_get(struct bfq_disk_ctx *bfq_diskctx)
95{
96 /*
97 * XXX: wait is OK?
98 */
99 return objcache_get(bfq_diskctx->helper_msg_cache, M_WAITOK);
100}
101
102static int
103helper_msg_put(struct bfq_disk_ctx *bfq_diskctx, helper_msg_t msg)
104{
105 objcache_put(bfq_diskctx->helper_msg_cache, msg);
106 return 0;
107}
108
109static void
110helper_msg_autofree_reply(lwkt_port_t port, lwkt_msg_t msg)
111{
112 helper_msg_t hm = (helper_msg_t)msg;
113 helper_msg_put(hm->bfq_diskctx, (helper_msg_t)msg);
114}
115
116/*
117 * Initialize the dispose port. All helper threads share this port.
118 * Must be called only once, and before any helper thread being created.
119 *
120 * Called by bfq.c: bfq_moc_handler()
121 */
122void
123helper_init_global(void)
124{
125 lwkt_initport_replyonly(&helper_dispose_port, helper_msg_autofree_reply);
126}
127
128/*
129 * Helper thread initialization function:
130 * initialize the per-disk objcache and create the
131 * helper thread.
132 *
133 * Called by bfq.c:bfq_prepare()
134 */
135void
136helper_init(struct bfq_disk_ctx *bfq_diskctx)
137{
138 struct thread *phelper_thread;
139
140 bfq_diskctx->helper_msg_cache = objcache_create("bfq-helper-msg-cache", 0, 0,
141 NULL, NULL, NULL,
142 objcache_malloc_alloc,
143 objcache_malloc_free,
144 &helper_msg_malloc_args);
145
146 lwkt_create((void (*) (void *)) helper_thread, bfq_diskctx,
147 &phelper_thread, NULL, 0, -1,
148 "bfq_helper_td_%s", bfq_diskctx->head.dp->d_cdev->si_name);
149
150 bfq_diskctx->helper_thread = phelper_thread;
151}
152
153static void
154helper_msg_send(struct bfq_disk_ctx *bfq_diskctx, uint32_t cmd, helper_msg_t helper_msg)
155{
156 lwkt_port_t port = &bfq_diskctx->helper_msg_port;
157
158 lwkt_initmsg(&helper_msg->hdr, &helper_dispose_port, 0);
159 helper_msg->bfq_diskctx = bfq_diskctx;
160 helper_msg->hdr.u.ms_result = cmd;
161
162 if (port->mpu_td == curthread){
163 helper_msg_exec(helper_msg);
164 lwkt_replymsg(&helper_msg->hdr, 0);
165 } else {
166 lwkt_sendmsg(port, (lwkt_msg_t)helper_msg);
167 }
168}
169
170/*
171 * Deallocate the objcache.
172 * Called by bfq.c: bfq_teardown()
173 */
174void
175helper_uninit(struct bfq_disk_ctx *bfq_diskctx)
176{
177 objcache_destroy(bfq_diskctx->helper_msg_cache);
178}
179
180static void
181helper_sysctl_init(struct bfq_disk_ctx *bfq_diskctx)
182{
183 struct sysctl_oid *oid;
184
185 sysctl_ctx_init(&bfq_diskctx->bfq_sysctl_ctx);
186
187 if (!bfq_mod_oid){
188 kprintf("Failed to create BFQ dev sysctl node!\n");
189 return;
190 }
191
192 oid = SYSCTL_ADD_NODE(&bfq_diskctx->bfq_sysctl_ctx,
193 SYSCTL_CHILDREN(bfq_mod_oid),
194 OID_AUTO,
195 bfq_diskctx->head.dp->d_cdev->si_name,
196 CTLFLAG_RD, 0, "");
197
198 SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx,
199 SYSCTL_CHILDREN(oid),
200 OID_AUTO,
201 "max_budget",
202 CTLFLAG_RW,
203 &bfq_diskctx->bfq_max_budget,
204 0,
205 "BFQ max budget");
206
207 SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx,
208 SYSCTL_CHILDREN(oid),
209 OID_AUTO,
210 "peak_rate",
211 CTLFLAG_RD,
212 &bfq_diskctx->bfq_peak_rate,
213 0,
214 "BFQ estimated peak rate");
215
216 SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx,
217 SYSCTL_CHILDREN(oid),
218 OID_AUTO,
219 "peak_samples",
220 CTLFLAG_RD,
221 &bfq_diskctx->bfq_peak_rate_samples,
222 0,
223 "BFQ estimated peak rate samples");
224
225 SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx,
226 SYSCTL_CHILDREN(oid),
227 OID_AUTO,
228 "as_miss",
229 CTLFLAG_RD,
230 &bfq_diskctx->bfq_as_miss,
231 0,
232 "BFQ AS miss");
233
234 SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx,
235 SYSCTL_CHILDREN(oid),
236 OID_AUTO,
237 "as_hit",
238 CTLFLAG_RD,
239 &bfq_diskctx->bfq_as_hit,
240 0,
241 "BFQ AS hit");
242
243 SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx,
244 SYSCTL_CHILDREN(oid),
245 OID_AUTO,
246 "as_wait_avg_all",
247 CTLFLAG_RD,
248 &bfq_diskctx->bfq_as_avg_wait_all,
249 0,
250 "BFQ AS waitall");
251
252 SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx,
253 SYSCTL_CHILDREN(oid),
254 OID_AUTO,
255 "as_wait_avg_miss",
256 CTLFLAG_RD,
257 &bfq_diskctx->bfq_as_avg_wait_miss,
258 0,
259 "BFQ AS waitmiss");
260
261 SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx,
262 SYSCTL_CHILDREN(oid),
263 OID_AUTO,
264 "as_wait_max",
265 CTLFLAG_RD,
266 &bfq_diskctx->bfq_as_max_wait,
267 0,
268 "BFQ AS waitmax");
269
270 SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx,
271 SYSCTL_CHILDREN(oid),
272 OID_AUTO,
273 "as_wait_max2",
274 CTLFLAG_RD,
275 &bfq_diskctx->bfq_as_max_wait2,
276 0,
277 "BFQ AS waitmax2");
278
279 SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx,
280 SYSCTL_CHILDREN(oid),
281 OID_AUTO,
282 "as_high_wait_count",
283 CTLFLAG_RD,
284 &bfq_diskctx->bfq_as_high_wait_count,
285 0,
286 "BFQ AS high count");
287
288 SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx,
289 SYSCTL_CHILDREN(oid),
290 OID_AUTO,
291 "as_high_wait_count2",
292 CTLFLAG_RD,
293 &bfq_diskctx->bfq_as_high_wait_count2,
294 0,
295 "BFQ AS high count2");
296
297 SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx,
298 SYSCTL_CHILDREN(oid),
299 OID_AUTO,
300 "avg_time_slice",
301 CTLFLAG_RD,
302 &bfq_diskctx->bfq_avg_time_slice,
303 0,
304 "BFQ average time slice");
305
306 SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx,
307 SYSCTL_CHILDREN(oid),
308 OID_AUTO,
309 "max_time_slice",
310 CTLFLAG_RD,
311 &bfq_diskctx->bfq_max_time_slice,
312 0,
313 "BFQ max time slice");
314
315 SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx,
316 SYSCTL_CHILDREN(oid),
317 OID_AUTO,
318 "high_time_slice_count",
319 CTLFLAG_RD,
320 &bfq_diskctx->bfq_high_time_slice_count,
321 0,
322 "BFQ high time slice count");
323
324 SYSCTL_ADD_PROC(&bfq_diskctx->bfq_sysctl_ctx, SYSCTL_CHILDREN(oid),
325 OID_AUTO, "as_switch", CTLTYPE_INT|CTLFLAG_RW,
326 bfq_diskctx, 0, bfq_sysctl_as_switch_handler, "I", "as_switch");
327
328 SYSCTL_ADD_PROC(&bfq_diskctx->bfq_sysctl_ctx, SYSCTL_CHILDREN(oid),
329 OID_AUTO, "auto_max_budget_switch", CTLTYPE_INT|CTLFLAG_RW,
330 bfq_diskctx, 0, bfq_sysctl_auto_max_budget_handler, "I", "amb_switch");
331}
332
333static void
334helper_thread(struct bfq_disk_ctx *bfq_diskctx)
335{
aabeb187
BP
336 int r;
337 helper_msg_t msg;
338
5374d04f 339 dsched_new_policy_thread_tdio(&bfq_diskctx->head, &dsched_bfq_policy);
aabeb187
BP
340
341 lwkt_initport_thread(&bfq_diskctx->helper_msg_port, curthread);
342 dsched_disk_ctx_ref(&bfq_diskctx->head);
343 helper_sysctl_init(bfq_diskctx);
344
345 dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: helper thread created\n");
346#if 0
347 /* XXX: why mplock?! */
348 get_mplock();
349#endif
350
351 for(;;) {
352 msg = (helper_msg_t)lwkt_waitport(&bfq_diskctx->helper_msg_port, 0);
353 dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: helper: msg recv: %d\n", msg->hdr.u.ms_result);
354 r = helper_msg_exec(msg);
355 lwkt_replymsg(&msg->hdr, 0);
356 /*
357 * received BFQ_MSG_KILL
358 */
359 if (r == -1)
360 break;
361 }
362
363#if 0
364 rel_mplock();
365#endif
366
367 sysctl_ctx_free(&bfq_diskctx->bfq_sysctl_ctx);
368 dsched_disk_ctx_unref(&bfq_diskctx->head);
369 dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: helper: die peacefully\n");
370 lwkt_exit();
371}
372
373static int
374helper_msg_exec(helper_msg_t msg)
375{
376 struct bfq_disk_ctx *bfq_diskctx;
377
378 bfq_diskctx = msg->bfq_diskctx;
379
380
381 switch (msg->hdr.u.ms_result)
382 {
383 case BFQ_MSG_DEQUEUE:
384 if (atomic_cmpset_int(&bfq_diskctx->pending_dequeue, 0, 1))
385 bfq_dequeue((struct dsched_disk_ctx *)bfq_diskctx);
386 break;
387 case BFQ_MSG_AS_TIMEOUT:
388 bfq_timeout(bfq_diskctx);
389 break;
390
391 case BFQ_MSG_DESTROY_TDIO:
392 bfq_helper_destroy_tdio(msg->tdio, bfq_diskctx);
393 break;
394
395 case BFQ_MSG_KILL:
396 return -1;
397
398 default:
399 break;
400 }
401 return 0;
402}
403
404void
405helper_msg_dequeue(struct bfq_disk_ctx *bfq_diskctx)
406{
407 helper_msg_t helper_msg = helper_msg_get(bfq_diskctx);
408
409 helper_msg_send(bfq_diskctx, BFQ_MSG_DEQUEUE, helper_msg);
410}
411
412void
413helper_msg_as_timeout(struct bfq_disk_ctx *bfq_diskctx)
414{
415 helper_msg_t helper_msg = helper_msg_get(bfq_diskctx);
416 /**
417 * For statisticsal use, temporary
418 * ------------------------------
419 */
420 struct bfq_thread_io *bfq_tdio;
421 struct timeval tv;
422 uint32_t msec;
423
424
425 bfq_tdio = bfq_diskctx->bfq_blockon;
426 if (bfq_tdio) {
427 getmicrotime(&tv);
428 timevalsub(&tv, &bfq_tdio->as_start_time);
429 msec = ((uint64_t)(1000000*tv.tv_sec + tv.tv_usec)) >> 10;
430 if (msec > 5 * BFQ_T_WAIT_MIN * (1000 / hz))
431 atomic_add_int(&bfq_diskctx->bfq_as_high_wait_count2, 1);
432 if (msec > bfq_diskctx->bfq_as_max_wait2)
433 bfq_diskctx->bfq_as_max_wait2 = msec;
434 }
435 /* ----------------------------- */
436
437 helper_msg_send(bfq_diskctx, BFQ_MSG_AS_TIMEOUT, helper_msg);
438}
439
440void
441helper_msg_destroy_tdio(struct bfq_disk_ctx *bfq_diskctx, struct dsched_thread_io *tdio)
442{
443 helper_msg_t helper_msg = helper_msg_get(bfq_diskctx);
444
445 helper_msg->tdio = tdio;
446 helper_msg_send(bfq_diskctx, BFQ_MSG_DESTROY_TDIO, helper_msg);
447}
448
449void
450helper_msg_kill(struct bfq_disk_ctx *bfq_diskctx)
451{
452 helper_msg_t helper_msg = helper_msg_get(bfq_diskctx);
453
454 helper_msg_send(bfq_diskctx, BFQ_MSG_KILL, helper_msg);
455}