2 * Copyright (c) 2011 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Brills Peng <brillsp@gmail.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 * BFQ disk scheduler, the algorithm routines and the interfaces with the
43 #include <sys/systm.h>
44 #include <sys/kernel.h>
46 #include <sys/sysctl.h>
49 #include <sys/diskslice.h>
51 #include <sys/malloc.h>
52 #include <machine/md_var.h>
53 #include <sys/ctype.h>
54 #include <sys/syslog.h>
55 #include <sys/device.h>
56 #include <sys/msgport.h>
57 #include <sys/msgport2.h>
59 #include <sys/dsched.h>
60 #include <sys/fcntl.h>
61 #include <machine/varargs.h>
63 #include <kern/dsched/bfq/bfq.h>
64 #include <kern/dsched/bfq/bfq_helper_thread.h>
66 #define _DSCHED_BFQ_BFQ_C_
67 #include <kern/dsched/bfq/bfq_ktr.h>
69 /* Make sure our structs fit */
70 CTASSERT(sizeof(struct bfq_thread_io) <= DSCHED_THREAD_IO_MAX_SZ);
71 CTASSERT(sizeof(struct bfq_disk_ctx) <= DSCHED_DISK_CTX_MAX_SZ);
74 static dsched_prepare_t bfq_prepare;
75 static dsched_teardown_t bfq_teardown;
76 static dsched_cancel_t bfq_cancel_all;
77 static dsched_queue_t bfq_queue;
78 static dsched_new_tdio_t bfq_new_tdio;
79 static dsched_destroy_tdio_t bfq_destroy_tdio;
80 static dsched_bio_done_t bfq_bio_done;
83 static void bfq_update_peak_rate(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio);
84 static int bfq_slow_tdio(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio);
85 static void bfq_expire(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio, enum bfq_expire_reason reason);
86 static void bfq_update_tdio_seek_avg(struct bfq_thread_io *bfq_tdio, struct bio *bp);
87 static void bfq_update_tdio_ttime_avg(struct bfq_thread_io *bfq_tdio);
88 static void bfq_update_as_avg_wait(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio, int flag);
89 static void bfq_update_avg_time_slice(struct bfq_disk_ctx *bfq_diskctx, struct timeval tv);
93 struct dsched_policy dsched_bfq_policy = {
95 .prepare = bfq_prepare,
96 .teardown = bfq_teardown,
97 .cancel_all = bfq_cancel_all,
98 .bio_queue = bfq_queue,
99 .new_tdio = bfq_new_tdio,
100 .destroy_tdio = bfq_destroy_tdio,
101 .bio_done = bfq_bio_done,
102 .polling_func = (void (*)(struct dsched_disk_ctx *))helper_msg_dequeue,
106 struct sysctl_oid *bfq_mod_oid;
108 struct dsched_bfq_stats bfq_stats;
110 static int dsched_bfq_version_maj = 1;
111 static int dsched_bfq_version_min = 0;
114 * bfq_prepare(): the .prepare callback of the bfq policy. Initialize
115 * all fields in bfq_diskctx and initialize the corresponding helper
124 bfq_prepare(struct dsched_disk_ctx *diskctx)
126 struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)diskctx;
128 BFQ_LOCKINIT(bfq_diskctx);
130 bfq_diskctx->pending_dequeue = 0;
132 wf2q_init(&bfq_diskctx->bfq_wf2q);
134 callout_init_mp(&bfq_diskctx->bfq_callout);
136 bfq_diskctx->bfq_blockon = NULL;
137 bfq_diskctx->bfq_active_tdio = NULL;
138 bfq_diskctx->bfq_remaining_budget = 0;
140 bfq_diskctx->bfq_max_budget = BFQ_DEFAULT_MAX_BUDGET;
141 bfq_diskctx->bfq_peak_rate_samples = 0;
142 bfq_diskctx->bfq_peak_rate = 0;
145 bfq_diskctx->bfq_flag = BFQ_FLAG_AS | BFQ_FLAG_AUTO_MAX_BUDGET;
147 bfq_diskctx->bfq_flag = BFQ_FLAG_AS;
149 bfq_diskctx->bfq_as_miss = 0;
150 bfq_diskctx->bfq_as_hit = 0;
152 bfq_diskctx->bfq_as_avg_wait_miss = 0;
153 bfq_diskctx->bfq_as_avg_wait_all = 0;
154 bfq_diskctx->bfq_as_max_wait = 0;
155 bfq_diskctx->bfq_as_max_wait2 = 0;
156 bfq_diskctx->bfq_as_high_wait_count = 0;
157 bfq_diskctx->bfq_as_high_wait_count2 = 0;
159 bfq_diskctx->bfq_avg_time_slice = 0;
160 bfq_diskctx->bfq_max_time_slice = 0;
161 bfq_diskctx->bfq_high_time_slice_count = 0;
163 /* initiailize the helper thread */
164 helper_init(bfq_diskctx);
166 dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: initialized!\n");
171 * bfq_teardown(): .teardown callback of the bfq policy. Send message
172 * of killing to the helper thread and deallocate resources used by
173 * the helper thread (currently the objcache)
175 * XXX: deadlock causing when the caller of bfq_teardown() and the
176 * helper thread are on the same CPU.
184 bfq_teardown(struct dsched_disk_ctx *diskctx)
186 struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)diskctx;
189 helper_msg_kill(bfq_diskctx);
191 tsleep(diskctx, 0, "teardn", hz * 3 / 2);
193 helper_uninit(bfq_diskctx);
197 * bfq_cancel_all(): .cancel_all callback of the bfq policy. Cancel
198 * all bios that queue in each bfq_thread_io structure in the
202 * BFQ_LOCK: protect from wf2q_insert operation in bfq_queue() and
203 * bfq_dequeue(); wf2q_get_next operation in bfq_dequeue()
204 * THREAD_IO_LOCK: protect from queue iteration in bfq_dequeue() and
205 * queue insertion in bfq_queue()
208 * unref thread_io structures; they are referenced in queue(),
209 * when a bio is queued. The refcount may decrease to zero.
213 bfq_cancel_all(struct dsched_disk_ctx *diskctx)
216 struct bfq_thread_io *bfq_tdio;
217 struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)diskctx;
219 BFQ_LOCK(bfq_diskctx);
221 while ((bfq_tdio = wf2q_get_next_thread_io(&bfq_diskctx->bfq_wf2q))) {
222 DSCHED_THREAD_IO_LOCK(&bfq_tdio->head);
223 KKASSERT(lockstatus(&bfq_tdio->head.lock, curthread) == LK_EXCLUSIVE);
225 while ((bio = TAILQ_FIRST(&bfq_tdio->head.queue))) {
226 bfq_tdio->head.qlength--;
227 TAILQ_REMOVE(&bfq_tdio->head.queue, bio, link);
228 dsched_cancel_bio(bio);
229 dsched_thread_io_unref(&bfq_tdio->head);
232 KKASSERT(bfq_tdio->head.qlength == 0);
233 DSCHED_THREAD_IO_UNLOCK(&bfq_tdio->head);
236 BFQ_UNLOCK(bfq_diskctx);
240 * bfq_new_tdio(): .new_tdio callback of the bfq policy. Initialize
241 * the bfq_thread_io structure.
247 bfq_new_tdio(struct dsched_thread_io *tdio)
249 struct bfq_thread_io *bfq_tdio = (struct bfq_thread_io *) tdio;
251 /* the queue has to be initialized some where else */
254 tdio->debug_priv = 0xF00FF00F;
256 bfq_tdio->budget = BFQ_DEFAULT_MIN_BUDGET;
257 bfq_tdio->weight = BFQ_DEFAULT_WEIGHT;
259 bfq_tdio->tdio_as_switch = 1;
260 bfq_tdio->maybe_timeout = 0;
262 bfq_tdio->seek_samples = 0;
263 bfq_tdio->seek_avg = 0;
264 bfq_tdio->seek_total = 0;
265 bfq_tdio->ttime_samples = 0;
266 bfq_tdio->ttime_avg = 0;
267 bfq_tdio->service_received = 0;
268 bfq_tdio->bio_dispatched = 0;
269 bfq_tdio->bio_completed = 0;
271 KTR_LOG(dsched_bfq_thread_created, bfq_tdio);
275 * bfq_helper_destroy_tdio(): called after a thread_io struct is destroyed.
276 * if the scheduler is AS waiting for a destroyed tdio, this function resumes
280 * BFQ_LOCK: protect from nullify bfq_diskctx->bfq_blockon/bfq_active_tdio
285 * Calling path: bfq_destroy_tdio --lwkt_msg--> helper_thread --call--> me
289 bfq_helper_destroy_tdio(struct dsched_thread_io *tdio, struct bfq_disk_ctx *bfq_diskctx)
291 KKASSERT(bfq_diskctx);
293 BFQ_LOCK(bfq_diskctx);
296 * Test whether the scheduler is pending on the tdio to
299 if (((struct dsched_thread_io *)bfq_diskctx->bfq_blockon == tdio) &&
300 callout_pending(&bfq_diskctx->bfq_callout)) {
301 dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: pending on a being destroyed thread!\n");
303 callout_stop(&bfq_diskctx->bfq_callout);
305 bfq_diskctx->bfq_blockon = NULL;
306 bfq_diskctx->bfq_active_tdio = NULL;
308 BFQ_UNLOCK(bfq_diskctx);
310 helper_msg_dequeue(bfq_diskctx);
313 BFQ_UNLOCK(bfq_diskctx);
318 * bfq_destroy_tdio(): .destroy_tdio callback of the bfq policy
320 * Called immediate after a dsched_thread_io struct's refcount decreases
321 * to zero. This function will record the seek_avg and ttime_avg of the
322 * destroyed thread with the KTR facility.
326 * refcount: the tdio's refcount should be zero. It may be nuked, and
327 * any read/write to the tdio is not safe by then.
330 bfq_destroy_tdio(struct dsched_thread_io *tdio)
332 struct bfq_thread_io *bfq_tdio = (struct bfq_thread_io *)tdio;
335 * do not log threads without I/O
337 if (bfq_tdio->seek_samples != 0 || bfq_tdio->ttime_samples != 0) {
338 KTR_LOG(dsched_bfq_thread_seek_avg, bfq_tdio, bfq_tdio->seek_avg );
339 KTR_LOG(dsched_bfq_thread_ttime_avg, bfq_tdio, bfq_tdio->ttime_avg);
342 helper_msg_destroy_tdio((struct bfq_disk_ctx *)tdio->diskctx, tdio);
346 * bfq_bio_done(): .bio_done callback of the bfq policy
348 * Called after a bio is done, (by request_polling_biodone of dsched).
349 * This function judges whet her a thread consumes up its time slice, and
350 * if so, it will set the maybe_timeout flag in bfq_tdio structure. Any
351 * further action of that thread or the bfq scheduler will cause the
352 * thread to be expired. (in bfq_queue() or in bfq_dequeue())
354 * This function requires the bfq_tdio pointer of the thread that pushes
355 * bp to be stored by dsched_set_bio_priv() earlier. Currently it is
356 * stored when bfq_queue() is called.
358 * lock: none. This function CANNOT be blocked by any lock
361 * the corresponding tdio's refcount should decrease by 1 after
362 * this function call. The counterpart increasing is in bfq_queue().
363 * For each bio pushed down, we increase the refcount of the pushing
367 bfq_bio_done(struct bio *bp)
369 struct disk *dp = dsched_get_bio_dp(bp);
370 struct bfq_thread_io *bfq_tdio = dsched_get_bio_priv(bp);
371 struct bfq_disk_ctx *bfq_diskctx = dsched_get_disk_priv(dp);
377 dsched_thread_io_ref(&bfq_tdio->head);
379 atomic_add_int(&bfq_tdio->bio_completed, 1);
381 /* the tdio has already expired */
382 if (bfq_tdio != bfq_diskctx->bfq_active_tdio)
384 atomic_add_int(&bfq_tdio->service_received, BIO_SIZE(bp));
388 bfq_tdio->last_request_done_time = tv;
389 timevalsub (&tv, &bfq_tdio->service_start_time);
390 ticks_expired = tvtohz_high(&tv);
392 /* the thread has run out its time slice */
393 if ((ticks_expired != 0x7fffffff) &&
394 (ticks_expired >= BFQ_SLICE_TIMEOUT)) {
396 * we cannot block here, so just set a flag
399 bfq_tdio->maybe_timeout = 1;
401 if (atomic_cmpset_int(&bfq_tdio->maybe_timeout, 0, 1)) {
402 bfq_update_avg_time_slice(bfq_diskctx, tv);
403 dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: %p may time out\n", bfq_tdio);
407 dsched_thread_io_unref(&bfq_tdio->head); /* ref'ed in this function */
408 dsched_thread_io_unref(&bfq_tdio->head); /* ref'ed in queue() */
413 * bfq_timeout(): called after the callout alarm strikes.
415 * This function getting called indicates that after waiting for
416 * BFQ_T_WAIT / BFQ_T_WAIT_MIN ticks, the thread "active_tdio"
417 * represents does not push any further bios. This tdio should
418 * be expired with the reason BFQ_REASON_TOO_IDLE, but if the tdio
419 * is marked as timeout (in bfq_biodone()) first, we expire it
420 * for BFQ_REASON_TIMEOUT. The bfq scheduler should resume working
421 * (and pick another thread to serve).
423 * It is possible that this function gets called a litter after
424 * the thread pushes a bio with bfq_queue(), and thus a "fake timeout"
425 * happens. We treat it as the callout does not strike, and continue
426 * to serve the active_tdio.
429 * BFQ_LOCK: protect bfq_diskctx->blockon and bfq_diskctx->active_tdio
430 * they should either changed in bfq_queue() or in this function,
432 * TDIO_LOCK: protect from dequeue() updateing the budget by the
433 * maybe_timeout branch. (Not necessary, because we already hold the
434 * BFQ_LOCK, and no one else could change the budget of the tdio)
437 * the refcount of bfq_diskctx->bfq_active_tdio will decrease one
438 * after this function. (The counterpart increasing is in bfq_dequeue(),
439 * before resetting the callout alarm.)
442 * during the waiting period, no bio is pushed by the being
446 * callout facility --> helper_msg_timeout --lwkt_msg--> helper thread
453 * no deceptive idleness, and unblock dispatching
455 struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)p;
456 struct bfq_thread_io *bfq_tdio;
458 BFQ_LOCK(bfq_diskctx);
461 * the timeout occurs after the thread
462 * pushing one more bio
464 if (bfq_diskctx->bfq_blockon == NULL) {
465 dsched_debug(BFQ_DEBUG_VERBOSE , "BFQ: fake AS timeout \n");
469 bfq_diskctx->bfq_as_miss++;
471 KKASSERT(bfq_diskctx->bfq_active_tdio);
472 bfq_tdio = bfq_diskctx->bfq_active_tdio;
474 DSCHED_THREAD_IO_LOCK(&bfq_tdio->head);
476 bfq_update_as_avg_wait(bfq_diskctx, bfq_tdio, BFQ_AS_STAT_ALL|BFQ_AS_STAT_ONLY_MISS);
478 bfq_diskctx->bfq_blockon = NULL;
479 bfq_diskctx->bfq_active_tdio = NULL;
480 dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: unblocked %p\n", bfq_tdio);
482 wf2q_update_vd(bfq_tdio, bfq_tdio->budget - bfq_diskctx->bfq_remaining_budget);
484 * the time slice expired before as timeout
485 * this should be REASON_TIMEOUT
487 if (bfq_tdio->maybe_timeout) {
488 bfq_expire(bfq_diskctx, bfq_tdio, BFQ_REASON_TIMEOUT);
489 dsched_debug(BFQ_DEBUG_VERBOSE, "%p time out in timeout()\n", bfq_tdio);
491 bfq_expire(bfq_diskctx, bfq_tdio, BFQ_REASON_TOO_IDLE);
492 dsched_debug(BFQ_DEBUG_VERBOSE, "%p too idle\n", bfq_tdio);
495 DSCHED_THREAD_IO_UNLOCK(&bfq_tdio->head);
497 /* ref'ed in dequeue(), before resetting callout */
498 dsched_thread_io_unref(&bfq_tdio->head);
500 BFQ_UNLOCK(bfq_diskctx);
501 helper_msg_dequeue(bfq_diskctx);
505 * bfq_queue(): .queue callback of the bfq policy.
507 * A thread calls this function to hand in its I/O requests (bio).
508 * Their bios are stored in the per-thread queue, in tdio structure.
509 * Currently, the sync/async bios are queued together, which may cause
510 * some issues on performance.
512 * Besides queueing bios, this function also calculates the average
513 * thinking time and average seek distance of a thread, using the
514 * information in bio structure.
516 * If the calling thread is waiting by the bfq scheduler due to
517 * the AS feature, this function will cancel the callout alarm
518 * and resume the scheduler to continue serving this thread.
521 * THREAD_IO_LOCK: protect from queue iteration in bfq_dequeue()
522 * BFQ_LOCK: protect from other insertions/deletions in wf2q_augtree
523 * in bfq_queue() or bfq_dequeue().
526 * If the calling thread is waited by the scheduler, the refcount
527 * of the related tdio will decrease by 1 after this function. The
528 * counterpart increasing is in bfq_dequeue(), before resetting the
532 * EINVAL: if bio->bio_buf->b_cmd == BUF_CMD_FLUSH
533 * 0: bio is queued successfully.
536 bfq_queue(struct dsched_disk_ctx *diskctx, struct dsched_thread_io *tdio,
539 struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)diskctx;
540 struct bfq_thread_io *bfq_tdio = (struct bfq_thread_io *)tdio;
541 int original_qlength;
543 /* we do not handle flush requests. push it down to dsched */
544 if (__predict_false(bio->bio_buf->b_cmd == BUF_CMD_FLUSH))
547 DSCHED_THREAD_IO_LOCK(tdio);
548 KKASSERT(tdio->debug_priv == 0xF00FF00F);
549 dsched_debug(BFQ_DEBUG_NORMAL, "bfq: tdio %p pushes bio %p\n", bfq_tdio, bio);
551 dsched_set_bio_priv(bio, tdio);
552 dsched_thread_io_ref(tdio);
554 if ((bio->bio_buf->b_cmd == BUF_CMD_READ) ||
555 (bio->bio_buf->b_cmd == BUF_CMD_WRITE)) {
556 bfq_update_tdio_seek_avg(bfq_tdio, bio);
559 bfq_update_tdio_ttime_avg(bfq_tdio);
561 /* update last_bio_pushed_time */
562 getmicrotime(&bfq_tdio->last_bio_pushed_time);
564 if ((bfq_tdio->seek_samples > BFQ_VALID_MIN_SAMPLES) &&
565 BFQ_TDIO_SEEKY(bfq_tdio))
566 dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: tdio %p is seeky\n", bfq_tdio);
569 * If a tdio taks too long to think, we disable the AS feature of it.
571 if ((bfq_tdio->ttime_samples > BFQ_VALID_MIN_SAMPLES) &&
572 (bfq_tdio->ttime_avg > BFQ_T_WAIT * (1000 / hz) * 1000) &&
573 (bfq_tdio->service_received > bfq_tdio->budget / 8)) {
574 dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: tdio %p takes too long time to think\n", bfq_tdio);
575 bfq_tdio->tdio_as_switch = 0;
577 bfq_tdio->tdio_as_switch = 1;
580 /* insert the bio into the tdio's own queue */
581 KKASSERT(lockstatus(&tdio->lock, curthread) == LK_EXCLUSIVE);
582 TAILQ_INSERT_TAIL(&tdio->queue, bio, link);
586 original_qlength = atomic_fetchadd_int(&tdio->qlength, 1);
587 DSCHED_THREAD_IO_UNLOCK(tdio);
590 * In dequeue function, we remove the thread
591 * from the aug-tree if it has no further bios.
592 * Therefore "new" means a really new thread (a
593 * newly created thread or a thread that pushed no more
594 * bios when the scheduler was waiting for it) or
595 * one that was removed from the aug-tree earlier.
597 if (original_qlength == 0) {
599 * a really new thread
601 BFQ_LOCK(bfq_diskctx);
602 if (bfq_tdio != bfq_diskctx->bfq_active_tdio) {
603 /* insert the tdio into the wf2q queue */
604 wf2q_insert_thread_io(&bfq_diskctx->bfq_wf2q, bfq_tdio);
607 * the thread being waited by the scheduler
609 if (bfq_diskctx->bfq_blockon == bfq_tdio) {
611 * XXX: possible race condition here:
612 * if the callout function is triggered when
613 * the following code is executed, then after
614 * releasing the TDIO lock, the callout function
615 * will set the thread inactive and it will never
616 * be inserted into the aug-tree (so its bio pushed
617 * this time will not be dispatched) until it pushes
620 bfq_diskctx->bfq_as_hit++;
621 bfq_update_as_avg_wait(bfq_diskctx, bfq_tdio, BFQ_AS_STAT_ALL);
623 if (callout_pending(&bfq_diskctx->bfq_callout))
624 callout_stop(&bfq_diskctx->bfq_callout);
625 bfq_diskctx->bfq_blockon = NULL;
627 /* ref'ed in dequeue(), before resetting callout */
628 dsched_thread_io_unref(&bfq_tdio->head);
630 dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: %p pushes a new bio when AS\n", bfq_tdio);
634 BFQ_UNLOCK(bfq_diskctx);
637 helper_msg_dequeue(bfq_diskctx);
643 * bfq_dequeue(): dispatch bios to the disk driver.
645 * This function will push as many bios as the number of free slots
648 * In the progress of dispatching, the following events may happen:
649 * - Current thread is timeout: Expire the current thread for
650 * BFQ_REASON_TIMEOUT, and select a new thread to serve in the
653 * - Current thread runs out of its budget: Expire the current thread
654 * for BFQ_REASON_OUT_OF_BUDGET, and select a new thread to serve
656 * - Current thread has no further bios in its queue: if the AS feature
657 * is turned on, the bfq scheduler sets an alarm and starts to suspend.
658 * The bfq_timeout() or bfq_queue() calls may resume the scheduler.
660 * Implementation note: The bios selected to be dispatched will first
661 * be stored in an array bio_do_dispatch. After this function releases
662 * all the locks it holds, it will call dsched_strategy_request_polling()
663 * for each bio stored.
665 * With the help of bfq_disk_ctx->pending_dequeue,
666 * there will be only one bfq_dequeue pending on the BFQ_LOCK.
669 * BFQ_LOCK: protect from wf2q_augtree operations in bfq_queue()
670 * THREAD_IO_LOCK: locks the active_tdio. Protect from queue insertions
671 * in bfq_queue; Protect the active_tdio->budget
674 * If the scheduler decides to suspend, the refcount of active_tdio
675 * increases by 1. The counterpart decreasing is in bfq_queue() and
678 * May be blocking on the disk driver lock. It depends on drivers.
681 * The callers could be:
682 * bfq_queue(), bfq_timeout() and the registered polling function.
684 * caller --> helper_msg_dequeue --lwkt_msg--> helper_thread-> me
688 bfq_dequeue(struct dsched_disk_ctx *diskctx)
692 remaining_budget = 0;/* remaining budget of current active process */
694 struct bio *bio, *bio_to_dispatch[33];
695 struct bfq_thread_io *active_tdio = NULL;
696 struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)diskctx;
698 BFQ_LOCK(bfq_diskctx);
699 atomic_cmpset_int(&bfq_diskctx->pending_dequeue, 1, 0);
702 * The whole scheduler is waiting for further bios
703 * from process currently being served
705 if (bfq_diskctx->bfq_blockon != NULL)
708 remaining_budget = bfq_diskctx->bfq_remaining_budget;
709 active_tdio = bfq_diskctx->bfq_active_tdio;
710 dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: dequeue: Im in. active_tdio = %p\n", active_tdio);
712 free_slots = diskctx->max_tag_queue_depth - diskctx->current_tag_queue_depth;
713 KKASSERT(free_slots >= 0 && free_slots <= 32);
716 DSCHED_THREAD_IO_LOCK(&active_tdio->head);
719 /* Here active_tdio must be locked ! */
722 * the bio_done function has marked the current
725 if (active_tdio->maybe_timeout) {
726 dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: %p time out in dequeue()\n", active_tdio);
727 wf2q_update_vd(active_tdio, active_tdio->budget - remaining_budget);
728 bfq_expire(bfq_diskctx, active_tdio, BFQ_REASON_TIMEOUT);
730 /* there still exist bios not dispatched,
731 * reinsert the tdio into aug-tree*/
732 if (active_tdio->head.qlength > 0) {
733 wf2q_insert_thread_io(&bfq_diskctx->bfq_wf2q, active_tdio);
734 KKASSERT(bfq_diskctx->bfq_wf2q.wf2q_tdio_count);
737 active_tdio->maybe_timeout = 0;
738 DSCHED_THREAD_IO_UNLOCK(&active_tdio->head);
743 /* select next bio to dispatch */
744 /* TODO: a wiser slection */
745 KKASSERT(lockstatus(&active_tdio->head.lock, curthread) == LK_EXCLUSIVE);
746 bio = TAILQ_FIRST(&active_tdio->head.queue);
747 dsched_debug(BFQ_DEBUG_NORMAL, "bfq: the first bio in queue of active_tdio %p is %p\n", active_tdio, bio);
749 dsched_debug(BFQ_DEBUG_VERBOSE, "bfq: active_tdio %p exists, remaining budget = %d, tdio budget = %d\n, qlength = %d, first bio = %p, first bio cmd = %d, first bio size = %d\n", active_tdio, remaining_budget, active_tdio->budget, active_tdio->head.qlength, bio, bio?bio->bio_buf->b_cmd:-1, bio?bio->bio_buf->b_bcount:-1);
752 * The bio is not read or write, just
755 if (bio && (bio->bio_buf->b_cmd != BUF_CMD_READ) &&
756 (bio->bio_buf->b_cmd != BUF_CMD_WRITE)) {
757 dsched_debug(BFQ_DEBUG_NORMAL, "bfq: remove bio %p from the queue of %p\n", bio, active_tdio);
758 KKASSERT(lockstatus(&active_tdio->head.lock, curthread) == LK_EXCLUSIVE);
759 TAILQ_REMOVE(&active_tdio->head.queue, bio, link);
760 active_tdio->head.qlength--;
764 dsched_strategy_request_polling(diskctx->dp, bio, diskctx);
766 bio_to_dispatch[bio_index++] = bio;
767 KKASSERT(bio_index <= bfq_diskctx->head.max_tag_queue_depth);
772 * But this is not because the size of bio is larger
773 * than the complete budget.
774 * If the size of bio is larger than the complete
775 * budget, then use a complete budget to cover it.
777 if (bio && (remaining_budget < BIO_SIZE(bio)) &&
778 (remaining_budget != active_tdio->budget)) {
779 /* charge budget used */
780 wf2q_update_vd(active_tdio, active_tdio->budget - remaining_budget);
781 bfq_expire(bfq_diskctx, active_tdio, BFQ_REASON_OUT_OF_BUDGET);
782 wf2q_insert_thread_io(&bfq_diskctx->bfq_wf2q, active_tdio);
783 dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: thread %p ran out of budget\n", active_tdio);
784 DSCHED_THREAD_IO_UNLOCK(&active_tdio->head);
786 } else { /* if (bio && remaining_budget < BIO_SIZE(bio) && remaining_budget != active_tdio->budget) */
789 * Having enough budget,
790 * or having a complete budget and the size of bio
791 * is larger than that.
795 remaining_budget -= BIO_SIZE(bio);
797 * The size of the first bio is larger
798 * than the whole budget, we should
799 * charge the extra part
801 if (remaining_budget < 0)
802 wf2q_update_vd(active_tdio, -remaining_budget);
804 wf2q_update_vd(active_tdio, -remaining_budget);
806 * remaining_budget may be < 0,
807 * but to prevent the budget of current tdio
808 * to substract a negative number,
809 * the remaining_budget has to be >= 0
811 remaining_budget = MAX(0, remaining_budget);
812 dsched_debug(BFQ_DEBUG_NORMAL, "bfq: remove bio %p from the queue of %p\n", bio, active_tdio);
813 KKASSERT(lockstatus(&active_tdio->head.lock, curthread) == LK_EXCLUSIVE);
814 TAILQ_REMOVE(&active_tdio->head.queue, bio, link);
816 active_tdio->head.qlength--;
817 active_tdio->bio_dispatched++;
818 wf2q_inc_tot_service(&bfq_diskctx->bfq_wf2q, BIO_SIZE(bio));
819 dsched_debug(BFQ_DEBUG_VERBOSE,
820 "BFQ: %p's bio dispatched, size=%d, remaining_budget = %d\n",
821 active_tdio, BIO_SIZE(bio), remaining_budget);
823 dsched_strategy_request_polling(diskctx->dp, bio, diskctx);
825 bio_to_dispatch[bio_index++] = bio;
826 KKASSERT(bio_index <= bfq_diskctx->head.max_tag_queue_depth);
828 } else { /* if (bio) */
830 KKASSERT(active_tdio);
832 * If AS feature is switched off,
833 * expire the tdio as well
835 if ((remaining_budget <= 0) ||
836 !(bfq_diskctx->bfq_flag & BFQ_FLAG_AS) ||
837 !active_tdio->tdio_as_switch) {
838 active_tdio->budget -= remaining_budget;
839 wf2q_update_vd(active_tdio, active_tdio->budget);
840 bfq_expire(bfq_diskctx, active_tdio, BFQ_REASON_OUT_OF_BUDGET);
841 DSCHED_THREAD_IO_UNLOCK(&active_tdio->head);
845 /* no further bio, wait for a while */
846 bfq_diskctx->bfq_blockon = active_tdio;
848 * Increase ref count to ensure that
849 * tdio will not be destroyed during waiting.
851 dsched_thread_io_ref(&active_tdio->head);
853 * If the tdio is seeky but not thingking for
854 * too long, we wait for it a little shorter
856 if (active_tdio->seek_samples >= BFQ_VALID_MIN_SAMPLES && BFQ_TDIO_SEEKY(active_tdio))
857 callout_reset(&bfq_diskctx->bfq_callout, BFQ_T_WAIT_MIN, (void (*) (void *))helper_msg_as_timeout, bfq_diskctx);
859 callout_reset(&bfq_diskctx->bfq_callout, BFQ_T_WAIT, (void (*) (void *))helper_msg_as_timeout, bfq_diskctx);
861 /* save the start time of blocking */
862 getmicrotime(&active_tdio->as_start_time);
864 dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: blocked on %p, remaining_budget = %d\n", active_tdio, remaining_budget);
865 DSCHED_THREAD_IO_UNLOCK(&active_tdio->head);
870 } else { /* if (active_tdio) */
871 /* there is no active tdio */
873 /* no pending bios at all */
874 active_tdio = wf2q_get_next_thread_io(&bfq_diskctx->bfq_wf2q);
877 KKASSERT(bfq_diskctx->bfq_wf2q.wf2q_tdio_count == 0);
878 dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: no more eligible tdio!\n");
883 * A new tdio is picked,
884 * initialize the service related statistic data
886 DSCHED_THREAD_IO_LOCK(&active_tdio->head);
887 active_tdio->service_received = 0;
890 * Reset the maybe_timeout flag, which
891 * may be set by a biodone after the the service is done
893 getmicrotime(&active_tdio->service_start_time);
894 active_tdio->maybe_timeout = 0;
896 remaining_budget = active_tdio->budget;
897 dsched_debug(BFQ_DEBUG_VERBOSE, "bfq: active_tdio %p selected, remaining budget = %d, tdio budget = %d\n, qlength = %d\n", active_tdio, remaining_budget, active_tdio->budget, active_tdio->head.qlength);
900 }/* while (free_slots) */
902 /* reach here only when free_slots == 0 */
903 if (active_tdio) /* && lockcount(&active_tdio->head.lock) > 0) */
904 DSCHED_THREAD_IO_UNLOCK(&active_tdio->head);
907 /* save the remaining budget */
908 bfq_diskctx->bfq_remaining_budget = remaining_budget;
909 bfq_diskctx->bfq_active_tdio = active_tdio;
911 BFQ_UNLOCK(bfq_diskctx);
912 /*dispatch the planned bios*/
913 for (i = 0; i < bio_index; i++)
914 dsched_strategy_request_polling(diskctx->dp, bio_to_dispatch[i], diskctx);
919 * bfq_slow_tdio(): decide whether a tdio is slow
921 * This function decides whether a tdio is slow by the speed
922 * estimated from the current time slice start time: if the
923 * tdio is not fast enough to consume its budget (or 2/3
924 * its budget) within the time slice, it is judged slow.
926 * Called by bfq_expire()
929 * THREAD_IO_LOCK is expected to be held.
935 bfq_slow_tdio(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio)
938 * A tdio is considered slow if it can not finish its budget
939 * at its current average speed
941 uint64_t usec_elapsed, service_received, speed;
943 struct timeval tv = bfq_tdio->last_request_done_time;
945 timevalsub (&tv, &bfq_tdio->service_start_time);
946 usec_elapsed = (uint64_t)(1000000 * (uint64_t)tv.tv_sec + tv.tv_usec);
948 /* discard absurd value */
949 if (usec_elapsed < 20000)
952 service_received = (uint64_t)bfq_tdio->service_received << BFQ_FIXPOINT_SHIFT;
953 speed = service_received / usec_elapsed;
954 expect = (speed * BFQ_SLICE_TIMEOUT * (1000 * 1000 / hz)) >> BFQ_FIXPOINT_SHIFT;
957 dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: overflow on calculating slow_tdio\n");
961 if (expect < bfq_tdio->budget * 2 / 3) {
962 dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: %p is judged slow\n", bfq_tdio);
970 * bfq_expire(): expire a tdio for a given reason.
972 * Different amount of the new budget will be assign to the expired
973 * tdio according to the following reasons:
975 * BFQ_REASON_TIMEOUT:
976 * The tdio does not consume its budget up within BFQ_SLICE_TIMEOUT ticks.
977 * We shall update the disk peak rate if the tdio is not seeky. The new
978 * budget will be the budget it actually consumes during this time
981 * BFQ_REASON_TOO_IDLE:
982 * The tdio does not push any further bios during the scheduler is
983 * suspending. To ensure low global latency, this tdio should be
984 * punished by assign it the minimum budget. But if the tdio's not
985 * pushing any bio is because it is waiting for the dispatched bios
986 * to be done, we just keep the budget unchanged.
988 * BFQ_REASON_OUT_OF_BUDGET:
989 * The tdio runs out of its budget within the time slice. It usually
990 * indicates that the tdio is doing well. We increase the budget of it.
993 * THREAD_IO_LOCK is expected to be held.
994 * BFQ_LOCK is expected to be held (needed by bfq_update_peak_rate()).
998 * Callers: bfq_timeout(), bfq_dequeue()
1002 bfq_expire(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio, enum bfq_expire_reason reason)
1004 int max_budget = bfq_diskctx->bfq_max_budget,
1009 service_received = bfq_tdio->service_received;
1010 budget_left = bfq_tdio->budget - bfq_tdio->service_received;
1012 if (budget_left < 0) {
1013 dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: budget down flow: %d, %d\n", bfq_tdio->budget, bfq_tdio->service_received);
1017 KKASSERT(budget_left >= 0);
1020 case BFQ_REASON_TIMEOUT:
1021 /* the tdio is not seeky so that we can update
1022 * the disk peak rate based on the service received
1025 if ((bfq_tdio->seek_samples >= BFQ_VALID_MIN_SAMPLES) &&
1026 (!BFQ_TDIO_SEEKY(bfq_tdio)))
1027 bfq_update_peak_rate(bfq_diskctx, bfq_tdio);
1029 /* max_budget may be updated */
1030 max_budget = bfq_diskctx->bfq_max_budget;
1032 /* update budget to service_received*/
1033 bfq_tdio->budget = MAX(service_received, BFQ_DEFAULT_MIN_BUDGET);
1037 case BFQ_REASON_TOO_IDLE:
1039 * the tdio is too slow, charge full budget
1041 if (bfq_slow_tdio(bfq_diskctx, bfq_tdio))
1042 wf2q_update_vd(bfq_tdio, budget_left);
1044 bio_in_flight = bfq_tdio->bio_dispatched - bfq_tdio->bio_completed;
1045 KKASSERT(bio_in_flight >= 0);
1047 * maybe the tdio pushes no bio
1048 * because it is waiting for some bios
1049 * dispatched to be done, in this case
1050 * we do not reduce the budget too harshly
1052 if (bio_in_flight > 0) {
1053 bfq_tdio->budget = MAX(BFQ_DEFAULT_MIN_BUDGET, service_received);
1056 bfq_tdio->budget = MAX(BFQ_DEFAULT_MIN_BUDGET, bfq_diskctx->bfq_max_budget / BFQ_MIN_BUDGET_FACTOR);
1058 bfq_tdio->budget = BFQ_DEFAULT_MIN_BUDGET;
1062 case BFQ_REASON_OUT_OF_BUDGET:
1064 if ((bfq_tdio->seek_samples >= BFQ_VALID_MIN_SAMPLES) &&
1065 (!BFQ_TDIO_SEEKY(bfq_tdio)))
1066 bfq_update_peak_rate(bfq_diskctx, bfq_tdio);
1068 /* increase the budget */
1069 if (bfq_tdio->budget < BFQ_BUDGET_MULTIPLE_THRESHOLD)
1070 bfq_tdio->budget = MIN(max_budget, bfq_tdio->budget * 2);
1072 bfq_tdio->budget = MIN(max_budget, bfq_tdio->budget + BFQ_BUDG_INC_STEP);
1080 * bfq_update_peak_rate(): update the peak disk speed by sampling
1081 * the throughput within a time slice.
1084 * BFQ_LOCK is expected to be held
1089 * Caller: bfq_expire()
1092 bfq_update_peak_rate(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio)
1094 struct timeval tv = bfq_tdio->last_request_done_time;
1095 uint64_t usec, service_received, peak_rate;
1098 timevalsub (&tv, &bfq_tdio->service_start_time);
1099 usec = (uint64_t)(1000000 * (uint64_t)tv.tv_sec + tv.tv_usec);
1101 /* discard absurd value */
1102 if (usec < 2000 || usec > (BFQ_SLICE_TIMEOUT * (1000 / hz) * 1000)) {
1103 dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: absurd interval for peak rate\n");
1107 service_received = (uint64_t)bfq_tdio->service_received << BFQ_FIXPOINT_SHIFT;
1108 peak_rate = service_received / usec;
1109 bfq_diskctx->bfq_peak_rate = (peak_rate + 7 * bfq_diskctx->bfq_peak_rate) / 8;
1110 bfq_diskctx->bfq_peak_rate_samples++;
1112 /* update the max_budget according to the peak rate */
1113 if (bfq_diskctx->bfq_peak_rate_samples > BFQ_VALID_MIN_SAMPLES) {
1114 bfq_diskctx->bfq_peak_rate_samples = BFQ_VALID_MIN_SAMPLES;
1116 * if the auto max budget adjust is disabled,
1117 * the bfq_max_budget will always be BFQ_DEFAULT_MAX_BUDGET;
1119 if (bfq_diskctx->bfq_flag & BFQ_FLAG_AUTO_MAX_BUDGET) {
1120 bfq_diskctx->bfq_max_budget =
1121 (uint32_t)((BFQ_SLICE_TIMEOUT * (1000 / hz) * bfq_diskctx->bfq_peak_rate * 1000) >> BFQ_FIXPOINT_SHIFT);
1122 dsched_debug(BFQ_DEBUG_NORMAL, "max budget updated to %d\n", bfq_diskctx->bfq_max_budget);
1128 * bfq_update_tdio_seek_avg(): update the average seek distance of a
1132 * THREAD_IO_LOCK is expected to be held.
1137 * Caller: bfq_queue()
1140 bfq_update_tdio_seek_avg(struct bfq_thread_io *bfq_tdio, struct bio *bp)
1144 /* the first bio it dispatches,
1145 * we do not calculate the seek_avg,
1146 * just update the last_seek_end
1148 if (bfq_tdio->seek_samples == 0) {
1149 ++bfq_tdio->seek_samples;
1153 seek = ABS(bp->bio_offset - bfq_tdio->last_seek_end);
1156 * we do not do seek_samples++,
1157 * because the seek_total may overflow if seek_total += seek,
1159 bfq_tdio->seek_samples = (7 * bfq_tdio->seek_samples + 256) / 8;
1160 bfq_tdio->seek_total = (7 * bfq_tdio->seek_total + 256 * seek) / 8;
1161 bfq_tdio->seek_avg = (bfq_tdio->seek_total + bfq_tdio->seek_samples / 2) / bfq_tdio->seek_samples;
1163 dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: tdio %p seek_avg updated to %" PRIu64 "\n", bfq_tdio, bfq_tdio->seek_avg);
1166 bfq_tdio->last_seek_end = bp->bio_offset + BIO_SIZE(bp);
1170 * bfq_update_tdio_ttime_avg(): update the average thinking time
1173 * The thinking time is used to switch on / off the tdio's AS feature
1176 * THREAD_IO_LOCK is expected to be held.
1186 bfq_update_tdio_ttime_avg(struct bfq_thread_io *bfq_tdio)
1188 struct timeval tv, after_start;
1191 if (bfq_tdio->ttime_samples == 0) {
1192 ++bfq_tdio->ttime_samples;
1197 after_start = bfq_tdio->last_request_done_time;
1200 timevalsub (&tv, &bfq_tdio->last_request_done_time);
1203 * Try the interval between two bios are pushed,
1204 * instead of between last_request_done_time and
1208 timevalsub (&tv, &bfq_tdio->last_bio_pushed_time);
1210 timevalsub (&after_start, &bfq_tdio->service_start_time);
1213 * tv.tv_sec < 0 means the last reauest done time is
1214 * after the current time.
1215 * this may happen because the biodone function is not blocked
1217 * after_start.tv_sec < 0 means that the last bio done happens
1218 * before the current service slice, and we should drop this value.
1220 if (tv.tv_sec < 0 || after_start.tv_sec < 0)
1223 usec = (uint64_t)(1000000 * (uint64_t)tv.tv_sec + tv.tv_usec);
1225 bfq_tdio->ttime_samples = (7 * bfq_tdio->ttime_samples + 256) / 8;
1226 bfq_tdio->ttime_total = (7 * bfq_tdio->ttime_total + 256 * usec) / 8;
1227 bfq_tdio->ttime_avg = (bfq_tdio->ttime_total + 128) / bfq_tdio->ttime_samples;
1232 * This function will also update the bfq_max_time_slice field
1234 * tv: the timeval structure representing the length of time slice
1237 bfq_update_avg_time_slice(struct bfq_disk_ctx *bfq_diskctx, struct timeval tv)
1241 msec = ((uint64_t)(1000000 * (uint64_t)tv.tv_sec + tv.tv_usec) >> 10 );
1243 if (msec > 3 * BFQ_SLICE_TIMEOUT * (1000 / hz))
1244 atomic_add_int(&bfq_diskctx->bfq_high_time_slice_count, 1);
1246 bfq_diskctx->bfq_avg_time_slice =
1247 (7 * bfq_diskctx->bfq_avg_time_slice + msec) / 8;
1249 if (bfq_diskctx->bfq_max_time_slice < msec)
1250 bfq_diskctx->bfq_max_time_slice = msec;
1253 * This function will also update the bfq_as_max_wait field
1254 * flag: BFQ_AS_STAT_ALL, BFQ_AS_STAT_ONLY_MISS
1258 bfq_update_as_avg_wait(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio, int flag)
1263 timevalsub (&tv, &bfq_tdio->as_start_time);
1265 /* approximately divide 1000 by left shift 10 */
1266 msec = ((uint64_t)(1000000 * (uint64_t)tv.tv_sec + tv.tv_usec) >> 10 );
1268 /* ridiculous value */
1270 dsched_debug(BFQ_DEBUG_NORMAL, "bfq: ridiculous as wait time!\n");
1274 if (msec > 5 * BFQ_T_WAIT_MIN * (1000 / hz))
1275 atomic_add_int(&bfq_diskctx->bfq_as_high_wait_count, 1);
1277 if (flag & BFQ_AS_STAT_ALL) {
1278 bfq_diskctx->bfq_as_avg_wait_all =
1279 (7 * bfq_diskctx->bfq_as_avg_wait_all + msec) / 8;
1282 if (flag & BFQ_AS_STAT_ONLY_MISS) {
1283 bfq_diskctx->bfq_as_avg_wait_miss =
1284 (7 * bfq_diskctx->bfq_as_avg_wait_miss + msec) / 8;
1287 /* update the maximum waiting time */
1288 if (bfq_diskctx->bfq_as_max_wait < msec)
1289 bfq_diskctx->bfq_as_max_wait = msec;
1295 bfq_mod_handler(module_t mod, int type, void *unused)
1297 static struct sysctl_ctx_list sysctl_ctx;
1298 static struct sysctl_oid *oid;
1299 static char version[16];
1302 ksnprintf(version, sizeof(version), "%d.%d",
1303 dsched_bfq_version_maj, dsched_bfq_version_min);
1307 bzero(&bfq_stats, sizeof(struct dsched_bfq_stats));
1308 if ((error = dsched_register(&dsched_bfq_policy)))
1311 sysctl_ctx_init(&sysctl_ctx);
1312 oid = SYSCTL_ADD_NODE(&sysctl_ctx,
1313 SYSCTL_STATIC_CHILDREN(_dsched),
1319 SYSCTL_ADD_STRING(&sysctl_ctx, SYSCTL_CHILDREN(oid),
1320 OID_AUTO, "version", CTLFLAG_RD, version, 0, "bfq version");
1321 helper_init_global();
1323 kprintf("BFQ scheduler policy version %d.%d loaded. sizeof(bfq_thread_io) = %zu\n",
1324 dsched_bfq_version_maj, dsched_bfq_version_min, sizeof(struct bfq_thread_io));
1328 if ((error = dsched_unregister(&dsched_bfq_policy)))
1330 sysctl_ctx_free(&sysctl_ctx);
1331 kprintf("BFQ scheduler policy unloaded\n");
1342 bfq_sysctl_as_switch_handler(SYSCTL_HANDLER_ARGS)
1344 struct bfq_disk_ctx *bfq_diskctx = arg1;
1345 int as_switch, error;
1347 as_switch = ((bfq_diskctx->bfq_flag & BFQ_FLAG_AS) ? 1 : 0);
1348 error = sysctl_handle_int(oidp, &as_switch, 0, req);
1349 if (error || !req->newptr)
1353 bfq_diskctx->bfq_flag |= BFQ_FLAG_AS;
1354 else if (as_switch == 0)
1355 bfq_diskctx->bfq_flag &= ~(BFQ_FLAG_AS);
1363 bfq_sysctl_auto_max_budget_handler(SYSCTL_HANDLER_ARGS)
1365 struct bfq_disk_ctx *bfq_diskctx = arg1;
1366 int auto_max_budget_switch, error;
1367 auto_max_budget_switch = ((bfq_diskctx->bfq_flag & BFQ_FLAG_AUTO_MAX_BUDGET) ? 1 : 0);
1368 error = sysctl_handle_int(oidp, &auto_max_budget_switch, 0, req);
1369 if (error || !req->newptr)
1372 if (auto_max_budget_switch == 1)
1373 bfq_diskctx->bfq_flag |= BFQ_FLAG_AUTO_MAX_BUDGET;
1374 else if (auto_max_budget_switch == 0)
1375 bfq_diskctx->bfq_flag &= ~(BFQ_FLAG_AUTO_MAX_BUDGET);
1382 DSCHED_POLICY_MODULE(dsched_bfq, bfq_mod_handler);