/* * Copyright (c) 2011 The DragonFly Project. All rights reserved. * * This code is derived from software contributed to The DragonFly Project * by Brills Peng * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * 3. Neither the name of The DragonFly Project nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific, prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * BFQ disk scheduler, the algorithm routines and the interfaces with the * dsched framework. * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define _DSCHED_BFQ_BFQ_C_ #include /* Make sure our structs fit */ CTASSERT(sizeof(struct bfq_thread_io) <= DSCHED_THREAD_IO_MAX_SZ); CTASSERT(sizeof(struct bfq_disk_ctx) <= DSCHED_DISK_CTX_MAX_SZ); static dsched_prepare_t bfq_prepare; static dsched_teardown_t bfq_teardown; static dsched_cancel_t bfq_cancel_all; static dsched_queue_t bfq_queue; static dsched_new_tdio_t bfq_new_tdio; static dsched_destroy_tdio_t bfq_destroy_tdio; static dsched_bio_done_t bfq_bio_done; static void bfq_update_peak_rate(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio); static int bfq_slow_tdio(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio); static void bfq_expire(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio, enum bfq_expire_reason reason); static void bfq_update_tdio_seek_avg(struct bfq_thread_io *bfq_tdio, struct bio *bp); static void bfq_update_tdio_ttime_avg(struct bfq_thread_io *bfq_tdio); static void bfq_update_as_avg_wait(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio, int flag); static void bfq_update_avg_time_slice(struct bfq_disk_ctx *bfq_diskctx, struct timeval tv); struct dsched_policy dsched_bfq_policy = { .name = "bfq", .prepare = bfq_prepare, .teardown = bfq_teardown, .cancel_all = bfq_cancel_all, .bio_queue = bfq_queue, .new_tdio = bfq_new_tdio, .destroy_tdio = bfq_destroy_tdio, .bio_done = bfq_bio_done, .polling_func = (void (*)(struct dsched_disk_ctx *))helper_msg_dequeue, }; struct sysctl_oid *bfq_mod_oid; struct dsched_bfq_stats bfq_stats; static int dsched_bfq_version_maj = 1; static int dsched_bfq_version_min = 0; /* * bfq_prepare(): the .prepare callback of the bfq policy. Initialize * all fields in bfq_diskctx and initialize the corresponding helper * thread. * * lock: none * refcount: none * * Returns 0 */ static int bfq_prepare(struct dsched_disk_ctx *diskctx) { struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)diskctx; BFQ_LOCKINIT(bfq_diskctx); bfq_diskctx->pending_dequeue = 0; wf2q_init(&bfq_diskctx->bfq_wf2q); callout_init_mp(&bfq_diskctx->bfq_callout); bfq_diskctx->bfq_blockon = NULL; bfq_diskctx->bfq_active_tdio = NULL; bfq_diskctx->bfq_remaining_budget = 0; bfq_diskctx->bfq_max_budget = BFQ_DEFAULT_MAX_BUDGET; bfq_diskctx->bfq_peak_rate_samples = 0; bfq_diskctx->bfq_peak_rate = 0; #if 0 bfq_diskctx->bfq_flag = BFQ_FLAG_AS | BFQ_FLAG_AUTO_MAX_BUDGET; #endif bfq_diskctx->bfq_flag = BFQ_FLAG_AS; bfq_diskctx->bfq_as_miss = 0; bfq_diskctx->bfq_as_hit = 0; bfq_diskctx->bfq_as_avg_wait_miss = 0; bfq_diskctx->bfq_as_avg_wait_all = 0; bfq_diskctx->bfq_as_max_wait = 0; bfq_diskctx->bfq_as_max_wait2 = 0; bfq_diskctx->bfq_as_high_wait_count = 0; bfq_diskctx->bfq_as_high_wait_count2 = 0; bfq_diskctx->bfq_avg_time_slice = 0; bfq_diskctx->bfq_max_time_slice = 0; bfq_diskctx->bfq_high_time_slice_count = 0; /* initiailize the helper thread */ helper_init(bfq_diskctx); dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: initialized!\n"); return 0; } /* * bfq_teardown(): .teardown callback of the bfq policy. Send message * of killing to the helper thread and deallocate resources used by * the helper thread (currently the objcache) * * XXX: deadlock causing when the caller of bfq_teardown() and the * helper thread are on the same CPU. * * lock: none * refcount: none * */ static void bfq_teardown(struct dsched_disk_ctx *diskctx) { struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)diskctx; KKASSERT(diskctx); helper_msg_kill(bfq_diskctx); tsleep(diskctx, 0, "teardn", hz * 3 / 2); helper_uninit(bfq_diskctx); } /* * bfq_cancel_all(): .cancel_all callback of the bfq policy. Cancel * all bios that queue in each bfq_thread_io structure in the * wf2q tree. * * lock: * BFQ_LOCK: protect from wf2q_insert operation in bfq_queue() and * bfq_dequeue(); wf2q_get_next operation in bfq_dequeue() * THREAD_IO_LOCK: protect from queue iteration in bfq_dequeue() and * queue insertion in bfq_queue() * * refcount: * unref thread_io structures; they are referenced in queue(), * when a bio is queued. The refcount may decrease to zero. * */ static void bfq_cancel_all(struct dsched_disk_ctx *diskctx) { struct bio *bio; struct bfq_thread_io *bfq_tdio; struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)diskctx; BFQ_LOCK(bfq_diskctx); while ((bfq_tdio = wf2q_get_next_thread_io(&bfq_diskctx->bfq_wf2q))) { DSCHED_THREAD_IO_LOCK(&bfq_tdio->head); KKASSERT(lockstatus(&bfq_tdio->head.lock, curthread) == LK_EXCLUSIVE); while ((bio = TAILQ_FIRST(&bfq_tdio->head.queue))) { bfq_tdio->head.qlength--; TAILQ_REMOVE(&bfq_tdio->head.queue, bio, link); dsched_cancel_bio(bio); dsched_thread_io_unref(&bfq_tdio->head); } KKASSERT(bfq_tdio->head.qlength == 0); DSCHED_THREAD_IO_UNLOCK(&bfq_tdio->head); } BFQ_UNLOCK(bfq_diskctx); } /* * bfq_new_tdio(): .new_tdio callback of the bfq policy. Initialize * the bfq_thread_io structure. * * lock: none * refcount: none */ static void bfq_new_tdio(struct dsched_thread_io *tdio) { struct bfq_thread_io *bfq_tdio = (struct bfq_thread_io *) tdio; /* the queue has to be initialized some where else */ tdio->qlength = 0; tdio->debug_priv = 0xF00FF00F; bfq_tdio->budget = BFQ_DEFAULT_MIN_BUDGET; bfq_tdio->weight = BFQ_DEFAULT_WEIGHT; bfq_tdio->tdio_as_switch = 1; bfq_tdio->maybe_timeout = 0; bfq_tdio->seek_samples = 0; bfq_tdio->seek_avg = 0; bfq_tdio->seek_total = 0; bfq_tdio->ttime_samples = 0; bfq_tdio->ttime_avg = 0; bfq_tdio->service_received = 0; bfq_tdio->bio_dispatched = 0; bfq_tdio->bio_completed = 0; KTR_LOG(dsched_bfq_thread_created, bfq_tdio); } /* * bfq_helper_destroy_tdio(): called after a thread_io struct is destroyed. * if the scheduler is AS waiting for a destroyed tdio, this function resumes * the scheduler. * * lock: * BFQ_LOCK: protect from nullify bfq_diskctx->bfq_blockon/bfq_active_tdio * in bfq_timeout() * * refcount: none * * Calling path: bfq_destroy_tdio --lwkt_msg--> helper_thread --call--> me * */ void bfq_helper_destroy_tdio(struct dsched_thread_io *tdio, struct bfq_disk_ctx *bfq_diskctx) { KKASSERT(bfq_diskctx); BFQ_LOCK(bfq_diskctx); /* * Test whether the scheduler is pending on the tdio to * be destroyed. */ if (((struct dsched_thread_io *)bfq_diskctx->bfq_blockon == tdio) && callout_pending(&bfq_diskctx->bfq_callout)) { dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: pending on a being destroyed thread!\n"); callout_stop(&bfq_diskctx->bfq_callout); bfq_diskctx->bfq_blockon = NULL; bfq_diskctx->bfq_active_tdio = NULL; BFQ_UNLOCK(bfq_diskctx); helper_msg_dequeue(bfq_diskctx); return; } BFQ_UNLOCK(bfq_diskctx); } /* * bfq_destroy_tdio(): .destroy_tdio callback of the bfq policy * * Called immediate after a dsched_thread_io struct's refcount decreases * to zero. This function will record the seek_avg and ttime_avg of the * destroyed thread with the KTR facility. * * lock: none * * refcount: the tdio's refcount should be zero. It may be nuked, and * any read/write to the tdio is not safe by then. */ static void bfq_destroy_tdio(struct dsched_thread_io *tdio) { struct bfq_thread_io *bfq_tdio = (struct bfq_thread_io *)tdio; /* * do not log threads without I/O */ if (bfq_tdio->seek_samples != 0 || bfq_tdio->ttime_samples != 0) { KTR_LOG(dsched_bfq_thread_seek_avg, bfq_tdio, bfq_tdio->seek_avg ); KTR_LOG(dsched_bfq_thread_ttime_avg, bfq_tdio, bfq_tdio->ttime_avg); } helper_msg_destroy_tdio((struct bfq_disk_ctx *)tdio->diskctx, tdio); } /* * bfq_bio_done(): .bio_done callback of the bfq policy * * Called after a bio is done, (by request_polling_biodone of dsched). * This function judges whet her a thread consumes up its time slice, and * if so, it will set the maybe_timeout flag in bfq_tdio structure. Any * further action of that thread or the bfq scheduler will cause the * thread to be expired. (in bfq_queue() or in bfq_dequeue()) * * This function requires the bfq_tdio pointer of the thread that pushes * bp to be stored by dsched_set_bio_priv() earlier. Currently it is * stored when bfq_queue() is called. * * lock: none. This function CANNOT be blocked by any lock * * refcount: * the corresponding tdio's refcount should decrease by 1 after * this function call. The counterpart increasing is in bfq_queue(). * For each bio pushed down, we increase the refcount of the pushing * tdio. */ static void bfq_bio_done(struct bio *bp) { struct disk *dp = dsched_get_bio_dp(bp); struct bfq_thread_io *bfq_tdio = dsched_get_bio_priv(bp); struct bfq_disk_ctx *bfq_diskctx = dsched_get_disk_priv(dp); struct timeval tv; int ticks_expired; KKASSERT(bfq_tdio); dsched_thread_io_ref(&bfq_tdio->head); atomic_add_int(&bfq_tdio->bio_completed, 1); /* the tdio has already expired */ if (bfq_tdio != bfq_diskctx->bfq_active_tdio) goto rtn; atomic_add_int(&bfq_tdio->service_received, BIO_SIZE(bp)); /* current time */ getmicrotime(&tv); bfq_tdio->last_request_done_time = tv; timevalsub (&tv, &bfq_tdio->service_start_time); ticks_expired = tvtohz_high(&tv); /* the thread has run out its time slice */ if ((ticks_expired != 0x7fffffff) && (ticks_expired >= BFQ_SLICE_TIMEOUT)) { /* * we cannot block here, so just set a flag */ #if 0 bfq_tdio->maybe_timeout = 1; #endif if (atomic_cmpset_int(&bfq_tdio->maybe_timeout, 0, 1)) { bfq_update_avg_time_slice(bfq_diskctx, tv); dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: %p may time out\n", bfq_tdio); } } rtn: dsched_thread_io_unref(&bfq_tdio->head); /* ref'ed in this function */ dsched_thread_io_unref(&bfq_tdio->head); /* ref'ed in queue() */ } /* * bfq_timeout(): called after the callout alarm strikes. * * This function getting called indicates that after waiting for * BFQ_T_WAIT / BFQ_T_WAIT_MIN ticks, the thread "active_tdio" * represents does not push any further bios. This tdio should * be expired with the reason BFQ_REASON_TOO_IDLE, but if the tdio * is marked as timeout (in bfq_biodone()) first, we expire it * for BFQ_REASON_TIMEOUT. The bfq scheduler should resume working * (and pick another thread to serve). * * It is possible that this function gets called a litter after * the thread pushes a bio with bfq_queue(), and thus a "fake timeout" * happens. We treat it as the callout does not strike, and continue * to serve the active_tdio. * * lock: * BFQ_LOCK: protect bfq_diskctx->blockon and bfq_diskctx->active_tdio * they should either changed in bfq_queue() or in this function, * atomically. * TDIO_LOCK: protect from dequeue() updateing the budget by the * maybe_timeout branch. (Not necessary, because we already hold the * BFQ_LOCK, and no one else could change the budget of the tdio) * * refcount: * the refcount of bfq_diskctx->bfq_active_tdio will decrease one * after this function. (The counterpart increasing is in bfq_dequeue(), * before resetting the callout alarm.) * * AS timeout: * during the waiting period, no bio is pushed by the being * waited tdio * * Calling path: * callout facility --> helper_msg_timeout --lwkt_msg--> helper thread * --> me */ void bfq_timeout(void *p) { /* waiting time out: * no deceptive idleness, and unblock dispatching */ struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)p; struct bfq_thread_io *bfq_tdio; BFQ_LOCK(bfq_diskctx); /* * the timeout occurs after the thread * pushing one more bio */ if (bfq_diskctx->bfq_blockon == NULL) { dsched_debug(BFQ_DEBUG_VERBOSE , "BFQ: fake AS timeout \n"); goto rtn; } bfq_diskctx->bfq_as_miss++; KKASSERT(bfq_diskctx->bfq_active_tdio); bfq_tdio = bfq_diskctx->bfq_active_tdio; DSCHED_THREAD_IO_LOCK(&bfq_tdio->head); bfq_update_as_avg_wait(bfq_diskctx, bfq_tdio, BFQ_AS_STAT_ALL|BFQ_AS_STAT_ONLY_MISS); bfq_diskctx->bfq_blockon = NULL; bfq_diskctx->bfq_active_tdio = NULL; dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: unblocked %p\n", bfq_tdio); wf2q_update_vd(bfq_tdio, bfq_tdio->budget - bfq_diskctx->bfq_remaining_budget); /* * the time slice expired before as timeout * this should be REASON_TIMEOUT */ if (bfq_tdio->maybe_timeout) { bfq_expire(bfq_diskctx, bfq_tdio, BFQ_REASON_TIMEOUT); dsched_debug(BFQ_DEBUG_VERBOSE, "%p time out in timeout()\n", bfq_tdio); } else { bfq_expire(bfq_diskctx, bfq_tdio, BFQ_REASON_TOO_IDLE); dsched_debug(BFQ_DEBUG_VERBOSE, "%p too idle\n", bfq_tdio); } DSCHED_THREAD_IO_UNLOCK(&bfq_tdio->head); /* ref'ed in dequeue(), before resetting callout */ dsched_thread_io_unref(&bfq_tdio->head); rtn: BFQ_UNLOCK(bfq_diskctx); helper_msg_dequeue(bfq_diskctx); } /* * bfq_queue(): .queue callback of the bfq policy. * * A thread calls this function to hand in its I/O requests (bio). * Their bios are stored in the per-thread queue, in tdio structure. * Currently, the sync/async bios are queued together, which may cause * some issues on performance. * * Besides queueing bios, this function also calculates the average * thinking time and average seek distance of a thread, using the * information in bio structure. * * If the calling thread is waiting by the bfq scheduler due to * the AS feature, this function will cancel the callout alarm * and resume the scheduler to continue serving this thread. * * lock: * THREAD_IO_LOCK: protect from queue iteration in bfq_dequeue() * BFQ_LOCK: protect from other insertions/deletions in wf2q_augtree * in bfq_queue() or bfq_dequeue(). * * refcount: * If the calling thread is waited by the scheduler, the refcount * of the related tdio will decrease by 1 after this function. The * counterpart increasing is in bfq_dequeue(), before resetting the * callout alarm. * * Return value: * EINVAL: if bio->bio_buf->b_cmd == BUF_CMD_FLUSH * 0: bio is queued successfully. */ static int bfq_queue(struct dsched_disk_ctx *diskctx, struct dsched_thread_io *tdio, struct bio *bio) { struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)diskctx; struct bfq_thread_io *bfq_tdio = (struct bfq_thread_io *)tdio; int original_qlength; /* we do not handle flush requests. push it down to dsched */ if (__predict_false(bio->bio_buf->b_cmd == BUF_CMD_FLUSH)) return (EINVAL); DSCHED_THREAD_IO_LOCK(tdio); KKASSERT(tdio->debug_priv == 0xF00FF00F); dsched_debug(BFQ_DEBUG_NORMAL, "bfq: tdio %p pushes bio %p\n", bfq_tdio, bio); dsched_set_bio_priv(bio, tdio); dsched_thread_io_ref(tdio); if ((bio->bio_buf->b_cmd == BUF_CMD_READ) || (bio->bio_buf->b_cmd == BUF_CMD_WRITE)) { bfq_update_tdio_seek_avg(bfq_tdio, bio); } bfq_update_tdio_ttime_avg(bfq_tdio); /* update last_bio_pushed_time */ getmicrotime(&bfq_tdio->last_bio_pushed_time); if ((bfq_tdio->seek_samples > BFQ_VALID_MIN_SAMPLES) && BFQ_TDIO_SEEKY(bfq_tdio)) dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: tdio %p is seeky\n", bfq_tdio); /* * If a tdio taks too long to think, we disable the AS feature of it. */ if ((bfq_tdio->ttime_samples > BFQ_VALID_MIN_SAMPLES) && (bfq_tdio->ttime_avg > BFQ_T_WAIT * (1000 / hz) * 1000) && (bfq_tdio->service_received > bfq_tdio->budget / 8)) { dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: tdio %p takes too long time to think\n", bfq_tdio); bfq_tdio->tdio_as_switch = 0; } else { bfq_tdio->tdio_as_switch = 1; } /* insert the bio into the tdio's own queue */ KKASSERT(lockstatus(&tdio->lock, curthread) == LK_EXCLUSIVE); TAILQ_INSERT_TAIL(&tdio->queue, bio, link); #if 0 tdio->qlength++; #endif original_qlength = atomic_fetchadd_int(&tdio->qlength, 1); DSCHED_THREAD_IO_UNLOCK(tdio); /* * A new thread: * In dequeue function, we remove the thread * from the aug-tree if it has no further bios. * Therefore "new" means a really new thread (a * newly created thread or a thread that pushed no more * bios when the scheduler was waiting for it) or * one that was removed from the aug-tree earlier. */ if (original_qlength == 0) { /* * a really new thread */ BFQ_LOCK(bfq_diskctx); if (bfq_tdio != bfq_diskctx->bfq_active_tdio) { /* insert the tdio into the wf2q queue */ wf2q_insert_thread_io(&bfq_diskctx->bfq_wf2q, bfq_tdio); } else { /* * the thread being waited by the scheduler */ if (bfq_diskctx->bfq_blockon == bfq_tdio) { /* * XXX: possible race condition here: * if the callout function is triggered when * the following code is executed, then after * releasing the TDIO lock, the callout function * will set the thread inactive and it will never * be inserted into the aug-tree (so its bio pushed * this time will not be dispatched) until it pushes * further bios */ bfq_diskctx->bfq_as_hit++; bfq_update_as_avg_wait(bfq_diskctx, bfq_tdio, BFQ_AS_STAT_ALL); if (callout_pending(&bfq_diskctx->bfq_callout)) callout_stop(&bfq_diskctx->bfq_callout); bfq_diskctx->bfq_blockon = NULL; /* ref'ed in dequeue(), before resetting callout */ dsched_thread_io_unref(&bfq_tdio->head); dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: %p pushes a new bio when AS\n", bfq_tdio); } } BFQ_UNLOCK(bfq_diskctx); } helper_msg_dequeue(bfq_diskctx); return 0; } /* * bfq_dequeue(): dispatch bios to the disk driver. * * This function will push as many bios as the number of free slots * in the tag queue. * * In the progress of dispatching, the following events may happen: * - Current thread is timeout: Expire the current thread for * BFQ_REASON_TIMEOUT, and select a new thread to serve in the * wf2q tree. * * - Current thread runs out of its budget: Expire the current thread * for BFQ_REASON_OUT_OF_BUDGET, and select a new thread to serve * * - Current thread has no further bios in its queue: if the AS feature * is turned on, the bfq scheduler sets an alarm and starts to suspend. * The bfq_timeout() or bfq_queue() calls may resume the scheduler. * * Implementation note: The bios selected to be dispatched will first * be stored in an array bio_do_dispatch. After this function releases * all the locks it holds, it will call dsched_strategy_request_polling() * for each bio stored. * * With the help of bfq_disk_ctx->pending_dequeue, * there will be only one bfq_dequeue pending on the BFQ_LOCK. * * lock: * BFQ_LOCK: protect from wf2q_augtree operations in bfq_queue() * THREAD_IO_LOCK: locks the active_tdio. Protect from queue insertions * in bfq_queue; Protect the active_tdio->budget * * refcount: * If the scheduler decides to suspend, the refcount of active_tdio * increases by 1. The counterpart decreasing is in bfq_queue() and * bfq_timeout() * blocking: * May be blocking on the disk driver lock. It depends on drivers. * * Calling path: * The callers could be: * bfq_queue(), bfq_timeout() and the registered polling function. * * caller --> helper_msg_dequeue --lwkt_msg--> helper_thread-> me * */ void bfq_dequeue(struct dsched_disk_ctx *diskctx) { int free_slots, bio_index = 0, i, remaining_budget = 0;/* remaining budget of current active process */ struct bio *bio, *bio_to_dispatch[33]; struct bfq_thread_io *active_tdio = NULL; struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)diskctx; BFQ_LOCK(bfq_diskctx); atomic_cmpset_int(&bfq_diskctx->pending_dequeue, 1, 0); /* * The whole scheduler is waiting for further bios * from process currently being served */ if (bfq_diskctx->bfq_blockon != NULL) goto rtn; remaining_budget = bfq_diskctx->bfq_remaining_budget; active_tdio = bfq_diskctx->bfq_active_tdio; dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: dequeue: Im in. active_tdio = %p\n", active_tdio); free_slots = diskctx->max_tag_queue_depth - diskctx->current_tag_queue_depth; KKASSERT(free_slots >= 0 && free_slots <= 32); if (active_tdio) DSCHED_THREAD_IO_LOCK(&active_tdio->head); while (free_slots) { /* Here active_tdio must be locked ! */ if (active_tdio) { /* * the bio_done function has marked the current * tdio timeout */ if (active_tdio->maybe_timeout) { dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: %p time out in dequeue()\n", active_tdio); wf2q_update_vd(active_tdio, active_tdio->budget - remaining_budget); bfq_expire(bfq_diskctx, active_tdio, BFQ_REASON_TIMEOUT); /* there still exist bios not dispatched, * reinsert the tdio into aug-tree*/ if (active_tdio->head.qlength > 0) { wf2q_insert_thread_io(&bfq_diskctx->bfq_wf2q, active_tdio); KKASSERT(bfq_diskctx->bfq_wf2q.wf2q_tdio_count); } active_tdio->maybe_timeout = 0; DSCHED_THREAD_IO_UNLOCK(&active_tdio->head); active_tdio = NULL; continue; } /* select next bio to dispatch */ /* TODO: a wiser slection */ KKASSERT(lockstatus(&active_tdio->head.lock, curthread) == LK_EXCLUSIVE); bio = TAILQ_FIRST(&active_tdio->head.queue); dsched_debug(BFQ_DEBUG_NORMAL, "bfq: the first bio in queue of active_tdio %p is %p\n", active_tdio, bio); dsched_debug(BFQ_DEBUG_VERBOSE, "bfq: active_tdio %p exists, remaining budget = %d, tdio budget = %d\n, qlength = %d, first bio = %p, first bio cmd = %d, first bio size = %d\n", active_tdio, remaining_budget, active_tdio->budget, active_tdio->head.qlength, bio, bio?bio->bio_buf->b_cmd:-1, bio?bio->bio_buf->b_bcount:-1); /* * The bio is not read or write, just * push it down. */ if (bio && (bio->bio_buf->b_cmd != BUF_CMD_READ) && (bio->bio_buf->b_cmd != BUF_CMD_WRITE)) { dsched_debug(BFQ_DEBUG_NORMAL, "bfq: remove bio %p from the queue of %p\n", bio, active_tdio); KKASSERT(lockstatus(&active_tdio->head.lock, curthread) == LK_EXCLUSIVE); TAILQ_REMOVE(&active_tdio->head.queue, bio, link); active_tdio->head.qlength--; free_slots--; #if 0 dsched_strategy_request_polling(diskctx->dp, bio, diskctx); #endif bio_to_dispatch[bio_index++] = bio; KKASSERT(bio_index <= bfq_diskctx->head.max_tag_queue_depth); continue; } /* * Run out of budget * But this is not because the size of bio is larger * than the complete budget. * If the size of bio is larger than the complete * budget, then use a complete budget to cover it. */ if (bio && (remaining_budget < BIO_SIZE(bio)) && (remaining_budget != active_tdio->budget)) { /* charge budget used */ wf2q_update_vd(active_tdio, active_tdio->budget - remaining_budget); bfq_expire(bfq_diskctx, active_tdio, BFQ_REASON_OUT_OF_BUDGET); wf2q_insert_thread_io(&bfq_diskctx->bfq_wf2q, active_tdio); dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: thread %p ran out of budget\n", active_tdio); DSCHED_THREAD_IO_UNLOCK(&active_tdio->head); active_tdio = NULL; } else { /* if (bio && remaining_budget < BIO_SIZE(bio) && remaining_budget != active_tdio->budget) */ /* * Having enough budget, * or having a complete budget and the size of bio * is larger than that. */ if (bio) { /* dispatch */ remaining_budget -= BIO_SIZE(bio); /* * The size of the first bio is larger * than the whole budget, we should * charge the extra part */ if (remaining_budget < 0) wf2q_update_vd(active_tdio, -remaining_budget); /* compensate */ wf2q_update_vd(active_tdio, -remaining_budget); /* * remaining_budget may be < 0, * but to prevent the budget of current tdio * to substract a negative number, * the remaining_budget has to be >= 0 */ remaining_budget = MAX(0, remaining_budget); dsched_debug(BFQ_DEBUG_NORMAL, "bfq: remove bio %p from the queue of %p\n", bio, active_tdio); KKASSERT(lockstatus(&active_tdio->head.lock, curthread) == LK_EXCLUSIVE); TAILQ_REMOVE(&active_tdio->head.queue, bio, link); free_slots--; active_tdio->head.qlength--; active_tdio->bio_dispatched++; wf2q_inc_tot_service(&bfq_diskctx->bfq_wf2q, BIO_SIZE(bio)); dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: %p's bio dispatched, size=%d, remaining_budget = %d\n", active_tdio, BIO_SIZE(bio), remaining_budget); #if 0 dsched_strategy_request_polling(diskctx->dp, bio, diskctx); #endif bio_to_dispatch[bio_index++] = bio; KKASSERT(bio_index <= bfq_diskctx->head.max_tag_queue_depth); } else { /* if (bio) */ KKASSERT(active_tdio); /* * If AS feature is switched off, * expire the tdio as well */ if ((remaining_budget <= 0) || !(bfq_diskctx->bfq_flag & BFQ_FLAG_AS) || !active_tdio->tdio_as_switch) { active_tdio->budget -= remaining_budget; wf2q_update_vd(active_tdio, active_tdio->budget); bfq_expire(bfq_diskctx, active_tdio, BFQ_REASON_OUT_OF_BUDGET); DSCHED_THREAD_IO_UNLOCK(&active_tdio->head); active_tdio = NULL; } else { /* no further bio, wait for a while */ bfq_diskctx->bfq_blockon = active_tdio; /* * Increase ref count to ensure that * tdio will not be destroyed during waiting. */ dsched_thread_io_ref(&active_tdio->head); /* * If the tdio is seeky but not thingking for * too long, we wait for it a little shorter */ if (active_tdio->seek_samples >= BFQ_VALID_MIN_SAMPLES && BFQ_TDIO_SEEKY(active_tdio)) callout_reset(&bfq_diskctx->bfq_callout, BFQ_T_WAIT_MIN, (void (*) (void *))helper_msg_as_timeout, bfq_diskctx); else callout_reset(&bfq_diskctx->bfq_callout, BFQ_T_WAIT, (void (*) (void *))helper_msg_as_timeout, bfq_diskctx); /* save the start time of blocking */ getmicrotime(&active_tdio->as_start_time); dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: blocked on %p, remaining_budget = %d\n", active_tdio, remaining_budget); DSCHED_THREAD_IO_UNLOCK(&active_tdio->head); goto save_and_rtn; } } } } else { /* if (active_tdio) */ /* there is no active tdio */ /* no pending bios at all */ active_tdio = wf2q_get_next_thread_io(&bfq_diskctx->bfq_wf2q); if (!active_tdio) { KKASSERT(bfq_diskctx->bfq_wf2q.wf2q_tdio_count == 0); dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: no more eligible tdio!\n"); goto save_and_rtn; } /* * A new tdio is picked, * initialize the service related statistic data */ DSCHED_THREAD_IO_LOCK(&active_tdio->head); active_tdio->service_received = 0; /* * Reset the maybe_timeout flag, which * may be set by a biodone after the the service is done */ getmicrotime(&active_tdio->service_start_time); active_tdio->maybe_timeout = 0; remaining_budget = active_tdio->budget; dsched_debug(BFQ_DEBUG_VERBOSE, "bfq: active_tdio %p selected, remaining budget = %d, tdio budget = %d\n, qlength = %d\n", active_tdio, remaining_budget, active_tdio->budget, active_tdio->head.qlength); } }/* while (free_slots) */ /* reach here only when free_slots == 0 */ if (active_tdio) /* && lockcount(&active_tdio->head.lock) > 0) */ DSCHED_THREAD_IO_UNLOCK(&active_tdio->head); save_and_rtn: /* save the remaining budget */ bfq_diskctx->bfq_remaining_budget = remaining_budget; bfq_diskctx->bfq_active_tdio = active_tdio; rtn: BFQ_UNLOCK(bfq_diskctx); /*dispatch the planned bios*/ for (i = 0; i < bio_index; i++) dsched_strategy_request_polling(diskctx->dp, bio_to_dispatch[i], diskctx); } /* * bfq_slow_tdio(): decide whether a tdio is slow * * This function decides whether a tdio is slow by the speed * estimated from the current time slice start time: if the * tdio is not fast enough to consume its budget (or 2/3 * its budget) within the time slice, it is judged slow. * * Called by bfq_expire() * * lock: * THREAD_IO_LOCK is expected to be held. * refcount: * none * */ static int bfq_slow_tdio(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio) { /** * A tdio is considered slow if it can not finish its budget * at its current average speed */ uint64_t usec_elapsed, service_received, speed; int expect; struct timeval tv = bfq_tdio->last_request_done_time; timevalsub (&tv, &bfq_tdio->service_start_time); usec_elapsed = (uint64_t)(1000000 * (uint64_t)tv.tv_sec + tv.tv_usec); /* discard absurd value */ if (usec_elapsed < 20000) return 0; service_received = (uint64_t)bfq_tdio->service_received << BFQ_FIXPOINT_SHIFT; speed = service_received / usec_elapsed; expect = (speed * BFQ_SLICE_TIMEOUT * (1000 * 1000 / hz)) >> BFQ_FIXPOINT_SHIFT; if (expect < 0) { dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: overflow on calculating slow_tdio\n"); return 0; } if (expect < bfq_tdio->budget * 2 / 3) { dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: %p is judged slow\n", bfq_tdio); return 1; } return 0; } /* * bfq_expire(): expire a tdio for a given reason. * * Different amount of the new budget will be assign to the expired * tdio according to the following reasons: * * BFQ_REASON_TIMEOUT: * The tdio does not consume its budget up within BFQ_SLICE_TIMEOUT ticks. * We shall update the disk peak rate if the tdio is not seeky. The new * budget will be the budget it actually consumes during this time * slice. * * BFQ_REASON_TOO_IDLE: * The tdio does not push any further bios during the scheduler is * suspending. To ensure low global latency, this tdio should be * punished by assign it the minimum budget. But if the tdio's not * pushing any bio is because it is waiting for the dispatched bios * to be done, we just keep the budget unchanged. * * BFQ_REASON_OUT_OF_BUDGET: * The tdio runs out of its budget within the time slice. It usually * indicates that the tdio is doing well. We increase the budget of it. * * lock: * THREAD_IO_LOCK is expected to be held. * BFQ_LOCK is expected to be held (needed by bfq_update_peak_rate()). * * refcount: none * * Callers: bfq_timeout(), bfq_dequeue() * */ static void bfq_expire(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio, enum bfq_expire_reason reason) { int max_budget = bfq_diskctx->bfq_max_budget, budget_left, bio_in_flight, service_received; service_received = bfq_tdio->service_received; budget_left = bfq_tdio->budget - bfq_tdio->service_received; if (budget_left < 0) { dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: budget down flow: %d, %d\n", bfq_tdio->budget, bfq_tdio->service_received); budget_left = 0; } KKASSERT(budget_left >= 0); switch (reason) { case BFQ_REASON_TIMEOUT: /* the tdio is not seeky so that we can update * the disk peak rate based on the service received * by the tdio */ if ((bfq_tdio->seek_samples >= BFQ_VALID_MIN_SAMPLES) && (!BFQ_TDIO_SEEKY(bfq_tdio))) bfq_update_peak_rate(bfq_diskctx, bfq_tdio); /* max_budget may be updated */ max_budget = bfq_diskctx->bfq_max_budget; /* update budget to service_received*/ bfq_tdio->budget = MAX(service_received, BFQ_DEFAULT_MIN_BUDGET); break; case BFQ_REASON_TOO_IDLE: /* * the tdio is too slow, charge full budget */ if (bfq_slow_tdio(bfq_diskctx, bfq_tdio)) wf2q_update_vd(bfq_tdio, budget_left); bio_in_flight = bfq_tdio->bio_dispatched - bfq_tdio->bio_completed; KKASSERT(bio_in_flight >= 0); /* * maybe the tdio pushes no bio * because it is waiting for some bios * dispatched to be done, in this case * we do not reduce the budget too harshly */ if (bio_in_flight > 0) { bfq_tdio->budget = MAX(BFQ_DEFAULT_MIN_BUDGET, service_received); } else { #if 0 bfq_tdio->budget = MAX(BFQ_DEFAULT_MIN_BUDGET, bfq_diskctx->bfq_max_budget / BFQ_MIN_BUDGET_FACTOR); #endif bfq_tdio->budget = BFQ_DEFAULT_MIN_BUDGET; } break; case BFQ_REASON_OUT_OF_BUDGET: if ((bfq_tdio->seek_samples >= BFQ_VALID_MIN_SAMPLES) && (!BFQ_TDIO_SEEKY(bfq_tdio))) bfq_update_peak_rate(bfq_diskctx, bfq_tdio); /* increase the budget */ if (bfq_tdio->budget < BFQ_BUDGET_MULTIPLE_THRESHOLD) bfq_tdio->budget = MIN(max_budget, bfq_tdio->budget * 2); else bfq_tdio->budget = MIN(max_budget, bfq_tdio->budget + BFQ_BUDG_INC_STEP); break; default: break; } } /* * bfq_update_peak_rate(): update the peak disk speed by sampling * the throughput within a time slice. * * lock: * BFQ_LOCK is expected to be held * * refcount: * none * * Caller: bfq_expire() */ static void bfq_update_peak_rate(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio) { struct timeval tv = bfq_tdio->last_request_done_time; uint64_t usec, service_received, peak_rate; timevalsub (&tv, &bfq_tdio->service_start_time); usec = (uint64_t)(1000000 * (uint64_t)tv.tv_sec + tv.tv_usec); /* discard absurd value */ if (usec < 2000 || usec > (BFQ_SLICE_TIMEOUT * (1000 / hz) * 1000)) { dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: absurd interval for peak rate\n"); return; } service_received = (uint64_t)bfq_tdio->service_received << BFQ_FIXPOINT_SHIFT; peak_rate = service_received / usec; bfq_diskctx->bfq_peak_rate = (peak_rate + 7 * bfq_diskctx->bfq_peak_rate) / 8; bfq_diskctx->bfq_peak_rate_samples++; /* update the max_budget according to the peak rate */ if (bfq_diskctx->bfq_peak_rate_samples > BFQ_VALID_MIN_SAMPLES) { bfq_diskctx->bfq_peak_rate_samples = BFQ_VALID_MIN_SAMPLES; /* * if the auto max budget adjust is disabled, * the bfq_max_budget will always be BFQ_DEFAULT_MAX_BUDGET; */ if (bfq_diskctx->bfq_flag & BFQ_FLAG_AUTO_MAX_BUDGET) { bfq_diskctx->bfq_max_budget = (uint32_t)((BFQ_SLICE_TIMEOUT * (1000 / hz) * bfq_diskctx->bfq_peak_rate * 1000) >> BFQ_FIXPOINT_SHIFT); dsched_debug(BFQ_DEBUG_NORMAL, "max budget updated to %d\n", bfq_diskctx->bfq_max_budget); } } } /* * bfq_update_tdio_seek_avg(): update the average seek distance of a * tdio. * * lock: * THREAD_IO_LOCK is expected to be held. * * refcount: * none * * Caller: bfq_queue() */ static void bfq_update_tdio_seek_avg(struct bfq_thread_io *bfq_tdio, struct bio *bp) { off_t seek; /* the first bio it dispatches, * we do not calculate the seek_avg, * just update the last_seek_end */ if (bfq_tdio->seek_samples == 0) { ++bfq_tdio->seek_samples; goto rtn; } seek = ABS(bp->bio_offset - bfq_tdio->last_seek_end); /* * we do not do seek_samples++, * because the seek_total may overflow if seek_total += seek, */ bfq_tdio->seek_samples = (7 * bfq_tdio->seek_samples + 256) / 8; bfq_tdio->seek_total = (7 * bfq_tdio->seek_total + 256 * seek) / 8; bfq_tdio->seek_avg = (bfq_tdio->seek_total + bfq_tdio->seek_samples / 2) / bfq_tdio->seek_samples; dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: tdio %p seek_avg updated to %" PRIu64 "\n", bfq_tdio, bfq_tdio->seek_avg); rtn: bfq_tdio->last_seek_end = bp->bio_offset + BIO_SIZE(bp); } /* * bfq_update_tdio_ttime_avg(): update the average thinking time * of a tdio. * * The thinking time is used to switch on / off the tdio's AS feature * * lock: * THREAD_IO_LOCK is expected to be held. * * refcount: * none * * Caller: * bfq_queue() * */ static void bfq_update_tdio_ttime_avg(struct bfq_thread_io *bfq_tdio) { struct timeval tv, after_start; uint64_t usec; if (bfq_tdio->ttime_samples == 0) { ++bfq_tdio->ttime_samples; return; } getmicrotime(&tv); after_start = bfq_tdio->last_request_done_time; #if 0 timevalsub (&tv, &bfq_tdio->last_request_done_time); #endif /* * Try the interval between two bios are pushed, * instead of between last_request_done_time and * the current time. */ timevalsub (&tv, &bfq_tdio->last_bio_pushed_time); timevalsub (&after_start, &bfq_tdio->service_start_time); /* * tv.tv_sec < 0 means the last reauest done time is * after the current time. * this may happen because the biodone function is not blocked * * after_start.tv_sec < 0 means that the last bio done happens * before the current service slice, and we should drop this value. */ if (tv.tv_sec < 0 || after_start.tv_sec < 0) return; usec = (uint64_t)(1000000 * (uint64_t)tv.tv_sec + tv.tv_usec); bfq_tdio->ttime_samples = (7 * bfq_tdio->ttime_samples + 256) / 8; bfq_tdio->ttime_total = (7 * bfq_tdio->ttime_total + 256 * usec) / 8; bfq_tdio->ttime_avg = (bfq_tdio->ttime_total + 128) / bfq_tdio->ttime_samples; } /* * This function will also update the bfq_max_time_slice field * * tv: the timeval structure representing the length of time slice */ static void bfq_update_avg_time_slice(struct bfq_disk_ctx *bfq_diskctx, struct timeval tv) { uint32_t msec; msec = ((uint64_t)(1000000 * (uint64_t)tv.tv_sec + tv.tv_usec) >> 10 ); if (msec > 3 * BFQ_SLICE_TIMEOUT * (1000 / hz)) atomic_add_int(&bfq_diskctx->bfq_high_time_slice_count, 1); bfq_diskctx->bfq_avg_time_slice = (7 * bfq_diskctx->bfq_avg_time_slice + msec) / 8; if (bfq_diskctx->bfq_max_time_slice < msec) bfq_diskctx->bfq_max_time_slice = msec; } /* * This function will also update the bfq_as_max_wait field * flag: BFQ_AS_STAT_ALL, BFQ_AS_STAT_ONLY_MISS * */ static void bfq_update_as_avg_wait(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio, int flag) { struct timeval tv; uint32_t msec; getmicrotime(&tv); timevalsub (&tv, &bfq_tdio->as_start_time); /* approximately divide 1000 by left shift 10 */ msec = ((uint64_t)(1000000 * (uint64_t)tv.tv_sec + tv.tv_usec) >> 10 ); /* ridiculous value */ if (msec > 10000) { dsched_debug(BFQ_DEBUG_NORMAL, "bfq: ridiculous as wait time!\n"); return; } if (msec > 5 * BFQ_T_WAIT_MIN * (1000 / hz)) atomic_add_int(&bfq_diskctx->bfq_as_high_wait_count, 1); if (flag & BFQ_AS_STAT_ALL) { bfq_diskctx->bfq_as_avg_wait_all = (7 * bfq_diskctx->bfq_as_avg_wait_all + msec) / 8; } if (flag & BFQ_AS_STAT_ONLY_MISS) { bfq_diskctx->bfq_as_avg_wait_miss = (7 * bfq_diskctx->bfq_as_avg_wait_miss + msec) / 8; } /* update the maximum waiting time */ if (bfq_diskctx->bfq_as_max_wait < msec) bfq_diskctx->bfq_as_max_wait = msec; return; } static int bfq_mod_handler(module_t mod, int type, void *unused) { static struct sysctl_ctx_list sysctl_ctx; static struct sysctl_oid *oid; static char version[16]; int error; ksnprintf(version, sizeof(version), "%d.%d", dsched_bfq_version_maj, dsched_bfq_version_min); switch (type) { case MOD_LOAD: bzero(&bfq_stats, sizeof(struct dsched_bfq_stats)); if ((error = dsched_register(&dsched_bfq_policy))) return (error); sysctl_ctx_init(&sysctl_ctx); oid = SYSCTL_ADD_NODE(&sysctl_ctx, SYSCTL_STATIC_CHILDREN(_dsched), OID_AUTO, "bfq", CTLFLAG_RD, 0, ""); bfq_mod_oid = oid; SYSCTL_ADD_STRING(&sysctl_ctx, SYSCTL_CHILDREN(oid), OID_AUTO, "version", CTLFLAG_RD, version, 0, "bfq version"); helper_init_global(); kprintf("BFQ scheduler policy version %d.%d loaded. sizeof(bfq_thread_io) = %zu\n", dsched_bfq_version_maj, dsched_bfq_version_min, sizeof(struct bfq_thread_io)); break; case MOD_UNLOAD: if ((error = dsched_unregister(&dsched_bfq_policy))) return (error); sysctl_ctx_free(&sysctl_ctx); kprintf("BFQ scheduler policy unloaded\n"); break; default: break; } return 0; } int bfq_sysctl_as_switch_handler(SYSCTL_HANDLER_ARGS) { struct bfq_disk_ctx *bfq_diskctx = arg1; int as_switch, error; as_switch = ((bfq_diskctx->bfq_flag & BFQ_FLAG_AS) ? 1 : 0); error = sysctl_handle_int(oidp, &as_switch, 0, req); if (error || !req->newptr) return error; if (as_switch == 1) bfq_diskctx->bfq_flag |= BFQ_FLAG_AS; else if (as_switch == 0) bfq_diskctx->bfq_flag &= ~(BFQ_FLAG_AS); else return 0; return error; } int bfq_sysctl_auto_max_budget_handler(SYSCTL_HANDLER_ARGS) { struct bfq_disk_ctx *bfq_diskctx = arg1; int auto_max_budget_switch, error; auto_max_budget_switch = ((bfq_diskctx->bfq_flag & BFQ_FLAG_AUTO_MAX_BUDGET) ? 1 : 0); error = sysctl_handle_int(oidp, &auto_max_budget_switch, 0, req); if (error || !req->newptr) return error; if (auto_max_budget_switch == 1) bfq_diskctx->bfq_flag |= BFQ_FLAG_AUTO_MAX_BUDGET; else if (auto_max_budget_switch == 0) bfq_diskctx->bfq_flag &= ~(BFQ_FLAG_AUTO_MAX_BUDGET); else return 0; return error; } DSCHED_POLICY_MODULE(dsched_bfq, bfq_mod_handler);