From aabeb1879a283dd102229d575b6239e9e72e887c Mon Sep 17 00:00:00 2001 From: Brills Peng Date: Sat, 27 Aug 2011 18:19:49 +0000 Subject: [PATCH] dsched_bfq - A budget fair-queuing dsched policy * dsched_bfq is a budget fair queuing scheduling policy for the dsched framework. * NOTE: this scheduler is still highly experimental and work-in-progress, it's not recommended for widespread use (yet). There are several well-known issues, such as a possible deadlock on unloading the module. Sponsored-by: Google Summer of Code --- sys/conf/files | 5 +- sys/conf/options | 1 + sys/config/LINT | 2 + sys/kern/dsched/Makefile | 2 +- sys/kern/dsched/bfq/Makefile | 5 + sys/kern/dsched/bfq/bfq.c | 1382 +++++++++++++++++++++++++++++++ sys/kern/dsched/bfq/bfq.h | 238 ++++++ sys/kern/dsched/bfq/bfq_helper_thread.c | 457 ++++++++++ sys/kern/dsched/bfq/bfq_helper_thread.h | 62 ++ sys/kern/dsched/bfq/bfq_ktr.h | 64 ++ sys/kern/dsched/bfq/wf2q.c | 231 +++++ sys/kern/dsched/bfq/wf2q.h | 68 ++ 12 files changed, 2515 insertions(+), 2 deletions(-) create mode 100644 sys/kern/dsched/bfq/Makefile create mode 100644 sys/kern/dsched/bfq/bfq.c create mode 100644 sys/kern/dsched/bfq/bfq.h create mode 100644 sys/kern/dsched/bfq/bfq_helper_thread.c create mode 100644 sys/kern/dsched/bfq/bfq_helper_thread.h create mode 100644 sys/kern/dsched/bfq/bfq_ktr.h create mode 100644 sys/kern/dsched/bfq/wf2q.c create mode 100644 sys/kern/dsched/bfq/wf2q.h diff --git a/sys/conf/files b/sys/conf/files index 44f9272..e1968cd 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -2044,9 +2044,12 @@ ${OSACPI_MI_DIR}/acpi_toshiba/acpi_toshiba.c optional acpi_toshiba acpi ${OSACPI_MI_DIR}/acpi_video/acpi_video.c optional acpi_video acpi ${OSACPI_MI_DIR}/aibs/atk0110.c optional aibs acpi -#dsched stuff +# dsched stuff kern/dsched/fq/fq_core.c optional dsched_fq kern/dsched/fq/fq_diskops.c optional dsched_fq +kern/dsched/bfq/bfq.c optional dsched_bfq +kern/dsched/bfq/wf2q.c optional dsched_bfq +kern/dsched/bfq/bfq_helper_thread.c optional dsched_bfq # ACPICA code ${ACPICA_DIR}/debugger/dbcmds.c optional acpi acpi_debug diff --git a/sys/conf/options b/sys/conf/options index 75eb0dd..884e4cb 100644 --- a/sys/conf/options +++ b/sys/conf/options @@ -581,6 +581,7 @@ KTR_ENTRIES opt_global.h KTR_ALL opt_ktr.h KTR_CTXSW opt_ktr.h KTR_DMCRYPT opt_ktr.h +KTR_DSCHED_BFQ opt_ktr.h KTR_ETHERNET opt_ktr.h KTR_HAMMER opt_ktr.h KTR_IFQ opt_ktr.h diff --git a/sys/config/LINT b/sys/config/LINT index 51b6dfa..6431306 100644 --- a/sys/config/LINT +++ b/sys/config/LINT @@ -2773,6 +2773,7 @@ options KTR_ENTRIES=1024 options KTR_VERBOSE=1 #options KTR_CTXSW #options KTR_DMCRYPT +#options KTR_DSCHED_BFQ #options KTR_ETHERNET #options KTR_HAMMER #options KTR_IFQ @@ -2819,6 +2820,7 @@ options SCTP_MAP_LOGGING # DSCHED stuff options DSCHED_FQ +options DSCHED_BFQ # WATCHDOG options WATCHDOG_ENABLE # Enable watchdog support framework diff --git a/sys/kern/dsched/Makefile b/sys/kern/dsched/Makefile index 95e95a7..a14d938 100644 --- a/sys/kern/dsched/Makefile +++ b/sys/kern/dsched/Makefile @@ -1,5 +1,5 @@ # $DragonFly: src/sys/dev/Makefile,v 1.12 2007/01/30 14:50:10 corecode Exp $ -SUBDIR= fq as +SUBDIR= fq as bfq .include diff --git a/sys/kern/dsched/bfq/Makefile b/sys/kern/dsched/bfq/Makefile new file mode 100644 index 0000000..ed9b4c9 --- /dev/null +++ b/sys/kern/dsched/bfq/Makefile @@ -0,0 +1,5 @@ +KMOD= dsched_bfq +SRCS= opt_ktr.h +SRCS+= bfq.c wf2q.c bfq_helper_thread.c + +.include diff --git a/sys/kern/dsched/bfq/bfq.c b/sys/kern/dsched/bfq/bfq.c new file mode 100644 index 0000000..d9d29b6 --- /dev/null +++ b/sys/kern/dsched/bfq/bfq.c @@ -0,0 +1,1382 @@ +/* + * Copyright (c) 2011 The DragonFly Project. All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Brills Peng + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + + +/* + * BFQ disk scheduler, the algorithm routines and the interfaces with the + * dsched framework. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define _DSCHED_BFQ_BFQ_C_ +#include + +/* Make sure our structs fit */ +CTASSERT(sizeof(struct bfq_thread_io) <= DSCHED_THREAD_IO_MAX_SZ); +CTASSERT(sizeof(struct bfq_disk_ctx) <= DSCHED_DISK_CTX_MAX_SZ); + + +static dsched_prepare_t bfq_prepare; +static dsched_teardown_t bfq_teardown; +static dsched_cancel_t bfq_cancel_all; +static dsched_queue_t bfq_queue; +static dsched_new_tdio_t bfq_new_tdio; +static dsched_destroy_tdio_t bfq_destroy_tdio; +static dsched_bio_done_t bfq_bio_done; + + +static void bfq_update_peak_rate(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio); +static int bfq_slow_tdio(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio); +static void bfq_expire(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio, enum bfq_expire_reason reason); +static void bfq_update_tdio_seek_avg(struct bfq_thread_io *bfq_tdio, struct bio *bp); +static void bfq_update_tdio_ttime_avg(struct bfq_thread_io *bfq_tdio); +static void bfq_update_as_avg_wait(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio, int flag); +static void bfq_update_avg_time_slice(struct bfq_disk_ctx *bfq_diskctx, struct timeval tv); + + + +struct dsched_policy dsched_bfq_policy = { + .name = "bfq", + .prepare = bfq_prepare, + .teardown = bfq_teardown, + .cancel_all = bfq_cancel_all, + .bio_queue = bfq_queue, + .new_tdio = bfq_new_tdio, + .destroy_tdio = bfq_destroy_tdio, + .bio_done = bfq_bio_done, + .polling_func = (void (*)(struct dsched_disk_ctx *))helper_msg_dequeue, +}; + + +struct sysctl_oid *bfq_mod_oid; + +struct dsched_bfq_stats bfq_stats; + +static int dsched_bfq_version_maj = 1; +static int dsched_bfq_version_min = 0; + +/* + * bfq_prepare(): the .prepare callback of the bfq policy. Initialize + * all fields in bfq_diskctx and initialize the corresponding helper + * thread. + * + * lock: none + * refcount: none + * + * Returns 0 + */ +static int +bfq_prepare(struct dsched_disk_ctx *diskctx) +{ + struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)diskctx; + + BFQ_LOCKINIT(bfq_diskctx); + + bfq_diskctx->pending_dequeue = 0; + + wf2q_init(&bfq_diskctx->bfq_wf2q); + + callout_init_mp(&bfq_diskctx->bfq_callout); + + bfq_diskctx->bfq_blockon = NULL; + bfq_diskctx->bfq_active_tdio = NULL; + bfq_diskctx->bfq_remaining_budget = 0; + + bfq_diskctx->bfq_max_budget = BFQ_DEFAULT_MAX_BUDGET; + bfq_diskctx->bfq_peak_rate_samples = 0; + bfq_diskctx->bfq_peak_rate = 0; + +#if 0 + bfq_diskctx->bfq_flag = BFQ_FLAG_AS | BFQ_FLAG_AUTO_MAX_BUDGET; +#endif + bfq_diskctx->bfq_flag = BFQ_FLAG_AS; + + bfq_diskctx->bfq_as_miss = 0; + bfq_diskctx->bfq_as_hit = 0; + + bfq_diskctx->bfq_as_avg_wait_miss = 0; + bfq_diskctx->bfq_as_avg_wait_all = 0; + bfq_diskctx->bfq_as_max_wait = 0; + bfq_diskctx->bfq_as_max_wait2 = 0; + bfq_diskctx->bfq_as_high_wait_count = 0; + bfq_diskctx->bfq_as_high_wait_count2 = 0; + + bfq_diskctx->bfq_avg_time_slice = 0; + bfq_diskctx->bfq_max_time_slice = 0; + bfq_diskctx->bfq_high_time_slice_count = 0; + + /* initiailize the helper thread */ + helper_init(bfq_diskctx); + + dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: initialized!\n"); + return 0; +} + +/* + * bfq_teardown(): .teardown callback of the bfq policy. Send message + * of killing to the helper thread and deallocate resources used by + * the helper thread (currently the objcache) + * + * XXX: deadlock causing when the caller of bfq_teardown() and the + * helper thread are on the same CPU. + * + * lock: none + * refcount: none + * + */ + +static void +bfq_teardown(struct dsched_disk_ctx *diskctx) +{ + struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)diskctx; + KKASSERT(diskctx); + + helper_msg_kill(bfq_diskctx); + + tsleep(diskctx, 0, "teardn", hz * 3 / 2); + + helper_uninit(bfq_diskctx); +} + +/* + * bfq_cancel_all(): .cancel_all callback of the bfq policy. Cancel + * all bios that queue in each bfq_thread_io structure in the + * wf2q tree. + * + * lock: + * BFQ_LOCK: protect from wf2q_insert operation in bfq_queue() and + * bfq_dequeue(); wf2q_get_next operation in bfq_dequeue() + * THREAD_IO_LOCK: protect from queue iteration in bfq_dequeue() and + * queue insertion in bfq_queue() + * + * refcount: + * unref thread_io structures; they are referenced in queue(), + * when a bio is queued. The refcount may decrease to zero. + * + */ +static void +bfq_cancel_all(struct dsched_disk_ctx *diskctx) +{ + struct bio *bio; + struct bfq_thread_io *bfq_tdio; + struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)diskctx; + + BFQ_LOCK(bfq_diskctx); + + while ((bfq_tdio = wf2q_get_next_thread_io(&bfq_diskctx->bfq_wf2q))) { + DSCHED_THREAD_IO_LOCK(&bfq_tdio->head); + KKASSERT(lockstatus(&bfq_tdio->head.lock, curthread) == LK_EXCLUSIVE); + + while ((bio = TAILQ_FIRST(&bfq_tdio->head.queue))) { + bfq_tdio->head.qlength--; + TAILQ_REMOVE(&bfq_tdio->head.queue, bio, link); + dsched_cancel_bio(bio); + dsched_thread_io_unref(&bfq_tdio->head); + } + + KKASSERT(bfq_tdio->head.qlength == 0); + DSCHED_THREAD_IO_UNLOCK(&bfq_tdio->head); + } + + BFQ_UNLOCK(bfq_diskctx); +} + +/* + * bfq_new_tdio(): .new_tdio callback of the bfq policy. Initialize + * the bfq_thread_io structure. + * + * lock: none + * refcount: none + */ +static void +bfq_new_tdio(struct dsched_thread_io *tdio) +{ + struct bfq_thread_io *bfq_tdio = (struct bfq_thread_io *) tdio; + + /* the queue has to be initialized some where else */ + tdio->qlength = 0; + + tdio->debug_priv = 0xF00FF00F; + + bfq_tdio->budget = BFQ_DEFAULT_MIN_BUDGET; + bfq_tdio->weight = BFQ_DEFAULT_WEIGHT; + + bfq_tdio->tdio_as_switch = 1; + bfq_tdio->maybe_timeout = 0; + + bfq_tdio->seek_samples = 0; + bfq_tdio->seek_avg = 0; + bfq_tdio->seek_total = 0; + bfq_tdio->ttime_samples = 0; + bfq_tdio->ttime_avg = 0; + bfq_tdio->service_received = 0; + bfq_tdio->bio_dispatched = 0; + bfq_tdio->bio_completed = 0; + + KTR_LOG(dsched_bfq_thread_created, bfq_tdio); +} + +/* + * bfq_helper_destroy_tdio(): called after a thread_io struct is destroyed. + * if the scheduler is AS waiting for a destroyed tdio, this function resumes + * the scheduler. + * + * lock: + * BFQ_LOCK: protect from nullify bfq_diskctx->bfq_blockon/bfq_active_tdio + * in bfq_timeout() + * + * refcount: none + * + * Calling path: bfq_destroy_tdio --lwkt_msg--> helper_thread --call--> me + * + */ +void +bfq_helper_destroy_tdio(struct dsched_thread_io *tdio, struct bfq_disk_ctx *bfq_diskctx) +{ + KKASSERT(bfq_diskctx); + + BFQ_LOCK(bfq_diskctx); + + /* + * Test whether the scheduler is pending on the tdio to + * be destroyed. + */ + if (((struct dsched_thread_io *)bfq_diskctx->bfq_blockon == tdio) && + callout_pending(&bfq_diskctx->bfq_callout)) { + dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: pending on a being destroyed thread!\n"); + + callout_stop(&bfq_diskctx->bfq_callout); + + bfq_diskctx->bfq_blockon = NULL; + bfq_diskctx->bfq_active_tdio = NULL; + + BFQ_UNLOCK(bfq_diskctx); + + helper_msg_dequeue(bfq_diskctx); + return; + } + BFQ_UNLOCK(bfq_diskctx); + +} + +/* + * bfq_destroy_tdio(): .destroy_tdio callback of the bfq policy + * + * Called immediate after a dsched_thread_io struct's refcount decreases + * to zero. This function will record the seek_avg and ttime_avg of the + * destroyed thread with the KTR facility. + * + * lock: none + * + * refcount: the tdio's refcount should be zero. It may be nuked, and + * any read/write to the tdio is not safe by then. + */ +static void +bfq_destroy_tdio(struct dsched_thread_io *tdio) +{ + struct bfq_thread_io *bfq_tdio = (struct bfq_thread_io *)tdio; + + /* + * do not log threads without I/O + */ + if (bfq_tdio->seek_samples != 0 || bfq_tdio->ttime_samples != 0) { + KTR_LOG(dsched_bfq_thread_seek_avg, bfq_tdio, bfq_tdio->seek_avg ); + KTR_LOG(dsched_bfq_thread_ttime_avg, bfq_tdio, bfq_tdio->ttime_avg); + } + + helper_msg_destroy_tdio((struct bfq_disk_ctx *)tdio->diskctx, tdio); +} + +/* + * bfq_bio_done(): .bio_done callback of the bfq policy + * + * Called after a bio is done, (by request_polling_biodone of dsched). + * This function judges whet her a thread consumes up its time slice, and + * if so, it will set the maybe_timeout flag in bfq_tdio structure. Any + * further action of that thread or the bfq scheduler will cause the + * thread to be expired. (in bfq_queue() or in bfq_dequeue()) + * + * This function requires the bfq_tdio pointer of the thread that pushes + * bp to be stored by dsched_set_bio_priv() earlier. Currently it is + * stored when bfq_queue() is called. + * + * lock: none. This function CANNOT be blocked by any lock + * + * refcount: + * the corresponding tdio's refcount should decrease by 1 after + * this function call. The counterpart increasing is in bfq_queue(). + * For each bio pushed down, we increase the refcount of the pushing + * tdio. + */ +static void +bfq_bio_done(struct bio *bp) +{ + struct disk *dp = dsched_get_bio_dp(bp); + struct bfq_thread_io *bfq_tdio = dsched_get_bio_priv(bp); + struct bfq_disk_ctx *bfq_diskctx = dsched_get_disk_priv(dp); + struct timeval tv; + int ticks_expired; + + KKASSERT(bfq_tdio); + + dsched_thread_io_ref(&bfq_tdio->head); + + atomic_add_int(&bfq_tdio->bio_completed, 1); + + /* the tdio has already expired */ + if (bfq_tdio != bfq_diskctx->bfq_active_tdio) + goto rtn; + atomic_add_int(&bfq_tdio->service_received, BIO_SIZE(bp)); + + /* current time */ + getmicrotime(&tv); + bfq_tdio->last_request_done_time = tv; + timevalsub (&tv, &bfq_tdio->service_start_time); + ticks_expired = tvtohz_high(&tv); + + /* the thread has run out its time slice */ + if ((ticks_expired != 0x7fffffff) && + (ticks_expired >= BFQ_SLICE_TIMEOUT)) { + /* + * we cannot block here, so just set a flag + */ +#if 0 + bfq_tdio->maybe_timeout = 1; +#endif + if (atomic_cmpset_int(&bfq_tdio->maybe_timeout, 0, 1)) { + bfq_update_avg_time_slice(bfq_diskctx, tv); + dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: %p may time out\n", bfq_tdio); + } + } +rtn: + dsched_thread_io_unref(&bfq_tdio->head); /* ref'ed in this function */ + dsched_thread_io_unref(&bfq_tdio->head); /* ref'ed in queue() */ + +} + +/* + * bfq_timeout(): called after the callout alarm strikes. + * + * This function getting called indicates that after waiting for + * BFQ_T_WAIT / BFQ_T_WAIT_MIN ticks, the thread "active_tdio" + * represents does not push any further bios. This tdio should + * be expired with the reason BFQ_REASON_TOO_IDLE, but if the tdio + * is marked as timeout (in bfq_biodone()) first, we expire it + * for BFQ_REASON_TIMEOUT. The bfq scheduler should resume working + * (and pick another thread to serve). + * + * It is possible that this function gets called a litter after + * the thread pushes a bio with bfq_queue(), and thus a "fake timeout" + * happens. We treat it as the callout does not strike, and continue + * to serve the active_tdio. + * + * lock: + * BFQ_LOCK: protect bfq_diskctx->blockon and bfq_diskctx->active_tdio + * they should either changed in bfq_queue() or in this function, + * atomically. + * TDIO_LOCK: protect from dequeue() updateing the budget by the + * maybe_timeout branch. (Not necessary, because we already hold the + * BFQ_LOCK, and no one else could change the budget of the tdio) + * + * refcount: + * the refcount of bfq_diskctx->bfq_active_tdio will decrease one + * after this function. (The counterpart increasing is in bfq_dequeue(), + * before resetting the callout alarm.) + * + * AS timeout: + * during the waiting period, no bio is pushed by the being + * waited tdio + * + * Calling path: + * callout facility --> helper_msg_timeout --lwkt_msg--> helper thread + * --> me + */ +void +bfq_timeout(void *p) +{ + /* waiting time out: + * no deceptive idleness, and unblock dispatching + */ + struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)p; + struct bfq_thread_io *bfq_tdio; + + BFQ_LOCK(bfq_diskctx); + + /* + * the timeout occurs after the thread + * pushing one more bio + */ + if (bfq_diskctx->bfq_blockon == NULL) { + dsched_debug(BFQ_DEBUG_VERBOSE , "BFQ: fake AS timeout \n"); + goto rtn; + } + + bfq_diskctx->bfq_as_miss++; + + KKASSERT(bfq_diskctx->bfq_active_tdio); + bfq_tdio = bfq_diskctx->bfq_active_tdio; + + DSCHED_THREAD_IO_LOCK(&bfq_tdio->head); + + bfq_update_as_avg_wait(bfq_diskctx, bfq_tdio, BFQ_AS_STAT_ALL|BFQ_AS_STAT_ONLY_MISS); + + bfq_diskctx->bfq_blockon = NULL; + bfq_diskctx->bfq_active_tdio = NULL; + dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: unblocked %p\n", bfq_tdio); + + wf2q_update_vd(bfq_tdio, bfq_tdio->budget - bfq_diskctx->bfq_remaining_budget); + /* + * the time slice expired before as timeout + * this should be REASON_TIMEOUT + */ + if (bfq_tdio->maybe_timeout) { + bfq_expire(bfq_diskctx, bfq_tdio, BFQ_REASON_TIMEOUT); + dsched_debug(BFQ_DEBUG_VERBOSE, "%p time out in timeout()\n", bfq_tdio); + } else { + bfq_expire(bfq_diskctx, bfq_tdio, BFQ_REASON_TOO_IDLE); + dsched_debug(BFQ_DEBUG_VERBOSE, "%p too idle\n", bfq_tdio); + } + + DSCHED_THREAD_IO_UNLOCK(&bfq_tdio->head); + + /* ref'ed in dequeue(), before resetting callout */ + dsched_thread_io_unref(&bfq_tdio->head); +rtn: + BFQ_UNLOCK(bfq_diskctx); + helper_msg_dequeue(bfq_diskctx); +} + +/* + * bfq_queue(): .queue callback of the bfq policy. + * + * A thread calls this function to hand in its I/O requests (bio). + * Their bios are stored in the per-thread queue, in tdio structure. + * Currently, the sync/async bios are queued together, which may cause + * some issues on performance. + * + * Besides queueing bios, this function also calculates the average + * thinking time and average seek distance of a thread, using the + * information in bio structure. + * + * If the calling thread is waiting by the bfq scheduler due to + * the AS feature, this function will cancel the callout alarm + * and resume the scheduler to continue serving this thread. + * + * lock: + * THREAD_IO_LOCK: protect from queue iteration in bfq_dequeue() + * BFQ_LOCK: protect from other insertions/deletions in wf2q_augtree + * in bfq_queue() or bfq_dequeue(). + * + * refcount: + * If the calling thread is waited by the scheduler, the refcount + * of the related tdio will decrease by 1 after this function. The + * counterpart increasing is in bfq_dequeue(), before resetting the + * callout alarm. + * + * Return value: + * EINVAL: if bio->bio_buf->b_cmd == BUF_CMD_FLUSH + * 0: bio is queued successfully. + */ +static int +bfq_queue(struct dsched_disk_ctx *diskctx, struct dsched_thread_io *tdio, + struct bio *bio) +{ + struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)diskctx; + struct bfq_thread_io *bfq_tdio = (struct bfq_thread_io *)tdio; + int original_qlength; + + /* we do not handle flush requests. push it down to dsched */ + if (__predict_false(bio->bio_buf->b_cmd == BUF_CMD_FLUSH)) + return (EINVAL); + + DSCHED_THREAD_IO_LOCK(tdio); + KKASSERT(tdio->debug_priv == 0xF00FF00F); + dsched_debug(BFQ_DEBUG_NORMAL, "bfq: tdio %p pushes bio %p\n", bfq_tdio, bio); + + dsched_set_bio_priv(bio, tdio); + dsched_thread_io_ref(tdio); + + if ((bio->bio_buf->b_cmd == BUF_CMD_READ) || + (bio->bio_buf->b_cmd == BUF_CMD_WRITE)) { + bfq_update_tdio_seek_avg(bfq_tdio, bio); + } + + bfq_update_tdio_ttime_avg(bfq_tdio); + + /* update last_bio_pushed_time */ + getmicrotime(&bfq_tdio->last_bio_pushed_time); + + if ((bfq_tdio->seek_samples > BFQ_VALID_MIN_SAMPLES) && + BFQ_TDIO_SEEKY(bfq_tdio)) + dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: tdio %p is seeky\n", bfq_tdio); + + /* + * If a tdio taks too long to think, we disable the AS feature of it. + */ + if ((bfq_tdio->ttime_samples > BFQ_VALID_MIN_SAMPLES) && + (bfq_tdio->ttime_avg > BFQ_T_WAIT * (1000 / hz) * 1000) && + (bfq_tdio->service_received > bfq_tdio->budget / 8)) { + dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: tdio %p takes too long time to think\n", bfq_tdio); + bfq_tdio->tdio_as_switch = 0; + } else { + bfq_tdio->tdio_as_switch = 1; + } + + /* insert the bio into the tdio's own queue */ + KKASSERT(lockstatus(&tdio->lock, curthread) == LK_EXCLUSIVE); + TAILQ_INSERT_TAIL(&tdio->queue, bio, link); +#if 0 + tdio->qlength++; +#endif + original_qlength = atomic_fetchadd_int(&tdio->qlength, 1); + DSCHED_THREAD_IO_UNLOCK(tdio); + /* + * A new thread: + * In dequeue function, we remove the thread + * from the aug-tree if it has no further bios. + * Therefore "new" means a really new thread (a + * newly created thread or a thread that pushed no more + * bios when the scheduler was waiting for it) or + * one that was removed from the aug-tree earlier. + */ + if (original_qlength == 0) { + /* + * a really new thread + */ + BFQ_LOCK(bfq_diskctx); + if (bfq_tdio != bfq_diskctx->bfq_active_tdio) { + /* insert the tdio into the wf2q queue */ + wf2q_insert_thread_io(&bfq_diskctx->bfq_wf2q, bfq_tdio); + } else { + /* + * the thread being waited by the scheduler + */ + if (bfq_diskctx->bfq_blockon == bfq_tdio) { + /* + * XXX: possible race condition here: + * if the callout function is triggered when + * the following code is executed, then after + * releasing the TDIO lock, the callout function + * will set the thread inactive and it will never + * be inserted into the aug-tree (so its bio pushed + * this time will not be dispatched) until it pushes + * further bios + */ + bfq_diskctx->bfq_as_hit++; + bfq_update_as_avg_wait(bfq_diskctx, bfq_tdio, BFQ_AS_STAT_ALL); + + if (callout_pending(&bfq_diskctx->bfq_callout)) + callout_stop(&bfq_diskctx->bfq_callout); + bfq_diskctx->bfq_blockon = NULL; + + /* ref'ed in dequeue(), before resetting callout */ + dsched_thread_io_unref(&bfq_tdio->head); + + dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: %p pushes a new bio when AS\n", bfq_tdio); + } + } + + BFQ_UNLOCK(bfq_diskctx); + } + + helper_msg_dequeue(bfq_diskctx); + + return 0; +} + +/* + * bfq_dequeue(): dispatch bios to the disk driver. + * + * This function will push as many bios as the number of free slots + * in the tag queue. + * + * In the progress of dispatching, the following events may happen: + * - Current thread is timeout: Expire the current thread for + * BFQ_REASON_TIMEOUT, and select a new thread to serve in the + * wf2q tree. + * + * - Current thread runs out of its budget: Expire the current thread + * for BFQ_REASON_OUT_OF_BUDGET, and select a new thread to serve + * + * - Current thread has no further bios in its queue: if the AS feature + * is turned on, the bfq scheduler sets an alarm and starts to suspend. + * The bfq_timeout() or bfq_queue() calls may resume the scheduler. + * + * Implementation note: The bios selected to be dispatched will first + * be stored in an array bio_do_dispatch. After this function releases + * all the locks it holds, it will call dsched_strategy_request_polling() + * for each bio stored. + * + * With the help of bfq_disk_ctx->pending_dequeue, + * there will be only one bfq_dequeue pending on the BFQ_LOCK. + * + * lock: + * BFQ_LOCK: protect from wf2q_augtree operations in bfq_queue() + * THREAD_IO_LOCK: locks the active_tdio. Protect from queue insertions + * in bfq_queue; Protect the active_tdio->budget + * + * refcount: + * If the scheduler decides to suspend, the refcount of active_tdio + * increases by 1. The counterpart decreasing is in bfq_queue() and + * bfq_timeout() + * blocking: + * May be blocking on the disk driver lock. It depends on drivers. + * + * Calling path: + * The callers could be: + * bfq_queue(), bfq_timeout() and the registered polling function. + * + * caller --> helper_msg_dequeue --lwkt_msg--> helper_thread-> me + * + */ +void +bfq_dequeue(struct dsched_disk_ctx *diskctx) +{ + int free_slots, + bio_index = 0, i, + remaining_budget = 0;/* remaining budget of current active process */ + + struct bio *bio, *bio_to_dispatch[33]; + struct bfq_thread_io *active_tdio = NULL; + struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)diskctx; + + BFQ_LOCK(bfq_diskctx); + atomic_cmpset_int(&bfq_diskctx->pending_dequeue, 1, 0); + + /* + * The whole scheduler is waiting for further bios + * from process currently being served + */ + if (bfq_diskctx->bfq_blockon != NULL) + goto rtn; + + remaining_budget = bfq_diskctx->bfq_remaining_budget; + active_tdio = bfq_diskctx->bfq_active_tdio; + dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: dequeue: Im in. active_tdio = %p\n", active_tdio); + + free_slots = diskctx->max_tag_queue_depth - diskctx->current_tag_queue_depth; + KKASSERT(free_slots >= 0 && free_slots <= 32); + + if (active_tdio) + DSCHED_THREAD_IO_LOCK(&active_tdio->head); + + while (free_slots) { + /* Here active_tdio must be locked ! */ + if (active_tdio) { + /* + * the bio_done function has marked the current + * tdio timeout + */ + if (active_tdio->maybe_timeout) { + dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: %p time out in dequeue()\n", active_tdio); + wf2q_update_vd(active_tdio, active_tdio->budget - remaining_budget); + bfq_expire(bfq_diskctx, active_tdio, BFQ_REASON_TIMEOUT); + + /* there still exist bios not dispatched, + * reinsert the tdio into aug-tree*/ + if (active_tdio->head.qlength > 0) { + wf2q_insert_thread_io(&bfq_diskctx->bfq_wf2q, active_tdio); + KKASSERT(bfq_diskctx->bfq_wf2q.wf2q_tdio_count); + } + + active_tdio->maybe_timeout = 0; + DSCHED_THREAD_IO_UNLOCK(&active_tdio->head); + active_tdio = NULL; + continue; + } + + /* select next bio to dispatch */ + /* TODO: a wiser slection */ + KKASSERT(lockstatus(&active_tdio->head.lock, curthread) == LK_EXCLUSIVE); + bio = TAILQ_FIRST(&active_tdio->head.queue); + dsched_debug(BFQ_DEBUG_NORMAL, "bfq: the first bio in queue of active_tdio %p is %p\n", active_tdio, bio); + + dsched_debug(BFQ_DEBUG_VERBOSE, "bfq: active_tdio %p exists, remaining budget = %d, tdio budget = %d\n, qlength = %d, first bio = %p, first bio cmd = %d, first bio size = %d\n", active_tdio, remaining_budget, active_tdio->budget, active_tdio->head.qlength, bio, bio?bio->bio_buf->b_cmd:-1, bio?bio->bio_buf->b_bcount:-1); + + /* + * The bio is not read or write, just + * push it down. + */ + if (bio && (bio->bio_buf->b_cmd != BUF_CMD_READ) && + (bio->bio_buf->b_cmd != BUF_CMD_WRITE)) { + dsched_debug(BFQ_DEBUG_NORMAL, "bfq: remove bio %p from the queue of %p\n", bio, active_tdio); + KKASSERT(lockstatus(&active_tdio->head.lock, curthread) == LK_EXCLUSIVE); + TAILQ_REMOVE(&active_tdio->head.queue, bio, link); + active_tdio->head.qlength--; + free_slots--; + +#if 0 + dsched_strategy_request_polling(diskctx->dp, bio, diskctx); +#endif + bio_to_dispatch[bio_index++] = bio; + KKASSERT(bio_index <= bfq_diskctx->head.max_tag_queue_depth); + continue; + } + /* + * Run out of budget + * But this is not because the size of bio is larger + * than the complete budget. + * If the size of bio is larger than the complete + * budget, then use a complete budget to cover it. + */ + if (bio && (remaining_budget < BIO_SIZE(bio)) && + (remaining_budget != active_tdio->budget)) { + /* charge budget used */ + wf2q_update_vd(active_tdio, active_tdio->budget - remaining_budget); + bfq_expire(bfq_diskctx, active_tdio, BFQ_REASON_OUT_OF_BUDGET); + wf2q_insert_thread_io(&bfq_diskctx->bfq_wf2q, active_tdio); + dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: thread %p ran out of budget\n", active_tdio); + DSCHED_THREAD_IO_UNLOCK(&active_tdio->head); + active_tdio = NULL; + } else { /* if (bio && remaining_budget < BIO_SIZE(bio) && remaining_budget != active_tdio->budget) */ + + /* + * Having enough budget, + * or having a complete budget and the size of bio + * is larger than that. + */ + if (bio) { + /* dispatch */ + remaining_budget -= BIO_SIZE(bio); + /* + * The size of the first bio is larger + * than the whole budget, we should + * charge the extra part + */ + if (remaining_budget < 0) + wf2q_update_vd(active_tdio, -remaining_budget); + /* compensate */ + wf2q_update_vd(active_tdio, -remaining_budget); + /* + * remaining_budget may be < 0, + * but to prevent the budget of current tdio + * to substract a negative number, + * the remaining_budget has to be >= 0 + */ + remaining_budget = MAX(0, remaining_budget); + dsched_debug(BFQ_DEBUG_NORMAL, "bfq: remove bio %p from the queue of %p\n", bio, active_tdio); + KKASSERT(lockstatus(&active_tdio->head.lock, curthread) == LK_EXCLUSIVE); + TAILQ_REMOVE(&active_tdio->head.queue, bio, link); + free_slots--; + active_tdio->head.qlength--; + active_tdio->bio_dispatched++; + wf2q_inc_tot_service(&bfq_diskctx->bfq_wf2q, BIO_SIZE(bio)); + dsched_debug(BFQ_DEBUG_VERBOSE, + "BFQ: %p's bio dispatched, size=%d, remaining_budget = %d\n", + active_tdio, BIO_SIZE(bio), remaining_budget); +#if 0 + dsched_strategy_request_polling(diskctx->dp, bio, diskctx); +#endif + bio_to_dispatch[bio_index++] = bio; + KKASSERT(bio_index <= bfq_diskctx->head.max_tag_queue_depth); + + } else { /* if (bio) */ + + KKASSERT(active_tdio); + /* + * If AS feature is switched off, + * expire the tdio as well + */ + if ((remaining_budget <= 0) || + !(bfq_diskctx->bfq_flag & BFQ_FLAG_AS) || + !active_tdio->tdio_as_switch) { + active_tdio->budget -= remaining_budget; + wf2q_update_vd(active_tdio, active_tdio->budget); + bfq_expire(bfq_diskctx, active_tdio, BFQ_REASON_OUT_OF_BUDGET); + DSCHED_THREAD_IO_UNLOCK(&active_tdio->head); + active_tdio = NULL; + } else { + + /* no further bio, wait for a while */ + bfq_diskctx->bfq_blockon = active_tdio; + /* + * Increase ref count to ensure that + * tdio will not be destroyed during waiting. + */ + dsched_thread_io_ref(&active_tdio->head); + /* + * If the tdio is seeky but not thingking for + * too long, we wait for it a little shorter + */ + if (active_tdio->seek_samples >= BFQ_VALID_MIN_SAMPLES && BFQ_TDIO_SEEKY(active_tdio)) + callout_reset(&bfq_diskctx->bfq_callout, BFQ_T_WAIT_MIN, (void (*) (void *))helper_msg_as_timeout, bfq_diskctx); + else + callout_reset(&bfq_diskctx->bfq_callout, BFQ_T_WAIT, (void (*) (void *))helper_msg_as_timeout, bfq_diskctx); + + /* save the start time of blocking */ + getmicrotime(&active_tdio->as_start_time); + + dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: blocked on %p, remaining_budget = %d\n", active_tdio, remaining_budget); + DSCHED_THREAD_IO_UNLOCK(&active_tdio->head); + goto save_and_rtn; + } + } + } + } else { /* if (active_tdio) */ + /* there is no active tdio */ + + /* no pending bios at all */ + active_tdio = wf2q_get_next_thread_io(&bfq_diskctx->bfq_wf2q); + + if (!active_tdio) { + KKASSERT(bfq_diskctx->bfq_wf2q.wf2q_tdio_count == 0); + dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: no more eligible tdio!\n"); + goto save_and_rtn; + } + + /* + * A new tdio is picked, + * initialize the service related statistic data + */ + DSCHED_THREAD_IO_LOCK(&active_tdio->head); + active_tdio->service_received = 0; + + /* + * Reset the maybe_timeout flag, which + * may be set by a biodone after the the service is done + */ + getmicrotime(&active_tdio->service_start_time); + active_tdio->maybe_timeout = 0; + + remaining_budget = active_tdio->budget; + dsched_debug(BFQ_DEBUG_VERBOSE, "bfq: active_tdio %p selected, remaining budget = %d, tdio budget = %d\n, qlength = %d\n", active_tdio, remaining_budget, active_tdio->budget, active_tdio->head.qlength); + } + + }/* while (free_slots) */ + + /* reach here only when free_slots == 0 */ + if (active_tdio) /* && lockcount(&active_tdio->head.lock) > 0) */ + DSCHED_THREAD_IO_UNLOCK(&active_tdio->head); + +save_and_rtn: + /* save the remaining budget */ + bfq_diskctx->bfq_remaining_budget = remaining_budget; + bfq_diskctx->bfq_active_tdio = active_tdio; +rtn: + BFQ_UNLOCK(bfq_diskctx); + /*dispatch the planned bios*/ + for (i = 0; i < bio_index; i++) + dsched_strategy_request_polling(diskctx->dp, bio_to_dispatch[i], diskctx); + +} + +/* + * bfq_slow_tdio(): decide whether a tdio is slow + * + * This function decides whether a tdio is slow by the speed + * estimated from the current time slice start time: if the + * tdio is not fast enough to consume its budget (or 2/3 + * its budget) within the time slice, it is judged slow. + * + * Called by bfq_expire() + * + * lock: + * THREAD_IO_LOCK is expected to be held. + * refcount: + * none + * + */ +static int +bfq_slow_tdio(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio) +{ + /** + * A tdio is considered slow if it can not finish its budget + * at its current average speed + */ + uint64_t usec_elapsed, service_received, speed; + int expect; + struct timeval tv = bfq_tdio->last_request_done_time; + + timevalsub (&tv, &bfq_tdio->service_start_time); + usec_elapsed = (uint64_t)(1000000 * (uint64_t)tv.tv_sec + tv.tv_usec); + + /* discard absurd value */ + if (usec_elapsed < 20000) + return 0; + + service_received = (uint64_t)bfq_tdio->service_received << BFQ_FIXPOINT_SHIFT; + speed = service_received / usec_elapsed; + expect = (speed * BFQ_SLICE_TIMEOUT * (1000 * 1000 / hz)) >> BFQ_FIXPOINT_SHIFT; + + if (expect < 0) { + dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: overflow on calculating slow_tdio\n"); + return 0; + } + + if (expect < bfq_tdio->budget * 2 / 3) { + dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: %p is judged slow\n", bfq_tdio); + return 1; + } + + return 0; +} + +/* + * bfq_expire(): expire a tdio for a given reason. + * + * Different amount of the new budget will be assign to the expired + * tdio according to the following reasons: + * + * BFQ_REASON_TIMEOUT: + * The tdio does not consume its budget up within BFQ_SLICE_TIMEOUT ticks. + * We shall update the disk peak rate if the tdio is not seeky. The new + * budget will be the budget it actually consumes during this time + * slice. + * + * BFQ_REASON_TOO_IDLE: + * The tdio does not push any further bios during the scheduler is + * suspending. To ensure low global latency, this tdio should be + * punished by assign it the minimum budget. But if the tdio's not + * pushing any bio is because it is waiting for the dispatched bios + * to be done, we just keep the budget unchanged. + * + * BFQ_REASON_OUT_OF_BUDGET: + * The tdio runs out of its budget within the time slice. It usually + * indicates that the tdio is doing well. We increase the budget of it. + * + * lock: + * THREAD_IO_LOCK is expected to be held. + * BFQ_LOCK is expected to be held (needed by bfq_update_peak_rate()). + * + * refcount: none + * + * Callers: bfq_timeout(), bfq_dequeue() + * + */ +static void +bfq_expire(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio, enum bfq_expire_reason reason) +{ + int max_budget = bfq_diskctx->bfq_max_budget, + budget_left, + bio_in_flight, + service_received; + + service_received = bfq_tdio->service_received; + budget_left = bfq_tdio->budget - bfq_tdio->service_received; + + if (budget_left < 0) { + dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: budget down flow: %d, %d\n", bfq_tdio->budget, bfq_tdio->service_received); + budget_left = 0; + } + + KKASSERT(budget_left >= 0); + + switch (reason) { + case BFQ_REASON_TIMEOUT: + /* the tdio is not seeky so that we can update + * the disk peak rate based on the service received + * by the tdio + */ + if ((bfq_tdio->seek_samples >= BFQ_VALID_MIN_SAMPLES) && + (!BFQ_TDIO_SEEKY(bfq_tdio))) + bfq_update_peak_rate(bfq_diskctx, bfq_tdio); + + /* max_budget may be updated */ + max_budget = bfq_diskctx->bfq_max_budget; + + /* update budget to service_received*/ + bfq_tdio->budget = MAX(service_received, BFQ_DEFAULT_MIN_BUDGET); + + break; + + case BFQ_REASON_TOO_IDLE: + /* + * the tdio is too slow, charge full budget + */ + if (bfq_slow_tdio(bfq_diskctx, bfq_tdio)) + wf2q_update_vd(bfq_tdio, budget_left); + + bio_in_flight = bfq_tdio->bio_dispatched - bfq_tdio->bio_completed; + KKASSERT(bio_in_flight >= 0); + /* + * maybe the tdio pushes no bio + * because it is waiting for some bios + * dispatched to be done, in this case + * we do not reduce the budget too harshly + */ + if (bio_in_flight > 0) { + bfq_tdio->budget = MAX(BFQ_DEFAULT_MIN_BUDGET, service_received); + } else { +#if 0 + bfq_tdio->budget = MAX(BFQ_DEFAULT_MIN_BUDGET, bfq_diskctx->bfq_max_budget / BFQ_MIN_BUDGET_FACTOR); +#endif + bfq_tdio->budget = BFQ_DEFAULT_MIN_BUDGET; + } + + break; + case BFQ_REASON_OUT_OF_BUDGET: + + if ((bfq_tdio->seek_samples >= BFQ_VALID_MIN_SAMPLES) && + (!BFQ_TDIO_SEEKY(bfq_tdio))) + bfq_update_peak_rate(bfq_diskctx, bfq_tdio); + + /* increase the budget */ + if (bfq_tdio->budget < BFQ_BUDGET_MULTIPLE_THRESHOLD) + bfq_tdio->budget = MIN(max_budget, bfq_tdio->budget * 2); + else + bfq_tdio->budget = MIN(max_budget, bfq_tdio->budget + BFQ_BUDG_INC_STEP); + break; + default: + break; + } +} + +/* + * bfq_update_peak_rate(): update the peak disk speed by sampling + * the throughput within a time slice. + * + * lock: + * BFQ_LOCK is expected to be held + * + * refcount: + * none + * + * Caller: bfq_expire() + */ +static void +bfq_update_peak_rate(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio) +{ + struct timeval tv = bfq_tdio->last_request_done_time; + uint64_t usec, service_received, peak_rate; + + + timevalsub (&tv, &bfq_tdio->service_start_time); + usec = (uint64_t)(1000000 * (uint64_t)tv.tv_sec + tv.tv_usec); + + /* discard absurd value */ + if (usec < 2000 || usec > (BFQ_SLICE_TIMEOUT * (1000 / hz) * 1000)) { + dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: absurd interval for peak rate\n"); + return; + } + + service_received = (uint64_t)bfq_tdio->service_received << BFQ_FIXPOINT_SHIFT; + peak_rate = service_received / usec; + bfq_diskctx->bfq_peak_rate = (peak_rate + 7 * bfq_diskctx->bfq_peak_rate) / 8; + bfq_diskctx->bfq_peak_rate_samples++; + + /* update the max_budget according to the peak rate */ + if (bfq_diskctx->bfq_peak_rate_samples > BFQ_VALID_MIN_SAMPLES) { + bfq_diskctx->bfq_peak_rate_samples = BFQ_VALID_MIN_SAMPLES; + /* + * if the auto max budget adjust is disabled, + * the bfq_max_budget will always be BFQ_DEFAULT_MAX_BUDGET; + */ + if (bfq_diskctx->bfq_flag & BFQ_FLAG_AUTO_MAX_BUDGET) { + bfq_diskctx->bfq_max_budget = + (uint32_t)((BFQ_SLICE_TIMEOUT * (1000 / hz) * bfq_diskctx->bfq_peak_rate * 1000) >> BFQ_FIXPOINT_SHIFT); + dsched_debug(BFQ_DEBUG_NORMAL, "max budget updated to %d\n", bfq_diskctx->bfq_max_budget); + } + } +} + +/* + * bfq_update_tdio_seek_avg(): update the average seek distance of a + * tdio. + * + * lock: + * THREAD_IO_LOCK is expected to be held. + * + * refcount: + * none + * + * Caller: bfq_queue() + */ +static void +bfq_update_tdio_seek_avg(struct bfq_thread_io *bfq_tdio, struct bio *bp) +{ + off_t seek; + + /* the first bio it dispatches, + * we do not calculate the seek_avg, + * just update the last_seek_end + */ + if (bfq_tdio->seek_samples == 0) { + ++bfq_tdio->seek_samples; + goto rtn; + } + + seek = ABS(bp->bio_offset - bfq_tdio->last_seek_end); + + /* + * we do not do seek_samples++, + * because the seek_total may overflow if seek_total += seek, + */ + bfq_tdio->seek_samples = (7 * bfq_tdio->seek_samples + 256) / 8; + bfq_tdio->seek_total = (7 * bfq_tdio->seek_total + 256 * seek) / 8; + bfq_tdio->seek_avg = (bfq_tdio->seek_total + bfq_tdio->seek_samples / 2) / bfq_tdio->seek_samples; + + dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: tdio %p seek_avg updated to %" PRIu64 "\n", bfq_tdio, bfq_tdio->seek_avg); + +rtn: + bfq_tdio->last_seek_end = bp->bio_offset + BIO_SIZE(bp); +} + +/* + * bfq_update_tdio_ttime_avg(): update the average thinking time + * of a tdio. + * + * The thinking time is used to switch on / off the tdio's AS feature + * + * lock: + * THREAD_IO_LOCK is expected to be held. + * + * refcount: + * none + * + * Caller: + * bfq_queue() + * + */ +static void +bfq_update_tdio_ttime_avg(struct bfq_thread_io *bfq_tdio) +{ + struct timeval tv, after_start; + uint64_t usec; + + if (bfq_tdio->ttime_samples == 0) { + ++bfq_tdio->ttime_samples; + return; + } + + getmicrotime(&tv); + after_start = bfq_tdio->last_request_done_time; + +#if 0 + timevalsub (&tv, &bfq_tdio->last_request_done_time); +#endif + /* + * Try the interval between two bios are pushed, + * instead of between last_request_done_time and + * the current time. + */ + + timevalsub (&tv, &bfq_tdio->last_bio_pushed_time); + + timevalsub (&after_start, &bfq_tdio->service_start_time); + + /* + * tv.tv_sec < 0 means the last reauest done time is + * after the current time. + * this may happen because the biodone function is not blocked + * + * after_start.tv_sec < 0 means that the last bio done happens + * before the current service slice, and we should drop this value. + */ + if (tv.tv_sec < 0 || after_start.tv_sec < 0) + return; + + usec = (uint64_t)(1000000 * (uint64_t)tv.tv_sec + tv.tv_usec); + + bfq_tdio->ttime_samples = (7 * bfq_tdio->ttime_samples + 256) / 8; + bfq_tdio->ttime_total = (7 * bfq_tdio->ttime_total + 256 * usec) / 8; + bfq_tdio->ttime_avg = (bfq_tdio->ttime_total + 128) / bfq_tdio->ttime_samples; + +} + +/* + * This function will also update the bfq_max_time_slice field + * + * tv: the timeval structure representing the length of time slice + */ +static void +bfq_update_avg_time_slice(struct bfq_disk_ctx *bfq_diskctx, struct timeval tv) +{ + uint32_t msec; + + msec = ((uint64_t)(1000000 * (uint64_t)tv.tv_sec + tv.tv_usec) >> 10 ); + + if (msec > 3 * BFQ_SLICE_TIMEOUT * (1000 / hz)) + atomic_add_int(&bfq_diskctx->bfq_high_time_slice_count, 1); + + bfq_diskctx->bfq_avg_time_slice = + (7 * bfq_diskctx->bfq_avg_time_slice + msec) / 8; + + if (bfq_diskctx->bfq_max_time_slice < msec) + bfq_diskctx->bfq_max_time_slice = msec; +} +/* + * This function will also update the bfq_as_max_wait field + * flag: BFQ_AS_STAT_ALL, BFQ_AS_STAT_ONLY_MISS + * + */ +static void +bfq_update_as_avg_wait(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio, int flag) +{ + struct timeval tv; + uint32_t msec; + getmicrotime(&tv); + timevalsub (&tv, &bfq_tdio->as_start_time); + + /* approximately divide 1000 by left shift 10 */ + msec = ((uint64_t)(1000000 * (uint64_t)tv.tv_sec + tv.tv_usec) >> 10 ); + + /* ridiculous value */ + if (msec > 10000) { + dsched_debug(BFQ_DEBUG_NORMAL, "bfq: ridiculous as wait time!\n"); + return; + } + + if (msec > 5 * BFQ_T_WAIT_MIN * (1000 / hz)) + atomic_add_int(&bfq_diskctx->bfq_as_high_wait_count, 1); + + if (flag & BFQ_AS_STAT_ALL) { + bfq_diskctx->bfq_as_avg_wait_all = + (7 * bfq_diskctx->bfq_as_avg_wait_all + msec) / 8; + } + + if (flag & BFQ_AS_STAT_ONLY_MISS) { + bfq_diskctx->bfq_as_avg_wait_miss = + (7 * bfq_diskctx->bfq_as_avg_wait_miss + msec) / 8; + } + + /* update the maximum waiting time */ + if (bfq_diskctx->bfq_as_max_wait < msec) + bfq_diskctx->bfq_as_max_wait = msec; + + return; +} + +static int +bfq_mod_handler(module_t mod, int type, void *unused) +{ + static struct sysctl_ctx_list sysctl_ctx; + static struct sysctl_oid *oid; + static char version[16]; + int error; + + ksnprintf(version, sizeof(version), "%d.%d", + dsched_bfq_version_maj, dsched_bfq_version_min); + + switch (type) { + case MOD_LOAD: + bzero(&bfq_stats, sizeof(struct dsched_bfq_stats)); + if ((error = dsched_register(&dsched_bfq_policy))) + return (error); + + sysctl_ctx_init(&sysctl_ctx); + oid = SYSCTL_ADD_NODE(&sysctl_ctx, + SYSCTL_STATIC_CHILDREN(_dsched), + OID_AUTO, + "bfq", + CTLFLAG_RD, 0, ""); + bfq_mod_oid = oid; + + SYSCTL_ADD_STRING(&sysctl_ctx, SYSCTL_CHILDREN(oid), + OID_AUTO, "version", CTLFLAG_RD, version, 0, "bfq version"); + helper_init_global(); + + kprintf("BFQ scheduler policy version %d.%d loaded. sizeof(bfq_thread_io) = %zu\n", + dsched_bfq_version_maj, dsched_bfq_version_min, sizeof(struct bfq_thread_io)); + break; + + case MOD_UNLOAD: + if ((error = dsched_unregister(&dsched_bfq_policy))) + return (error); + sysctl_ctx_free(&sysctl_ctx); + kprintf("BFQ scheduler policy unloaded\n"); + break; + + default: + break; + } + + return 0; +} + +int +bfq_sysctl_as_switch_handler(SYSCTL_HANDLER_ARGS) +{ + struct bfq_disk_ctx *bfq_diskctx = arg1; + int as_switch, error; + + as_switch = ((bfq_diskctx->bfq_flag & BFQ_FLAG_AS) ? 1 : 0); + error = sysctl_handle_int(oidp, &as_switch, 0, req); + if (error || !req->newptr) + return error; + + if (as_switch == 1) + bfq_diskctx->bfq_flag |= BFQ_FLAG_AS; + else if (as_switch == 0) + bfq_diskctx->bfq_flag &= ~(BFQ_FLAG_AS); + else + return 0; + + return error; +} + +int +bfq_sysctl_auto_max_budget_handler(SYSCTL_HANDLER_ARGS) +{ + struct bfq_disk_ctx *bfq_diskctx = arg1; + int auto_max_budget_switch, error; + auto_max_budget_switch = ((bfq_diskctx->bfq_flag & BFQ_FLAG_AUTO_MAX_BUDGET) ? 1 : 0); + error = sysctl_handle_int(oidp, &auto_max_budget_switch, 0, req); + if (error || !req->newptr) + return error; + + if (auto_max_budget_switch == 1) + bfq_diskctx->bfq_flag |= BFQ_FLAG_AUTO_MAX_BUDGET; + else if (auto_max_budget_switch == 0) + bfq_diskctx->bfq_flag &= ~(BFQ_FLAG_AUTO_MAX_BUDGET); + else + return 0; + + return error; +} + +DSCHED_POLICY_MODULE(dsched_bfq, bfq_mod_handler); diff --git a/sys/kern/dsched/bfq/bfq.h b/sys/kern/dsched/bfq/bfq.h new file mode 100644 index 0000000..2ba8d59 --- /dev/null +++ b/sys/kern/dsched/bfq/bfq.h @@ -0,0 +1,238 @@ +/* + * Copyright (c) 2011 The DragonFly Project. All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Brills Peng + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + + +#ifndef _DSCHED_BFQ_H_ +#define _DSCHED_BFQ_H_ + +#if defined(_KERNEL) || defined(_KERNEL_STRUCTURES) + +#ifndef _SYS_QUEUE_H_ +#include +#endif + +#ifndef _SYS_BIO_H_ +#include +#endif + +#ifndef _SYS_BIOTRACK_H_ +#include +#endif + +#ifndef _SYS_SPINLOCK_H_ +#include +#endif + +#ifndef _SYS_TREE_H_ +#include +#endif + +#ifndef _SYS_DSCHED_H_ +#include +#endif + +#ifndef _DSCHED_BFQ_WF2Q_H_ +#include +#endif + +struct wf2q_t; + +struct bfq_thread_io { + struct dsched_thread_io head; + RB_ENTRY(bfq_thread_io) entry; + int budget; /* The budget of a thread */ + int vd; /* Virtual deadline (finish time) */ + int ve; /* Virtual eligible time (start time) */ + int min_vd; /* Minimum vd among the sub trees, used for augmented rb-tree */ + int weight; /* Weight of the thread, the higher, the more + chance to be dispatched the thread will have */ + + volatile int maybe_timeout; /* a flag indicating that the tdio may + expire, only when active_tdio = this is it valid */ + int tdio_as_switch; + + /* Statistic data */ + off_t last_seek_end; /* the end point of seeking of the last bio + pushed down */ + uint32_t seek_samples; /* averange seek length samples */ + off_t seek_avg; /* averange seek length, fixed point */ + off_t seek_total; + + uint32_t ttime_samples; /* averange think time samples */ + uint64_t ttime_avg; /* averange think time, usec */ + uint64_t ttime_total; + + struct timeval service_start_time; /* the time when the first request + of the current service period is dispatched */ + struct timeval last_request_done_time; /* the time when the last + request is done */ + struct timeval as_start_time; /* the start time of AS waiting */ + struct timeval last_bio_pushed_time; + + uint32_t service_received; /* the amount of read/write during + the time slice */ + uint32_t bio_dispatched; /* number of bios dispatched during + the current period */ + uint32_t bio_completed; /* number of bios completed during + the current period */ +}; + +struct bfq_disk_ctx { + struct dsched_disk_ctx head; + + struct lock bfq_lock; + + struct callout bfq_callout; /* the blocking-timer callout */ + struct wf2q_t bfq_wf2q; /* the wf2q scheduler */ + + struct bfq_thread_io *bfq_blockon; /* waiting on any */ + struct bfq_thread_io *bfq_active_tdio; /* currently active tdio */ + + int pending_dequeue; /* number of dequeue() calls pending + on BFQ_LOCK */ + + int bfq_max_budget; + int bfq_remaining_budget; /* remaining budget of the current tdio */ + + uint32_t bfq_flag; /* SEE BFQ_FLAG_* define for all flags */ + + /* Statistic data */ + uint32_t bfq_peak_rate_samples; /* peak rate samples */ + uint64_t bfq_peak_rate; /* peak rate, fixed point */ + + int bfq_as_miss; + int bfq_as_hit; + + uint32_t bfq_as_avg_wait_miss; /* average AS waiting time for + only AS miss, ms */ + uint32_t bfq_as_avg_wait_all; /* average AS waiting time for all, ms */ + uint32_t bfq_as_max_wait; /* maximum AS waiting time, ms */ + uint32_t bfq_as_max_wait2; /* maximum AS waiting time(from callout), ms */ + + int bfq_as_high_wait_count; /* the number of times when AS waiting time + is longer than 5 * BFQ_T_WAIT_MIN (50ms now) */ + int bfq_as_high_wait_count2; /* the number of times when AS waiting + time is longer than 5 * BFQ_T_WAIT_MIN (50ms now) */ + + uint32_t bfq_avg_time_slice; /* average time slice length, ms */ + uint32_t bfq_max_time_slice; /* maximum time slice length, ms */ + int bfq_high_time_slice_count; /* the number of times when a time slice + is longer than 5 * BFQ_SLICE_TIMEOUT */ + + struct sysctl_ctx_list bfq_sysctl_ctx; /* bfq statistics interface + with sysctl */ + /* helper thread and its lwkt message cache and port*/ + struct thread *helper_thread; + struct objcache *helper_msg_cache; + struct lwkt_port helper_msg_port; +}; + +enum bfq_expire_reason { + BFQ_REASON_TIMEOUT = 0, + BFQ_REASON_TOO_IDLE, + BFQ_REASON_OUT_OF_BUDGET, + BFQ_REASON_NO_MORE_REQ +}; + +#define BFQ_FLAG_AS 0x01 +#define BFQ_FLAG_AUTO_MAX_BUDGET 0x02 + +#define BFQ_TDIO_SEEKY(x) (((x)->seek_avg) > (1024 * SECT_SIZE)) + +#define BFQ_LOCKINIT(x) \ + lockinit(&(x)->bfq_lock, "bfqwf2q", 0, LK_CANRECURSE); + +#define BFQ_LOCK(x) do { \ + dsched_disk_ctx_ref(&(x)->head); \ + lockmgr(&(x)->bfq_lock, LK_EXCLUSIVE); \ + } while(0) + +#define BFQ_UNLOCK(x) do { \ + lockmgr(&(x)->bfq_lock, LK_RELEASE); \ + dsched_disk_ctx_unref(&(x)->head); \ + } while(0) + +#define SECT_SIZE 512 /* XXX: DEV_BSIZE? */ +#define BFQ_DEBUG_CRITICAL 1 +#define BFQ_DEBUG_NORMAL 2 +#define BFQ_DEBUG_VERBOSE 3 +#define BFQ_DEFAULT_MAX_BUDGET (1024*512) /* 1024 sectors / 0.2sec */ +#define BFQ_DEFAULT_MIN_BUDGET (32*512) /* 32 sectors / 0.2sec */ +#define BFQ_BUDG_INC_STEP (1*128*512) /* The linear increasing step of budget */ + +/* If the budget is larger than this threshold, + * it will get linear increment, else, + * it will get exponential increment.*/ +#define BFQ_BUDGET_MULTIPLE_THRESHOLD (256*512) + +#define BFQ_DEFAULT_WEIGHT 1 + +/* Get the size of a bio */ +#define BIO_SIZE(x) ((x)->bio_buf->b_bcount) + +/* Anticipatory waiting time (ticks) ~ 20ms, min ~ 10ms */ +#define BFQ_T_WAIT ((hz/50) > 5 ? (hz/50) : 5) + +#define BFQ_T_WAIT_MIN ((hz/100 > 0) ? (hz/100) : 1) + +/* Time slice for each service period ~200ms (ticks) */ +#define BFQ_SLICE_TIMEOUT (hz/5) + +#define BFQ_FIXPOINT_SHIFT 10 /* fixed point arithmetic shift */ + +#define BFQ_VALID_MIN_SAMPLES 80 /* minimum number of samples */ + +#define ABS(x) (((x) < 0) ? (-(x)) : (x)) + +/* as statistics define */ +#define BFQ_AS_STAT_ALL 0x1 +#define BFQ_AS_STAT_ONLY_MISS 0x2 + +/* functions helper thread calls */ +void bfq_timeout(void *); +void bfq_dequeue(struct dsched_disk_ctx *); +void bfq_helper_destroy_tdio(struct dsched_thread_io *, struct bfq_disk_ctx *); + +/* sysctl handlers, registered in the helper thread */ +int bfq_sysctl_as_switch_handler(SYSCTL_HANDLER_ARGS); +int bfq_sysctl_auto_max_budget_handler(SYSCTL_HANDLER_ARGS); + +#endif /* _KERNEL || _KERNEL_STRUCTURES */ +struct dsched_bfq_stats { + int32_t as_missed; + int32_t as_hit; + int32_t as_fake; + int32_t unused; +}; +#endif /*_DSCHED_BFQ_H_ */ diff --git a/sys/kern/dsched/bfq/bfq_helper_thread.c b/sys/kern/dsched/bfq/bfq_helper_thread.c new file mode 100644 index 0000000..75d8a04 --- /dev/null +++ b/sys/kern/dsched/bfq/bfq_helper_thread.c @@ -0,0 +1,457 @@ +/* + * Copyright (c) 2011 The DragonFly Project. All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Brills Peng + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + + +/* + * bfq_helper_thread.c: + * Thread function of the helper thread and + * message sending routines. + * + * XXX: The current approach of serializing using lwkt messages is suboptimal. + * The idea is to replace it with way more fine-grained and lockless + * accesses spread all over the place. It makes things more complicated, + * but it will also improve performance significantly. + * + * The sysctl node of bfq is also initialized + * here. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +extern struct sysctl_oid *bfq_mod_oid; +extern struct dsched_policy dsched_bfq_policy; + +static void helper_thread(struct bfq_disk_ctx *bfq_diskctx); +static int helper_msg_exec(helper_msg_t msg); +static void helper_sysctl_init(struct bfq_disk_ctx *bfq_diskctx); + +MALLOC_DEFINE(M_HELPER, "bfq", "BFQ helper thread message allocations"); + +/* + * All threads share one dispose port + */ +static struct lwkt_port helper_dispose_port; + +/* XXX: should be an mpipe */ +static struct objcache_malloc_args helper_msg_malloc_args = { + sizeof(struct helper_msg), M_HELPER }; + + +static helper_msg_t +helper_msg_get(struct bfq_disk_ctx *bfq_diskctx) +{ + /* + * XXX: wait is OK? + */ + return objcache_get(bfq_diskctx->helper_msg_cache, M_WAITOK); +} + +static int +helper_msg_put(struct bfq_disk_ctx *bfq_diskctx, helper_msg_t msg) +{ + objcache_put(bfq_diskctx->helper_msg_cache, msg); + return 0; +} + +static void +helper_msg_autofree_reply(lwkt_port_t port, lwkt_msg_t msg) +{ + helper_msg_t hm = (helper_msg_t)msg; + helper_msg_put(hm->bfq_diskctx, (helper_msg_t)msg); +} + +/* + * Initialize the dispose port. All helper threads share this port. + * Must be called only once, and before any helper thread being created. + * + * Called by bfq.c: bfq_moc_handler() + */ +void +helper_init_global(void) +{ + lwkt_initport_replyonly(&helper_dispose_port, helper_msg_autofree_reply); +} + +/* + * Helper thread initialization function: + * initialize the per-disk objcache and create the + * helper thread. + * + * Called by bfq.c:bfq_prepare() + */ +void +helper_init(struct bfq_disk_ctx *bfq_diskctx) +{ + struct thread *phelper_thread; + + bfq_diskctx->helper_msg_cache = objcache_create("bfq-helper-msg-cache", 0, 0, + NULL, NULL, NULL, + objcache_malloc_alloc, + objcache_malloc_free, + &helper_msg_malloc_args); + + lwkt_create((void (*) (void *)) helper_thread, bfq_diskctx, + &phelper_thread, NULL, 0, -1, + "bfq_helper_td_%s", bfq_diskctx->head.dp->d_cdev->si_name); + + bfq_diskctx->helper_thread = phelper_thread; +} + +static void +helper_msg_send(struct bfq_disk_ctx *bfq_diskctx, uint32_t cmd, helper_msg_t helper_msg) +{ + lwkt_port_t port = &bfq_diskctx->helper_msg_port; + + lwkt_initmsg(&helper_msg->hdr, &helper_dispose_port, 0); + helper_msg->bfq_diskctx = bfq_diskctx; + helper_msg->hdr.u.ms_result = cmd; + + if (port->mpu_td == curthread){ + helper_msg_exec(helper_msg); + lwkt_replymsg(&helper_msg->hdr, 0); + } else { + lwkt_sendmsg(port, (lwkt_msg_t)helper_msg); + } +} + +/* + * Deallocate the objcache. + * Called by bfq.c: bfq_teardown() + */ +void +helper_uninit(struct bfq_disk_ctx *bfq_diskctx) +{ + objcache_destroy(bfq_diskctx->helper_msg_cache); +} + +static void +helper_sysctl_init(struct bfq_disk_ctx *bfq_diskctx) +{ + struct sysctl_oid *oid; + + sysctl_ctx_init(&bfq_diskctx->bfq_sysctl_ctx); + + if (!bfq_mod_oid){ + kprintf("Failed to create BFQ dev sysctl node!\n"); + return; + } + + oid = SYSCTL_ADD_NODE(&bfq_diskctx->bfq_sysctl_ctx, + SYSCTL_CHILDREN(bfq_mod_oid), + OID_AUTO, + bfq_diskctx->head.dp->d_cdev->si_name, + CTLFLAG_RD, 0, ""); + + SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx, + SYSCTL_CHILDREN(oid), + OID_AUTO, + "max_budget", + CTLFLAG_RW, + &bfq_diskctx->bfq_max_budget, + 0, + "BFQ max budget"); + + SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx, + SYSCTL_CHILDREN(oid), + OID_AUTO, + "peak_rate", + CTLFLAG_RD, + &bfq_diskctx->bfq_peak_rate, + 0, + "BFQ estimated peak rate"); + + SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx, + SYSCTL_CHILDREN(oid), + OID_AUTO, + "peak_samples", + CTLFLAG_RD, + &bfq_diskctx->bfq_peak_rate_samples, + 0, + "BFQ estimated peak rate samples"); + + SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx, + SYSCTL_CHILDREN(oid), + OID_AUTO, + "as_miss", + CTLFLAG_RD, + &bfq_diskctx->bfq_as_miss, + 0, + "BFQ AS miss"); + + SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx, + SYSCTL_CHILDREN(oid), + OID_AUTO, + "as_hit", + CTLFLAG_RD, + &bfq_diskctx->bfq_as_hit, + 0, + "BFQ AS hit"); + + SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx, + SYSCTL_CHILDREN(oid), + OID_AUTO, + "as_wait_avg_all", + CTLFLAG_RD, + &bfq_diskctx->bfq_as_avg_wait_all, + 0, + "BFQ AS waitall"); + + SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx, + SYSCTL_CHILDREN(oid), + OID_AUTO, + "as_wait_avg_miss", + CTLFLAG_RD, + &bfq_diskctx->bfq_as_avg_wait_miss, + 0, + "BFQ AS waitmiss"); + + SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx, + SYSCTL_CHILDREN(oid), + OID_AUTO, + "as_wait_max", + CTLFLAG_RD, + &bfq_diskctx->bfq_as_max_wait, + 0, + "BFQ AS waitmax"); + + SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx, + SYSCTL_CHILDREN(oid), + OID_AUTO, + "as_wait_max2", + CTLFLAG_RD, + &bfq_diskctx->bfq_as_max_wait2, + 0, + "BFQ AS waitmax2"); + + SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx, + SYSCTL_CHILDREN(oid), + OID_AUTO, + "as_high_wait_count", + CTLFLAG_RD, + &bfq_diskctx->bfq_as_high_wait_count, + 0, + "BFQ AS high count"); + + SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx, + SYSCTL_CHILDREN(oid), + OID_AUTO, + "as_high_wait_count2", + CTLFLAG_RD, + &bfq_diskctx->bfq_as_high_wait_count2, + 0, + "BFQ AS high count2"); + + SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx, + SYSCTL_CHILDREN(oid), + OID_AUTO, + "avg_time_slice", + CTLFLAG_RD, + &bfq_diskctx->bfq_avg_time_slice, + 0, + "BFQ average time slice"); + + SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx, + SYSCTL_CHILDREN(oid), + OID_AUTO, + "max_time_slice", + CTLFLAG_RD, + &bfq_diskctx->bfq_max_time_slice, + 0, + "BFQ max time slice"); + + SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx, + SYSCTL_CHILDREN(oid), + OID_AUTO, + "high_time_slice_count", + CTLFLAG_RD, + &bfq_diskctx->bfq_high_time_slice_count, + 0, + "BFQ high time slice count"); + + SYSCTL_ADD_PROC(&bfq_diskctx->bfq_sysctl_ctx, SYSCTL_CHILDREN(oid), + OID_AUTO, "as_switch", CTLTYPE_INT|CTLFLAG_RW, + bfq_diskctx, 0, bfq_sysctl_as_switch_handler, "I", "as_switch"); + + SYSCTL_ADD_PROC(&bfq_diskctx->bfq_sysctl_ctx, SYSCTL_CHILDREN(oid), + OID_AUTO, "auto_max_budget_switch", CTLTYPE_INT|CTLFLAG_RW, + bfq_diskctx, 0, bfq_sysctl_auto_max_budget_handler, "I", "amb_switch"); +} + +static void +helper_thread(struct bfq_disk_ctx *bfq_diskctx) +{ + struct dsched_thread_io *tdio; + + int r; + helper_msg_t msg; + + tdio = dsched_new_policy_thread_tdio(&bfq_diskctx->head, &dsched_bfq_policy); + + lwkt_initport_thread(&bfq_diskctx->helper_msg_port, curthread); + dsched_disk_ctx_ref(&bfq_diskctx->head); + helper_sysctl_init(bfq_diskctx); + + dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: helper thread created\n"); +#if 0 + /* XXX: why mplock?! */ + get_mplock(); +#endif + + for(;;) { + msg = (helper_msg_t)lwkt_waitport(&bfq_diskctx->helper_msg_port, 0); + dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: helper: msg recv: %d\n", msg->hdr.u.ms_result); + r = helper_msg_exec(msg); + lwkt_replymsg(&msg->hdr, 0); + /* + * received BFQ_MSG_KILL + */ + if (r == -1) + break; + } + +#if 0 + rel_mplock(); +#endif + + sysctl_ctx_free(&bfq_diskctx->bfq_sysctl_ctx); + dsched_disk_ctx_unref(&bfq_diskctx->head); + dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: helper: die peacefully\n"); + lwkt_exit(); +} + +static int +helper_msg_exec(helper_msg_t msg) +{ + struct bfq_disk_ctx *bfq_diskctx; + + bfq_diskctx = msg->bfq_diskctx; + + + switch (msg->hdr.u.ms_result) + { + case BFQ_MSG_DEQUEUE: + if (atomic_cmpset_int(&bfq_diskctx->pending_dequeue, 0, 1)) + bfq_dequeue((struct dsched_disk_ctx *)bfq_diskctx); + break; + case BFQ_MSG_AS_TIMEOUT: + bfq_timeout(bfq_diskctx); + break; + + case BFQ_MSG_DESTROY_TDIO: + bfq_helper_destroy_tdio(msg->tdio, bfq_diskctx); + break; + + case BFQ_MSG_KILL: + return -1; + + default: + break; + } + return 0; +} + +void +helper_msg_dequeue(struct bfq_disk_ctx *bfq_diskctx) +{ + helper_msg_t helper_msg = helper_msg_get(bfq_diskctx); + + helper_msg_send(bfq_diskctx, BFQ_MSG_DEQUEUE, helper_msg); +} + +void +helper_msg_as_timeout(struct bfq_disk_ctx *bfq_diskctx) +{ + helper_msg_t helper_msg = helper_msg_get(bfq_diskctx); + /** + * For statisticsal use, temporary + * ------------------------------ + */ + struct bfq_thread_io *bfq_tdio; + struct timeval tv; + uint32_t msec; + + + bfq_tdio = bfq_diskctx->bfq_blockon; + if (bfq_tdio) { + getmicrotime(&tv); + timevalsub(&tv, &bfq_tdio->as_start_time); + msec = ((uint64_t)(1000000*tv.tv_sec + tv.tv_usec)) >> 10; + if (msec > 5 * BFQ_T_WAIT_MIN * (1000 / hz)) + atomic_add_int(&bfq_diskctx->bfq_as_high_wait_count2, 1); + if (msec > bfq_diskctx->bfq_as_max_wait2) + bfq_diskctx->bfq_as_max_wait2 = msec; + } + /* ----------------------------- */ + + helper_msg_send(bfq_diskctx, BFQ_MSG_AS_TIMEOUT, helper_msg); +} + +void +helper_msg_destroy_tdio(struct bfq_disk_ctx *bfq_diskctx, struct dsched_thread_io *tdio) +{ + helper_msg_t helper_msg = helper_msg_get(bfq_diskctx); + + helper_msg->tdio = tdio; + helper_msg_send(bfq_diskctx, BFQ_MSG_DESTROY_TDIO, helper_msg); +} + +void +helper_msg_kill(struct bfq_disk_ctx *bfq_diskctx) +{ + helper_msg_t helper_msg = helper_msg_get(bfq_diskctx); + + helper_msg_send(bfq_diskctx, BFQ_MSG_KILL, helper_msg); +} diff --git a/sys/kern/dsched/bfq/bfq_helper_thread.h b/sys/kern/dsched/bfq/bfq_helper_thread.h new file mode 100644 index 0000000..8df731b --- /dev/null +++ b/sys/kern/dsched/bfq/bfq_helper_thread.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2011 The DragonFly Project. All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Brills Peng + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + + +#ifndef _DSCHED_BFQ_HELPER_THREAD_H_ +#define _DSCHED_BFQ_HELPER_THREAD_H_ +#include + +typedef struct helper_msg { + struct lwkt_msg hdr; + struct bfq_disk_ctx *bfq_diskctx; + struct dsched_thread_io *tdio; +} *helper_msg_t; + +enum helper_msg_cmd { + BFQ_MSG_DEQUEUE = 1, + BFQ_MSG_AS_TIMEOUT, + BFQ_MSG_DESTROY_TDIO, + BFQ_MSG_KILL +}; + +void helper_init_global(void); +void helper_init(struct bfq_disk_ctx *bfq_diskctx); +void helper_uninit(struct bfq_disk_ctx *bfq_diskctx); +void helper_msg_dequeue(struct bfq_disk_ctx *bfq_diskctx); +void helper_msg_as_timeout(struct bfq_disk_ctx *bfq_diskctx); +void helper_msg_destroy_tdio(struct bfq_disk_ctx *bfq_diskctx, struct dsched_thread_io *tdio); +void helper_msg_kill(struct bfq_disk_ctx *bfq_diskctx); + +#endif + diff --git a/sys/kern/dsched/bfq/bfq_ktr.h b/sys/kern/dsched/bfq/bfq_ktr.h new file mode 100644 index 0000000..6941e68 --- /dev/null +++ b/sys/kern/dsched/bfq/bfq_ktr.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2011 The DragonFly Project. All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Brills Peng + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + + +/* + * Kernel tracing facility definitions for BFQ + * + * This header can ONLY be included by bfq.c + */ +#ifndef _DSCHED_BFQ_BFQ_C_ +#error "bfq_ktr.h should only be included in sys/kern/dsched/bfq/bfq.c!" +#endif + +#ifndef _DSCHED_BFQ_KTR_H_ +#define _DSCHED_BFQ_KTR_H_ +#include + +#if !defined(KTR_DSCHED_BFQ) +#define KTR_DSCHED_BFQ KTR_ALL +#endif +KTR_INFO_MASTER(dsched_bfq); + +/* thread created */ +KTR_INFO(KTR_DSCHED_BFQ, dsched_bfq, thread_created, 0, "%p", sizeof(void *)); + +/* average seek distance per thread */ +KTR_INFO(KTR_DSCHED_BFQ, dsched_bfq, thread_seek_avg, 0, "%p: %" PRIu64, sizeof(off_t) + sizeof(void *)); + +/* average thinking time per thread */ +KTR_INFO(KTR_DSCHED_BFQ, dsched_bfq, thread_ttime_avg, 0, "%p: %" PRIu64, sizeof(off_t) + sizeof(void *)); + +#endif + diff --git a/sys/kern/dsched/bfq/wf2q.c b/sys/kern/dsched/bfq/wf2q.c new file mode 100644 index 0000000..c2f5e54 --- /dev/null +++ b/sys/kern/dsched/bfq/wf2q.c @@ -0,0 +1,231 @@ +/* + * Copyright (c) 2011 The DragonFly Project. All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Brills Peng + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Augment RB-tree for B-WF2Q+ queuing algorithm: + * - The key of the binary tree is the virtual eligible time (start time) + * - Each node maintains an additional min_vd value, which + * is the minimum virtual deadline (finish time) among the node and its + * children + * - Every operation on the tree changing the childs of a node will + * trigger RB_AUGMENT() marco, which change min_vd along the path to + * the root + */ + +#include + + +#undef RB_AUGMENT +#define RB_AUGMENT(x) wf2q_augment_func(x); + +static void +wf2q_augment_func(struct bfq_thread_io *node) +{ + struct bfq_thread_io *tmp = node, *tmp2; + int min_vd; + do{ + min_vd = tmp->vd; + tmp2 = RB_LEFT(tmp, entry); + min_vd = tmp2 ? MIN(tmp2->min_vd, min_vd) : min_vd; + tmp2 = RB_RIGHT(tmp, entry); + min_vd = tmp2 ? MIN(tmp2->min_vd, min_vd) : min_vd; + tmp->min_vd = min_vd; + }while((tmp = RB_PARENT(tmp,entry))); +} + +/* + * The rb-tree is indexed by the virtual eligible (start) time + */ +static int +bfq_thread_io_cmp(struct bfq_thread_io *a, struct bfq_thread_io *b) +{ + if (a->ve - b->ve <= 0) + return -1; + return 1; +} + +RB_PROTOTYPE(wf2q_augtree_t, bfq_thread_io, entry,); +RB_GENERATE(wf2q_augtree_t, bfq_thread_io, entry, bfq_thread_io_cmp); + +/* + * The algorithm is from + * I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline + * First: A Flexible and Accurate Mechanism for Proportional Share + * Resource Allocation,'' technical report. + * + * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf + * + * - Partition the tree into two parts by ve: + * - One part contains nodes with ve smaller than vtime + * - The other part contains nodes with ve larger than vtime + * - In the first part, find the node with minimum vd, along the + * min_vd value path + * + * Returns + * NULL, if no node with ve smaller than vtime + * or the elegible node with minimum vd. + */ +static struct bfq_thread_io * +wf2q_augtree_get_eligible_with_min_vd(struct wf2q_augtree_t *tree, int vtime) +{ + struct bfq_thread_io *node = RB_ROOT(tree), *st_tree = NULL, *path_req = NULL; + while (node) { + if (node->ve <= vtime) { + /* update node with earliest deadline along path. */ + if ((!path_req) || (path_req->vd > node->vd)) + path_req = node; + /* update root of subtree containing earliest deadline */ + if ((!st_tree) || (RB_LEFT(node,entry) && st_tree->min_vd > RB_LEFT(node,entry)->min_vd)) + st_tree = RB_LEFT(node,entry); + node = RB_RIGHT(node, entry); + } else + node = RB_LEFT(node, entry); + } + /* check whether node with earliest deadline was along path */ + if ((!st_tree) || (st_tree->min_vd >= path_req->vd)) + return path_req; + /* return node with earliest deadline from subtree */ + for (node = st_tree; node; ) { + /* if node found, return it */ + if (st_tree->min_vd == node->vd) + return node; + /* XXX: modified temporarily */ + if (RB_LEFT(node, entry) && node->min_vd == RB_LEFT(node, entry)->min_vd) + node = RB_LEFT(node, entry); + else + node = RB_RIGHT(node, entry); + } + return NULL; +} + +/* + * This function initializes a wf2q structure + */ +void +wf2q_init(struct wf2q_t *pwf2q) +{ + RB_INIT(&pwf2q->wf2q_augtree); + pwf2q->wf2q_virtual_time = 0; + pwf2q->wf2q_tdio_count = 0; +} + +/* + * Insert a tdio into a wf2q queue. + * The virtual eligible (start) time and deadline is handled + * according to the current virtual time (in wf2q_t). + */ +void +wf2q_insert_thread_io(struct wf2q_t *wf2q, struct bfq_thread_io *tdio) +{ + /* + * TODO: The anticipatory parts + * start time varies on whether the tdio is being waited + */ + tdio->ve = MAX(wf2q->wf2q_virtual_time, tdio->vd); + tdio->vd = tdio->ve + tdio->budget / tdio->weight; + tdio->min_vd = tdio->vd; + RB_INSERT(wf2q_augtree_t, &wf2q->wf2q_augtree, tdio); + wf2q->wf2q_tdio_count++; +} + +/* + * Remove a thread_io struct from the augment tree, + * called before a thread is destroyed. + */ +void +wf2q_remove_thread_io(struct wf2q_t *wf2q, struct bfq_thread_io *tdio) +{ + RB_REMOVE(wf2q_augtree_t, &wf2q->wf2q_augtree, tdio); + wf2q->wf2q_tdio_count--; +} + +/* + * Increase the current virtual time as services are provided + */ +void +wf2q_inc_tot_service(struct wf2q_t *wf2q, int amount) +{ + wf2q->wf2q_virtual_time += amount; +} + +/* + * Update a tdio's virtual deadline as it received service + */ +void +wf2q_update_vd(struct bfq_thread_io *tdio, int received_service) +{ + tdio->vd = tdio->ve + received_service / tdio->weight; +} + +static void +wf2q_tree_dump(struct bfq_thread_io *root, int level) +{ + int i; + if (!root) return; + for (i = 0; i < level; i++) + kprintf("-"); + kprintf("vd: %d; ve: %d; min_vd: %d\n", root->vd, root->ve, root->min_vd); + wf2q_tree_dump(RB_LEFT(root,entry), level + 1); + wf2q_tree_dump(RB_RIGHT(root, entry), level + 1); +} + +/* + * Get a tdio with minimum virtual deadline and virtual eligible + * time smaller than the current virtual time. + * If there is no such tdio, update the current virtual time to + * the minimum ve in the queue. (And there must be one eligible then) + */ +struct bfq_thread_io * +wf2q_get_next_thread_io(struct wf2q_t *wf2q) +{ + struct bfq_thread_io *tdio; + struct wf2q_augtree_t *tree = &wf2q->wf2q_augtree; + if (!(tdio = wf2q_augtree_get_eligible_with_min_vd(tree, wf2q->wf2q_virtual_time))) { + tdio = RB_MIN(wf2q_augtree_t, tree); + if (!tdio) + return NULL; + wf2q->wf2q_virtual_time = tdio->ve; + tdio = wf2q_augtree_get_eligible_with_min_vd(tree, wf2q->wf2q_virtual_time); + } + if (!tdio) { + kprintf("!!!wf2q: wf2q_tdio_count=%d\n", wf2q->wf2q_tdio_count); + wf2q_tree_dump(RB_ROOT(tree), 0); + KKASSERT(0); + } + RB_REMOVE(wf2q_augtree_t, tree, tdio); + wf2q->wf2q_tdio_count--; + return tdio; +} + + diff --git a/sys/kern/dsched/bfq/wf2q.h b/sys/kern/dsched/bfq/wf2q.h new file mode 100644 index 0000000..b1a0794 --- /dev/null +++ b/sys/kern/dsched/bfq/wf2q.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2011 The DragonFly Project. All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Brills Peng + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + + +#ifndef _DSCHED_BFQ_WF2Q_H_ +#define _DSCHED_BFQ_WF2Q_H_ + +#include + +#ifndef NULL +#define NULL 0x0 +#endif +/* struct bfq_thread_io is defined in bfq.h */ + +struct bfq_thread_io; + +RB_HEAD(wf2q_augtree_t, bfq_thread_io); + +struct wf2q_t { + struct wf2q_augtree_t wf2q_augtree; + int wf2q_virtual_time; + int wf2q_tdio_count; +}; + +#ifndef _DSCHED_BFQ_H_ +#include +#endif + +void wf2q_init(struct wf2q_t *pwf2q); +void wf2q_insert_thread_io(struct wf2q_t *wf2q, struct bfq_thread_io *tdio); +void wf2q_remove_thread_io(struct wf2q_t *wf2q, struct bfq_thread_io *tdio); +void wf2q_update_vd(struct bfq_thread_io *tdio, int received_service); +struct bfq_thread_io *wf2q_get_next_thread_io(struct wf2q_t *wf2q); +void wf2q_inc_tot_service(struct wf2q_t *wf2q, int amount); + +#endif + -- 1.7.7.2