dsched_bfq - A budget fair-queuing dsched policy
authorBrills Peng <brills@gmail.com>
Sat, 27 Aug 2011 18:19:49 +0000 (18:19 +0000)
committerAlex Hornung <ahornung@gmail.com>
Sun, 28 Aug 2011 01:20:33 +0000 (01:20 +0000)
 * dsched_bfq is a budget fair queuing scheduling policy for the dsched
   framework.

 * NOTE: this scheduler is still highly experimental and
   work-in-progress, it's not recommended for widespread use (yet).
   There are several well-known issues, such as a possible deadlock on
   unloading the module.

Sponsored-by: Google Summer of Code
12 files changed:
sys/conf/files
sys/conf/options
sys/config/LINT
sys/kern/dsched/Makefile
sys/kern/dsched/bfq/Makefile [new file with mode: 0644]
sys/kern/dsched/bfq/bfq.c [new file with mode: 0644]
sys/kern/dsched/bfq/bfq.h [new file with mode: 0644]
sys/kern/dsched/bfq/bfq_helper_thread.c [new file with mode: 0644]
sys/kern/dsched/bfq/bfq_helper_thread.h [new file with mode: 0644]
sys/kern/dsched/bfq/bfq_ktr.h [new file with mode: 0644]
sys/kern/dsched/bfq/wf2q.c [new file with mode: 0644]
sys/kern/dsched/bfq/wf2q.h [new file with mode: 0644]

index 44f9272..e1968cd 100644 (file)
@@ -2044,9 +2044,12 @@ ${OSACPI_MI_DIR}/acpi_toshiba/acpi_toshiba.c     optional acpi_toshiba acpi
 ${OSACPI_MI_DIR}/acpi_video/acpi_video.c       optional acpi_video acpi
 ${OSACPI_MI_DIR}/aibs/atk0110.c                        optional aibs acpi
 
-#dsched stuff
+# dsched stuff
 kern/dsched/fq/fq_core.c               optional dsched_fq
 kern/dsched/fq/fq_diskops.c            optional dsched_fq
+kern/dsched/bfq/bfq.c                  optional dsched_bfq
+kern/dsched/bfq/wf2q.c                 optional dsched_bfq
+kern/dsched/bfq/bfq_helper_thread.c    optional dsched_bfq
 
 # ACPICA code
 ${ACPICA_DIR}/debugger/dbcmds.c                        optional acpi acpi_debug
index 75eb0dd..884e4cb 100644 (file)
@@ -581,6 +581,7 @@ KTR_ENTRIES                 opt_global.h
 KTR_ALL                                opt_ktr.h
 KTR_CTXSW                      opt_ktr.h
 KTR_DMCRYPT                    opt_ktr.h
+KTR_DSCHED_BFQ                 opt_ktr.h
 KTR_ETHERNET                   opt_ktr.h
 KTR_HAMMER                     opt_ktr.h
 KTR_IFQ                                opt_ktr.h
index 51b6dfa..6431306 100644 (file)
@@ -2773,6 +2773,7 @@ options   KTR_ENTRIES=1024
 options        KTR_VERBOSE=1
 #options KTR_CTXSW
 #options KTR_DMCRYPT
+#options KTR_DSCHED_BFQ
 #options KTR_ETHERNET
 #options KTR_HAMMER
 #options KTR_IFQ
@@ -2819,6 +2820,7 @@ options           SCTP_MAP_LOGGING
 
 # DSCHED stuff
 options                DSCHED_FQ
+options                DSCHED_BFQ
 
 # WATCHDOG
 options                WATCHDOG_ENABLE         # Enable watchdog support framework
index 95e95a7..a14d938 100644 (file)
@@ -1,5 +1,5 @@
 # $DragonFly: src/sys/dev/Makefile,v 1.12 2007/01/30 14:50:10 corecode Exp $
 
-SUBDIR=        fq as
+SUBDIR=        fq as bfq
 
 .include <bsd.subdir.mk>
diff --git a/sys/kern/dsched/bfq/Makefile b/sys/kern/dsched/bfq/Makefile
new file mode 100644 (file)
index 0000000..ed9b4c9
--- /dev/null
@@ -0,0 +1,5 @@
+KMOD=  dsched_bfq
+SRCS=  opt_ktr.h
+SRCS+= bfq.c wf2q.c bfq_helper_thread.c
+
+.include <bsd.kmod.mk>
diff --git a/sys/kern/dsched/bfq/bfq.c b/sys/kern/dsched/bfq/bfq.c
new file mode 100644 (file)
index 0000000..d9d29b6
--- /dev/null
@@ -0,0 +1,1382 @@
+/*
+ * Copyright (c) 2011 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Brills Peng <brillsp@gmail.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+
+/*
+ * BFQ disk scheduler, the algorithm routines and the interfaces with the
+ * dsched framework.
+ *
+ */
+
+#include <inttypes.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/sysctl.h>
+#include <sys/buf.h>
+#include <sys/conf.h>
+#include <sys/diskslice.h>
+#include <sys/disk.h>
+#include <sys/malloc.h>
+#include <machine/md_var.h>
+#include <sys/ctype.h>
+#include <sys/syslog.h>
+#include <sys/device.h>
+#include <sys/msgport.h>
+#include <sys/msgport2.h>
+#include <sys/buf2.h>
+#include <sys/dsched.h>
+#include <sys/fcntl.h>
+#include <machine/varargs.h>
+
+#include <kern/dsched/bfq/bfq.h>
+#include <kern/dsched/bfq/bfq_helper_thread.h>
+
+#define _DSCHED_BFQ_BFQ_C_
+#include <kern/dsched/bfq/bfq_ktr.h>
+
+/* Make sure our structs fit */
+CTASSERT(sizeof(struct bfq_thread_io) <= DSCHED_THREAD_IO_MAX_SZ);
+CTASSERT(sizeof(struct bfq_disk_ctx) <= DSCHED_DISK_CTX_MAX_SZ);
+
+
+static dsched_prepare_t                bfq_prepare;
+static dsched_teardown_t       bfq_teardown;
+static dsched_cancel_t         bfq_cancel_all;
+static dsched_queue_t          bfq_queue;
+static dsched_new_tdio_t       bfq_new_tdio;
+static dsched_destroy_tdio_t   bfq_destroy_tdio;
+static dsched_bio_done_t       bfq_bio_done;
+
+
+static void bfq_update_peak_rate(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio);
+static int bfq_slow_tdio(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio);
+static void bfq_expire(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio, enum bfq_expire_reason reason);
+static void bfq_update_tdio_seek_avg(struct bfq_thread_io *bfq_tdio, struct bio *bp);
+static void bfq_update_tdio_ttime_avg(struct bfq_thread_io *bfq_tdio);
+static void bfq_update_as_avg_wait(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio, int flag);
+static void bfq_update_avg_time_slice(struct bfq_disk_ctx *bfq_diskctx, struct timeval tv);
+
+
+
+struct dsched_policy dsched_bfq_policy = {
+       .name           = "bfq",
+       .prepare        = bfq_prepare,
+       .teardown       = bfq_teardown,
+       .cancel_all     = bfq_cancel_all,
+       .bio_queue      = bfq_queue,
+       .new_tdio       = bfq_new_tdio,
+       .destroy_tdio   = bfq_destroy_tdio,
+       .bio_done       = bfq_bio_done,
+       .polling_func   = (void (*)(struct dsched_disk_ctx *))helper_msg_dequeue,
+};
+
+
+struct sysctl_oid *bfq_mod_oid;
+
+struct dsched_bfq_stats bfq_stats;
+
+static int dsched_bfq_version_maj = 1;
+static int dsched_bfq_version_min = 0;
+
+/*
+ * bfq_prepare(): the .prepare callback of the bfq policy. Initialize
+ * all fields in bfq_diskctx and initialize the corresponding helper
+ * thread.
+ *
+ * lock: none
+ * refcount: none
+ *
+ * Returns 0
+ */
+static int
+bfq_prepare(struct dsched_disk_ctx *diskctx)
+{
+       struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)diskctx;
+
+       BFQ_LOCKINIT(bfq_diskctx);
+
+       bfq_diskctx->pending_dequeue = 0;
+
+       wf2q_init(&bfq_diskctx->bfq_wf2q);
+
+       callout_init_mp(&bfq_diskctx->bfq_callout);
+
+       bfq_diskctx->bfq_blockon = NULL;
+       bfq_diskctx->bfq_active_tdio = NULL;
+       bfq_diskctx->bfq_remaining_budget = 0;
+
+       bfq_diskctx->bfq_max_budget = BFQ_DEFAULT_MAX_BUDGET;
+       bfq_diskctx->bfq_peak_rate_samples = 0;
+       bfq_diskctx->bfq_peak_rate = 0;
+
+#if 0
+       bfq_diskctx->bfq_flag = BFQ_FLAG_AS | BFQ_FLAG_AUTO_MAX_BUDGET;
+#endif
+       bfq_diskctx->bfq_flag = BFQ_FLAG_AS;
+
+       bfq_diskctx->bfq_as_miss = 0;
+       bfq_diskctx->bfq_as_hit = 0;
+
+       bfq_diskctx->bfq_as_avg_wait_miss = 0;
+       bfq_diskctx->bfq_as_avg_wait_all = 0;
+       bfq_diskctx->bfq_as_max_wait = 0;
+       bfq_diskctx->bfq_as_max_wait2 = 0;
+       bfq_diskctx->bfq_as_high_wait_count = 0;
+       bfq_diskctx->bfq_as_high_wait_count2 = 0;
+
+       bfq_diskctx->bfq_avg_time_slice = 0;
+       bfq_diskctx->bfq_max_time_slice = 0;
+       bfq_diskctx->bfq_high_time_slice_count = 0;
+
+       /* initiailize the helper thread */
+       helper_init(bfq_diskctx);
+
+       dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: initialized!\n");
+       return 0;
+}
+
+/*
+ * bfq_teardown(): .teardown callback of the bfq policy. Send message
+ * of killing to the helper thread and deallocate resources used by
+ * the helper thread (currently the objcache)
+ *
+ * XXX: deadlock causing when the caller of bfq_teardown() and the
+ * helper thread are on the same CPU.
+ *
+ * lock: none
+ * refcount: none
+ *
+ */
+
+static void
+bfq_teardown(struct dsched_disk_ctx *diskctx)
+{
+       struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)diskctx;
+       KKASSERT(diskctx);
+
+       helper_msg_kill(bfq_diskctx);
+
+       tsleep(diskctx, 0, "teardn", hz * 3 / 2);
+
+       helper_uninit(bfq_diskctx);
+}
+
+/*
+ * bfq_cancel_all(): .cancel_all callback of the bfq policy. Cancel
+ * all bios that queue in each bfq_thread_io structure in the
+ * wf2q tree.
+ *
+ * lock:
+ *     BFQ_LOCK: protect from wf2q_insert operation in bfq_queue() and
+ *     bfq_dequeue(); wf2q_get_next operation in bfq_dequeue()
+ *     THREAD_IO_LOCK: protect from queue iteration in bfq_dequeue() and
+ *     queue insertion in bfq_queue()
+ *
+ * refcount:
+ *     unref thread_io structures; they are referenced in queue(),
+ *     when a bio is queued. The refcount may decrease to zero.
+ *
+ */
+static void
+bfq_cancel_all(struct dsched_disk_ctx *diskctx)
+{
+       struct bio *bio;
+       struct bfq_thread_io *bfq_tdio;
+       struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)diskctx;
+
+       BFQ_LOCK(bfq_diskctx);
+
+       while ((bfq_tdio = wf2q_get_next_thread_io(&bfq_diskctx->bfq_wf2q))) {
+               DSCHED_THREAD_IO_LOCK(&bfq_tdio->head);
+               KKASSERT(lockstatus(&bfq_tdio->head.lock, curthread) == LK_EXCLUSIVE);
+
+               while ((bio = TAILQ_FIRST(&bfq_tdio->head.queue))) {
+                       bfq_tdio->head.qlength--;
+                       TAILQ_REMOVE(&bfq_tdio->head.queue, bio, link);
+                       dsched_cancel_bio(bio);
+                       dsched_thread_io_unref(&bfq_tdio->head);
+               }
+
+               KKASSERT(bfq_tdio->head.qlength == 0);
+               DSCHED_THREAD_IO_UNLOCK(&bfq_tdio->head);
+       }
+
+       BFQ_UNLOCK(bfq_diskctx);
+}
+
+/*
+ * bfq_new_tdio(): .new_tdio callback of the bfq policy. Initialize
+ * the bfq_thread_io structure.
+ *
+ * lock: none
+ * refcount: none
+ */
+static void
+bfq_new_tdio(struct dsched_thread_io *tdio)
+{
+       struct bfq_thread_io *bfq_tdio = (struct bfq_thread_io *) tdio;
+
+       /* the queue has to be initialized some where else */
+       tdio->qlength = 0;
+
+       tdio->debug_priv = 0xF00FF00F;
+
+       bfq_tdio->budget = BFQ_DEFAULT_MIN_BUDGET;
+       bfq_tdio->weight = BFQ_DEFAULT_WEIGHT;
+
+       bfq_tdio->tdio_as_switch = 1;
+       bfq_tdio->maybe_timeout = 0;
+
+       bfq_tdio->seek_samples = 0;
+       bfq_tdio->seek_avg = 0;
+       bfq_tdio->seek_total = 0;
+       bfq_tdio->ttime_samples = 0;
+       bfq_tdio->ttime_avg = 0;
+       bfq_tdio->service_received = 0;
+       bfq_tdio->bio_dispatched = 0;
+       bfq_tdio->bio_completed = 0;
+
+       KTR_LOG(dsched_bfq_thread_created, bfq_tdio);
+}
+
+/*
+ * bfq_helper_destroy_tdio(): called after a thread_io struct is destroyed.
+ * if the scheduler is AS waiting for a destroyed tdio, this function resumes
+ * the scheduler.
+ *
+ * lock:
+ *     BFQ_LOCK: protect from nullify bfq_diskctx->bfq_blockon/bfq_active_tdio
+ *     in bfq_timeout()
+ *
+ * refcount: none
+ *
+ * Calling path: bfq_destroy_tdio --lwkt_msg--> helper_thread --call--> me
+ *
+ */
+void
+bfq_helper_destroy_tdio(struct dsched_thread_io *tdio, struct bfq_disk_ctx *bfq_diskctx)
+{
+       KKASSERT(bfq_diskctx);
+
+       BFQ_LOCK(bfq_diskctx);
+
+       /*
+        * Test whether the scheduler is pending on the tdio to
+        * be destroyed.
+        */
+       if (((struct dsched_thread_io *)bfq_diskctx->bfq_blockon == tdio) &&
+           callout_pending(&bfq_diskctx->bfq_callout)) {
+               dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: pending on a being destroyed thread!\n");
+
+               callout_stop(&bfq_diskctx->bfq_callout);
+
+               bfq_diskctx->bfq_blockon = NULL;
+               bfq_diskctx->bfq_active_tdio = NULL;
+
+               BFQ_UNLOCK(bfq_diskctx);
+
+               helper_msg_dequeue(bfq_diskctx);
+               return;
+       }
+       BFQ_UNLOCK(bfq_diskctx);
+
+}
+
+/*
+ * bfq_destroy_tdio(): .destroy_tdio callback of the bfq policy
+ *
+ * Called immediate after a dsched_thread_io struct's refcount decreases
+ * to zero. This function will record the seek_avg and ttime_avg of the
+ * destroyed thread with the KTR facility.
+ *
+ * lock: none
+ *
+ * refcount: the tdio's refcount should be zero. It may be nuked, and
+ * any read/write to the tdio is not safe by then.
+ */
+static void
+bfq_destroy_tdio(struct dsched_thread_io *tdio)
+{
+       struct bfq_thread_io *bfq_tdio = (struct bfq_thread_io *)tdio;
+
+       /*
+        * do not log threads without I/O
+        */
+       if (bfq_tdio->seek_samples != 0 || bfq_tdio->ttime_samples != 0) {
+               KTR_LOG(dsched_bfq_thread_seek_avg, bfq_tdio, bfq_tdio->seek_avg );
+               KTR_LOG(dsched_bfq_thread_ttime_avg, bfq_tdio, bfq_tdio->ttime_avg);
+       }
+
+       helper_msg_destroy_tdio((struct bfq_disk_ctx *)tdio->diskctx, tdio);
+}
+
+/*
+ * bfq_bio_done(): .bio_done callback of the bfq policy
+ *
+ * Called after a bio is done, (by request_polling_biodone of dsched).
+ * This function judges whet her a thread consumes up its time slice, and
+ * if so, it will set the maybe_timeout flag in bfq_tdio structure. Any
+ * further action of that thread or the bfq scheduler will cause the
+ * thread to be expired. (in bfq_queue() or in bfq_dequeue())
+ *
+ * This function requires the bfq_tdio pointer of the thread that pushes
+ * bp to be stored by dsched_set_bio_priv() earlier. Currently it is
+ * stored when bfq_queue() is called.
+ *
+ * lock: none. This function CANNOT be blocked by any lock
+ *
+ * refcount:
+ *     the corresponding tdio's refcount should decrease by 1 after
+ *     this function call. The counterpart increasing is in bfq_queue().
+ *     For each bio pushed down, we increase the refcount of the pushing
+ *     tdio.
+ */
+static void
+bfq_bio_done(struct bio *bp)
+{
+       struct disk *dp = dsched_get_bio_dp(bp);
+       struct bfq_thread_io *bfq_tdio = dsched_get_bio_priv(bp);
+       struct bfq_disk_ctx *bfq_diskctx = dsched_get_disk_priv(dp);
+       struct timeval tv;
+       int ticks_expired;
+
+       KKASSERT(bfq_tdio);
+
+       dsched_thread_io_ref(&bfq_tdio->head);
+
+       atomic_add_int(&bfq_tdio->bio_completed, 1);
+
+       /* the tdio has already expired */
+       if (bfq_tdio != bfq_diskctx->bfq_active_tdio)
+               goto rtn;
+       atomic_add_int(&bfq_tdio->service_received, BIO_SIZE(bp));
+
+       /* current time */
+       getmicrotime(&tv);
+       bfq_tdio->last_request_done_time = tv;
+       timevalsub (&tv, &bfq_tdio->service_start_time);
+       ticks_expired = tvtohz_high(&tv);
+
+       /* the thread has run out its time slice */
+       if ((ticks_expired != 0x7fffffff) &&
+           (ticks_expired >= BFQ_SLICE_TIMEOUT)) {
+               /*
+                * we cannot block here, so just set a flag
+                */
+#if 0
+               bfq_tdio->maybe_timeout = 1;
+#endif
+               if (atomic_cmpset_int(&bfq_tdio->maybe_timeout, 0, 1)) {
+                       bfq_update_avg_time_slice(bfq_diskctx, tv);
+                       dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: %p may time out\n", bfq_tdio);
+               }
+       }
+rtn:
+       dsched_thread_io_unref(&bfq_tdio->head); /* ref'ed in this function */
+       dsched_thread_io_unref(&bfq_tdio->head); /* ref'ed in queue() */
+
+}
+
+/*
+ * bfq_timeout(): called after the callout alarm strikes.
+ *
+ * This function getting called indicates that after waiting for
+ * BFQ_T_WAIT / BFQ_T_WAIT_MIN ticks, the thread "active_tdio"
+ * represents does not push any further bios. This tdio should
+ * be expired with the reason BFQ_REASON_TOO_IDLE, but if the tdio
+ * is marked as timeout (in bfq_biodone()) first, we expire it
+ * for BFQ_REASON_TIMEOUT. The bfq scheduler should resume working
+ * (and pick another thread to serve).
+ *
+ * It is possible that this function gets called a litter after
+ * the thread pushes a bio with bfq_queue(), and thus a "fake timeout"
+ * happens. We treat it as the callout does not strike, and continue
+ * to serve the active_tdio.
+ *
+ * lock:
+ *     BFQ_LOCK: protect bfq_diskctx->blockon and bfq_diskctx->active_tdio
+ *     they should either changed in bfq_queue() or in this function,
+ *     atomically.
+ *     TDIO_LOCK: protect from dequeue() updateing the budget by the
+ *     maybe_timeout branch. (Not necessary, because we already hold the
+ *     BFQ_LOCK, and no one else could change the budget of the tdio)
+ *
+ * refcount:
+ *  the refcount of bfq_diskctx->bfq_active_tdio will decrease one
+ *  after this function. (The counterpart increasing is in bfq_dequeue(),
+ *  before resetting the callout alarm.)
+ *
+ * AS timeout:
+ * during the waiting period, no bio is pushed by the being
+ * waited tdio
+ *
+ * Calling path:
+ * callout facility --> helper_msg_timeout --lwkt_msg--> helper thread
+ *  --> me
+ */
+void
+bfq_timeout(void *p)
+{
+       /* waiting time out:
+        * no deceptive idleness, and unblock dispatching
+        */
+       struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)p;
+       struct bfq_thread_io *bfq_tdio;
+
+       BFQ_LOCK(bfq_diskctx);
+
+       /*
+        * the timeout occurs after the thread
+        * pushing one more bio
+        */
+       if (bfq_diskctx->bfq_blockon == NULL) {
+               dsched_debug(BFQ_DEBUG_VERBOSE , "BFQ: fake AS timeout \n");
+               goto rtn;
+       }
+
+       bfq_diskctx->bfq_as_miss++;
+
+       KKASSERT(bfq_diskctx->bfq_active_tdio);
+       bfq_tdio = bfq_diskctx->bfq_active_tdio;
+
+       DSCHED_THREAD_IO_LOCK(&bfq_tdio->head);
+
+       bfq_update_as_avg_wait(bfq_diskctx, bfq_tdio, BFQ_AS_STAT_ALL|BFQ_AS_STAT_ONLY_MISS);
+
+       bfq_diskctx->bfq_blockon = NULL;
+       bfq_diskctx->bfq_active_tdio = NULL;
+       dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: unblocked %p\n", bfq_tdio);
+
+       wf2q_update_vd(bfq_tdio, bfq_tdio->budget - bfq_diskctx->bfq_remaining_budget);
+       /*
+        * the time slice expired before as timeout
+        * this should be REASON_TIMEOUT
+        */
+       if (bfq_tdio->maybe_timeout) {
+               bfq_expire(bfq_diskctx, bfq_tdio, BFQ_REASON_TIMEOUT);
+               dsched_debug(BFQ_DEBUG_VERBOSE, "%p time out in timeout()\n", bfq_tdio);
+       } else {
+               bfq_expire(bfq_diskctx, bfq_tdio, BFQ_REASON_TOO_IDLE);
+               dsched_debug(BFQ_DEBUG_VERBOSE, "%p too idle\n", bfq_tdio);
+       }
+
+       DSCHED_THREAD_IO_UNLOCK(&bfq_tdio->head);
+
+       /* ref'ed in dequeue(), before resetting callout */
+       dsched_thread_io_unref(&bfq_tdio->head);
+rtn:
+       BFQ_UNLOCK(bfq_diskctx);
+       helper_msg_dequeue(bfq_diskctx);
+}
+
+/*
+ * bfq_queue(): .queue callback of the bfq policy.
+ *
+ * A thread calls this function to hand in its I/O requests (bio).
+ * Their bios are stored in the per-thread queue, in tdio structure.
+ * Currently, the sync/async bios are queued together, which may cause
+ * some issues on performance.
+ *
+ * Besides queueing bios, this function also calculates the average
+ * thinking time and average seek distance of a thread, using the
+ * information in bio structure.
+ *
+ * If the calling thread is waiting by the bfq scheduler due to
+ * the AS feature, this function will cancel the callout alarm
+ * and resume the scheduler to continue serving this thread.
+ *
+ * lock:
+ *   THREAD_IO_LOCK: protect from queue iteration in bfq_dequeue()
+ *   BFQ_LOCK: protect from other insertions/deletions in wf2q_augtree
+ *   in bfq_queue() or bfq_dequeue().
+ *
+ * refcount:
+ *   If the calling thread is waited by the scheduler, the refcount
+ *   of the related tdio will decrease by 1 after this function. The
+ *   counterpart increasing is in bfq_dequeue(), before resetting the
+ *   callout alarm.
+ *
+ * Return value:
+ *  EINVAL: if bio->bio_buf->b_cmd == BUF_CMD_FLUSH
+ *  0: bio is queued successfully.
+ */
+static int
+bfq_queue(struct dsched_disk_ctx *diskctx, struct dsched_thread_io *tdio,
+               struct  bio *bio)
+{
+       struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)diskctx;
+       struct bfq_thread_io *bfq_tdio = (struct bfq_thread_io *)tdio;
+       int original_qlength;
+
+       /* we do not handle flush requests. push it down to dsched */
+       if (__predict_false(bio->bio_buf->b_cmd == BUF_CMD_FLUSH))
+               return (EINVAL);
+
+       DSCHED_THREAD_IO_LOCK(tdio);
+       KKASSERT(tdio->debug_priv == 0xF00FF00F);
+       dsched_debug(BFQ_DEBUG_NORMAL, "bfq: tdio %p pushes bio %p\n", bfq_tdio, bio);
+
+       dsched_set_bio_priv(bio, tdio);
+       dsched_thread_io_ref(tdio);
+
+       if ((bio->bio_buf->b_cmd == BUF_CMD_READ) ||
+           (bio->bio_buf->b_cmd == BUF_CMD_WRITE)) {
+               bfq_update_tdio_seek_avg(bfq_tdio, bio);
+       }
+
+       bfq_update_tdio_ttime_avg(bfq_tdio);
+
+       /* update last_bio_pushed_time */
+       getmicrotime(&bfq_tdio->last_bio_pushed_time);
+
+       if ((bfq_tdio->seek_samples > BFQ_VALID_MIN_SAMPLES) &&
+           BFQ_TDIO_SEEKY(bfq_tdio))
+               dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: tdio %p is seeky\n", bfq_tdio);
+
+       /*
+        * If a tdio taks too long to think, we disable the AS feature of it.
+        */
+       if ((bfq_tdio->ttime_samples > BFQ_VALID_MIN_SAMPLES) &&
+           (bfq_tdio->ttime_avg > BFQ_T_WAIT * (1000 / hz) * 1000) &&
+           (bfq_tdio->service_received > bfq_tdio->budget / 8)) {
+               dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: tdio %p takes too long time to think\n", bfq_tdio);
+               bfq_tdio->tdio_as_switch = 0;
+       } else {
+               bfq_tdio->tdio_as_switch = 1;
+       }
+
+       /* insert the bio into the tdio's own queue */
+       KKASSERT(lockstatus(&tdio->lock, curthread) == LK_EXCLUSIVE);
+       TAILQ_INSERT_TAIL(&tdio->queue, bio, link);
+#if 0
+       tdio->qlength++;
+#endif
+       original_qlength = atomic_fetchadd_int(&tdio->qlength, 1);
+       DSCHED_THREAD_IO_UNLOCK(tdio);
+       /*
+        * A new thread:
+        * In dequeue function, we remove the thread
+        * from the aug-tree if it has no further bios.
+        * Therefore "new" means a really new thread (a
+        * newly created thread or a thread that pushed no more
+        * bios when the scheduler was waiting for it) or
+        * one that was removed from the aug-tree earlier.
+        */
+       if (original_qlength == 0) {
+               /*
+                * a really new thread
+                */
+               BFQ_LOCK(bfq_diskctx);
+               if (bfq_tdio != bfq_diskctx->bfq_active_tdio) {
+                       /* insert the tdio into the wf2q queue */
+                       wf2q_insert_thread_io(&bfq_diskctx->bfq_wf2q, bfq_tdio);
+               } else {
+                       /*
+                        * the thread being waited by the scheduler
+                        */
+                       if (bfq_diskctx->bfq_blockon == bfq_tdio) {
+                               /*
+                                * XXX: possible race condition here:
+                                * if the callout function is triggered when
+                                * the following code is executed, then after
+                                * releasing the TDIO lock, the callout function
+                                * will set the thread inactive and it will never
+                                * be inserted into the aug-tree (so its bio pushed
+                                * this time will not be dispatched) until it pushes
+                                * further bios
+                                */
+                               bfq_diskctx->bfq_as_hit++;
+                               bfq_update_as_avg_wait(bfq_diskctx, bfq_tdio, BFQ_AS_STAT_ALL);
+
+                               if (callout_pending(&bfq_diskctx->bfq_callout))
+                                       callout_stop(&bfq_diskctx->bfq_callout);
+                               bfq_diskctx->bfq_blockon = NULL;
+
+                               /* ref'ed in dequeue(), before resetting callout */
+                               dsched_thread_io_unref(&bfq_tdio->head);
+
+                               dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: %p pushes a new bio when AS\n", bfq_tdio);
+                       }
+               }
+
+               BFQ_UNLOCK(bfq_diskctx);
+       }
+
+       helper_msg_dequeue(bfq_diskctx);
+
+       return 0;
+}
+
+/*
+ * bfq_dequeue(): dispatch bios to the disk driver.
+ *
+ * This function will push as many bios as the number of free slots
+ * in the tag queue.
+ *
+ * In the progress of dispatching, the following events may happen:
+ *  - Current thread is timeout: Expire the current thread for
+ *    BFQ_REASON_TIMEOUT, and select a new thread to serve in the
+ *    wf2q tree.
+ *
+ *  - Current thread runs out of its budget: Expire the current thread
+ *    for BFQ_REASON_OUT_OF_BUDGET, and select a new thread to serve
+ *
+ *  - Current thread has no further bios in its queue: if the AS feature
+ *    is turned on, the bfq scheduler sets an alarm and starts to suspend.
+ *    The bfq_timeout() or bfq_queue() calls may resume the scheduler.
+ *
+ * Implementation note: The bios selected to be dispatched will first
+ * be stored in an array bio_do_dispatch. After this function releases
+ * all the locks it holds, it will call dsched_strategy_request_polling()
+ * for each bio stored.
+ *
+ * With the help of bfq_disk_ctx->pending_dequeue,
+ * there will be only one bfq_dequeue pending on the BFQ_LOCK.
+ *
+ * lock:
+ *     BFQ_LOCK: protect from wf2q_augtree operations in bfq_queue()
+ *     THREAD_IO_LOCK: locks the active_tdio. Protect from queue insertions
+ *     in bfq_queue; Protect the active_tdio->budget
+ *
+ * refcount:
+ *  If the scheduler decides to suspend, the refcount of active_tdio
+ *  increases by 1. The counterpart decreasing is in bfq_queue() and
+ *  bfq_timeout()
+ * blocking:
+ *  May be blocking on the disk driver lock. It depends on drivers.
+ *
+ * Calling path:
+ * The callers could be:
+ *     bfq_queue(), bfq_timeout() and the registered polling function.
+ *
+ *     caller --> helper_msg_dequeue --lwkt_msg--> helper_thread-> me
+ *
+ */
+void
+bfq_dequeue(struct dsched_disk_ctx *diskctx)
+{
+       int free_slots,
+           bio_index = 0, i,
+           remaining_budget = 0;/* remaining budget of current active process */
+
+       struct bio *bio, *bio_to_dispatch[33];
+       struct bfq_thread_io *active_tdio = NULL;
+       struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)diskctx;
+
+       BFQ_LOCK(bfq_diskctx);
+       atomic_cmpset_int(&bfq_diskctx->pending_dequeue, 1, 0);
+
+       /*
+        * The whole scheduler is waiting for further bios
+        * from process currently being served
+        */
+       if (bfq_diskctx->bfq_blockon != NULL)
+               goto rtn;
+
+       remaining_budget = bfq_diskctx->bfq_remaining_budget;
+       active_tdio = bfq_diskctx->bfq_active_tdio;
+       dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: dequeue: Im in. active_tdio = %p\n", active_tdio);
+
+       free_slots = diskctx->max_tag_queue_depth - diskctx->current_tag_queue_depth;
+       KKASSERT(free_slots >= 0 && free_slots <= 32);
+
+       if (active_tdio)
+               DSCHED_THREAD_IO_LOCK(&active_tdio->head);
+
+       while (free_slots) {
+               /* Here active_tdio must be locked ! */
+               if (active_tdio) {
+                       /*
+                        * the bio_done function has marked the current
+                        * tdio timeout
+                        */
+                       if (active_tdio->maybe_timeout) {
+                               dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: %p time out in dequeue()\n", active_tdio);
+                               wf2q_update_vd(active_tdio, active_tdio->budget - remaining_budget);
+                               bfq_expire(bfq_diskctx, active_tdio, BFQ_REASON_TIMEOUT);
+
+                               /* there still exist bios not dispatched,
+                                * reinsert the tdio into aug-tree*/
+                               if (active_tdio->head.qlength > 0) {
+                                       wf2q_insert_thread_io(&bfq_diskctx->bfq_wf2q, active_tdio);
+                                       KKASSERT(bfq_diskctx->bfq_wf2q.wf2q_tdio_count);
+                               }
+
+                               active_tdio->maybe_timeout = 0;
+                               DSCHED_THREAD_IO_UNLOCK(&active_tdio->head);
+                               active_tdio = NULL;
+                               continue;
+                       }
+
+                       /* select next bio to dispatch */
+                       /* TODO: a wiser slection */
+                       KKASSERT(lockstatus(&active_tdio->head.lock, curthread) == LK_EXCLUSIVE);
+                       bio = TAILQ_FIRST(&active_tdio->head.queue);
+                       dsched_debug(BFQ_DEBUG_NORMAL, "bfq: the first bio in queue of active_tdio %p is %p\n", active_tdio, bio);
+
+                       dsched_debug(BFQ_DEBUG_VERBOSE, "bfq: active_tdio %p exists, remaining budget = %d, tdio budget = %d\n, qlength = %d, first bio = %p, first bio cmd = %d, first bio size = %d\n", active_tdio, remaining_budget, active_tdio->budget, active_tdio->head.qlength, bio, bio?bio->bio_buf->b_cmd:-1, bio?bio->bio_buf->b_bcount:-1);
+
+                       /*
+                        * The bio is not read or write, just
+                        * push it down.
+                        */
+                       if (bio && (bio->bio_buf->b_cmd != BUF_CMD_READ) &&
+                           (bio->bio_buf->b_cmd != BUF_CMD_WRITE)) {
+                               dsched_debug(BFQ_DEBUG_NORMAL, "bfq: remove bio %p from the queue of %p\n", bio, active_tdio);
+                               KKASSERT(lockstatus(&active_tdio->head.lock, curthread) == LK_EXCLUSIVE);
+                               TAILQ_REMOVE(&active_tdio->head.queue, bio, link);
+                               active_tdio->head.qlength--;
+                               free_slots--;
+
+#if 0
+                               dsched_strategy_request_polling(diskctx->dp, bio, diskctx);
+#endif
+                               bio_to_dispatch[bio_index++] = bio;
+                               KKASSERT(bio_index <= bfq_diskctx->head.max_tag_queue_depth);
+                               continue;
+                       }
+                       /*
+                        * Run out of budget
+                        * But this is not because the size of bio is larger
+                        * than the complete budget.
+                        * If the size of bio is larger than the complete
+                        * budget, then use a complete budget to cover it.
+                        */
+                       if (bio && (remaining_budget < BIO_SIZE(bio)) &&
+                           (remaining_budget != active_tdio->budget)) {
+                               /* charge budget used */
+                               wf2q_update_vd(active_tdio, active_tdio->budget - remaining_budget);
+                               bfq_expire(bfq_diskctx, active_tdio, BFQ_REASON_OUT_OF_BUDGET);
+                               wf2q_insert_thread_io(&bfq_diskctx->bfq_wf2q, active_tdio);
+                               dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: thread %p ran out of budget\n", active_tdio);
+                               DSCHED_THREAD_IO_UNLOCK(&active_tdio->head);
+                               active_tdio = NULL;
+                       } else { /* if (bio && remaining_budget < BIO_SIZE(bio) && remaining_budget != active_tdio->budget) */
+
+                               /*
+                                * Having enough budget,
+                                * or having a complete budget and the size of bio
+                                * is larger than that.
+                                */
+                               if (bio) {
+                                       /* dispatch */
+                                       remaining_budget -= BIO_SIZE(bio);
+                                       /*
+                                        * The size of the first bio is larger
+                                        * than the whole budget, we should
+                                        * charge the extra part
+                                        */
+                                       if (remaining_budget < 0)
+                                               wf2q_update_vd(active_tdio, -remaining_budget);
+                                       /* compensate */
+                                       wf2q_update_vd(active_tdio, -remaining_budget);
+                                       /*
+                                        * remaining_budget may be < 0,
+                                        * but to prevent the budget of current tdio
+                                        * to substract a negative number,
+                                        * the remaining_budget has to be >= 0
+                                        */
+                                       remaining_budget = MAX(0, remaining_budget);
+                                       dsched_debug(BFQ_DEBUG_NORMAL, "bfq: remove bio %p from the queue of %p\n", bio, active_tdio);
+                                       KKASSERT(lockstatus(&active_tdio->head.lock, curthread) == LK_EXCLUSIVE);
+                                       TAILQ_REMOVE(&active_tdio->head.queue, bio, link);
+                                       free_slots--;
+                                       active_tdio->head.qlength--;
+                                       active_tdio->bio_dispatched++;
+                                       wf2q_inc_tot_service(&bfq_diskctx->bfq_wf2q, BIO_SIZE(bio));
+                                       dsched_debug(BFQ_DEBUG_VERBOSE,
+                                           "BFQ: %p's bio dispatched, size=%d, remaining_budget = %d\n",
+                                           active_tdio, BIO_SIZE(bio), remaining_budget);
+#if 0
+                                       dsched_strategy_request_polling(diskctx->dp, bio, diskctx);
+#endif
+                                       bio_to_dispatch[bio_index++] = bio;
+                                       KKASSERT(bio_index <= bfq_diskctx->head.max_tag_queue_depth);
+
+                               } else { /* if (bio) */
+
+                                       KKASSERT(active_tdio);
+                                       /*
+                                        * If AS feature is switched off,
+                                        * expire the tdio as well
+                                        */
+                                       if ((remaining_budget <= 0) ||
+                                           !(bfq_diskctx->bfq_flag & BFQ_FLAG_AS) ||
+                                           !active_tdio->tdio_as_switch) {
+                                               active_tdio->budget -= remaining_budget;
+                                               wf2q_update_vd(active_tdio, active_tdio->budget);
+                                               bfq_expire(bfq_diskctx, active_tdio, BFQ_REASON_OUT_OF_BUDGET);
+                                               DSCHED_THREAD_IO_UNLOCK(&active_tdio->head);
+                                               active_tdio = NULL;
+                                       } else {
+
+                                               /* no further bio, wait for a while */
+                                               bfq_diskctx->bfq_blockon = active_tdio;
+                                               /*
+                                                * Increase ref count to ensure that
+                                                * tdio will not be destroyed during waiting.
+                                                */
+                                               dsched_thread_io_ref(&active_tdio->head);
+                                               /*
+                                                * If the tdio is seeky but not thingking for
+                                                * too long, we wait for it a little shorter
+                                                */
+                                               if (active_tdio->seek_samples >= BFQ_VALID_MIN_SAMPLES && BFQ_TDIO_SEEKY(active_tdio))
+                                                       callout_reset(&bfq_diskctx->bfq_callout, BFQ_T_WAIT_MIN, (void (*) (void *))helper_msg_as_timeout, bfq_diskctx);
+                                               else
+                                                       callout_reset(&bfq_diskctx->bfq_callout, BFQ_T_WAIT, (void (*) (void *))helper_msg_as_timeout, bfq_diskctx);
+
+                                               /* save the start time of blocking */
+                                               getmicrotime(&active_tdio->as_start_time);
+
+                                               dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: blocked on %p, remaining_budget = %d\n", active_tdio, remaining_budget);
+                                               DSCHED_THREAD_IO_UNLOCK(&active_tdio->head);
+                                               goto save_and_rtn;
+                                       }
+                               }
+                       }
+               } else { /* if (active_tdio) */
+                       /* there is no active tdio */
+
+                       /* no pending bios at all */
+                       active_tdio = wf2q_get_next_thread_io(&bfq_diskctx->bfq_wf2q);
+
+                       if (!active_tdio) {
+                               KKASSERT(bfq_diskctx->bfq_wf2q.wf2q_tdio_count == 0);
+                               dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: no more eligible tdio!\n");
+                               goto save_and_rtn;
+                       }
+
+                       /*
+                        * A new tdio is picked,
+                        * initialize the service related statistic data
+                        */
+                       DSCHED_THREAD_IO_LOCK(&active_tdio->head);
+                       active_tdio->service_received = 0;
+
+                       /*
+                        * Reset the maybe_timeout flag, which
+                        * may be set by a biodone after the the service is done
+                        */
+                       getmicrotime(&active_tdio->service_start_time);
+                       active_tdio->maybe_timeout = 0;
+
+                       remaining_budget = active_tdio->budget;
+                       dsched_debug(BFQ_DEBUG_VERBOSE, "bfq: active_tdio %p selected, remaining budget = %d, tdio budget = %d\n, qlength = %d\n", active_tdio, remaining_budget, active_tdio->budget, active_tdio->head.qlength);
+               }
+
+       }/* while (free_slots) */
+
+       /* reach here only when free_slots == 0 */
+       if (active_tdio) /* && lockcount(&active_tdio->head.lock) > 0) */
+               DSCHED_THREAD_IO_UNLOCK(&active_tdio->head);
+
+save_and_rtn:
+       /* save the remaining budget */
+       bfq_diskctx->bfq_remaining_budget = remaining_budget;
+       bfq_diskctx->bfq_active_tdio = active_tdio;
+rtn:
+       BFQ_UNLOCK(bfq_diskctx);
+       /*dispatch the planned bios*/
+       for (i = 0; i < bio_index; i++)
+               dsched_strategy_request_polling(diskctx->dp, bio_to_dispatch[i], diskctx);
+
+}
+
+/*
+ * bfq_slow_tdio(): decide whether a tdio is slow
+ *
+ * This function decides whether a tdio is slow by the speed
+ * estimated from the current time slice start time: if the
+ * tdio is not fast enough to consume its budget (or 2/3
+ * its budget) within the time slice, it is judged slow.
+ *
+ * Called by bfq_expire()
+ *
+ * lock:
+ *  THREAD_IO_LOCK is expected to be held.
+ * refcount:
+ *     none
+ *
+ */
+static int
+bfq_slow_tdio(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio)
+{
+       /**
+        * A tdio is considered slow if it can not finish its budget
+        * at its current average speed
+        */
+       uint64_t usec_elapsed, service_received, speed;
+       int expect;
+       struct timeval tv = bfq_tdio->last_request_done_time;
+
+       timevalsub (&tv, &bfq_tdio->service_start_time);
+       usec_elapsed = (uint64_t)(1000000 * (uint64_t)tv.tv_sec + tv.tv_usec);
+
+       /* discard absurd value */
+       if (usec_elapsed < 20000)
+               return 0;
+
+       service_received = (uint64_t)bfq_tdio->service_received << BFQ_FIXPOINT_SHIFT;
+       speed = service_received / usec_elapsed;
+       expect = (speed * BFQ_SLICE_TIMEOUT * (1000 * 1000 / hz)) >> BFQ_FIXPOINT_SHIFT;
+
+       if (expect < 0) {
+               dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: overflow on calculating slow_tdio\n");
+               return 0;
+       }
+
+       if (expect < bfq_tdio->budget * 2 / 3) {
+               dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: %p is judged slow\n", bfq_tdio);
+               return 1;
+       }
+
+       return 0;
+}
+
+/*
+ * bfq_expire(): expire a tdio for a given reason.
+ *
+ * Different amount of the new budget will be assign to the expired
+ * tdio according to the following reasons:
+ *
+ * BFQ_REASON_TIMEOUT:
+ *  The tdio does not consume its budget up within BFQ_SLICE_TIMEOUT ticks.
+ *  We shall update the disk peak rate if the tdio is not seeky. The new
+ *  budget will be the budget it actually consumes during this time
+ *  slice.
+ *
+ * BFQ_REASON_TOO_IDLE:
+ *  The tdio does not push any further bios during the scheduler is
+ *  suspending. To ensure low global latency, this tdio should be
+ *  punished by assign it the minimum budget. But if the tdio's not
+ *  pushing any bio is because it is waiting for the dispatched bios
+ *  to be done, we just keep the budget unchanged.
+ *
+ * BFQ_REASON_OUT_OF_BUDGET:
+ *     The tdio runs out of its budget within the time slice. It usually
+ *     indicates that the tdio is doing well. We increase the budget of it.
+ *
+ * lock:
+ *  THREAD_IO_LOCK is expected to be held.
+ *  BFQ_LOCK is expected to be held (needed by bfq_update_peak_rate()).
+ *
+ * refcount: none
+ *
+ * Callers: bfq_timeout(), bfq_dequeue()
+ *
+ */
+static void
+bfq_expire(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio, enum bfq_expire_reason reason)
+{
+       int max_budget = bfq_diskctx->bfq_max_budget,
+               budget_left,
+               bio_in_flight,
+               service_received;
+
+       service_received = bfq_tdio->service_received;
+       budget_left = bfq_tdio->budget - bfq_tdio->service_received;
+
+       if (budget_left < 0) {
+               dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: budget down flow: %d, %d\n", bfq_tdio->budget, bfq_tdio->service_received);
+               budget_left = 0;
+       }
+
+       KKASSERT(budget_left >= 0);
+
+       switch (reason) {
+               case BFQ_REASON_TIMEOUT:
+                       /* the tdio is not seeky so that we can update
+                        * the disk peak rate based on the service received
+                        * by the tdio
+                        */
+                       if ((bfq_tdio->seek_samples >= BFQ_VALID_MIN_SAMPLES) &&
+                           (!BFQ_TDIO_SEEKY(bfq_tdio)))
+                               bfq_update_peak_rate(bfq_diskctx, bfq_tdio);
+
+                       /* max_budget may be updated */
+                       max_budget = bfq_diskctx->bfq_max_budget;
+
+                       /* update budget to service_received*/
+                       bfq_tdio->budget = MAX(service_received, BFQ_DEFAULT_MIN_BUDGET);
+
+                       break;
+
+               case BFQ_REASON_TOO_IDLE:
+                       /*
+                        * the tdio is too slow, charge full budget
+                        */
+                       if (bfq_slow_tdio(bfq_diskctx, bfq_tdio))
+                               wf2q_update_vd(bfq_tdio, budget_left);
+
+                       bio_in_flight = bfq_tdio->bio_dispatched - bfq_tdio->bio_completed;
+                       KKASSERT(bio_in_flight >= 0);
+                       /*
+                        * maybe the tdio pushes no bio
+                        * because it is waiting for some bios
+                        * dispatched to be done, in this case
+                        * we do not reduce the budget too harshly
+                        */
+                       if (bio_in_flight > 0) {
+                               bfq_tdio->budget = MAX(BFQ_DEFAULT_MIN_BUDGET, service_received);
+                       } else {
+#if 0
+                               bfq_tdio->budget = MAX(BFQ_DEFAULT_MIN_BUDGET, bfq_diskctx->bfq_max_budget / BFQ_MIN_BUDGET_FACTOR);
+#endif
+                               bfq_tdio->budget = BFQ_DEFAULT_MIN_BUDGET;
+                       }
+
+                       break;
+               case BFQ_REASON_OUT_OF_BUDGET:
+
+                       if ((bfq_tdio->seek_samples >= BFQ_VALID_MIN_SAMPLES) &&
+                           (!BFQ_TDIO_SEEKY(bfq_tdio)))
+                               bfq_update_peak_rate(bfq_diskctx, bfq_tdio);
+
+                       /* increase the budget */
+                       if (bfq_tdio->budget < BFQ_BUDGET_MULTIPLE_THRESHOLD)
+                               bfq_tdio->budget = MIN(max_budget, bfq_tdio->budget * 2);
+                       else
+                               bfq_tdio->budget = MIN(max_budget, bfq_tdio->budget + BFQ_BUDG_INC_STEP);
+                       break;
+               default:
+                       break;
+       }
+}
+
+/*
+ * bfq_update_peak_rate(): update the peak disk speed by sampling
+ * the throughput within a time slice.
+ *
+ * lock:
+ *  BFQ_LOCK is expected to be held
+ *
+ * refcount:
+ *     none
+ *
+ * Caller: bfq_expire()
+ */
+static void
+bfq_update_peak_rate(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio)
+{
+       struct timeval tv = bfq_tdio->last_request_done_time;
+       uint64_t usec, service_received, peak_rate;
+
+
+       timevalsub (&tv, &bfq_tdio->service_start_time);
+       usec = (uint64_t)(1000000 * (uint64_t)tv.tv_sec + tv.tv_usec);
+
+       /* discard absurd value */
+       if (usec < 2000 || usec > (BFQ_SLICE_TIMEOUT * (1000 / hz) * 1000)) {
+               dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: absurd interval for peak rate\n");
+               return;
+       }
+
+       service_received = (uint64_t)bfq_tdio->service_received << BFQ_FIXPOINT_SHIFT;
+       peak_rate = service_received / usec;
+       bfq_diskctx->bfq_peak_rate = (peak_rate + 7 * bfq_diskctx->bfq_peak_rate) / 8;
+       bfq_diskctx->bfq_peak_rate_samples++;
+
+       /* update the max_budget according to the peak rate */
+       if (bfq_diskctx->bfq_peak_rate_samples > BFQ_VALID_MIN_SAMPLES) {
+               bfq_diskctx->bfq_peak_rate_samples = BFQ_VALID_MIN_SAMPLES;
+               /*
+                * if the auto max budget adjust is disabled,
+                * the bfq_max_budget will always be BFQ_DEFAULT_MAX_BUDGET;
+                */
+               if (bfq_diskctx->bfq_flag & BFQ_FLAG_AUTO_MAX_BUDGET) {
+                       bfq_diskctx->bfq_max_budget =
+                               (uint32_t)((BFQ_SLICE_TIMEOUT * (1000 / hz) * bfq_diskctx->bfq_peak_rate * 1000) >> BFQ_FIXPOINT_SHIFT);
+                       dsched_debug(BFQ_DEBUG_NORMAL, "max budget updated to %d\n", bfq_diskctx->bfq_max_budget);
+               }
+       }
+}
+
+/*
+ * bfq_update_tdio_seek_avg(): update the average seek distance of a
+ * tdio.
+ *
+ * lock:
+ *     THREAD_IO_LOCK is expected to be held.
+ *
+ * refcount:
+ *  none
+ *
+ * Caller: bfq_queue()
+ */
+static void
+bfq_update_tdio_seek_avg(struct bfq_thread_io *bfq_tdio, struct bio *bp)
+{
+       off_t seek;
+
+       /* the first bio it dispatches,
+        * we do not calculate the seek_avg,
+        * just update the last_seek_end
+        */
+       if (bfq_tdio->seek_samples == 0) {
+               ++bfq_tdio->seek_samples;
+               goto rtn;
+       }
+
+       seek = ABS(bp->bio_offset - bfq_tdio->last_seek_end);
+
+       /*
+        * we do not do seek_samples++,
+        * because the seek_total may overflow if seek_total += seek,
+        */
+       bfq_tdio->seek_samples = (7 * bfq_tdio->seek_samples + 256) / 8;
+       bfq_tdio->seek_total = (7 * bfq_tdio->seek_total + 256 * seek) / 8;
+       bfq_tdio->seek_avg = (bfq_tdio->seek_total + bfq_tdio->seek_samples / 2) / bfq_tdio->seek_samples;
+
+       dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: tdio %p seek_avg updated to %" PRIu64 "\n", bfq_tdio, bfq_tdio->seek_avg);
+
+rtn:
+       bfq_tdio->last_seek_end = bp->bio_offset + BIO_SIZE(bp);
+}
+
+/*
+ * bfq_update_tdio_ttime_avg(): update the average thinking time
+ * of a tdio.
+ *
+ * The thinking time is used to switch on / off the tdio's AS feature
+ *
+ * lock:
+ *  THREAD_IO_LOCK is expected to be held.
+ *
+ * refcount:
+ *  none
+ *
+ * Caller:
+ *  bfq_queue()
+ *
+ */
+static void
+bfq_update_tdio_ttime_avg(struct bfq_thread_io *bfq_tdio)
+{
+       struct timeval tv, after_start;
+       uint64_t usec;
+
+       if (bfq_tdio->ttime_samples == 0) {
+               ++bfq_tdio->ttime_samples;
+               return;
+       }
+
+       getmicrotime(&tv);
+       after_start = bfq_tdio->last_request_done_time;
+
+#if 0
+       timevalsub (&tv, &bfq_tdio->last_request_done_time);
+#endif
+       /*
+        * Try the interval between two bios are pushed,
+        * instead of between last_request_done_time and
+        * the current time.
+        */
+
+       timevalsub (&tv, &bfq_tdio->last_bio_pushed_time);
+
+       timevalsub (&after_start, &bfq_tdio->service_start_time);
+
+       /*
+        * tv.tv_sec < 0 means the last reauest done time is
+        * after the current time.
+        * this may happen because the biodone function is not blocked
+        *
+        * after_start.tv_sec < 0 means that the last bio done happens
+        * before the current service slice, and we should drop this value.
+        */
+       if (tv.tv_sec < 0 || after_start.tv_sec < 0)
+               return;
+
+       usec = (uint64_t)(1000000 * (uint64_t)tv.tv_sec + tv.tv_usec);
+
+       bfq_tdio->ttime_samples = (7 * bfq_tdio->ttime_samples + 256) / 8;
+       bfq_tdio->ttime_total = (7 * bfq_tdio->ttime_total + 256 * usec) / 8;
+       bfq_tdio->ttime_avg = (bfq_tdio->ttime_total + 128) / bfq_tdio->ttime_samples;
+
+}
+
+/*
+ * This function will also update the bfq_max_time_slice field
+ *
+ * tv: the timeval structure representing the length of time slice
+ */
+static void
+bfq_update_avg_time_slice(struct bfq_disk_ctx *bfq_diskctx, struct timeval tv)
+{
+       uint32_t msec;
+
+       msec = ((uint64_t)(1000000 * (uint64_t)tv.tv_sec + tv.tv_usec) >> 10 );
+
+       if (msec > 3 * BFQ_SLICE_TIMEOUT * (1000 / hz))
+               atomic_add_int(&bfq_diskctx->bfq_high_time_slice_count, 1);
+
+       bfq_diskctx->bfq_avg_time_slice =
+               (7 * bfq_diskctx->bfq_avg_time_slice + msec) / 8;
+
+       if (bfq_diskctx->bfq_max_time_slice < msec)
+               bfq_diskctx->bfq_max_time_slice = msec;
+}
+/*
+ * This function will also update the bfq_as_max_wait field
+ * flag: BFQ_AS_STAT_ALL, BFQ_AS_STAT_ONLY_MISS
+ *
+ */
+static void
+bfq_update_as_avg_wait(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio, int flag)
+{
+       struct timeval tv;
+       uint32_t msec;
+       getmicrotime(&tv);
+       timevalsub (&tv, &bfq_tdio->as_start_time);
+
+       /* approximately divide 1000 by left shift 10 */
+       msec = ((uint64_t)(1000000 * (uint64_t)tv.tv_sec + tv.tv_usec) >> 10 );
+
+       /* ridiculous value */
+       if (msec > 10000) {
+               dsched_debug(BFQ_DEBUG_NORMAL, "bfq: ridiculous as wait time!\n");
+               return;
+       }
+
+       if (msec > 5 * BFQ_T_WAIT_MIN * (1000 / hz))
+               atomic_add_int(&bfq_diskctx->bfq_as_high_wait_count, 1);
+
+       if (flag & BFQ_AS_STAT_ALL) {
+               bfq_diskctx->bfq_as_avg_wait_all =
+                       (7 * bfq_diskctx->bfq_as_avg_wait_all + msec) / 8;
+       }
+
+       if (flag & BFQ_AS_STAT_ONLY_MISS) {
+               bfq_diskctx->bfq_as_avg_wait_miss =
+                       (7 * bfq_diskctx->bfq_as_avg_wait_miss + msec) / 8;
+       }
+
+       /* update the maximum waiting time */
+       if (bfq_diskctx->bfq_as_max_wait < msec)
+               bfq_diskctx->bfq_as_max_wait = msec;
+
+       return;
+}
+
+static int
+bfq_mod_handler(module_t mod, int type, void *unused)
+{
+       static struct sysctl_ctx_list sysctl_ctx;
+       static struct sysctl_oid *oid;
+       static char version[16];
+       int error;
+
+       ksnprintf(version, sizeof(version), "%d.%d",
+                       dsched_bfq_version_maj, dsched_bfq_version_min);
+
+       switch (type) {
+       case MOD_LOAD:
+               bzero(&bfq_stats, sizeof(struct dsched_bfq_stats));
+               if ((error = dsched_register(&dsched_bfq_policy)))
+                       return (error);
+
+               sysctl_ctx_init(&sysctl_ctx);
+               oid = SYSCTL_ADD_NODE(&sysctl_ctx,
+                   SYSCTL_STATIC_CHILDREN(_dsched),
+                   OID_AUTO,
+                   "bfq",
+                   CTLFLAG_RD, 0, "");
+               bfq_mod_oid = oid;
+
+               SYSCTL_ADD_STRING(&sysctl_ctx, SYSCTL_CHILDREN(oid),
+                   OID_AUTO, "version", CTLFLAG_RD, version, 0, "bfq version");
+               helper_init_global();
+
+               kprintf("BFQ scheduler policy version %d.%d loaded. sizeof(bfq_thread_io) = %zu\n",
+                   dsched_bfq_version_maj, dsched_bfq_version_min, sizeof(struct bfq_thread_io));
+               break;
+
+       case MOD_UNLOAD:
+               if ((error = dsched_unregister(&dsched_bfq_policy)))
+                       return (error);
+               sysctl_ctx_free(&sysctl_ctx);
+               kprintf("BFQ scheduler policy unloaded\n");
+               break;
+
+       default:
+               break;
+       }
+
+       return 0;
+}
+
+int
+bfq_sysctl_as_switch_handler(SYSCTL_HANDLER_ARGS)
+{
+       struct bfq_disk_ctx *bfq_diskctx = arg1;
+       int as_switch, error;
+
+       as_switch = ((bfq_diskctx->bfq_flag & BFQ_FLAG_AS) ? 1 : 0);
+       error = sysctl_handle_int(oidp, &as_switch, 0, req);
+       if (error || !req->newptr)
+               return error;
+
+       if (as_switch == 1)
+               bfq_diskctx->bfq_flag |= BFQ_FLAG_AS;
+       else if (as_switch == 0)
+               bfq_diskctx->bfq_flag &= ~(BFQ_FLAG_AS);
+       else
+               return 0;
+
+       return error;
+}
+
+int
+bfq_sysctl_auto_max_budget_handler(SYSCTL_HANDLER_ARGS)
+{
+       struct bfq_disk_ctx *bfq_diskctx = arg1;
+       int auto_max_budget_switch, error;
+       auto_max_budget_switch = ((bfq_diskctx->bfq_flag & BFQ_FLAG_AUTO_MAX_BUDGET) ? 1 : 0);
+       error = sysctl_handle_int(oidp, &auto_max_budget_switch, 0, req);
+       if (error || !req->newptr)
+               return error;
+
+       if (auto_max_budget_switch == 1)
+               bfq_diskctx->bfq_flag |= BFQ_FLAG_AUTO_MAX_BUDGET;
+       else if (auto_max_budget_switch == 0)
+               bfq_diskctx->bfq_flag &= ~(BFQ_FLAG_AUTO_MAX_BUDGET);
+       else
+               return 0;
+
+       return error;
+}
+
+DSCHED_POLICY_MODULE(dsched_bfq, bfq_mod_handler);
diff --git a/sys/kern/dsched/bfq/bfq.h b/sys/kern/dsched/bfq/bfq.h
new file mode 100644 (file)
index 0000000..2ba8d59
--- /dev/null
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2011 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Brills Peng <brillsp@gmail.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+
+#ifndef _DSCHED_BFQ_H_
+#define _DSCHED_BFQ_H_
+
+#if defined(_KERNEL) || defined(_KERNEL_STRUCTURES)
+
+#ifndef _SYS_QUEUE_H_
+#include <sys/queue.h>
+#endif
+
+#ifndef _SYS_BIO_H_
+#include <sys/bio.h>
+#endif
+
+#ifndef _SYS_BIOTRACK_H_
+#include <sys/biotrack.h>
+#endif
+
+#ifndef _SYS_SPINLOCK_H_
+#include <sys/spinlock.h>
+#endif
+
+#ifndef _SYS_TREE_H_
+#include <sys/tree.h>
+#endif
+
+#ifndef _SYS_DSCHED_H_
+#include <sys/dsched.h>
+#endif
+
+#ifndef _DSCHED_BFQ_WF2Q_H_
+#include <kern/dsched/bfq/wf2q.h>
+#endif
+
+struct wf2q_t;
+
+struct bfq_thread_io {
+       struct dsched_thread_io head;
+       RB_ENTRY(bfq_thread_io) entry;
+       int budget;     /* The budget of a thread */
+       int vd;         /* Virtual deadline (finish time) */
+       int ve;         /* Virtual eligible time (start time) */
+       int min_vd;     /* Minimum vd among the sub trees, used for augmented rb-tree */
+       int weight;     /* Weight of the thread, the higher, the more
+                          chance to be dispatched the thread will have */
+
+       volatile int maybe_timeout;     /* a flag indicating that the tdio may
+                                         expire, only when active_tdio = this is it valid */
+       int tdio_as_switch;
+
+       /* Statistic data */
+       off_t   last_seek_end;  /* the end point of seeking of the last bio
+                                                          pushed down */
+       uint32_t seek_samples;  /* averange seek length samples */
+       off_t   seek_avg;       /* averange seek length, fixed point */
+       off_t   seek_total;
+
+       uint32_t ttime_samples; /* averange think time samples */
+       uint64_t ttime_avg;     /* averange think time, usec */
+       uint64_t ttime_total;
+
+       struct timeval service_start_time; /* the time when the first request
+                                                 of the current service period is dispatched */
+       struct timeval last_request_done_time;  /* the time when the last
+                                                  request is done */
+       struct timeval as_start_time;   /* the start time of AS waiting */
+       struct timeval last_bio_pushed_time;
+
+       uint32_t service_received;      /* the amount of read/write during
+                                          the time slice */
+       uint32_t bio_dispatched;        /* number of bios dispatched during
+                                          the current period */
+       uint32_t bio_completed;         /* number of bios completed during
+                                          the current period */
+};
+
+struct bfq_disk_ctx {
+       struct dsched_disk_ctx head;
+
+       struct lock bfq_lock;
+
+       struct callout bfq_callout;     /* the blocking-timer callout */
+       struct wf2q_t bfq_wf2q;         /* the wf2q scheduler */
+
+       struct bfq_thread_io *bfq_blockon;      /* waiting on any */
+       struct bfq_thread_io *bfq_active_tdio;  /* currently active tdio */
+
+       int pending_dequeue; /* number of dequeue() calls pending
+                               on BFQ_LOCK */
+
+       int bfq_max_budget;
+       int bfq_remaining_budget; /* remaining budget of the current tdio */
+
+       uint32_t bfq_flag; /* SEE BFQ_FLAG_* define for all flags */
+
+       /* Statistic data */
+       uint32_t bfq_peak_rate_samples; /* peak rate samples */
+       uint64_t bfq_peak_rate;         /* peak rate, fixed point */
+
+       int bfq_as_miss;
+       int bfq_as_hit;
+
+       uint32_t bfq_as_avg_wait_miss;  /* average AS waiting time for
+                                          only AS miss, ms */
+       uint32_t bfq_as_avg_wait_all;   /* average AS waiting time for all, ms */
+       uint32_t bfq_as_max_wait;       /* maximum AS waiting time, ms */
+       uint32_t bfq_as_max_wait2;      /* maximum AS waiting time(from callout), ms */
+
+       int bfq_as_high_wait_count; /* the number of times when AS waiting time
+                                      is longer than 5 * BFQ_T_WAIT_MIN (50ms now) */
+       int bfq_as_high_wait_count2; /* the number of times when AS waiting
+                                       time is longer than 5 * BFQ_T_WAIT_MIN (50ms now) */
+
+       uint32_t bfq_avg_time_slice;    /* average time slice length, ms */
+       uint32_t bfq_max_time_slice;    /* maximum time slice length, ms */
+       int bfq_high_time_slice_count;  /* the number of times when a time slice
+                                           is longer than 5 * BFQ_SLICE_TIMEOUT */
+
+       struct sysctl_ctx_list bfq_sysctl_ctx; /* bfq statistics interface
+                                                 with sysctl */
+       /* helper thread and its lwkt message cache and port*/
+       struct thread *helper_thread;
+       struct objcache *helper_msg_cache;
+       struct lwkt_port helper_msg_port;
+};
+
+enum bfq_expire_reason {
+       BFQ_REASON_TIMEOUT = 0,
+       BFQ_REASON_TOO_IDLE,
+       BFQ_REASON_OUT_OF_BUDGET,
+       BFQ_REASON_NO_MORE_REQ
+};
+
+#define BFQ_FLAG_AS 0x01
+#define BFQ_FLAG_AUTO_MAX_BUDGET 0x02
+
+#define BFQ_TDIO_SEEKY(x) (((x)->seek_avg) > (1024 * SECT_SIZE))
+
+#define BFQ_LOCKINIT(x)                        \
+               lockinit(&(x)->bfq_lock, "bfqwf2q", 0, LK_CANRECURSE);
+
+#define BFQ_LOCK(x)    do {            \
+               dsched_disk_ctx_ref(&(x)->head);        \
+               lockmgr(&(x)->bfq_lock, LK_EXCLUSIVE);  \
+       } while(0)
+
+#define BFQ_UNLOCK(x)  do {            \
+               lockmgr(&(x)->bfq_lock, LK_RELEASE);    \
+               dsched_disk_ctx_unref(&(x)->head);      \
+       } while(0)
+
+#define SECT_SIZE 512 /* XXX: DEV_BSIZE? */
+#define BFQ_DEBUG_CRITICAL 1
+#define BFQ_DEBUG_NORMAL 2
+#define BFQ_DEBUG_VERBOSE 3
+#define BFQ_DEFAULT_MAX_BUDGET (1024*512) /* 1024 sectors / 0.2sec */
+#define BFQ_DEFAULT_MIN_BUDGET (32*512) /* 32 sectors / 0.2sec */
+#define BFQ_BUDG_INC_STEP (1*128*512) /* The linear increasing step of budget */
+
+/* If the budget is larger than this threshold,
+ * it will get linear increment, else,
+ * it will get exponential increment.*/
+#define BFQ_BUDGET_MULTIPLE_THRESHOLD (256*512)
+
+#define BFQ_DEFAULT_WEIGHT 1
+
+/* Get the size of a bio */
+#define BIO_SIZE(x) ((x)->bio_buf->b_bcount)
+
+/* Anticipatory waiting time (ticks) ~ 20ms, min ~ 10ms */
+#define BFQ_T_WAIT ((hz/50) > 5 ? (hz/50) : 5)
+
+#define BFQ_T_WAIT_MIN ((hz/100 > 0) ? (hz/100) : 1)
+
+/* Time slice for each service period ~200ms (ticks) */
+#define BFQ_SLICE_TIMEOUT (hz/5)
+
+#define BFQ_FIXPOINT_SHIFT 10 /* fixed point arithmetic shift */
+
+#define BFQ_VALID_MIN_SAMPLES 80 /* minimum number of samples */
+
+#define ABS(x) (((x) < 0) ? (-(x)) : (x))
+
+/* as statistics define */
+#define BFQ_AS_STAT_ALL 0x1
+#define BFQ_AS_STAT_ONLY_MISS 0x2
+
+/* functions helper thread calls */
+void bfq_timeout(void *);
+void bfq_dequeue(struct dsched_disk_ctx *);
+void bfq_helper_destroy_tdio(struct dsched_thread_io *, struct bfq_disk_ctx *);
+
+/* sysctl handlers, registered in the helper thread */
+int bfq_sysctl_as_switch_handler(SYSCTL_HANDLER_ARGS);
+int bfq_sysctl_auto_max_budget_handler(SYSCTL_HANDLER_ARGS);
+
+#endif /* _KERNEL || _KERNEL_STRUCTURES */
+struct dsched_bfq_stats {
+       int32_t as_missed;
+       int32_t as_hit;
+       int32_t as_fake;
+       int32_t unused;
+};
+#endif /*_DSCHED_BFQ_H_ */
diff --git a/sys/kern/dsched/bfq/bfq_helper_thread.c b/sys/kern/dsched/bfq/bfq_helper_thread.c
new file mode 100644 (file)
index 0000000..75d8a04
--- /dev/null
@@ -0,0 +1,457 @@
+/*
+ * Copyright (c) 2011 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Brills Peng <brillsp@gmail.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+
+/*
+ * bfq_helper_thread.c:
+ * Thread function of the helper thread and
+ * message sending routines.
+ *
+ * XXX: The current approach of serializing using lwkt messages is suboptimal.
+ *     The idea is to replace it with way more fine-grained and lockless
+ *     accesses spread all over the place. It makes things more complicated,
+ *     but it will also improve performance significantly.
+ *
+ * The sysctl node of bfq is also initialized
+ * here.
+ */
+
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/sysctl.h>
+#include <sys/buf.h>
+#include <sys/conf.h>
+#include <sys/diskslice.h>
+#include <sys/disk.h>
+#include <sys/malloc.h>
+#include <machine/md_var.h>
+#include <sys/ctype.h>
+#include <sys/syslog.h>
+#include <sys/device.h>
+#include <sys/msgport.h>
+#include <sys/msgport2.h>
+#include <sys/mplock2.h>
+#include <sys/buf2.h>
+#include <sys/dsched.h>
+#include <sys/fcntl.h>
+#include <machine/varargs.h>
+
+#include <kern/dsched/bfq/bfq.h>
+#include <kern/dsched/bfq/bfq_helper_thread.h>
+
+extern struct sysctl_oid *bfq_mod_oid;
+extern struct dsched_policy dsched_bfq_policy;
+
+static void helper_thread(struct bfq_disk_ctx *bfq_diskctx);
+static int helper_msg_exec(helper_msg_t msg);
+static void helper_sysctl_init(struct bfq_disk_ctx *bfq_diskctx);
+
+MALLOC_DEFINE(M_HELPER, "bfq", "BFQ helper thread message allocations");
+
+/*
+ * All threads share one dispose port
+ */
+static struct lwkt_port helper_dispose_port;
+
+/* XXX: should be an mpipe */
+static struct objcache_malloc_args helper_msg_malloc_args = {
+       sizeof(struct helper_msg), M_HELPER };
+
+
+static helper_msg_t
+helper_msg_get(struct bfq_disk_ctx *bfq_diskctx)
+{
+       /*
+        * XXX: wait is OK?
+        */
+       return objcache_get(bfq_diskctx->helper_msg_cache, M_WAITOK);
+}
+
+static int
+helper_msg_put(struct bfq_disk_ctx *bfq_diskctx, helper_msg_t msg)
+{
+       objcache_put(bfq_diskctx->helper_msg_cache, msg);
+       return 0;
+}
+
+static void
+helper_msg_autofree_reply(lwkt_port_t port, lwkt_msg_t msg)
+{
+       helper_msg_t hm = (helper_msg_t)msg;
+       helper_msg_put(hm->bfq_diskctx, (helper_msg_t)msg);
+}
+
+/*
+ * Initialize the dispose port. All helper threads share this port.
+ * Must be called only once, and before any helper thread being created.
+ *
+ * Called by bfq.c: bfq_moc_handler()
+ */
+void
+helper_init_global(void)
+{
+       lwkt_initport_replyonly(&helper_dispose_port, helper_msg_autofree_reply);
+}
+
+/*
+ * Helper thread initialization function:
+ * initialize the per-disk objcache and create the
+ * helper thread.
+ *
+ * Called by bfq.c:bfq_prepare()
+ */
+void
+helper_init(struct bfq_disk_ctx *bfq_diskctx)
+{
+       struct thread *phelper_thread;
+
+       bfq_diskctx->helper_msg_cache = objcache_create("bfq-helper-msg-cache", 0, 0,
+                       NULL, NULL, NULL,
+                       objcache_malloc_alloc,
+                       objcache_malloc_free,
+                       &helper_msg_malloc_args);
+
+       lwkt_create((void (*) (void *)) helper_thread, bfq_diskctx,
+                       &phelper_thread, NULL, 0, -1,
+                       "bfq_helper_td_%s", bfq_diskctx->head.dp->d_cdev->si_name);
+
+       bfq_diskctx->helper_thread = phelper_thread;
+}
+
+static void
+helper_msg_send(struct bfq_disk_ctx *bfq_diskctx, uint32_t cmd, helper_msg_t helper_msg)
+{
+       lwkt_port_t port = &bfq_diskctx->helper_msg_port;
+
+       lwkt_initmsg(&helper_msg->hdr, &helper_dispose_port, 0);
+       helper_msg->bfq_diskctx = bfq_diskctx;
+       helper_msg->hdr.u.ms_result = cmd;
+
+       if (port->mpu_td == curthread){
+               helper_msg_exec(helper_msg);
+               lwkt_replymsg(&helper_msg->hdr, 0);
+       } else {
+               lwkt_sendmsg(port, (lwkt_msg_t)helper_msg);
+       }
+}
+
+/*
+ * Deallocate the objcache.
+ * Called by bfq.c: bfq_teardown()
+ */
+void
+helper_uninit(struct bfq_disk_ctx *bfq_diskctx)
+{
+       objcache_destroy(bfq_diskctx->helper_msg_cache);
+}
+
+static void
+helper_sysctl_init(struct bfq_disk_ctx *bfq_diskctx)
+{
+       struct sysctl_oid *oid;
+
+       sysctl_ctx_init(&bfq_diskctx->bfq_sysctl_ctx);
+
+       if (!bfq_mod_oid){
+               kprintf("Failed to create BFQ dev sysctl node!\n");
+               return;
+       }
+
+       oid = SYSCTL_ADD_NODE(&bfq_diskctx->bfq_sysctl_ctx,
+               SYSCTL_CHILDREN(bfq_mod_oid),
+               OID_AUTO,
+               bfq_diskctx->head.dp->d_cdev->si_name,
+               CTLFLAG_RD, 0, "");
+
+       SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx,
+                       SYSCTL_CHILDREN(oid),
+                       OID_AUTO,
+                       "max_budget",
+                       CTLFLAG_RW,
+                       &bfq_diskctx->bfq_max_budget,
+                       0,
+                       "BFQ max budget");
+
+       SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx,
+                       SYSCTL_CHILDREN(oid),
+                       OID_AUTO,
+                       "peak_rate",
+                       CTLFLAG_RD,
+                       &bfq_diskctx->bfq_peak_rate,
+                       0,
+                       "BFQ estimated peak rate");
+
+       SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx,
+                       SYSCTL_CHILDREN(oid),
+                       OID_AUTO,
+                       "peak_samples",
+                       CTLFLAG_RD,
+                       &bfq_diskctx->bfq_peak_rate_samples,
+                       0,
+                       "BFQ estimated peak rate samples");
+
+       SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx,
+                       SYSCTL_CHILDREN(oid),
+                       OID_AUTO,
+                       "as_miss",
+                       CTLFLAG_RD,
+                       &bfq_diskctx->bfq_as_miss,
+                       0,
+                       "BFQ AS miss");
+
+       SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx,
+                       SYSCTL_CHILDREN(oid),
+                       OID_AUTO,
+                       "as_hit",
+                       CTLFLAG_RD,
+                       &bfq_diskctx->bfq_as_hit,
+                       0,
+                       "BFQ AS hit");
+
+       SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx,
+                       SYSCTL_CHILDREN(oid),
+                       OID_AUTO,
+                       "as_wait_avg_all",
+                       CTLFLAG_RD,
+                       &bfq_diskctx->bfq_as_avg_wait_all,
+                       0,
+                       "BFQ AS waitall");
+
+       SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx,
+                       SYSCTL_CHILDREN(oid),
+                       OID_AUTO,
+                       "as_wait_avg_miss",
+                       CTLFLAG_RD,
+                       &bfq_diskctx->bfq_as_avg_wait_miss,
+                       0,
+                       "BFQ AS waitmiss");
+
+       SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx,
+                       SYSCTL_CHILDREN(oid),
+                       OID_AUTO,
+                       "as_wait_max",
+                       CTLFLAG_RD,
+                       &bfq_diskctx->bfq_as_max_wait,
+                       0,
+                       "BFQ AS waitmax");
+
+       SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx,
+                       SYSCTL_CHILDREN(oid),
+                       OID_AUTO,
+                       "as_wait_max2",
+                       CTLFLAG_RD,
+                       &bfq_diskctx->bfq_as_max_wait2,
+                       0,
+                       "BFQ AS waitmax2");
+
+       SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx,
+                       SYSCTL_CHILDREN(oid),
+                       OID_AUTO,
+                       "as_high_wait_count",
+                       CTLFLAG_RD,
+                       &bfq_diskctx->bfq_as_high_wait_count,
+                       0,
+                       "BFQ AS high count");
+
+       SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx,
+                       SYSCTL_CHILDREN(oid),
+                       OID_AUTO,
+                       "as_high_wait_count2",
+                       CTLFLAG_RD,
+                       &bfq_diskctx->bfq_as_high_wait_count2,
+                       0,
+                       "BFQ AS high count2");
+
+       SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx,
+                       SYSCTL_CHILDREN(oid),
+                       OID_AUTO,
+                       "avg_time_slice",
+                       CTLFLAG_RD,
+                       &bfq_diskctx->bfq_avg_time_slice,
+                       0,
+                       "BFQ average time slice");
+
+       SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx,
+                       SYSCTL_CHILDREN(oid),
+                       OID_AUTO,
+                       "max_time_slice",
+                       CTLFLAG_RD,
+                       &bfq_diskctx->bfq_max_time_slice,
+                       0,
+                       "BFQ max time slice");
+
+       SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx,
+                       SYSCTL_CHILDREN(oid),
+                       OID_AUTO,
+                       "high_time_slice_count",
+                       CTLFLAG_RD,
+                       &bfq_diskctx->bfq_high_time_slice_count,
+                       0,
+                       "BFQ high time slice count");
+
+       SYSCTL_ADD_PROC(&bfq_diskctx->bfq_sysctl_ctx, SYSCTL_CHILDREN(oid),
+                       OID_AUTO, "as_switch", CTLTYPE_INT|CTLFLAG_RW,
+                       bfq_diskctx, 0, bfq_sysctl_as_switch_handler, "I", "as_switch");
+
+       SYSCTL_ADD_PROC(&bfq_diskctx->bfq_sysctl_ctx, SYSCTL_CHILDREN(oid),
+                       OID_AUTO, "auto_max_budget_switch", CTLTYPE_INT|CTLFLAG_RW,
+                       bfq_diskctx, 0, bfq_sysctl_auto_max_budget_handler, "I", "amb_switch");
+}
+
+static void
+helper_thread(struct bfq_disk_ctx *bfq_diskctx)
+{
+       struct dsched_thread_io *tdio;
+
+       int r;
+       helper_msg_t msg;
+
+       tdio = dsched_new_policy_thread_tdio(&bfq_diskctx->head, &dsched_bfq_policy);
+
+       lwkt_initport_thread(&bfq_diskctx->helper_msg_port, curthread);
+       dsched_disk_ctx_ref(&bfq_diskctx->head);
+       helper_sysctl_init(bfq_diskctx);
+
+       dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: helper thread created\n");
+#if 0
+       /* XXX: why mplock?! */
+       get_mplock();
+#endif
+
+       for(;;) {
+               msg = (helper_msg_t)lwkt_waitport(&bfq_diskctx->helper_msg_port, 0);
+               dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: helper: msg recv: %d\n", msg->hdr.u.ms_result);
+               r = helper_msg_exec(msg);
+               lwkt_replymsg(&msg->hdr, 0);
+               /*
+                * received BFQ_MSG_KILL
+                */
+               if (r == -1)
+                       break;
+       }
+
+#if 0
+       rel_mplock();
+#endif
+
+       sysctl_ctx_free(&bfq_diskctx->bfq_sysctl_ctx);
+       dsched_disk_ctx_unref(&bfq_diskctx->head);
+       dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: helper: die peacefully\n");
+       lwkt_exit();
+}
+
+static int
+helper_msg_exec(helper_msg_t msg)
+{
+       struct bfq_disk_ctx *bfq_diskctx;
+
+       bfq_diskctx = msg->bfq_diskctx;
+
+
+       switch (msg->hdr.u.ms_result)
+       {
+               case BFQ_MSG_DEQUEUE:
+                       if (atomic_cmpset_int(&bfq_diskctx->pending_dequeue, 0, 1))
+                               bfq_dequeue((struct dsched_disk_ctx *)bfq_diskctx);
+                       break;
+               case BFQ_MSG_AS_TIMEOUT:
+                       bfq_timeout(bfq_diskctx);
+                       break;
+
+               case BFQ_MSG_DESTROY_TDIO:
+                       bfq_helper_destroy_tdio(msg->tdio, bfq_diskctx);
+                       break;
+
+               case BFQ_MSG_KILL:
+                       return -1;
+
+               default:
+                       break;
+       }
+       return 0;
+}
+
+void
+helper_msg_dequeue(struct bfq_disk_ctx *bfq_diskctx)
+{
+       helper_msg_t helper_msg = helper_msg_get(bfq_diskctx);
+
+       helper_msg_send(bfq_diskctx, BFQ_MSG_DEQUEUE, helper_msg);
+}
+
+void
+helper_msg_as_timeout(struct bfq_disk_ctx *bfq_diskctx)
+{
+       helper_msg_t helper_msg = helper_msg_get(bfq_diskctx);
+       /**
+        * For statisticsal use, temporary
+        * ------------------------------
+        */
+       struct bfq_thread_io *bfq_tdio;
+       struct timeval tv;
+       uint32_t msec;
+
+
+       bfq_tdio = bfq_diskctx->bfq_blockon;
+       if (bfq_tdio) {
+               getmicrotime(&tv);
+               timevalsub(&tv, &bfq_tdio->as_start_time);
+               msec = ((uint64_t)(1000000*tv.tv_sec + tv.tv_usec)) >> 10;
+               if (msec > 5 * BFQ_T_WAIT_MIN * (1000 / hz))
+                       atomic_add_int(&bfq_diskctx->bfq_as_high_wait_count2, 1);
+               if (msec > bfq_diskctx->bfq_as_max_wait2)
+                       bfq_diskctx->bfq_as_max_wait2 = msec;
+       }
+       /* ----------------------------- */
+
+       helper_msg_send(bfq_diskctx, BFQ_MSG_AS_TIMEOUT, helper_msg);
+}
+
+void
+helper_msg_destroy_tdio(struct bfq_disk_ctx *bfq_diskctx, struct dsched_thread_io *tdio)
+{
+       helper_msg_t helper_msg = helper_msg_get(bfq_diskctx);
+
+       helper_msg->tdio = tdio;
+       helper_msg_send(bfq_diskctx, BFQ_MSG_DESTROY_TDIO, helper_msg);
+}
+
+void
+helper_msg_kill(struct bfq_disk_ctx *bfq_diskctx)
+{
+       helper_msg_t helper_msg = helper_msg_get(bfq_diskctx);
+
+       helper_msg_send(bfq_diskctx, BFQ_MSG_KILL, helper_msg);
+}
diff --git a/sys/kern/dsched/bfq/bfq_helper_thread.h b/sys/kern/dsched/bfq/bfq_helper_thread.h
new file mode 100644 (file)
index 0000000..8df731b
--- /dev/null
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2011 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Brills Peng <brillsp@gmail.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+
+#ifndef _DSCHED_BFQ_HELPER_THREAD_H_
+#define _DSCHED_BFQ_HELPER_THREAD_H_
+#include <kern/dsched/bfq/bfq.h>
+
+typedef struct helper_msg {
+       struct lwkt_msg hdr;
+       struct bfq_disk_ctx *bfq_diskctx;
+       struct dsched_thread_io *tdio;
+} *helper_msg_t;
+
+enum helper_msg_cmd {
+       BFQ_MSG_DEQUEUE = 1,
+       BFQ_MSG_AS_TIMEOUT,
+       BFQ_MSG_DESTROY_TDIO,
+       BFQ_MSG_KILL
+};
+
+void helper_init_global(void);
+void helper_init(struct bfq_disk_ctx *bfq_diskctx);
+void helper_uninit(struct bfq_disk_ctx *bfq_diskctx);
+void helper_msg_dequeue(struct bfq_disk_ctx *bfq_diskctx);
+void helper_msg_as_timeout(struct bfq_disk_ctx *bfq_diskctx);
+void helper_msg_destroy_tdio(struct bfq_disk_ctx *bfq_diskctx, struct dsched_thread_io *tdio);
+void helper_msg_kill(struct bfq_disk_ctx *bfq_diskctx);
+
+#endif
+
diff --git a/sys/kern/dsched/bfq/bfq_ktr.h b/sys/kern/dsched/bfq/bfq_ktr.h
new file mode 100644 (file)
index 0000000..6941e68
--- /dev/null
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2011 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Brills Peng <brillsp@gmail.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+
+/*
+ * Kernel tracing facility definitions for BFQ
+ *
+ * This header can ONLY be included by bfq.c
+ */
+#ifndef _DSCHED_BFQ_BFQ_C_
+#error "bfq_ktr.h should only be included in sys/kern/dsched/bfq/bfq.c!"
+#endif
+
+#ifndef _DSCHED_BFQ_KTR_H_
+#define _DSCHED_BFQ_KTR_H_
+#include <sys/ktr.h>
+
+#if !defined(KTR_DSCHED_BFQ)
+#define KTR_DSCHED_BFQ KTR_ALL
+#endif
+KTR_INFO_MASTER(dsched_bfq);
+
+/* thread created */
+KTR_INFO(KTR_DSCHED_BFQ, dsched_bfq, thread_created, 0, "%p", sizeof(void *));
+
+/* average seek distance per thread */
+KTR_INFO(KTR_DSCHED_BFQ, dsched_bfq, thread_seek_avg, 0, "%p: %" PRIu64, sizeof(off_t) + sizeof(void *));
+
+/* average thinking time per thread */
+KTR_INFO(KTR_DSCHED_BFQ, dsched_bfq, thread_ttime_avg, 0, "%p: %" PRIu64, sizeof(off_t) + sizeof(void *));
+
+#endif
+
diff --git a/sys/kern/dsched/bfq/wf2q.c b/sys/kern/dsched/bfq/wf2q.c
new file mode 100644 (file)
index 0000000..c2f5e54
--- /dev/null
@@ -0,0 +1,231 @@
+/*
+ * Copyright (c) 2011 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Brills Peng <brillsp@gmail.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Augment RB-tree for B-WF2Q+ queuing algorithm:
+ *  - The key of the binary tree is the virtual eligible time (start time)
+ *  - Each node maintains an additional min_vd value, which
+ *    is the minimum virtual deadline (finish time) among the node and its
+ *    children
+ *  - Every operation on the tree changing the childs of a node will
+ *    trigger RB_AUGMENT() marco, which change min_vd along the path to
+ *    the root
+ */
+
+#include <kern/dsched/bfq/wf2q.h>
+
+
+#undef RB_AUGMENT
+#define RB_AUGMENT(x) wf2q_augment_func(x);
+
+static void
+wf2q_augment_func(struct bfq_thread_io *node)
+{
+       struct bfq_thread_io *tmp = node, *tmp2;
+       int min_vd;
+       do{
+               min_vd = tmp->vd;
+               tmp2 = RB_LEFT(tmp, entry);
+               min_vd = tmp2 ? MIN(tmp2->min_vd, min_vd) : min_vd;
+               tmp2 = RB_RIGHT(tmp, entry);
+               min_vd = tmp2 ? MIN(tmp2->min_vd, min_vd) : min_vd;
+               tmp->min_vd = min_vd;
+       }while((tmp = RB_PARENT(tmp,entry)));
+}
+
+/*
+ * The rb-tree is indexed by the virtual eligible (start) time
+ */
+static int
+bfq_thread_io_cmp(struct bfq_thread_io *a, struct bfq_thread_io *b)
+{
+       if (a->ve - b->ve <= 0)
+               return -1;
+       return 1;
+}
+
+RB_PROTOTYPE(wf2q_augtree_t, bfq_thread_io, entry,);
+RB_GENERATE(wf2q_augtree_t, bfq_thread_io, entry, bfq_thread_io_cmp);
+
+/*
+ * The algorithm is from
+ *     I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline
+ *  First: A Flexible and Accurate Mechanism for Proportional Share
+ *  Resource Allocation,'' technical report.
+ *
+ *  http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
+ *
+ *  - Partition the tree into two parts by ve:
+ *  - One part contains nodes with ve smaller than vtime
+ *  - The other part contains nodes with ve larger than vtime
+ *  - In the first part, find the node with minimum vd, along the
+ *    min_vd value path
+ *
+ *  Returns
+ *     NULL, if no node with ve smaller than vtime
+ *     or the elegible node with minimum vd.
+ */
+static struct bfq_thread_io *
+wf2q_augtree_get_eligible_with_min_vd(struct wf2q_augtree_t *tree, int vtime)
+{
+       struct bfq_thread_io *node = RB_ROOT(tree), *st_tree = NULL, *path_req = NULL;
+       while (node) {
+               if (node->ve <= vtime) {
+                       /* update node with earliest deadline along path. */
+                       if ((!path_req) || (path_req->vd > node->vd))
+                               path_req = node;
+                       /* update root of subtree containing earliest deadline */
+                       if ((!st_tree) || (RB_LEFT(node,entry) && st_tree->min_vd > RB_LEFT(node,entry)->min_vd))
+                               st_tree = RB_LEFT(node,entry);
+                       node = RB_RIGHT(node, entry);
+               } else
+                       node = RB_LEFT(node, entry);
+       }
+       /* check whether node with earliest deadline was along path */
+       if ((!st_tree) || (st_tree->min_vd >= path_req->vd))
+               return path_req;
+       /* return node with earliest deadline from subtree */
+       for (node = st_tree; node; ) {
+               /* if node found, return it */
+               if (st_tree->min_vd == node->vd)
+                       return node;
+               /* XXX: modified temporarily */
+               if (RB_LEFT(node, entry) && node->min_vd == RB_LEFT(node, entry)->min_vd)
+                       node = RB_LEFT(node, entry);
+               else
+                       node = RB_RIGHT(node, entry);
+       }
+       return NULL;
+}
+
+/*
+ * This function initializes a wf2q structure
+ */
+void
+wf2q_init(struct wf2q_t *pwf2q)
+{
+       RB_INIT(&pwf2q->wf2q_augtree);
+       pwf2q->wf2q_virtual_time = 0;
+       pwf2q->wf2q_tdio_count = 0;
+}
+
+/*
+ * Insert a tdio into a wf2q queue.
+ * The virtual eligible (start) time and deadline is handled
+ * according to the current virtual time (in wf2q_t).
+ */
+void
+wf2q_insert_thread_io(struct wf2q_t *wf2q, struct bfq_thread_io *tdio)
+{
+       /*
+        * TODO: The anticipatory parts
+        * start time varies on whether the tdio is being waited
+        */
+       tdio->ve = MAX(wf2q->wf2q_virtual_time, tdio->vd);
+       tdio->vd = tdio->ve + tdio->budget / tdio->weight;
+       tdio->min_vd = tdio->vd;
+       RB_INSERT(wf2q_augtree_t, &wf2q->wf2q_augtree, tdio);
+       wf2q->wf2q_tdio_count++;
+}
+
+/*
+ * Remove a thread_io struct from the augment tree,
+ * called before a thread is destroyed.
+ */
+void
+wf2q_remove_thread_io(struct wf2q_t *wf2q, struct bfq_thread_io *tdio)
+{
+       RB_REMOVE(wf2q_augtree_t, &wf2q->wf2q_augtree, tdio);
+       wf2q->wf2q_tdio_count--;
+}
+
+/*
+ * Increase the current virtual time as services are provided
+ */
+void
+wf2q_inc_tot_service(struct wf2q_t *wf2q, int amount)
+{
+       wf2q->wf2q_virtual_time += amount;
+}
+
+/*
+ * Update a tdio's virtual deadline as it received service
+ */
+void
+wf2q_update_vd(struct bfq_thread_io *tdio, int received_service)
+{
+       tdio->vd = tdio->ve + received_service / tdio->weight;
+}
+
+static void
+wf2q_tree_dump(struct bfq_thread_io *root, int level)
+{
+       int i;
+       if (!root) return;
+       for (i = 0; i < level; i++)
+               kprintf("-");
+       kprintf("vd: %d; ve: %d; min_vd: %d\n", root->vd, root->ve, root->min_vd);
+       wf2q_tree_dump(RB_LEFT(root,entry), level + 1);
+       wf2q_tree_dump(RB_RIGHT(root, entry), level + 1);
+}
+
+/*
+ * Get a tdio with minimum virtual deadline and virtual eligible
+ * time smaller than the current virtual time.
+ * If there is no such tdio, update the current virtual time to
+ * the minimum ve in the queue. (And there must be one eligible then)
+ */
+struct bfq_thread_io *
+wf2q_get_next_thread_io(struct wf2q_t *wf2q)
+{
+       struct bfq_thread_io *tdio;
+       struct wf2q_augtree_t *tree = &wf2q->wf2q_augtree;
+       if (!(tdio = wf2q_augtree_get_eligible_with_min_vd(tree, wf2q->wf2q_virtual_time))) {
+               tdio = RB_MIN(wf2q_augtree_t, tree);
+               if (!tdio)
+                       return NULL;
+               wf2q->wf2q_virtual_time = tdio->ve;
+               tdio = wf2q_augtree_get_eligible_with_min_vd(tree, wf2q->wf2q_virtual_time);
+       }
+       if (!tdio) {
+               kprintf("!!!wf2q: wf2q_tdio_count=%d\n", wf2q->wf2q_tdio_count);
+               wf2q_tree_dump(RB_ROOT(tree), 0);
+               KKASSERT(0);
+       }
+       RB_REMOVE(wf2q_augtree_t, tree, tdio);
+       wf2q->wf2q_tdio_count--;
+       return tdio;
+}
+
+
diff --git a/sys/kern/dsched/bfq/wf2q.h b/sys/kern/dsched/bfq/wf2q.h
new file mode 100644 (file)
index 0000000..b1a0794
--- /dev/null
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2011 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Brills Peng <brillsp@gmail.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+
+#ifndef _DSCHED_BFQ_WF2Q_H_
+#define _DSCHED_BFQ_WF2Q_H_
+
+#include <sys/tree.h>
+
+#ifndef NULL
+#define NULL 0x0
+#endif
+/* struct bfq_thread_io is defined in bfq.h */
+
+struct bfq_thread_io;
+
+RB_HEAD(wf2q_augtree_t, bfq_thread_io);
+
+struct wf2q_t {
+       struct wf2q_augtree_t wf2q_augtree;
+       int wf2q_virtual_time;
+       int wf2q_tdio_count;
+};
+
+#ifndef _DSCHED_BFQ_H_
+#include <kern/dsched/bfq/bfq.h>
+#endif
+
+void wf2q_init(struct wf2q_t *pwf2q);
+void wf2q_insert_thread_io(struct wf2q_t *wf2q, struct bfq_thread_io *tdio);
+void wf2q_remove_thread_io(struct wf2q_t *wf2q, struct bfq_thread_io *tdio);
+void wf2q_update_vd(struct bfq_thread_io *tdio, int received_service);
+struct bfq_thread_io *wf2q_get_next_thread_io(struct wf2q_t *wf2q);
+void wf2q_inc_tot_service(struct wf2q_t *wf2q, int amount);
+
+#endif
+