kernel - Remove dsched
authorMatthew Dillon <dillon@apollo.backplane.com>
Wed, 11 Nov 2015 18:52:23 +0000 (10:52 -0800)
committerMatthew Dillon <dillon@apollo.backplane.com>
Wed, 11 Nov 2015 18:52:23 +0000 (10:52 -0800)
* After consultation, remove dsched from the kernel.  The original idea
  is still valid but the current implementation has had lingering bugs for
  several years now and we've determined that it's just got its fingers into
  too many structures.

  Also, the implementation was designed before SSDs, and doesn't play well
  with SSDs.

* Leave various empty entry points in so we can revisit at some
  future date.

36 files changed:
sys/conf/files
sys/conf/options
sys/config/LINT64
sys/config/X86_64_GENERIC
sys/kern/Makefile
sys/kern/dsched/Makefile [deleted file]
sys/kern/dsched/as/Makefile [deleted file]
sys/kern/dsched/as/as.c [deleted file]
sys/kern/dsched/bfq/Makefile [deleted file]
sys/kern/dsched/bfq/bfq.c [deleted file]
sys/kern/dsched/bfq/bfq.h [deleted file]
sys/kern/dsched/bfq/bfq_helper_thread.c [deleted file]
sys/kern/dsched/bfq/bfq_helper_thread.h [deleted file]
sys/kern/dsched/bfq/bfq_ktr.h [deleted file]
sys/kern/dsched/bfq/doc/bfq.viki [deleted file]
sys/kern/dsched/bfq/wf2q.c [deleted file]
sys/kern/dsched/bfq/wf2q.h [deleted file]
sys/kern/dsched/fq/Makefile [deleted file]
sys/kern/dsched/fq/flow.txt [deleted file]
sys/kern/dsched/fq/fq.h [deleted file]
sys/kern/dsched/fq/fq_core.c [deleted file]
sys/kern/dsched/fq/fq_diskops.c [deleted file]
sys/kern/kern_device.c
sys/kern/kern_dsched.c
sys/kern/kern_fork.c
sys/kern/lwkt_thread.c
sys/kern/subr_disk.c
sys/kern/vfs_bio.c
sys/sys/buf.h
sys/sys/disk.h
sys/sys/dsched.h
sys/sys/proc.h
sys/sys/sysctl.h
sys/sys/thread.h
sys/sys/udev.h
sys/vm/vm_pager.c

index 12d7dbb..7bbf67b 100644 (file)
@@ -2231,14 +2231,6 @@ ${OSACPI_MI_DIR}/acpi_video/acpi_video.c optional acpi_video acpi
 ${OSACPI_MI_DIR}/acpi_wmi/acpi_wmi.c           optional acpi_wmi acpi
 ${OSACPI_MI_DIR}/aibs/atk0110.c                        optional aibs acpi
 
-# dsched stuff
-kern/dsched/as/as.c                    optional dsched_as
-kern/dsched/fq/fq_core.c               optional dsched_fq
-kern/dsched/fq/fq_diskops.c            optional dsched_fq
-kern/dsched/bfq/bfq.c                  optional dsched_bfq
-kern/dsched/bfq/wf2q.c                 optional dsched_bfq
-kern/dsched/bfq/bfq_helper_thread.c    optional dsched_bfq
-
 # ACPICA code
 ${ACPICA_COMP_DIR}/debugger/dbcmds.c           optional acpi acpi_debug
 ${ACPICA_COMP_DIR}/debugger/dbconvert.c                optional acpi acpi_debug
index bf3cb0d..3949865 100644 (file)
@@ -552,7 +552,6 @@ KTR_ALL                             opt_ktr.h
 KTR_ACPI_EC                    opt_ktr.h
 KTR_CTXSW                      opt_ktr.h
 KTR_DMCRYPT                    opt_ktr.h
-KTR_DSCHED_BFQ                 opt_ktr.h
 KTR_ETHERNET                   opt_ktr.h
 KTR_HAMMER                     opt_ktr.h
 KTR_IFQ                                opt_ktr.h
@@ -654,11 +653,6 @@ DCONS_FORCE_GDB            opt_dcons.h
 # deprecated drivers and options
 I_WANT_DEPRECATED_STUFF        opt_deprecated.h
 
-# DSCHED stuff
-DSCHED_AS              opt_dsched.h
-DSCHED_BFQ             opt_dsched.h
-DSCHED_FQ              opt_dsched.h
-
 # Receive Side Scaling (now basecode)
 RSS_DEBUG              opt_rss.h
 
index 93c73f1..4d1e7df 100644 (file)
@@ -2298,7 +2298,6 @@ options   KTR_VERBOSE=1
 #options KTR_ACPI_EC
 #options KTR_CTXSW
 #options KTR_DMCRYPT
-#options KTR_DSCHED_BFQ
 #options KTR_ETHERNET
 #options KTR_HAMMER
 #options KTR_IFQ
@@ -2334,11 +2333,6 @@ options  ALTQ_DEBUG      #for debugging
 # especially with 100baseT
 #options       HZ=1000
 
-# DSCHED stuff
-options                DSCHED_AS
-options                DSCHED_BFQ
-options                DSCHED_FQ
-
 # WATCHDOG
 options                WDOG_DISABLE_ON_PANIC   # Automatically disable watchdogs on panic
 
index 470fe29..6d4e362 100644 (file)
@@ -43,7 +43,6 @@ options       AHC_REG_PRETTY_PRINT    # Print register bitfields in debug
                                        # output.  Adds ~128k to driver.
 options        AHD_REG_PRETTY_PRINT    # Print register bitfields in debug
                                        # output.  Adds ~215k to driver.
-options                DSCHED_FQ               # Fair-queuing disk scheduler
 
 # ALTQ
 options                ALTQ            #alternate queueing
index 533ea35..d433583 100644 (file)
@@ -19,6 +19,6 @@ ${.CURDIR}/../sys/sysunion.h: ${.CURDIR}/makesyscalls.sh ${.CURDIR}/syscalls.mas
        -mv -f ${.CURDIR}/../sys/sysunion.h ${.CURDIR}/../sys/sysunion.h.bak
        cd ${.CURDIR} && sh makesyscalls.sh syscalls.master
 
-SUBDIR= dsched firmware libmchain
+SUBDIR= firmware libmchain
 
 .include <bsd.subdir.mk>
diff --git a/sys/kern/dsched/Makefile b/sys/kern/dsched/Makefile
deleted file mode 100644 (file)
index a14d938..0000000
+++ /dev/null
@@ -1,5 +0,0 @@
-# $DragonFly: src/sys/dev/Makefile,v 1.12 2007/01/30 14:50:10 corecode Exp $
-
-SUBDIR=        fq as bfq
-
-.include <bsd.subdir.mk>
diff --git a/sys/kern/dsched/as/Makefile b/sys/kern/dsched/as/Makefile
deleted file mode 100644 (file)
index 13973fe..0000000
+++ /dev/null
@@ -1,4 +0,0 @@
-KMOD=  dsched_as
-SRCS=  as.c
-
-.include <bsd.kmod.mk>
diff --git a/sys/kern/dsched/as/as.c b/sys/kern/dsched/as/as.c
deleted file mode 100644 (file)
index f824663..0000000
+++ /dev/null
@@ -1,290 +0,0 @@
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/kernel.h>
-#include <sys/proc.h>
-#include <sys/sysctl.h>
-#include <sys/buf.h>
-#include <sys/conf.h>
-#include <sys/diskslice.h>
-#include <sys/disk.h>
-#include <sys/malloc.h>
-#include <machine/md_var.h>
-#include <sys/ctype.h>
-#include <sys/syslog.h>
-#include <sys/device.h>
-#include <sys/msgport.h>
-#include <sys/msgport2.h>
-#include <sys/buf2.h>
-#include <sys/dsched.h>
-#include <sys/fcntl.h>
-#include <machine/varargs.h>
-
-/*
- * A simple anticipatory scheduler
- */
-struct as_disk_ctx{
-       struct dsched_disk_ctx head;
-       TAILQ_HEAD(, bio) as_queue_rd;
-       TAILQ_HEAD(, bio) as_queue_wr;
-       /*
-        * TODO: lockmgr may be too heavy here,
-        * use spinlock instead!
-        */
-       struct lock as_queue_rd_lock;
-       struct lock as_queue_wr_lock;
-       int queue_rd_size;
-       int queue_wr_size;
-
-       struct callout as_callout;
-
-       int as_blockall;
-       pid_t as_blockon;
-};
-
-struct dsched_as_stats{
-       int32_t unused;
-}as_stats;
-
-static dsched_prepare_t                as_prepare;
-static dsched_teardown_t       as_teardown;
-static dsched_cancel_t         as_cancel;
-static dsched_queue_t          as_queue;
-static dsched_polling_func_t   as_dequeue;
-
-static struct dsched_policy dsched_as_policy = {
-       .name ="as",
-/*
- * field need_request_polling
- * is removed from struct dsched_policy
- */
-       //.need_request_polling = 1,
-       .prepare = as_prepare,
-       .teardown = as_teardown,
-       .cancel_all = as_cancel,
-       .bio_queue = as_queue,
-       .polling_func = as_dequeue
-};
-
-static int dsched_as_version_maj = 1;
-static int dsched_as_version_min = 0;
-
-static int
-as_prepare(struct dsched_disk_ctx *diskctx)
-{
-       struct as_disk_ctx *as_diskctx = (struct as_disk_ctx *)diskctx;
-       TAILQ_INIT(&as_diskctx->as_queue_wr);
-       as_diskctx->queue_wr_size = 0;
-       TAILQ_INIT(&as_diskctx->as_queue_rd);
-       as_diskctx->queue_rd_size = 0;
-       lockinit(&as_diskctx->as_queue_rd_lock, "as_queue_rd", 0, LK_CANRECURSE);
-       lockinit(&as_diskctx->as_queue_wr_lock, "as_queue_wr", 0, LK_CANRECURSE);
-       callout_init(&as_diskctx->as_callout);
-       as_diskctx->as_blockall = 0;
-       as_diskctx->as_blockon = NO_PID;
-       return 0;
-}
-
-static void
-as_teardown(struct dsched_disk_ctx *diskctx)
-{
-
-
-}
-
-static void
-as_cancel(struct dsched_disk_ctx *diskctx)
-{
-       struct as_disk_ctx *as_diskctx = (struct as_disk_ctx *)diskctx;
-       struct bio *bio, *bio2;
-       struct dsched_thread_io *tdio;
-       DSCHED_DISK_CTX_LOCK(&as_diskctx->head);
-       lockmgr(&as_diskctx->as_queue_rd_lock, LK_EXCLUSIVE);
-       TAILQ_FOREACH_MUTABLE(bio, &as_diskctx->as_queue_rd, link, bio2){
-               TAILQ_REMOVE(&as_diskctx->as_queue_rd, bio, link);
-               tdio = dsched_get_bio_tdio(bio);
-               dsched_cancel_bio(bio);
-               dsched_thread_io_unref(tdio);
-       }
-       lockmgr(&as_diskctx->as_queue_rd_lock, LK_RELEASE);
-
-       lockmgr(&as_diskctx->as_queue_wr_lock, LK_EXCLUSIVE);
-       TAILQ_FOREACH_MUTABLE(bio, &as_diskctx->as_queue_wr, link, bio2){
-               TAILQ_REMOVE(&as_diskctx->as_queue_wr, bio, link);
-               tdio = dsched_get_bio_tdio(bio);
-               dsched_cancel_bio(bio);
-               dsched_thread_io_unref(tdio);
-       }
-       lockmgr(&as_diskctx->as_queue_wr_lock, LK_RELEASE);
-
-       DSCHED_DISK_CTX_UNLOCK(&as_diskctx->head);
-}
-
-static void
-as_timeout(void *p)
-{
-       //pid_t last_blockon;
-       struct as_disk_ctx *as_diskctx = (struct as_disk_ctx *)p;
-       DSCHED_DISK_CTX_LOCK(&as_diskctx->head);
-       as_diskctx->as_blockall = 0;
-       //last_blockon = as_diskctx->as_blockon;
-       as_diskctx->as_blockon = NO_PID;
-       DSCHED_DISK_CTX_UNLOCK(&as_diskctx->head);
-       //dsched_debug(0, "dsched: as, timeout %d\n", last_blockon);
-       as_dequeue((struct dsched_disk_ctx *)as_diskctx);
-}
-
-static int
-as_queue(struct dsched_disk_ctx *diskctx, struct dsched_thread_io *tdio,
-               struct  bio *bio)
-{
-       struct as_disk_ctx *as_diskctx = (struct as_disk_ctx *)diskctx;
-       //if (tdio->p && (uint32_t)tdio->p != ~0)
-       //      dsched_debug(0, "dsched: user process bio from %d\n", tdio->p->p_pid);
-       /*save tdio for each bio*/
-       dsched_set_bio_priv(bio, tdio);
-       dsched_set_bio_tdio(bio, tdio);
-       /* will be unreferenced in bio_done function */
-       dsched_thread_io_ref(tdio);
-       DSCHED_DISK_CTX_LOCK(&as_diskctx->head);
-       /* blocking for as,
-        * if current bio is from as_blockon, insert it at head
-        */
-       if (bio->bio_buf->b_cmd == BUF_CMD_READ){
-               lockmgr(&as_diskctx->as_queue_rd_lock, LK_EXCLUSIVE);
-               if (as_diskctx->as_blockall && tdio->p && as_diskctx->as_blockon == tdio->p->p_pid)
-                       TAILQ_INSERT_HEAD(&as_diskctx->as_queue_rd, bio, link);
-               else
-                       TAILQ_INSERT_TAIL(&as_diskctx->as_queue_rd, bio, link);
-               atomic_add_int(&as_diskctx->queue_rd_size, 1);
-               lockmgr(&as_diskctx->as_queue_rd_lock, LK_RELEASE);
-       } else {
-               lockmgr(&as_diskctx->as_queue_wr_lock, LK_EXCLUSIVE);
-               TAILQ_INSERT_TAIL(&as_diskctx->as_queue_wr, bio, link);
-               atomic_add_int(&as_diskctx->queue_wr_size, 1);
-               lockmgr(&as_diskctx->as_queue_wr_lock, LK_RELEASE);
-       }
-       DSCHED_DISK_CTX_UNLOCK(&as_diskctx->head);
-       as_dequeue(diskctx);
-       return 0;
-
-}
-static void
-as_dequeue(struct dsched_disk_ctx *diskctx)
-{
-       int free_slots = 0;
-       struct as_disk_ctx *as_diskctx = (struct as_disk_ctx *)diskctx;
-       struct bio *bio;
-       struct dsched_thread_io *tdio;
-       /*Lock the diskctx for the whole dispatching process,
-        * to ensure atomic change to current_tab_queue_depth*/
-
-       DSCHED_DISK_CTX_LOCK(&as_diskctx->head);
-       /* if blocking all dispatching for anticipatory scheduling
-        * return directly
-        */
-       if (as_diskctx->as_blockall){
-       //      dsched_debug(0, "dsched: as, dequeue blocked! %d\n", as_diskctx->as_blockon);
-               goto rtn;
-       }
-       free_slots = as_diskctx->head.max_tag_queue_depth - as_diskctx->head.current_tag_queue_depth;
-       KKASSERT(free_slots>=0 && free_slots <=64);
-       lockmgr(&as_diskctx->as_queue_rd_lock, LK_EXCLUSIVE);
-       while (free_slots > 0){
-               if (TAILQ_EMPTY(&as_diskctx->as_queue_rd))
-                       break;
-               bio = TAILQ_FIRST(&as_diskctx->as_queue_rd);
-               tdio = dsched_get_bio_priv(bio);
-               //kernel thread
-               if (!tdio->p) {
-                       TAILQ_REMOVE(&as_diskctx->as_queue_rd, bio, link);
-                       dsched_strategy_request_polling(as_diskctx->head.dp, bio, diskctx);
-               } else {
-                       //user process, continue
-//                     dsched_debug(0, "dsched: as, user process bio\n");
-                       if (as_diskctx->as_blockon == NO_PID || as_diskctx->as_blockon == tdio->p->p_pid){
-                               as_diskctx->as_blockon = tdio->p->p_pid;
-                               TAILQ_REMOVE(&as_diskctx->as_queue_rd, bio, link);
-                               dsched_strategy_request_polling(as_diskctx->head.dp, bio, diskctx);
-                       } else {
-                               //user process, before switching, as!
-                               as_diskctx->as_blockall = 1;
-//                             dsched_debug(0, "dsched: as, block on %d\n", as_diskctx->as_blockon);
-                               callout_reset(&as_diskctx->as_callout, 10, as_timeout, as_diskctx);
-                               break;
-                       }
-
-               }
-               free_slots --;
-       }
-       lockmgr(&as_diskctx->as_queue_rd_lock, LK_RELEASE);
-       lockmgr(&as_diskctx->as_queue_wr_lock, LK_EXCLUSIVE);
-       while (free_slots > 0){
-               if (TAILQ_EMPTY(&as_diskctx->as_queue_wr))
-                       break;
-               bio = TAILQ_FIRST(&as_diskctx->as_queue_wr);
-               TAILQ_REMOVE(&as_diskctx->as_queue_wr, bio, link);
-               dsched_strategy_request_polling(as_diskctx->head.dp, bio, diskctx);
-               free_slots --;
-       }
-       lockmgr(&as_diskctx->as_queue_wr_lock, LK_RELEASE);
-
-rtn:
-       DSCHED_DISK_CTX_UNLOCK(&as_diskctx->head);
-}
-
-static int
-do_asstats(SYSCTL_HANDLER_ARGS)
-{
-       return (sysctl_handle_opaque(oidp, &as_stats, sizeof(struct dsched_as_stats), req));
-}
-
-static int
-as_mod_handler(module_t mod, int type, void *unused)
-{
-       static struct sysctl_ctx_list sysctl_ctx;
-       static struct sysctl_oid *oid;
-       static char version[16];
-       int error;
-
-       ksnprintf(version, sizeof(version), "%d.%d",
-           dsched_as_version_maj, dsched_as_version_min);
-
-       switch (type) {
-       case MOD_LOAD:
-               bzero(&as_stats, sizeof(struct dsched_as_stats));
-               if ((error = dsched_register(&dsched_as_policy)))
-                       return (error);
-
-               sysctl_ctx_init(&sysctl_ctx);
-               oid = SYSCTL_ADD_NODE(&sysctl_ctx,
-                   SYSCTL_STATIC_CHILDREN(_dsched),
-                   OID_AUTO,
-                   "as",
-                   CTLFLAG_RD, 0, "");
-
-               SYSCTL_ADD_PROC(&sysctl_ctx, SYSCTL_CHILDREN(oid),
-                   OID_AUTO, "stats", CTLTYPE_OPAQUE|CTLFLAG_RD,
-                   0, 0, do_asstats, "S,dsched_as_stats", "as statistics");
-
-               SYSCTL_ADD_STRING(&sysctl_ctx, SYSCTL_CHILDREN(oid),
-                   OID_AUTO, "version", CTLFLAG_RD, version, 0, "as version");
-
-               kprintf("AS scheduler policy version %d.%d loaded\n",
-                   dsched_as_version_maj, dsched_as_version_min);
-               break;
-
-       case MOD_UNLOAD:
-               if ((error = dsched_unregister(&dsched_as_policy)))
-                       return (error);
-               sysctl_ctx_free(&sysctl_ctx);
-               kprintf("AS scheduler policy unloaded\n");
-               break;
-
-       default:
-               break;
-       }
-
-       return 0;
-
-}
-DSCHED_POLICY_MODULE(dsched_as, as_mod_handler, 1);
diff --git a/sys/kern/dsched/bfq/Makefile b/sys/kern/dsched/bfq/Makefile
deleted file mode 100644 (file)
index ed9b4c9..0000000
+++ /dev/null
@@ -1,5 +0,0 @@
-KMOD=  dsched_bfq
-SRCS=  opt_ktr.h
-SRCS+= bfq.c wf2q.c bfq_helper_thread.c
-
-.include <bsd.kmod.mk>
diff --git a/sys/kern/dsched/bfq/bfq.c b/sys/kern/dsched/bfq/bfq.c
deleted file mode 100644 (file)
index 1cd88b0..0000000
+++ /dev/null
@@ -1,1382 +0,0 @@
-/*
- * Copyright (c) 2011 The DragonFly Project.  All rights reserved.
- *
- * This code is derived from software contributed to The DragonFly Project
- * by Brills Peng <brillsp@gmail.com>
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- * 3. Neither the name of The DragonFly Project nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific, prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
- * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
- * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-
-/*
- * BFQ disk scheduler, the algorithm routines and the interfaces with the
- * dsched framework.
- *
- */
-
-#include <sys/systm.h>
-#include <sys/kernel.h>
-#include <sys/proc.h>
-#include <sys/sysctl.h>
-#include <sys/buf.h>
-#include <sys/conf.h>
-#include <sys/diskslice.h>
-#include <sys/disk.h>
-#include <sys/malloc.h>
-#include <machine/md_var.h>
-#include <sys/ctype.h>
-#include <sys/syslog.h>
-#include <sys/device.h>
-#include <sys/msgport.h>
-#include <sys/msgport2.h>
-#include <sys/buf2.h>
-#include <sys/dsched.h>
-#include <sys/fcntl.h>
-#include <machine/inttypes.h>
-#include <machine/varargs.h>
-
-#include <kern/dsched/bfq/bfq.h>
-#include <kern/dsched/bfq/bfq_helper_thread.h>
-
-#define _DSCHED_BFQ_BFQ_C_
-#include <kern/dsched/bfq/bfq_ktr.h>
-
-/* Make sure our structs fit */
-CTASSERT(sizeof(struct bfq_thread_io) <= DSCHED_THREAD_IO_MAX_SZ);
-CTASSERT(sizeof(struct bfq_disk_ctx) <= DSCHED_DISK_CTX_MAX_SZ);
-
-
-static dsched_prepare_t                bfq_prepare;
-static dsched_teardown_t       bfq_teardown;
-static dsched_cancel_t         bfq_cancel_all;
-static dsched_queue_t          bfq_queue;
-static dsched_new_tdio_t       bfq_new_tdio;
-static dsched_destroy_tdio_t   bfq_destroy_tdio;
-static dsched_bio_done_t       bfq_bio_done;
-
-
-static void bfq_update_peak_rate(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio);
-static int bfq_slow_tdio(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio);
-static void bfq_expire(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio, enum bfq_expire_reason reason);
-static void bfq_update_tdio_seek_avg(struct bfq_thread_io *bfq_tdio, struct bio *bp);
-static void bfq_update_tdio_ttime_avg(struct bfq_thread_io *bfq_tdio);
-static void bfq_update_as_avg_wait(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio, int flag);
-static void bfq_update_avg_time_slice(struct bfq_disk_ctx *bfq_diskctx, struct timeval tv);
-
-
-
-struct dsched_policy dsched_bfq_policy = {
-       .name           = "bfq",
-       .prepare        = bfq_prepare,
-       .teardown       = bfq_teardown,
-       .cancel_all     = bfq_cancel_all,
-       .bio_queue      = bfq_queue,
-       .new_tdio       = bfq_new_tdio,
-       .destroy_tdio   = bfq_destroy_tdio,
-       .bio_done       = bfq_bio_done,
-       .polling_func   = (void (*)(struct dsched_disk_ctx *))helper_msg_dequeue,
-};
-
-
-struct sysctl_oid *bfq_mod_oid;
-
-struct dsched_bfq_stats bfq_stats;
-
-static int dsched_bfq_version_maj = 1;
-static int dsched_bfq_version_min = 0;
-
-/*
- * bfq_prepare(): the .prepare callback of the bfq policy. Initialize
- * all fields in bfq_diskctx and initialize the corresponding helper
- * thread.
- *
- * lock: none
- * refcount: none
- *
- * Returns 0
- */
-static int
-bfq_prepare(struct dsched_disk_ctx *diskctx)
-{
-       struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)diskctx;
-
-       BFQ_LOCKINIT(bfq_diskctx);
-
-       bfq_diskctx->pending_dequeue = 0;
-
-       wf2q_init(&bfq_diskctx->bfq_wf2q);
-
-       callout_init_mp(&bfq_diskctx->bfq_callout);
-
-       bfq_diskctx->bfq_blockon = NULL;
-       bfq_diskctx->bfq_active_tdio = NULL;
-       bfq_diskctx->bfq_remaining_budget = 0;
-
-       bfq_diskctx->bfq_max_budget = BFQ_DEFAULT_MAX_BUDGET;
-       bfq_diskctx->bfq_peak_rate_samples = 0;
-       bfq_diskctx->bfq_peak_rate = 0;
-
-#if 0
-       bfq_diskctx->bfq_flag = BFQ_FLAG_AS | BFQ_FLAG_AUTO_MAX_BUDGET;
-#endif
-       bfq_diskctx->bfq_flag = BFQ_FLAG_AS;
-
-       bfq_diskctx->bfq_as_miss = 0;
-       bfq_diskctx->bfq_as_hit = 0;
-
-       bfq_diskctx->bfq_as_avg_wait_miss = 0;
-       bfq_diskctx->bfq_as_avg_wait_all = 0;
-       bfq_diskctx->bfq_as_max_wait = 0;
-       bfq_diskctx->bfq_as_max_wait2 = 0;
-       bfq_diskctx->bfq_as_high_wait_count = 0;
-       bfq_diskctx->bfq_as_high_wait_count2 = 0;
-
-       bfq_diskctx->bfq_avg_time_slice = 0;
-       bfq_diskctx->bfq_max_time_slice = 0;
-       bfq_diskctx->bfq_high_time_slice_count = 0;
-
-       /* initiailize the helper thread */
-       helper_init(bfq_diskctx);
-
-       dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: initialized!\n");
-       return 0;
-}
-
-/*
- * bfq_teardown(): .teardown callback of the bfq policy. Send message
- * of killing to the helper thread and deallocate resources used by
- * the helper thread (currently the objcache)
- *
- * XXX: deadlock causing when the caller of bfq_teardown() and the
- * helper thread are on the same CPU.
- *
- * lock: none
- * refcount: none
- *
- */
-
-static void
-bfq_teardown(struct dsched_disk_ctx *diskctx)
-{
-       struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)diskctx;
-       KKASSERT(diskctx);
-
-       helper_msg_kill(bfq_diskctx);
-
-       tsleep(diskctx, 0, "teardn", hz * 3 / 2);
-
-       helper_uninit(bfq_diskctx);
-}
-
-/*
- * bfq_cancel_all(): .cancel_all callback of the bfq policy. Cancel
- * all bios that queue in each bfq_thread_io structure in the
- * wf2q tree.
- *
- * lock:
- *     BFQ_LOCK: protect from wf2q_insert operation in bfq_queue() and
- *     bfq_dequeue(); wf2q_get_next operation in bfq_dequeue()
- *     THREAD_IO_LOCK: protect from queue iteration in bfq_dequeue() and
- *     queue insertion in bfq_queue()
- *
- * refcount:
- *     unref thread_io structures; they are referenced in queue(),
- *     when a bio is queued. The refcount may decrease to zero.
- *
- */
-static void
-bfq_cancel_all(struct dsched_disk_ctx *diskctx)
-{
-       struct bio *bio;
-       struct bfq_thread_io *bfq_tdio;
-       struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)diskctx;
-
-       BFQ_LOCK(bfq_diskctx);
-
-       while ((bfq_tdio = wf2q_get_next_thread_io(&bfq_diskctx->bfq_wf2q))) {
-               DSCHED_THREAD_IO_LOCK(&bfq_tdio->head);
-               KKASSERT(lockstatus(&bfq_tdio->head.lock, curthread) == LK_EXCLUSIVE);
-
-               while ((bio = TAILQ_FIRST(&bfq_tdio->head.queue))) {
-                       bfq_tdio->head.qlength--;
-                       TAILQ_REMOVE(&bfq_tdio->head.queue, bio, link);
-                       dsched_cancel_bio(bio);
-                       dsched_thread_io_unref(&bfq_tdio->head);
-               }
-
-               KKASSERT(bfq_tdio->head.qlength == 0);
-               DSCHED_THREAD_IO_UNLOCK(&bfq_tdio->head);
-       }
-
-       BFQ_UNLOCK(bfq_diskctx);
-}
-
-/*
- * bfq_new_tdio(): .new_tdio callback of the bfq policy. Initialize
- * the bfq_thread_io structure.
- *
- * lock: none
- * refcount: none
- */
-static void
-bfq_new_tdio(struct dsched_thread_io *tdio)
-{
-       struct bfq_thread_io *bfq_tdio = (struct bfq_thread_io *) tdio;
-
-       /* the queue has to be initialized some where else */
-       tdio->qlength = 0;
-
-       tdio->debug_priv = 0xF00FF00F;
-
-       bfq_tdio->budget = BFQ_DEFAULT_MIN_BUDGET;
-       bfq_tdio->weight = BFQ_DEFAULT_WEIGHT;
-
-       bfq_tdio->tdio_as_switch = 1;
-       bfq_tdio->maybe_timeout = 0;
-
-       bfq_tdio->seek_samples = 0;
-       bfq_tdio->seek_avg = 0;
-       bfq_tdio->seek_total = 0;
-       bfq_tdio->ttime_samples = 0;
-       bfq_tdio->ttime_avg = 0;
-       bfq_tdio->service_received = 0;
-       bfq_tdio->bio_dispatched = 0;
-       bfq_tdio->bio_completed = 0;
-
-       KTR_LOG(dsched_bfq_thread_created, bfq_tdio);
-}
-
-/*
- * bfq_helper_destroy_tdio(): called after a thread_io struct is destroyed.
- * if the scheduler is AS waiting for a destroyed tdio, this function resumes
- * the scheduler.
- *
- * lock:
- *     BFQ_LOCK: protect from nullify bfq_diskctx->bfq_blockon/bfq_active_tdio
- *     in bfq_timeout()
- *
- * refcount: none
- *
- * Calling path: bfq_destroy_tdio --lwkt_msg--> helper_thread --call--> me
- *
- */
-void
-bfq_helper_destroy_tdio(struct dsched_thread_io *tdio, struct bfq_disk_ctx *bfq_diskctx)
-{
-       KKASSERT(bfq_diskctx);
-
-       BFQ_LOCK(bfq_diskctx);
-
-       /*
-        * Test whether the scheduler is pending on the tdio to
-        * be destroyed.
-        */
-       if (((struct dsched_thread_io *)bfq_diskctx->bfq_blockon == tdio) &&
-           callout_pending(&bfq_diskctx->bfq_callout)) {
-               dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: pending on a being destroyed thread!\n");
-
-               callout_stop(&bfq_diskctx->bfq_callout);
-
-               bfq_diskctx->bfq_blockon = NULL;
-               bfq_diskctx->bfq_active_tdio = NULL;
-
-               BFQ_UNLOCK(bfq_diskctx);
-
-               helper_msg_dequeue(bfq_diskctx);
-               return;
-       }
-       BFQ_UNLOCK(bfq_diskctx);
-
-}
-
-/*
- * bfq_destroy_tdio(): .destroy_tdio callback of the bfq policy
- *
- * Called immediate after a dsched_thread_io struct's refcount decreases
- * to zero. This function will record the seek_avg and ttime_avg of the
- * destroyed thread with the KTR facility.
- *
- * lock: none
- *
- * refcount: the tdio's refcount should be zero. It may be nuked, and
- * any read/write to the tdio is not safe by then.
- */
-static void
-bfq_destroy_tdio(struct dsched_thread_io *tdio)
-{
-       struct bfq_thread_io *bfq_tdio = (struct bfq_thread_io *)tdio;
-
-       /*
-        * do not log threads without I/O
-        */
-       if (bfq_tdio->seek_samples != 0 || bfq_tdio->ttime_samples != 0) {
-               KTR_LOG(dsched_bfq_thread_seek_avg, bfq_tdio, bfq_tdio->seek_avg );
-               KTR_LOG(dsched_bfq_thread_ttime_avg, bfq_tdio, bfq_tdio->ttime_avg);
-       }
-
-       helper_msg_destroy_tdio((struct bfq_disk_ctx *)tdio->diskctx, tdio);
-}
-
-/*
- * bfq_bio_done(): .bio_done callback of the bfq policy
- *
- * Called after a bio is done, (by request_polling_biodone of dsched).
- * This function judges whet her a thread consumes up its time slice, and
- * if so, it will set the maybe_timeout flag in bfq_tdio structure. Any
- * further action of that thread or the bfq scheduler will cause the
- * thread to be expired. (in bfq_queue() or in bfq_dequeue())
- *
- * This function requires the bfq_tdio pointer of the thread that pushes
- * bp to be stored by dsched_set_bio_priv() earlier. Currently it is
- * stored when bfq_queue() is called.
- *
- * lock: none. This function CANNOT be blocked by any lock
- *
- * refcount:
- *     the corresponding tdio's refcount should decrease by 1 after
- *     this function call. The counterpart increasing is in bfq_queue().
- *     For each bio pushed down, we increase the refcount of the pushing
- *     tdio.
- */
-static void
-bfq_bio_done(struct bio *bp)
-{
-       struct disk *dp = dsched_get_bio_dp(bp);
-       struct bfq_thread_io *bfq_tdio = dsched_get_bio_priv(bp);
-       struct bfq_disk_ctx *bfq_diskctx = dsched_get_disk_priv(dp);
-       struct timeval tv;
-       int ticks_expired;
-
-       KKASSERT(bfq_tdio);
-
-       dsched_thread_io_ref(&bfq_tdio->head);
-
-       atomic_add_int(&bfq_tdio->bio_completed, 1);
-
-       /* the tdio has already expired */
-       if (bfq_tdio != bfq_diskctx->bfq_active_tdio)
-               goto rtn;
-       atomic_add_int(&bfq_tdio->service_received, BIO_SIZE(bp));
-
-       /* current time */
-       getmicrotime(&tv);
-       bfq_tdio->last_request_done_time = tv;
-       timevalsub (&tv, &bfq_tdio->service_start_time);
-       ticks_expired = tvtohz_high(&tv);
-
-       /* the thread has run out its time slice */
-       if ((ticks_expired != 0x7fffffff) &&
-           (ticks_expired >= BFQ_SLICE_TIMEOUT)) {
-               /*
-                * we cannot block here, so just set a flag
-                */
-#if 0
-               bfq_tdio->maybe_timeout = 1;
-#endif
-               if (atomic_cmpset_int(&bfq_tdio->maybe_timeout, 0, 1)) {
-                       bfq_update_avg_time_slice(bfq_diskctx, tv);
-                       dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: %p may time out\n", bfq_tdio);
-               }
-       }
-rtn:
-       dsched_thread_io_unref(&bfq_tdio->head); /* ref'ed in this function */
-       dsched_thread_io_unref(&bfq_tdio->head); /* ref'ed in queue() */
-
-}
-
-/*
- * bfq_timeout(): called after the callout alarm strikes.
- *
- * This function getting called indicates that after waiting for
- * BFQ_T_WAIT / BFQ_T_WAIT_MIN ticks, the thread "active_tdio"
- * represents does not push any further bios. This tdio should
- * be expired with the reason BFQ_REASON_TOO_IDLE, but if the tdio
- * is marked as timeout (in bfq_biodone()) first, we expire it
- * for BFQ_REASON_TIMEOUT. The bfq scheduler should resume working
- * (and pick another thread to serve).
- *
- * It is possible that this function gets called a litter after
- * the thread pushes a bio with bfq_queue(), and thus a "fake timeout"
- * happens. We treat it as the callout does not strike, and continue
- * to serve the active_tdio.
- *
- * lock:
- *     BFQ_LOCK: protect bfq_diskctx->blockon and bfq_diskctx->active_tdio
- *     they should either changed in bfq_queue() or in this function,
- *     atomically.
- *     TDIO_LOCK: protect from dequeue() updateing the budget by the
- *     maybe_timeout branch. (Not necessary, because we already hold the
- *     BFQ_LOCK, and no one else could change the budget of the tdio)
- *
- * refcount:
- *  the refcount of bfq_diskctx->bfq_active_tdio will decrease one
- *  after this function. (The counterpart increasing is in bfq_dequeue(),
- *  before resetting the callout alarm.)
- *
- * AS timeout:
- * during the waiting period, no bio is pushed by the being
- * waited tdio
- *
- * Calling path:
- * callout facility --> helper_msg_timeout --lwkt_msg--> helper thread
- *  --> me
- */
-void
-bfq_timeout(void *p)
-{
-       /* waiting time out:
-        * no deceptive idleness, and unblock dispatching
-        */
-       struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)p;
-       struct bfq_thread_io *bfq_tdio;
-
-       BFQ_LOCK(bfq_diskctx);
-
-       /*
-        * the timeout occurs after the thread
-        * pushing one more bio
-        */
-       if (bfq_diskctx->bfq_blockon == NULL) {
-               dsched_debug(BFQ_DEBUG_VERBOSE , "BFQ: fake AS timeout \n");
-               goto rtn;
-       }
-
-       bfq_diskctx->bfq_as_miss++;
-
-       KKASSERT(bfq_diskctx->bfq_active_tdio);
-       bfq_tdio = bfq_diskctx->bfq_active_tdio;
-
-       DSCHED_THREAD_IO_LOCK(&bfq_tdio->head);
-
-       bfq_update_as_avg_wait(bfq_diskctx, bfq_tdio, BFQ_AS_STAT_ALL|BFQ_AS_STAT_ONLY_MISS);
-
-       bfq_diskctx->bfq_blockon = NULL;
-       bfq_diskctx->bfq_active_tdio = NULL;
-       dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: unblocked %p\n", bfq_tdio);
-
-       wf2q_update_vd(bfq_tdio, bfq_tdio->budget - bfq_diskctx->bfq_remaining_budget);
-       /*
-        * the time slice expired before as timeout
-        * this should be REASON_TIMEOUT
-        */
-       if (bfq_tdio->maybe_timeout) {
-               bfq_expire(bfq_diskctx, bfq_tdio, BFQ_REASON_TIMEOUT);
-               dsched_debug(BFQ_DEBUG_VERBOSE, "%p time out in timeout()\n", bfq_tdio);
-       } else {
-               bfq_expire(bfq_diskctx, bfq_tdio, BFQ_REASON_TOO_IDLE);
-               dsched_debug(BFQ_DEBUG_VERBOSE, "%p too idle\n", bfq_tdio);
-       }
-
-       DSCHED_THREAD_IO_UNLOCK(&bfq_tdio->head);
-
-       /* ref'ed in dequeue(), before resetting callout */
-       dsched_thread_io_unref(&bfq_tdio->head);
-rtn:
-       BFQ_UNLOCK(bfq_diskctx);
-       helper_msg_dequeue(bfq_diskctx);
-}
-
-/*
- * bfq_queue(): .queue callback of the bfq policy.
- *
- * A thread calls this function to hand in its I/O requests (bio).
- * Their bios are stored in the per-thread queue, in tdio structure.
- * Currently, the sync/async bios are queued together, which may cause
- * some issues on performance.
- *
- * Besides queueing bios, this function also calculates the average
- * thinking time and average seek distance of a thread, using the
- * information in bio structure.
- *
- * If the calling thread is waiting by the bfq scheduler due to
- * the AS feature, this function will cancel the callout alarm
- * and resume the scheduler to continue serving this thread.
- *
- * lock:
- *   THREAD_IO_LOCK: protect from queue iteration in bfq_dequeue()
- *   BFQ_LOCK: protect from other insertions/deletions in wf2q_augtree
- *   in bfq_queue() or bfq_dequeue().
- *
- * refcount:
- *   If the calling thread is waited by the scheduler, the refcount
- *   of the related tdio will decrease by 1 after this function. The
- *   counterpart increasing is in bfq_dequeue(), before resetting the
- *   callout alarm.
- *
- * Return value:
- *  EINVAL: if bio->bio_buf->b_cmd == BUF_CMD_FLUSH
- *  0: bio is queued successfully.
- */
-static int
-bfq_queue(struct dsched_disk_ctx *diskctx, struct dsched_thread_io *tdio,
-               struct  bio *bio)
-{
-       struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)diskctx;
-       struct bfq_thread_io *bfq_tdio = (struct bfq_thread_io *)tdio;
-       int original_qlength;
-
-       /* we do not handle flush requests. push it down to dsched */
-       if (__predict_false(bio->bio_buf->b_cmd == BUF_CMD_FLUSH))
-               return (EINVAL);
-
-       DSCHED_THREAD_IO_LOCK(tdio);
-       KKASSERT(tdio->debug_priv == 0xF00FF00F);
-       dsched_debug(BFQ_DEBUG_NORMAL, "bfq: tdio %p pushes bio %p\n", bfq_tdio, bio);
-
-       dsched_set_bio_priv(bio, tdio);
-       dsched_thread_io_ref(tdio);
-
-       if ((bio->bio_buf->b_cmd == BUF_CMD_READ) ||
-           (bio->bio_buf->b_cmd == BUF_CMD_WRITE)) {
-               bfq_update_tdio_seek_avg(bfq_tdio, bio);
-       }
-
-       bfq_update_tdio_ttime_avg(bfq_tdio);
-
-       /* update last_bio_pushed_time */
-       getmicrotime(&bfq_tdio->last_bio_pushed_time);
-
-       if ((bfq_tdio->seek_samples > BFQ_VALID_MIN_SAMPLES) &&
-           BFQ_TDIO_SEEKY(bfq_tdio))
-               dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: tdio %p is seeky\n", bfq_tdio);
-
-       /*
-        * If a tdio taks too long to think, we disable the AS feature of it.
-        */
-       if ((bfq_tdio->ttime_samples > BFQ_VALID_MIN_SAMPLES) &&
-           (bfq_tdio->ttime_avg > BFQ_T_WAIT * (1000 / hz) * 1000) &&
-           (bfq_tdio->service_received > bfq_tdio->budget / 8)) {
-               dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: tdio %p takes too long time to think\n", bfq_tdio);
-               bfq_tdio->tdio_as_switch = 0;
-       } else {
-               bfq_tdio->tdio_as_switch = 1;
-       }
-
-       /* insert the bio into the tdio's own queue */
-       KKASSERT(lockstatus(&tdio->lock, curthread) == LK_EXCLUSIVE);
-       TAILQ_INSERT_TAIL(&tdio->queue, bio, link);
-#if 0
-       tdio->qlength++;
-#endif
-       original_qlength = atomic_fetchadd_int(&tdio->qlength, 1);
-       DSCHED_THREAD_IO_UNLOCK(tdio);
-       /*
-        * A new thread:
-        * In dequeue function, we remove the thread
-        * from the aug-tree if it has no further bios.
-        * Therefore "new" means a really new thread (a
-        * newly created thread or a thread that pushed no more
-        * bios when the scheduler was waiting for it) or
-        * one that was removed from the aug-tree earlier.
-        */
-       if (original_qlength == 0) {
-               /*
-                * a really new thread
-                */
-               BFQ_LOCK(bfq_diskctx);
-               if (bfq_tdio != bfq_diskctx->bfq_active_tdio) {
-                       /* insert the tdio into the wf2q queue */
-                       wf2q_insert_thread_io(&bfq_diskctx->bfq_wf2q, bfq_tdio);
-               } else {
-                       /*
-                        * the thread being waited by the scheduler
-                        */
-                       if (bfq_diskctx->bfq_blockon == bfq_tdio) {
-                               /*
-                                * XXX: possible race condition here:
-                                * if the callout function is triggered when
-                                * the following code is executed, then after
-                                * releasing the TDIO lock, the callout function
-                                * will set the thread inactive and it will never
-                                * be inserted into the aug-tree (so its bio pushed
-                                * this time will not be dispatched) until it pushes
-                                * further bios
-                                */
-                               bfq_diskctx->bfq_as_hit++;
-                               bfq_update_as_avg_wait(bfq_diskctx, bfq_tdio, BFQ_AS_STAT_ALL);
-
-                               if (callout_pending(&bfq_diskctx->bfq_callout))
-                                       callout_stop(&bfq_diskctx->bfq_callout);
-                               bfq_diskctx->bfq_blockon = NULL;
-
-                               /* ref'ed in dequeue(), before resetting callout */
-                               dsched_thread_io_unref(&bfq_tdio->head);
-
-                               dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: %p pushes a new bio when AS\n", bfq_tdio);
-                       }
-               }
-
-               BFQ_UNLOCK(bfq_diskctx);
-       }
-
-       helper_msg_dequeue(bfq_diskctx);
-
-       return 0;
-}
-
-/*
- * bfq_dequeue(): dispatch bios to the disk driver.
- *
- * This function will push as many bios as the number of free slots
- * in the tag queue.
- *
- * In the progress of dispatching, the following events may happen:
- *  - Current thread is timeout: Expire the current thread for
- *    BFQ_REASON_TIMEOUT, and select a new thread to serve in the
- *    wf2q tree.
- *
- *  - Current thread runs out of its budget: Expire the current thread
- *    for BFQ_REASON_OUT_OF_BUDGET, and select a new thread to serve
- *
- *  - Current thread has no further bios in its queue: if the AS feature
- *    is turned on, the bfq scheduler sets an alarm and starts to suspend.
- *    The bfq_timeout() or bfq_queue() calls may resume the scheduler.
- *
- * Implementation note: The bios selected to be dispatched will first
- * be stored in an array bio_do_dispatch. After this function releases
- * all the locks it holds, it will call dsched_strategy_request_polling()
- * for each bio stored.
- *
- * With the help of bfq_disk_ctx->pending_dequeue,
- * there will be only one bfq_dequeue pending on the BFQ_LOCK.
- *
- * lock:
- *     BFQ_LOCK: protect from wf2q_augtree operations in bfq_queue()
- *     THREAD_IO_LOCK: locks the active_tdio. Protect from queue insertions
- *     in bfq_queue; Protect the active_tdio->budget
- *
- * refcount:
- *  If the scheduler decides to suspend, the refcount of active_tdio
- *  increases by 1. The counterpart decreasing is in bfq_queue() and
- *  bfq_timeout()
- * blocking:
- *  May be blocking on the disk driver lock. It depends on drivers.
- *
- * Calling path:
- * The callers could be:
- *     bfq_queue(), bfq_timeout() and the registered polling function.
- *
- *     caller --> helper_msg_dequeue --lwkt_msg--> helper_thread-> me
- *
- */
-void
-bfq_dequeue(struct dsched_disk_ctx *diskctx)
-{
-       int free_slots,
-           bio_index = 0, i,
-           remaining_budget = 0;/* remaining budget of current active process */
-
-       struct bio *bio, *bio_to_dispatch[33];
-       struct bfq_thread_io *active_tdio = NULL;
-       struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)diskctx;
-
-       BFQ_LOCK(bfq_diskctx);
-       atomic_cmpset_int(&bfq_diskctx->pending_dequeue, 1, 0);
-
-       /*
-        * The whole scheduler is waiting for further bios
-        * from process currently being served
-        */
-       if (bfq_diskctx->bfq_blockon != NULL)
-               goto rtn;
-
-       remaining_budget = bfq_diskctx->bfq_remaining_budget;
-       active_tdio = bfq_diskctx->bfq_active_tdio;
-       dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: dequeue: Im in. active_tdio = %p\n", active_tdio);
-
-       free_slots = diskctx->max_tag_queue_depth - diskctx->current_tag_queue_depth;
-       KKASSERT(free_slots >= 0 && free_slots <= 32);
-
-       if (active_tdio)
-               DSCHED_THREAD_IO_LOCK(&active_tdio->head);
-
-       while (free_slots) {
-               /* Here active_tdio must be locked ! */
-               if (active_tdio) {
-                       /*
-                        * the bio_done function has marked the current
-                        * tdio timeout
-                        */
-                       if (active_tdio->maybe_timeout) {
-                               dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: %p time out in dequeue()\n", active_tdio);
-                               wf2q_update_vd(active_tdio, active_tdio->budget - remaining_budget);
-                               bfq_expire(bfq_diskctx, active_tdio, BFQ_REASON_TIMEOUT);
-
-                               /* there still exist bios not dispatched,
-                                * reinsert the tdio into aug-tree*/
-                               if (active_tdio->head.qlength > 0) {
-                                       wf2q_insert_thread_io(&bfq_diskctx->bfq_wf2q, active_tdio);
-                                       KKASSERT(bfq_diskctx->bfq_wf2q.wf2q_tdio_count);
-                               }
-
-                               active_tdio->maybe_timeout = 0;
-                               DSCHED_THREAD_IO_UNLOCK(&active_tdio->head);
-                               active_tdio = NULL;
-                               continue;
-                       }
-
-                       /* select next bio to dispatch */
-                       /* TODO: a wiser slection */
-                       KKASSERT(lockstatus(&active_tdio->head.lock, curthread) == LK_EXCLUSIVE);
-                       bio = TAILQ_FIRST(&active_tdio->head.queue);
-                       dsched_debug(BFQ_DEBUG_NORMAL, "bfq: the first bio in queue of active_tdio %p is %p\n", active_tdio, bio);
-
-                       dsched_debug(BFQ_DEBUG_VERBOSE, "bfq: active_tdio %p exists, remaining budget = %d, tdio budget = %d\n, qlength = %d, first bio = %p, first bio cmd = %d, first bio size = %d\n", active_tdio, remaining_budget, active_tdio->budget, active_tdio->head.qlength, bio, bio?bio->bio_buf->b_cmd:-1, bio?bio->bio_buf->b_bcount:-1);
-
-                       /*
-                        * The bio is not read or write, just
-                        * push it down.
-                        */
-                       if (bio && (bio->bio_buf->b_cmd != BUF_CMD_READ) &&
-                           (bio->bio_buf->b_cmd != BUF_CMD_WRITE)) {
-                               dsched_debug(BFQ_DEBUG_NORMAL, "bfq: remove bio %p from the queue of %p\n", bio, active_tdio);
-                               KKASSERT(lockstatus(&active_tdio->head.lock, curthread) == LK_EXCLUSIVE);
-                               TAILQ_REMOVE(&active_tdio->head.queue, bio, link);
-                               active_tdio->head.qlength--;
-                               free_slots--;
-
-#if 0
-                               dsched_strategy_request_polling(diskctx->dp, bio, diskctx);
-#endif
-                               bio_to_dispatch[bio_index++] = bio;
-                               KKASSERT(bio_index <= bfq_diskctx->head.max_tag_queue_depth);
-                               continue;
-                       }
-                       /*
-                        * Run out of budget
-                        * But this is not because the size of bio is larger
-                        * than the complete budget.
-                        * If the size of bio is larger than the complete
-                        * budget, then use a complete budget to cover it.
-                        */
-                       if (bio && (remaining_budget < BIO_SIZE(bio)) &&
-                           (remaining_budget != active_tdio->budget)) {
-                               /* charge budget used */
-                               wf2q_update_vd(active_tdio, active_tdio->budget - remaining_budget);
-                               bfq_expire(bfq_diskctx, active_tdio, BFQ_REASON_OUT_OF_BUDGET);
-                               wf2q_insert_thread_io(&bfq_diskctx->bfq_wf2q, active_tdio);
-                               dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: thread %p ran out of budget\n", active_tdio);
-                               DSCHED_THREAD_IO_UNLOCK(&active_tdio->head);
-                               active_tdio = NULL;
-                       } else { /* if (bio && remaining_budget < BIO_SIZE(bio) && remaining_budget != active_tdio->budget) */
-
-                               /*
-                                * Having enough budget,
-                                * or having a complete budget and the size of bio
-                                * is larger than that.
-                                */
-                               if (bio) {
-                                       /* dispatch */
-                                       remaining_budget -= BIO_SIZE(bio);
-                                       /*
-                                        * The size of the first bio is larger
-                                        * than the whole budget, we should
-                                        * charge the extra part
-                                        */
-                                       if (remaining_budget < 0)
-                                               wf2q_update_vd(active_tdio, -remaining_budget);
-                                       /* compensate */
-                                       wf2q_update_vd(active_tdio, -remaining_budget);
-                                       /*
-                                        * remaining_budget may be < 0,
-                                        * but to prevent the budget of current tdio
-                                        * to substract a negative number,
-                                        * the remaining_budget has to be >= 0
-                                        */
-                                       remaining_budget = MAX(0, remaining_budget);
-                                       dsched_debug(BFQ_DEBUG_NORMAL, "bfq: remove bio %p from the queue of %p\n", bio, active_tdio);
-                                       KKASSERT(lockstatus(&active_tdio->head.lock, curthread) == LK_EXCLUSIVE);
-                                       TAILQ_REMOVE(&active_tdio->head.queue, bio, link);
-                                       free_slots--;
-                                       active_tdio->head.qlength--;
-                                       active_tdio->bio_dispatched++;
-                                       wf2q_inc_tot_service(&bfq_diskctx->bfq_wf2q, BIO_SIZE(bio));
-                                       dsched_debug(BFQ_DEBUG_VERBOSE,
-                                           "BFQ: %p's bio dispatched, size=%d, remaining_budget = %d\n",
-                                           active_tdio, BIO_SIZE(bio), remaining_budget);
-#if 0
-                                       dsched_strategy_request_polling(diskctx->dp, bio, diskctx);
-#endif
-                                       bio_to_dispatch[bio_index++] = bio;
-                                       KKASSERT(bio_index <= bfq_diskctx->head.max_tag_queue_depth);
-
-                               } else { /* if (bio) */
-
-                                       KKASSERT(active_tdio);
-                                       /*
-                                        * If AS feature is switched off,
-                                        * expire the tdio as well
-                                        */
-                                       if ((remaining_budget <= 0) ||
-                                           !(bfq_diskctx->bfq_flag & BFQ_FLAG_AS) ||
-                                           !active_tdio->tdio_as_switch) {
-                                               active_tdio->budget -= remaining_budget;
-                                               wf2q_update_vd(active_tdio, active_tdio->budget);
-                                               bfq_expire(bfq_diskctx, active_tdio, BFQ_REASON_OUT_OF_BUDGET);
-                                               DSCHED_THREAD_IO_UNLOCK(&active_tdio->head);
-                                               active_tdio = NULL;
-                                       } else {
-
-                                               /* no further bio, wait for a while */
-                                               bfq_diskctx->bfq_blockon = active_tdio;
-                                               /*
-                                                * Increase ref count to ensure that
-                                                * tdio will not be destroyed during waiting.
-                                                */
-                                               dsched_thread_io_ref(&active_tdio->head);
-                                               /*
-                                                * If the tdio is seeky but not thingking for
-                                                * too long, we wait for it a little shorter
-                                                */
-                                               if (active_tdio->seek_samples >= BFQ_VALID_MIN_SAMPLES && BFQ_TDIO_SEEKY(active_tdio))
-                                                       callout_reset(&bfq_diskctx->bfq_callout, BFQ_T_WAIT_MIN, (void (*) (void *))helper_msg_as_timeout, bfq_diskctx);
-                                               else
-                                                       callout_reset(&bfq_diskctx->bfq_callout, BFQ_T_WAIT, (void (*) (void *))helper_msg_as_timeout, bfq_diskctx);
-
-                                               /* save the start time of blocking */
-                                               getmicrotime(&active_tdio->as_start_time);
-
-                                               dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: blocked on %p, remaining_budget = %d\n", active_tdio, remaining_budget);
-                                               DSCHED_THREAD_IO_UNLOCK(&active_tdio->head);
-                                               goto save_and_rtn;
-                                       }
-                               }
-                       }
-               } else { /* if (active_tdio) */
-                       /* there is no active tdio */
-
-                       /* no pending bios at all */
-                       active_tdio = wf2q_get_next_thread_io(&bfq_diskctx->bfq_wf2q);
-
-                       if (!active_tdio) {
-                               KKASSERT(bfq_diskctx->bfq_wf2q.wf2q_tdio_count == 0);
-                               dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: no more eligible tdio!\n");
-                               goto save_and_rtn;
-                       }
-
-                       /*
-                        * A new tdio is picked,
-                        * initialize the service related statistic data
-                        */
-                       DSCHED_THREAD_IO_LOCK(&active_tdio->head);
-                       active_tdio->service_received = 0;
-
-                       /*
-                        * Reset the maybe_timeout flag, which
-                        * may be set by a biodone after the the service is done
-                        */
-                       getmicrotime(&active_tdio->service_start_time);
-                       active_tdio->maybe_timeout = 0;
-
-                       remaining_budget = active_tdio->budget;
-                       dsched_debug(BFQ_DEBUG_VERBOSE, "bfq: active_tdio %p selected, remaining budget = %d, tdio budget = %d\n, qlength = %d\n", active_tdio, remaining_budget, active_tdio->budget, active_tdio->head.qlength);
-               }
-
-       }/* while (free_slots) */
-
-       /* reach here only when free_slots == 0 */
-       if (active_tdio) /* && lockcount(&active_tdio->head.lock) > 0) */
-               DSCHED_THREAD_IO_UNLOCK(&active_tdio->head);
-
-save_and_rtn:
-       /* save the remaining budget */
-       bfq_diskctx->bfq_remaining_budget = remaining_budget;
-       bfq_diskctx->bfq_active_tdio = active_tdio;
-rtn:
-       BFQ_UNLOCK(bfq_diskctx);
-       /*dispatch the planned bios*/
-       for (i = 0; i < bio_index; i++)
-               dsched_strategy_request_polling(diskctx->dp, bio_to_dispatch[i], diskctx);
-
-}
-
-/*
- * bfq_slow_tdio(): decide whether a tdio is slow
- *
- * This function decides whether a tdio is slow by the speed
- * estimated from the current time slice start time: if the
- * tdio is not fast enough to consume its budget (or 2/3
- * its budget) within the time slice, it is judged slow.
- *
- * Called by bfq_expire()
- *
- * lock:
- *  THREAD_IO_LOCK is expected to be held.
- * refcount:
- *     none
- *
- */
-static int
-bfq_slow_tdio(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio)
-{
-       /**
-        * A tdio is considered slow if it can not finish its budget
-        * at its current average speed
-        */
-       uint64_t usec_elapsed, service_received, speed;
-       int expect;
-       struct timeval tv = bfq_tdio->last_request_done_time;
-
-       timevalsub (&tv, &bfq_tdio->service_start_time);
-       usec_elapsed = (uint64_t)(1000000 * (uint64_t)tv.tv_sec + tv.tv_usec);
-
-       /* discard absurd value */
-       if (usec_elapsed < 20000)
-               return 0;
-
-       service_received = (uint64_t)bfq_tdio->service_received << BFQ_FIXPOINT_SHIFT;
-       speed = service_received / usec_elapsed;
-       expect = (speed * BFQ_SLICE_TIMEOUT * (1000 * 1000 / hz)) >> BFQ_FIXPOINT_SHIFT;
-
-       if (expect < 0) {
-               dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: overflow on calculating slow_tdio\n");
-               return 0;
-       }
-
-       if (expect < bfq_tdio->budget * 2 / 3) {
-               dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: %p is judged slow\n", bfq_tdio);
-               return 1;
-       }
-
-       return 0;
-}
-
-/*
- * bfq_expire(): expire a tdio for a given reason.
- *
- * Different amount of the new budget will be assign to the expired
- * tdio according to the following reasons:
- *
- * BFQ_REASON_TIMEOUT:
- *  The tdio does not consume its budget up within BFQ_SLICE_TIMEOUT ticks.
- *  We shall update the disk peak rate if the tdio is not seeky. The new
- *  budget will be the budget it actually consumes during this time
- *  slice.
- *
- * BFQ_REASON_TOO_IDLE:
- *  The tdio does not push any further bios during the scheduler is
- *  suspending. To ensure low global latency, this tdio should be
- *  punished by assign it the minimum budget. But if the tdio's not
- *  pushing any bio is because it is waiting for the dispatched bios
- *  to be done, we just keep the budget unchanged.
- *
- * BFQ_REASON_OUT_OF_BUDGET:
- *     The tdio runs out of its budget within the time slice. It usually
- *     indicates that the tdio is doing well. We increase the budget of it.
- *
- * lock:
- *  THREAD_IO_LOCK is expected to be held.
- *  BFQ_LOCK is expected to be held (needed by bfq_update_peak_rate()).
- *
- * refcount: none
- *
- * Callers: bfq_timeout(), bfq_dequeue()
- *
- */
-static void
-bfq_expire(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio, enum bfq_expire_reason reason)
-{
-       int max_budget = bfq_diskctx->bfq_max_budget,
-               budget_left,
-               bio_in_flight,
-               service_received;
-
-       service_received = bfq_tdio->service_received;
-       budget_left = bfq_tdio->budget - bfq_tdio->service_received;
-
-       if (budget_left < 0) {
-               dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: budget down flow: %d, %d\n", bfq_tdio->budget, bfq_tdio->service_received);
-               budget_left = 0;
-       }
-
-       KKASSERT(budget_left >= 0);
-
-       switch (reason) {
-               case BFQ_REASON_TIMEOUT:
-                       /* the tdio is not seeky so that we can update
-                        * the disk peak rate based on the service received
-                        * by the tdio
-                        */
-                       if ((bfq_tdio->seek_samples >= BFQ_VALID_MIN_SAMPLES) &&
-                           (!BFQ_TDIO_SEEKY(bfq_tdio)))
-                               bfq_update_peak_rate(bfq_diskctx, bfq_tdio);
-
-                       /* max_budget may be updated */
-                       max_budget = bfq_diskctx->bfq_max_budget;
-
-                       /* update budget to service_received*/
-                       bfq_tdio->budget = MAX(service_received, BFQ_DEFAULT_MIN_BUDGET);
-
-                       break;
-
-               case BFQ_REASON_TOO_IDLE:
-                       /*
-                        * the tdio is too slow, charge full budget
-                        */
-                       if (bfq_slow_tdio(bfq_diskctx, bfq_tdio))
-                               wf2q_update_vd(bfq_tdio, budget_left);
-
-                       bio_in_flight = bfq_tdio->bio_dispatched - bfq_tdio->bio_completed;
-                       KKASSERT(bio_in_flight >= 0);
-                       /*
-                        * maybe the tdio pushes no bio
-                        * because it is waiting for some bios
-                        * dispatched to be done, in this case
-                        * we do not reduce the budget too harshly
-                        */
-                       if (bio_in_flight > 0) {
-                               bfq_tdio->budget = MAX(BFQ_DEFAULT_MIN_BUDGET, service_received);
-                       } else {
-#if 0
-                               bfq_tdio->budget = MAX(BFQ_DEFAULT_MIN_BUDGET, bfq_diskctx->bfq_max_budget / BFQ_MIN_BUDGET_FACTOR);
-#endif
-                               bfq_tdio->budget = BFQ_DEFAULT_MIN_BUDGET;
-                       }
-
-                       break;
-               case BFQ_REASON_OUT_OF_BUDGET:
-
-                       if ((bfq_tdio->seek_samples >= BFQ_VALID_MIN_SAMPLES) &&
-                           (!BFQ_TDIO_SEEKY(bfq_tdio)))
-                               bfq_update_peak_rate(bfq_diskctx, bfq_tdio);
-
-                       /* increase the budget */
-                       if (bfq_tdio->budget < BFQ_BUDGET_MULTIPLE_THRESHOLD)
-                               bfq_tdio->budget = MIN(max_budget, bfq_tdio->budget * 2);
-                       else
-                               bfq_tdio->budget = MIN(max_budget, bfq_tdio->budget + BFQ_BUDG_INC_STEP);
-                       break;
-               default:
-                       break;
-       }
-}
-
-/*
- * bfq_update_peak_rate(): update the peak disk speed by sampling
- * the throughput within a time slice.
- *
- * lock:
- *  BFQ_LOCK is expected to be held
- *
- * refcount:
- *     none
- *
- * Caller: bfq_expire()
- */
-static void
-bfq_update_peak_rate(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio)
-{
-       struct timeval tv = bfq_tdio->last_request_done_time;
-       uint64_t usec, service_received, peak_rate;
-
-
-       timevalsub (&tv, &bfq_tdio->service_start_time);
-       usec = (uint64_t)(1000000 * (uint64_t)tv.tv_sec + tv.tv_usec);
-
-       /* discard absurd value */
-       if (usec < 2000 || usec > (BFQ_SLICE_TIMEOUT * (1000 / hz) * 1000)) {
-               dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: absurd interval for peak rate\n");
-               return;
-       }
-
-       service_received = (uint64_t)bfq_tdio->service_received << BFQ_FIXPOINT_SHIFT;
-       peak_rate = service_received / usec;
-       bfq_diskctx->bfq_peak_rate = (peak_rate + 7 * bfq_diskctx->bfq_peak_rate) / 8;
-       bfq_diskctx->bfq_peak_rate_samples++;
-
-       /* update the max_budget according to the peak rate */
-       if (bfq_diskctx->bfq_peak_rate_samples > BFQ_VALID_MIN_SAMPLES) {
-               bfq_diskctx->bfq_peak_rate_samples = BFQ_VALID_MIN_SAMPLES;
-               /*
-                * if the auto max budget adjust is disabled,
-                * the bfq_max_budget will always be BFQ_DEFAULT_MAX_BUDGET;
-                */
-               if (bfq_diskctx->bfq_flag & BFQ_FLAG_AUTO_MAX_BUDGET) {
-                       bfq_diskctx->bfq_max_budget =
-                               (uint32_t)((BFQ_SLICE_TIMEOUT * (1000 / hz) * bfq_diskctx->bfq_peak_rate * 1000) >> BFQ_FIXPOINT_SHIFT);
-                       dsched_debug(BFQ_DEBUG_NORMAL, "max budget updated to %d\n", bfq_diskctx->bfq_max_budget);
-               }
-       }
-}
-
-/*
- * bfq_update_tdio_seek_avg(): update the average seek distance of a
- * tdio.
- *
- * lock:
- *     THREAD_IO_LOCK is expected to be held.
- *
- * refcount:
- *  none
- *
- * Caller: bfq_queue()
- */
-static void
-bfq_update_tdio_seek_avg(struct bfq_thread_io *bfq_tdio, struct bio *bp)
-{
-       off_t seek;
-
-       /* the first bio it dispatches,
-        * we do not calculate the seek_avg,
-        * just update the last_seek_end
-        */
-       if (bfq_tdio->seek_samples == 0) {
-               ++bfq_tdio->seek_samples;
-               goto rtn;
-       }
-
-       seek = ABS(bp->bio_offset - bfq_tdio->last_seek_end);
-
-       /*
-        * we do not do seek_samples++,
-        * because the seek_total may overflow if seek_total += seek,
-        */
-       bfq_tdio->seek_samples = (7 * bfq_tdio->seek_samples + 256) / 8;
-       bfq_tdio->seek_total = (7 * bfq_tdio->seek_total + 256 * seek) / 8;
-       bfq_tdio->seek_avg = (bfq_tdio->seek_total + bfq_tdio->seek_samples / 2) / bfq_tdio->seek_samples;
-
-       dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: tdio %p seek_avg updated to %" PRIu64 "\n", bfq_tdio, bfq_tdio->seek_avg);
-
-rtn:
-       bfq_tdio->last_seek_end = bp->bio_offset + BIO_SIZE(bp);
-}
-
-/*
- * bfq_update_tdio_ttime_avg(): update the average thinking time
- * of a tdio.
- *
- * The thinking time is used to switch on / off the tdio's AS feature
- *
- * lock:
- *  THREAD_IO_LOCK is expected to be held.
- *
- * refcount:
- *  none
- *
- * Caller:
- *  bfq_queue()
- *
- */
-static void
-bfq_update_tdio_ttime_avg(struct bfq_thread_io *bfq_tdio)
-{
-       struct timeval tv, after_start;
-       uint64_t usec;
-
-       if (bfq_tdio->ttime_samples == 0) {
-               ++bfq_tdio->ttime_samples;
-               return;
-       }
-
-       getmicrotime(&tv);
-       after_start = bfq_tdio->last_request_done_time;
-
-#if 0
-       timevalsub (&tv, &bfq_tdio->last_request_done_time);
-#endif
-       /*
-        * Try the interval between two bios are pushed,
-        * instead of between last_request_done_time and
-        * the current time.
-        */
-
-       timevalsub (&tv, &bfq_tdio->last_bio_pushed_time);
-
-       timevalsub (&after_start, &bfq_tdio->service_start_time);
-
-       /*
-        * tv.tv_sec < 0 means the last reauest done time is
-        * after the current time.
-        * this may happen because the biodone function is not blocked
-        *
-        * after_start.tv_sec < 0 means that the last bio done happens
-        * before the current service slice, and we should drop this value.
-        */
-       if (tv.tv_sec < 0 || after_start.tv_sec < 0)
-               return;
-
-       usec = (uint64_t)(1000000 * (uint64_t)tv.tv_sec + tv.tv_usec);
-
-       bfq_tdio->ttime_samples = (7 * bfq_tdio->ttime_samples + 256) / 8;
-       bfq_tdio->ttime_total = (7 * bfq_tdio->ttime_total + 256 * usec) / 8;
-       bfq_tdio->ttime_avg = (bfq_tdio->ttime_total + 128) / bfq_tdio->ttime_samples;
-
-}
-
-/*
- * This function will also update the bfq_max_time_slice field
- *
- * tv: the timeval structure representing the length of time slice
- */
-static void
-bfq_update_avg_time_slice(struct bfq_disk_ctx *bfq_diskctx, struct timeval tv)
-{
-       uint32_t msec;
-
-       msec = ((uint64_t)(1000000 * (uint64_t)tv.tv_sec + tv.tv_usec) >> 10 );
-
-       if (msec > 3 * BFQ_SLICE_TIMEOUT * (1000 / hz))
-               atomic_add_int(&bfq_diskctx->bfq_high_time_slice_count, 1);
-
-       bfq_diskctx->bfq_avg_time_slice =
-               (7 * bfq_diskctx->bfq_avg_time_slice + msec) / 8;
-
-       if (bfq_diskctx->bfq_max_time_slice < msec)
-               bfq_diskctx->bfq_max_time_slice = msec;
-}
-/*
- * This function will also update the bfq_as_max_wait field
- * flag: BFQ_AS_STAT_ALL, BFQ_AS_STAT_ONLY_MISS
- *
- */
-static void
-bfq_update_as_avg_wait(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio, int flag)
-{
-       struct timeval tv;
-       uint32_t msec;
-       getmicrotime(&tv);
-       timevalsub (&tv, &bfq_tdio->as_start_time);
-
-       /* approximately divide 1000 by left shift 10 */
-       msec = ((uint64_t)(1000000 * (uint64_t)tv.tv_sec + tv.tv_usec) >> 10 );
-
-       /* ridiculous value */
-       if (msec > 10000) {
-               dsched_debug(BFQ_DEBUG_NORMAL, "bfq: ridiculous as wait time!\n");
-               return;
-       }
-
-       if (msec > 5 * BFQ_T_WAIT_MIN * (1000 / hz))
-               atomic_add_int(&bfq_diskctx->bfq_as_high_wait_count, 1);
-
-       if (flag & BFQ_AS_STAT_ALL) {
-               bfq_diskctx->bfq_as_avg_wait_all =
-                       (7 * bfq_diskctx->bfq_as_avg_wait_all + msec) / 8;
-       }
-
-       if (flag & BFQ_AS_STAT_ONLY_MISS) {
-               bfq_diskctx->bfq_as_avg_wait_miss =
-                       (7 * bfq_diskctx->bfq_as_avg_wait_miss + msec) / 8;
-       }
-
-       /* update the maximum waiting time */
-       if (bfq_diskctx->bfq_as_max_wait < msec)
-               bfq_diskctx->bfq_as_max_wait = msec;
-
-       return;
-}
-
-static int
-bfq_mod_handler(module_t mod, int type, void *unused)
-{
-       static struct sysctl_ctx_list sysctl_ctx;
-       static struct sysctl_oid *oid;
-       static char version[16];
-       int error;
-
-       ksnprintf(version, sizeof(version), "%d.%d",
-                       dsched_bfq_version_maj, dsched_bfq_version_min);
-
-       switch (type) {
-       case MOD_LOAD:
-               bzero(&bfq_stats, sizeof(struct dsched_bfq_stats));
-               if ((error = dsched_register(&dsched_bfq_policy)))
-                       return (error);
-
-               sysctl_ctx_init(&sysctl_ctx);
-               oid = SYSCTL_ADD_NODE(&sysctl_ctx,
-                   SYSCTL_STATIC_CHILDREN(_dsched),
-                   OID_AUTO,
-                   "bfq",
-                   CTLFLAG_RD, 0, "");
-               bfq_mod_oid = oid;
-
-               SYSCTL_ADD_STRING(&sysctl_ctx, SYSCTL_CHILDREN(oid),
-                   OID_AUTO, "version", CTLFLAG_RD, version, 0, "bfq version");
-               helper_init_global();
-
-               kprintf("BFQ scheduler policy version %d.%d loaded. sizeof(bfq_thread_io) = %zu\n",
-                   dsched_bfq_version_maj, dsched_bfq_version_min, sizeof(struct bfq_thread_io));
-               break;
-
-       case MOD_UNLOAD:
-               if ((error = dsched_unregister(&dsched_bfq_policy)))
-                       return (error);
-               sysctl_ctx_free(&sysctl_ctx);
-               kprintf("BFQ scheduler policy unloaded\n");
-               break;
-
-       default:
-               break;
-       }
-
-       return 0;
-}
-
-int
-bfq_sysctl_as_switch_handler(SYSCTL_HANDLER_ARGS)
-{
-       struct bfq_disk_ctx *bfq_diskctx = arg1;
-       int as_switch, error;
-
-       as_switch = ((bfq_diskctx->bfq_flag & BFQ_FLAG_AS) ? 1 : 0);
-       error = sysctl_handle_int(oidp, &as_switch, 0, req);
-       if (error || !req->newptr)
-               return error;
-
-       if (as_switch == 1)
-               bfq_diskctx->bfq_flag |= BFQ_FLAG_AS;
-       else if (as_switch == 0)
-               bfq_diskctx->bfq_flag &= ~(BFQ_FLAG_AS);
-       else
-               return 0;
-
-       return error;
-}
-
-int
-bfq_sysctl_auto_max_budget_handler(SYSCTL_HANDLER_ARGS)
-{
-       struct bfq_disk_ctx *bfq_diskctx = arg1;
-       int auto_max_budget_switch, error;
-       auto_max_budget_switch = ((bfq_diskctx->bfq_flag & BFQ_FLAG_AUTO_MAX_BUDGET) ? 1 : 0);
-       error = sysctl_handle_int(oidp, &auto_max_budget_switch, 0, req);
-       if (error || !req->newptr)
-               return error;
-
-       if (auto_max_budget_switch == 1)
-               bfq_diskctx->bfq_flag |= BFQ_FLAG_AUTO_MAX_BUDGET;
-       else if (auto_max_budget_switch == 0)
-               bfq_diskctx->bfq_flag &= ~(BFQ_FLAG_AUTO_MAX_BUDGET);
-       else
-               return 0;
-
-       return error;
-}
-
-DSCHED_POLICY_MODULE(dsched_bfq, bfq_mod_handler, 1);
diff --git a/sys/kern/dsched/bfq/bfq.h b/sys/kern/dsched/bfq/bfq.h
deleted file mode 100644 (file)
index 2ba8d59..0000000
+++ /dev/null
@@ -1,238 +0,0 @@
-/*
- * Copyright (c) 2011 The DragonFly Project.  All rights reserved.
- *
- * This code is derived from software contributed to The DragonFly Project
- * by Brills Peng <brillsp@gmail.com>
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- * 3. Neither the name of The DragonFly Project nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific, prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
- * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
- * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-
-#ifndef _DSCHED_BFQ_H_
-#define _DSCHED_BFQ_H_
-
-#if defined(_KERNEL) || defined(_KERNEL_STRUCTURES)
-
-#ifndef _SYS_QUEUE_H_
-#include <sys/queue.h>
-#endif
-
-#ifndef _SYS_BIO_H_
-#include <sys/bio.h>
-#endif
-
-#ifndef _SYS_BIOTRACK_H_
-#include <sys/biotrack.h>
-#endif
-
-#ifndef _SYS_SPINLOCK_H_
-#include <sys/spinlock.h>
-#endif
-
-#ifndef _SYS_TREE_H_
-#include <sys/tree.h>
-#endif
-
-#ifndef _SYS_DSCHED_H_
-#include <sys/dsched.h>
-#endif
-
-#ifndef _DSCHED_BFQ_WF2Q_H_
-#include <kern/dsched/bfq/wf2q.h>
-#endif
-
-struct wf2q_t;
-
-struct bfq_thread_io {
-       struct dsched_thread_io head;
-       RB_ENTRY(bfq_thread_io) entry;
-       int budget;     /* The budget of a thread */
-       int vd;         /* Virtual deadline (finish time) */
-       int ve;         /* Virtual eligible time (start time) */
-       int min_vd;     /* Minimum vd among the sub trees, used for augmented rb-tree */
-       int weight;     /* Weight of the thread, the higher, the more
-                          chance to be dispatched the thread will have */
-
-       volatile int maybe_timeout;     /* a flag indicating that the tdio may
-                                         expire, only when active_tdio = this is it valid */
-       int tdio_as_switch;
-
-       /* Statistic data */
-       off_t   last_seek_end;  /* the end point of seeking of the last bio
-                                                          pushed down */
-       uint32_t seek_samples;  /* averange seek length samples */
-       off_t   seek_avg;       /* averange seek length, fixed point */
-       off_t   seek_total;
-
-       uint32_t ttime_samples; /* averange think time samples */
-       uint64_t ttime_avg;     /* averange think time, usec */
-       uint64_t ttime_total;
-
-       struct timeval service_start_time; /* the time when the first request
-                                                 of the current service period is dispatched */
-       struct timeval last_request_done_time;  /* the time when the last
-                                                  request is done */
-       struct timeval as_start_time;   /* the start time of AS waiting */
-       struct timeval last_bio_pushed_time;
-
-       uint32_t service_received;      /* the amount of read/write during
-                                          the time slice */
-       uint32_t bio_dispatched;        /* number of bios dispatched during
-                                          the current period */
-       uint32_t bio_completed;         /* number of bios completed during
-                                          the current period */
-};
-
-struct bfq_disk_ctx {
-       struct dsched_disk_ctx head;
-
-       struct lock bfq_lock;
-
-       struct callout bfq_callout;     /* the blocking-timer callout */
-       struct wf2q_t bfq_wf2q;         /* the wf2q scheduler */
-
-       struct bfq_thread_io *bfq_blockon;      /* waiting on any */
-       struct bfq_thread_io *bfq_active_tdio;  /* currently active tdio */
-
-       int pending_dequeue; /* number of dequeue() calls pending
-                               on BFQ_LOCK */
-
-       int bfq_max_budget;
-       int bfq_remaining_budget; /* remaining budget of the current tdio */
-
-       uint32_t bfq_flag; /* SEE BFQ_FLAG_* define for all flags */
-
-       /* Statistic data */
-       uint32_t bfq_peak_rate_samples; /* peak rate samples */
-       uint64_t bfq_peak_rate;         /* peak rate, fixed point */
-
-       int bfq_as_miss;
-       int bfq_as_hit;
-
-       uint32_t bfq_as_avg_wait_miss;  /* average AS waiting time for
-                                          only AS miss, ms */
-       uint32_t bfq_as_avg_wait_all;   /* average AS waiting time for all, ms */
-       uint32_t bfq_as_max_wait;       /* maximum AS waiting time, ms */
-       uint32_t bfq_as_max_wait2;      /* maximum AS waiting time(from callout), ms */
-
-       int bfq_as_high_wait_count; /* the number of times when AS waiting time
-                                      is longer than 5 * BFQ_T_WAIT_MIN (50ms now) */
-       int bfq_as_high_wait_count2; /* the number of times when AS waiting
-                                       time is longer than 5 * BFQ_T_WAIT_MIN (50ms now) */
-
-       uint32_t bfq_avg_time_slice;    /* average time slice length, ms */
-       uint32_t bfq_max_time_slice;    /* maximum time slice length, ms */
-       int bfq_high_time_slice_count;  /* the number of times when a time slice
-                                           is longer than 5 * BFQ_SLICE_TIMEOUT */
-
-       struct sysctl_ctx_list bfq_sysctl_ctx; /* bfq statistics interface
-                                                 with sysctl */
-       /* helper thread and its lwkt message cache and port*/
-       struct thread *helper_thread;
-       struct objcache *helper_msg_cache;
-       struct lwkt_port helper_msg_port;
-};
-
-enum bfq_expire_reason {
-       BFQ_REASON_TIMEOUT = 0,
-       BFQ_REASON_TOO_IDLE,
-       BFQ_REASON_OUT_OF_BUDGET,
-       BFQ_REASON_NO_MORE_REQ
-};
-
-#define BFQ_FLAG_AS 0x01
-#define BFQ_FLAG_AUTO_MAX_BUDGET 0x02
-
-#define BFQ_TDIO_SEEKY(x) (((x)->seek_avg) > (1024 * SECT_SIZE))
-
-#define BFQ_LOCKINIT(x)                        \
-               lockinit(&(x)->bfq_lock, "bfqwf2q", 0, LK_CANRECURSE);
-
-#define BFQ_LOCK(x)    do {            \
-               dsched_disk_ctx_ref(&(x)->head);        \
-               lockmgr(&(x)->bfq_lock, LK_EXCLUSIVE);  \
-       } while(0)
-
-#define BFQ_UNLOCK(x)  do {            \
-               lockmgr(&(x)->bfq_lock, LK_RELEASE);    \
-               dsched_disk_ctx_unref(&(x)->head);      \
-       } while(0)
-
-#define SECT_SIZE 512 /* XXX: DEV_BSIZE? */
-#define BFQ_DEBUG_CRITICAL 1
-#define BFQ_DEBUG_NORMAL 2
-#define BFQ_DEBUG_VERBOSE 3
-#define BFQ_DEFAULT_MAX_BUDGET (1024*512) /* 1024 sectors / 0.2sec */
-#define BFQ_DEFAULT_MIN_BUDGET (32*512) /* 32 sectors / 0.2sec */
-#define BFQ_BUDG_INC_STEP (1*128*512) /* The linear increasing step of budget */
-
-/* If the budget is larger than this threshold,
- * it will get linear increment, else,
- * it will get exponential increment.*/
-#define BFQ_BUDGET_MULTIPLE_THRESHOLD (256*512)
-
-#define BFQ_DEFAULT_WEIGHT 1
-
-/* Get the size of a bio */
-#define BIO_SIZE(x) ((x)->bio_buf->b_bcount)
-
-/* Anticipatory waiting time (ticks) ~ 20ms, min ~ 10ms */
-#define BFQ_T_WAIT ((hz/50) > 5 ? (hz/50) : 5)
-
-#define BFQ_T_WAIT_MIN ((hz/100 > 0) ? (hz/100) : 1)
-
-/* Time slice for each service period ~200ms (ticks) */
-#define BFQ_SLICE_TIMEOUT (hz/5)
-
-#define BFQ_FIXPOINT_SHIFT 10 /* fixed point arithmetic shift */
-
-#define BFQ_VALID_MIN_SAMPLES 80 /* minimum number of samples */
-
-#define ABS(x) (((x) < 0) ? (-(x)) : (x))
-
-/* as statistics define */
-#define BFQ_AS_STAT_ALL 0x1
-#define BFQ_AS_STAT_ONLY_MISS 0x2
-
-/* functions helper thread calls */
-void bfq_timeout(void *);
-void bfq_dequeue(struct dsched_disk_ctx *);
-void bfq_helper_destroy_tdio(struct dsched_thread_io *, struct bfq_disk_ctx *);
-
-/* sysctl handlers, registered in the helper thread */
-int bfq_sysctl_as_switch_handler(SYSCTL_HANDLER_ARGS);
-int bfq_sysctl_auto_max_budget_handler(SYSCTL_HANDLER_ARGS);
-
-#endif /* _KERNEL || _KERNEL_STRUCTURES */
-struct dsched_bfq_stats {
-       int32_t as_missed;
-       int32_t as_hit;
-       int32_t as_fake;
-       int32_t unused;
-};
-#endif /*_DSCHED_BFQ_H_ */
diff --git a/sys/kern/dsched/bfq/bfq_helper_thread.c b/sys/kern/dsched/bfq/bfq_helper_thread.c
deleted file mode 100644 (file)
index 6a5bffe..0000000
+++ /dev/null
@@ -1,455 +0,0 @@
-/*
- * Copyright (c) 2011 The DragonFly Project.  All rights reserved.
- *
- * This code is derived from software contributed to The DragonFly Project
- * by Brills Peng <brillsp@gmail.com>
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- * 3. Neither the name of The DragonFly Project nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific, prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
- * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
- * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-
-/*
- * bfq_helper_thread.c:
- * Thread function of the helper thread and
- * message sending routines.
- *
- * XXX: The current approach of serializing using lwkt messages is suboptimal.
- *     The idea is to replace it with way more fine-grained and lockless
- *     accesses spread all over the place. It makes things more complicated,
- *     but it will also improve performance significantly.
- *
- * The sysctl node of bfq is also initialized
- * here.
- */
-
-#include <sys/systm.h>
-#include <sys/kernel.h>
-#include <sys/proc.h>
-#include <sys/sysctl.h>
-#include <sys/buf.h>
-#include <sys/conf.h>
-#include <sys/diskslice.h>
-#include <sys/disk.h>
-#include <sys/malloc.h>
-#include <machine/md_var.h>
-#include <sys/ctype.h>
-#include <sys/syslog.h>
-#include <sys/device.h>
-#include <sys/msgport.h>
-#include <sys/msgport2.h>
-#include <sys/mplock2.h>
-#include <sys/buf2.h>
-#include <sys/dsched.h>
-#include <sys/fcntl.h>
-#include <machine/varargs.h>
-
-#include <kern/dsched/bfq/bfq.h>
-#include <kern/dsched/bfq/bfq_helper_thread.h>
-
-extern struct sysctl_oid *bfq_mod_oid;
-extern struct dsched_policy dsched_bfq_policy;
-
-static void helper_thread(struct bfq_disk_ctx *bfq_diskctx);
-static int helper_msg_exec(helper_msg_t msg);
-static void helper_sysctl_init(struct bfq_disk_ctx *bfq_diskctx);
-
-MALLOC_DEFINE(M_HELPER, "bfq", "BFQ helper thread message allocations");
-
-/*
- * All threads share one dispose port
- */
-static struct lwkt_port helper_dispose_port;
-
-/* XXX: should be an mpipe */
-static struct objcache_malloc_args helper_msg_malloc_args = {
-       sizeof(struct helper_msg), M_HELPER };
-
-
-static helper_msg_t
-helper_msg_get(struct bfq_disk_ctx *bfq_diskctx)
-{
-       /*
-        * XXX: wait is OK?
-        */
-       return objcache_get(bfq_diskctx->helper_msg_cache, M_WAITOK);
-}
-
-static int
-helper_msg_put(struct bfq_disk_ctx *bfq_diskctx, helper_msg_t msg)
-{
-       objcache_put(bfq_diskctx->helper_msg_cache, msg);
-       return 0;
-}
-
-static void
-helper_msg_autofree_reply(lwkt_port_t port, lwkt_msg_t msg)
-{
-       helper_msg_t hm = (helper_msg_t)msg;
-       helper_msg_put(hm->bfq_diskctx, (helper_msg_t)msg);
-}
-
-/*
- * Initialize the dispose port. All helper threads share this port.
- * Must be called only once, and before any helper thread being created.
- *
- * Called by bfq.c: bfq_moc_handler()
- */
-void
-helper_init_global(void)
-{
-       lwkt_initport_replyonly(&helper_dispose_port, helper_msg_autofree_reply);
-}
-
-/*
- * Helper thread initialization function:
- * initialize the per-disk objcache and create the
- * helper thread.
- *
- * Called by bfq.c:bfq_prepare()
- */
-void
-helper_init(struct bfq_disk_ctx *bfq_diskctx)
-{
-       struct thread *phelper_thread;
-
-       bfq_diskctx->helper_msg_cache = objcache_create("bfq-helper-msg-cache", 0, 0,
-                       NULL, NULL, NULL,
-                       objcache_malloc_alloc,
-                       objcache_malloc_free,
-                       &helper_msg_malloc_args);
-
-       lwkt_create((void (*) (void *)) helper_thread, bfq_diskctx,
-                       &phelper_thread, NULL, 0, -1,
-                       "bfq_helper_td_%s", bfq_diskctx->head.dp->d_cdev->si_name);
-
-       bfq_diskctx->helper_thread = phelper_thread;
-}
-
-static void
-helper_msg_send(struct bfq_disk_ctx *bfq_diskctx, uint32_t cmd, helper_msg_t helper_msg)
-{
-       lwkt_port_t port = &bfq_diskctx->helper_msg_port;
-
-       lwkt_initmsg(&helper_msg->hdr, &helper_dispose_port, 0);
-       helper_msg->bfq_diskctx = bfq_diskctx;
-       helper_msg->hdr.u.ms_result = cmd;
-
-       if (port->mpu_td == curthread){
-               helper_msg_exec(helper_msg);
-               lwkt_replymsg(&helper_msg->hdr, 0);
-       } else {
-               lwkt_sendmsg(port, (lwkt_msg_t)helper_msg);
-       }
-}
-
-/*
- * Deallocate the objcache.
- * Called by bfq.c: bfq_teardown()
- */
-void
-helper_uninit(struct bfq_disk_ctx *bfq_diskctx)
-{
-       objcache_destroy(bfq_diskctx->helper_msg_cache);
-}
-
-static void
-helper_sysctl_init(struct bfq_disk_ctx *bfq_diskctx)
-{
-       struct sysctl_oid *oid;
-
-       sysctl_ctx_init(&bfq_diskctx->bfq_sysctl_ctx);
-
-       if (!bfq_mod_oid){
-               kprintf("Failed to create BFQ dev sysctl node!\n");
-               return;
-       }
-
-       oid = SYSCTL_ADD_NODE(&bfq_diskctx->bfq_sysctl_ctx,
-               SYSCTL_CHILDREN(bfq_mod_oid),
-               OID_AUTO,
-               bfq_diskctx->head.dp->d_cdev->si_name,
-               CTLFLAG_RD, 0, "");
-
-       SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx,
-                       SYSCTL_CHILDREN(oid),
-                       OID_AUTO,
-                       "max_budget",
-                       CTLFLAG_RW,
-                       &bfq_diskctx->bfq_max_budget,
-                       0,
-                       "BFQ max budget");
-
-       SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx,
-                       SYSCTL_CHILDREN(oid),
-                       OID_AUTO,
-                       "peak_rate",
-                       CTLFLAG_RD,
-                       &bfq_diskctx->bfq_peak_rate,
-                       0,
-                       "BFQ estimated peak rate");
-
-       SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx,
-                       SYSCTL_CHILDREN(oid),
-                       OID_AUTO,
-                       "peak_samples",
-                       CTLFLAG_RD,
-                       &bfq_diskctx->bfq_peak_rate_samples,
-                       0,
-                       "BFQ estimated peak rate samples");
-
-       SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx,
-                       SYSCTL_CHILDREN(oid),
-                       OID_AUTO,
-                       "as_miss",
-                       CTLFLAG_RD,
-                       &bfq_diskctx->bfq_as_miss,
-                       0,
-                       "BFQ AS miss");
-
-       SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx,
-                       SYSCTL_CHILDREN(oid),
-                       OID_AUTO,
-                       "as_hit",
-                       CTLFLAG_RD,
-                       &bfq_diskctx->bfq_as_hit,
-                       0,
-                       "BFQ AS hit");
-
-       SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx,
-                       SYSCTL_CHILDREN(oid),
-                       OID_AUTO,
-                       "as_wait_avg_all",
-                       CTLFLAG_RD,
-                       &bfq_diskctx->bfq_as_avg_wait_all,
-                       0,
-                       "BFQ AS waitall");
-
-       SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx,
-                       SYSCTL_CHILDREN(oid),
-                       OID_AUTO,
-                       "as_wait_avg_miss",
-                       CTLFLAG_RD,
-                       &bfq_diskctx->bfq_as_avg_wait_miss,
-                       0,
-                       "BFQ AS waitmiss");
-
-       SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx,
-                       SYSCTL_CHILDREN(oid),
-                       OID_AUTO,
-                       "as_wait_max",
-                       CTLFLAG_RD,
-                       &bfq_diskctx->bfq_as_max_wait,
-                       0,
-                       "BFQ AS waitmax");
-
-       SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx,
-                       SYSCTL_CHILDREN(oid),
-                       OID_AUTO,
-                       "as_wait_max2",
-                       CTLFLAG_RD,
-                       &bfq_diskctx->bfq_as_max_wait2,
-                       0,
-                       "BFQ AS waitmax2");
-
-       SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx,
-                       SYSCTL_CHILDREN(oid),
-                       OID_AUTO,
-                       "as_high_wait_count",
-                       CTLFLAG_RD,
-                       &bfq_diskctx->bfq_as_high_wait_count,
-                       0,
-                       "BFQ AS high count");
-
-       SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx,
-                       SYSCTL_CHILDREN(oid),
-                       OID_AUTO,
-                       "as_high_wait_count2",
-                       CTLFLAG_RD,
-                       &bfq_diskctx->bfq_as_high_wait_count2,
-                       0,
-                       "BFQ AS high count2");
-
-       SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx,
-                       SYSCTL_CHILDREN(oid),
-                       OID_AUTO,
-                       "avg_time_slice",
-                       CTLFLAG_RD,
-                       &bfq_diskctx->bfq_avg_time_slice,
-                       0,
-                       "BFQ average time slice");
-
-       SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx,
-                       SYSCTL_CHILDREN(oid),
-                       OID_AUTO,
-                       "max_time_slice",
-                       CTLFLAG_RD,
-                       &bfq_diskctx->bfq_max_time_slice,
-                       0,
-                       "BFQ max time slice");
-
-       SYSCTL_ADD_INT(&bfq_diskctx->bfq_sysctl_ctx,
-                       SYSCTL_CHILDREN(oid),
-                       OID_AUTO,
-                       "high_time_slice_count",
-                       CTLFLAG_RD,
-                       &bfq_diskctx->bfq_high_time_slice_count,
-                       0,
-                       "BFQ high time slice count");
-
-       SYSCTL_ADD_PROC(&bfq_diskctx->bfq_sysctl_ctx, SYSCTL_CHILDREN(oid),
-                       OID_AUTO, "as_switch", CTLTYPE_INT|CTLFLAG_RW,
-                       bfq_diskctx, 0, bfq_sysctl_as_switch_handler, "I", "as_switch");
-
-       SYSCTL_ADD_PROC(&bfq_diskctx->bfq_sysctl_ctx, SYSCTL_CHILDREN(oid),
-                       OID_AUTO, "auto_max_budget_switch", CTLTYPE_INT|CTLFLAG_RW,
-                       bfq_diskctx, 0, bfq_sysctl_auto_max_budget_handler, "I", "amb_switch");
-}
-
-static void
-helper_thread(struct bfq_disk_ctx *bfq_diskctx)
-{
-       int r;
-       helper_msg_t msg;
-
-       dsched_new_policy_thread_tdio(&bfq_diskctx->head, &dsched_bfq_policy);
-
-       lwkt_initport_thread(&bfq_diskctx->helper_msg_port, curthread);
-       dsched_disk_ctx_ref(&bfq_diskctx->head);
-       helper_sysctl_init(bfq_diskctx);
-
-       dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: helper thread created\n");
-#if 0
-       /* XXX: why mplock?! */
-       get_mplock();
-#endif
-
-       for(;;) {
-               msg = (helper_msg_t)lwkt_waitport(&bfq_diskctx->helper_msg_port, 0);
-               dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: helper: msg recv: %d\n", msg->hdr.u.ms_result);
-               r = helper_msg_exec(msg);
-               lwkt_replymsg(&msg->hdr, 0);
-               /*
-                * received BFQ_MSG_KILL
-                */
-               if (r == -1)
-                       break;
-       }
-
-#if 0
-       rel_mplock();
-#endif
-
-       sysctl_ctx_free(&bfq_diskctx->bfq_sysctl_ctx);
-       dsched_disk_ctx_unref(&bfq_diskctx->head);
-       dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: helper: die peacefully\n");
-       lwkt_exit();
-}
-
-static int
-helper_msg_exec(helper_msg_t msg)
-{
-       struct bfq_disk_ctx *bfq_diskctx;
-
-       bfq_diskctx = msg->bfq_diskctx;
-
-
-       switch (msg->hdr.u.ms_result)
-       {
-               case BFQ_MSG_DEQUEUE:
-                       if (atomic_cmpset_int(&bfq_diskctx->pending_dequeue, 0, 1))
-                               bfq_dequeue((struct dsched_disk_ctx *)bfq_diskctx);
-                       break;
-               case BFQ_MSG_AS_TIMEOUT:
-                       bfq_timeout(bfq_diskctx);
-                       break;
-
-               case BFQ_MSG_DESTROY_TDIO:
-                       bfq_helper_destroy_tdio(msg->tdio, bfq_diskctx);
-                       break;
-
-               case BFQ_MSG_KILL:
-                       return -1;
-
-               default:
-                       break;
-       }
-       return 0;
-}
-
-void
-helper_msg_dequeue(struct bfq_disk_ctx *bfq_diskctx)
-{
-       helper_msg_t helper_msg = helper_msg_get(bfq_diskctx);
-
-       helper_msg_send(bfq_diskctx, BFQ_MSG_DEQUEUE, helper_msg);
-}
-
-void
-helper_msg_as_timeout(struct bfq_disk_ctx *bfq_diskctx)
-{
-       helper_msg_t helper_msg = helper_msg_get(bfq_diskctx);
-       /**
-        * For statisticsal use, temporary
-        * ------------------------------
-        */
-       struct bfq_thread_io *bfq_tdio;
-       struct timeval tv;
-       uint32_t msec;
-
-
-       bfq_tdio = bfq_diskctx->bfq_blockon;
-       if (bfq_tdio) {
-               getmicrotime(&tv);
-               timevalsub(&tv, &bfq_tdio->as_start_time);
-               msec = ((uint64_t)(1000000*tv.tv_sec + tv.tv_usec)) >> 10;
-               if (msec > 5 * BFQ_T_WAIT_MIN * (1000 / hz))
-                       atomic_add_int(&bfq_diskctx->bfq_as_high_wait_count2, 1);
-               if (msec > bfq_diskctx->bfq_as_max_wait2)
-                       bfq_diskctx->bfq_as_max_wait2 = msec;
-       }
-       /* ----------------------------- */
-
-       helper_msg_send(bfq_diskctx, BFQ_MSG_AS_TIMEOUT, helper_msg);
-}
-
-void
-helper_msg_destroy_tdio(struct bfq_disk_ctx *bfq_diskctx, struct dsched_thread_io *tdio)
-{
-       helper_msg_t helper_msg = helper_msg_get(bfq_diskctx);
-
-       helper_msg->tdio = tdio;
-       helper_msg_send(bfq_diskctx, BFQ_MSG_DESTROY_TDIO, helper_msg);
-}
-
-void
-helper_msg_kill(struct bfq_disk_ctx *bfq_diskctx)
-{
-       helper_msg_t helper_msg = helper_msg_get(bfq_diskctx);
-
-       helper_msg_send(bfq_diskctx, BFQ_MSG_KILL, helper_msg);
-}
diff --git a/sys/kern/dsched/bfq/bfq_helper_thread.h b/sys/kern/dsched/bfq/bfq_helper_thread.h
deleted file mode 100644 (file)
index 8df731b..0000000
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2011 The DragonFly Project.  All rights reserved.
- *
- * This code is derived from software contributed to The DragonFly Project
- * by Brills Peng <brillsp@gmail.com>
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- * 3. Neither the name of The DragonFly Project nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific, prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
- * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
- * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-
-#ifndef _DSCHED_BFQ_HELPER_THREAD_H_
-#define _DSCHED_BFQ_HELPER_THREAD_H_
-#include <kern/dsched/bfq/bfq.h>
-
-typedef struct helper_msg {
-       struct lwkt_msg hdr;
-       struct bfq_disk_ctx *bfq_diskctx;
-       struct dsched_thread_io *tdio;
-} *helper_msg_t;
-
-enum helper_msg_cmd {
-       BFQ_MSG_DEQUEUE = 1,
-       BFQ_MSG_AS_TIMEOUT,
-       BFQ_MSG_DESTROY_TDIO,
-       BFQ_MSG_KILL
-};
-
-void helper_init_global(void);
-void helper_init(struct bfq_disk_ctx *bfq_diskctx);
-void helper_uninit(struct bfq_disk_ctx *bfq_diskctx);
-void helper_msg_dequeue(struct bfq_disk_ctx *bfq_diskctx);
-void helper_msg_as_timeout(struct bfq_disk_ctx *bfq_diskctx);
-void helper_msg_destroy_tdio(struct bfq_disk_ctx *bfq_diskctx, struct dsched_thread_io *tdio);
-void helper_msg_kill(struct bfq_disk_ctx *bfq_diskctx);
-
-#endif
-
diff --git a/sys/kern/dsched/bfq/bfq_ktr.h b/sys/kern/dsched/bfq/bfq_ktr.h
deleted file mode 100644 (file)
index 54f6f13..0000000
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) 2011 The DragonFly Project.  All rights reserved.
- *
- * This code is derived from software contributed to The DragonFly Project
- * by Brills Peng <brillsp@gmail.com>
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- * 3. Neither the name of The DragonFly Project nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific, prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
- * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
- * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-
-/*
- * Kernel tracing facility definitions for BFQ
- *
- * This header can ONLY be included by bfq.c
- */
-#ifndef _DSCHED_BFQ_BFQ_C_
-#error "bfq_ktr.h should only be included in sys/kern/dsched/bfq/bfq.c!"
-#endif
-
-#ifndef _DSCHED_BFQ_KTR_H_
-#define _DSCHED_BFQ_KTR_H_
-#include <sys/ktr.h>
-
-#if !defined(KTR_DSCHED_BFQ)
-#define KTR_DSCHED_BFQ KTR_ALL
-#endif
-KTR_INFO_MASTER(dsched_bfq);
-
-/* thread created */
-KTR_INFO(KTR_DSCHED_BFQ, dsched_bfq, thread_created, 0, "%p",
-    struct bfq_thread_io *bfq_tdio);
-
-/* average seek distance per thread */
-KTR_INFO(KTR_DSCHED_BFQ, dsched_bfq, thread_seek_avg, 0, "%p: %" PRIu64,
-    struct bfq_thread_io *bfq_tdio, off_t seek_avg);
-
-/* average thinking time per thread */
-KTR_INFO(KTR_DSCHED_BFQ, dsched_bfq, thread_ttime_avg, 0, "%p: %" PRIu64,
-    struct bfq_thread_io *bfq_tdio, off_t ttime_avg);
-
-#endif
-
diff --git a/sys/kern/dsched/bfq/doc/bfq.viki b/sys/kern/dsched/bfq/doc/bfq.viki
deleted file mode 100644 (file)
index 7edccd5..0000000
+++ /dev/null
@@ -1,332 +0,0 @@
-#TITLE: The Budget Fair Queueing Disk Scheduler for DragonFlyBSD
-#INCLUDE: note.sty
-#MAKETITLE
-* Introduction
-The BFQ disk scheduler is invented by Paolo Valente.  The current version
-of BFQ in DragonFlyBSD is implemented according to his technique report[1].
-Also, some additional features are added into the current version,
-they are inspired by the Linux version[2], but are totally written from
-scratch.
-
-Like the CFQ (complete fair queue) disk scheduler under Linux, BFQ is a
-fair queueing scheduler that aims to improve the interactivity and lower
-the latency of the system. Maximize throughput, however, is not the major
-design goal of BFQ. So it is better to switch to BFQ if the computer is for
-desktop usage, in which interactivity eclipses throughput in general.
-
-* Basic Principles of the BFQ Scheduler
-
-** Budget
-
-The core conception of BFQ is the "budget" of every thread. It means the
-maximum amount of service (measured by the size of the I/O requests) that a
-thread can receive when the scheduler is serving it exclusively. Once a
-thread consumes up its budget, it gets off from the scheduler, assigned
-with a new budget, and queued (again) into the fair queue. Then BFQ will
-select another thread to serve exclusively.
-
-** The WF^2Q+ fair queueing algorithm
-
-BFQ is based on a fair queueing algorithm named WF^2Q+. This algorithm was
-first used on routers to fairly dispatch network packets from various
-connections. If we replace the term "packets" and "connections" by "I/O
-requests" and "threads (or processes)", we have reached the basic idea of
-how this algorithm is applied to BFQ scheduler.
-
-The WF^2Q+ algorithm decides which thread to select and to be served by BFQ
-when the last thread runs up its budget. It is based on the term "virtual
-time", which is actually the service offered and received (measured by
-bytes or sectors in implementation). It maintains a global virtual time,
-which is the amount of service offered globally. It also maintains two
-attributes for every thread: the virtual eligible time and the virtual
-deadline. The former one means the total service received while the latter
-one means the expected "time" to be selected, that is, it expects to be
-selected by the algorithm when the global virtual time reaches its
-deadline.
-
-The WF^2Q+ algorithm will always select the thread with minimum deadline
-among the threads whose eligible time is no later than the global virtual
-time. Intuitively, if all threads consume the same amount of budget, they
-will be selected alternately and have a same share of disk distribution; if
-one thread consumes more budget than others, it will get selected fewer. 
-
-* Implementation
-The BFQ scheduler is written on top of the ''dsched'' framework.
-However, more features are needed from ''dsched'' than it could provide:
-the scheduler has to be notified when the disk is idle or about to idle and
-only with this notification can it dispatch further I/O requests to the
-driver. Therefore, before implementing the scheduler itself, request
-polling feature is added to ''dsched'' framework.
-
-** Request polling in dsched
-Before request polling is implemented, the ''dsched'' framework does not
-have a ''dequeue()'' interface for scheduling policy running on top of it.
-Instead, it provides some ''strategy()'' functions for a scheduler to call
-when it "guesses" that the disk may be able to receive more I/O requests.
-
-The request polling feature transfers the guessing work to ''dsched'' by
-maintaining a variable called ''tag_queue_depth'', which is the estimated
-depth of the disk's NCQ or TCQ. A variable called
-''max_tag_queue_depth'' is initialized as the maximum depth of the disk's
-TCQ or NCQ, which can be acquired from the driver.
-
-The request polling feature is not restricted only to BFQ but can be made
-use of by any policy on ''dsched'' framework. To use this feature, a policy
-must:
-    @ Monitor ''current_tag_queue_depth'', and push as many ''bio''s as it
-      can until the depth reaches the maximum value. Monitoring can be
-      achieved by:
-        @ Creating a monitor thread and poll the value periodically (not
-          recommended)
-        @ Monitoring the value when:
-            @ some ''bio''s are done
-            @ some ''bio''s are pushed to the scheduler by ''dsched'''s
-              ''queue()'' interface. Actually, the policy may register a
-              ''polling_func'' callback, being called by ''dsched'' when
-              a ''bio'' dispatched by
-              ''dsched_strategy_request_polling()''is done.
-    @ Use ''dsched_strategy_request_polling()'' to dispatch the ''bio''s.
-      This ''strategy()'' call will decrease the
-      ''current_tag_queue_depth''. Note that unlike
-      ''dsched_strategy_async()'', a policy cannot register a ''biodone()''
-      callback which gets called when the dispatched ''bio'' is done.
-      Instead, if such a callback is needed, the policy should:
-    @ [optional] Register a biodone callback function (type
-      ''dsched_bio_done_t'') by assigning it to ''polling_func'' in
-      the policy structure. Note: this function should not be
-      blocked, (eg. by locks) and should be MPSAFE; this function should
-      not be changed after the ''prepare()'' interface is called.
-
-** The WF^2Q fair queue
-The WF^2Q fair queueing algorithm is implemented in
-''sys/kern/dsched/bfq/wf2q.c''.
-
-To efficiently implement the functions that WF^2Q provides, a data
-structure named "augmented binary tree" is used. With its help, WF^2Q+
-can select a proper thread described above within O(log(N)) time, where N
-is the number of threads in the tree. The inserting and deleting
-operations are scaled  O(log(N)) as well. The detailed information about
-how to implement WF2^Q with augmented tree is in [3].
-
-Before the implementation of BFQ, the ''tree.h'', which contains the
-definition of red-black tree in \DragonFly does not support the augment
-function. Thus the ''tree.h'' from FreeBSD is ported.
-
-** The structure of the BFQ scheduler: helper thread, lwkt message and why
-
-In current version, a helper thread is used to executing the following
-operations:
-
-*** Serialized ''bfq_dequeue()''
-The ''bfq_dequeue()'' function is the core of the BFQ scheduler. It takes
-the responsibility to serve a thread within a preset time slice, dispatche
-''bio''s of that thread and select another thread from the WF^2Q+ fair
-queue when current thread runs out of its budget. It should be called
-whenever the disk is idle or about to idle.
-
-To avoid blocking ''ithreads'' (interrupt threads), we use a helper thread
-to dispatch all bios to the lower driver in current version, that is to
-say, the ''bfq_dequeue()'' function is only called by the helper thread. 
-
-Originally, ''bfq_dequeue()'' could be called by:
-    @ ''dsched_request_polling_biodone()'', which is called by a interrupt
-      thread when a I/O request is done by the hard drive.
-    @ ''bfq_queue()'', after a user thread pushing its bios to the
-      scheduler.
-    @ ''bfq_timeout()'', after the scheduler finishing suspending.
-    @ ''bfq_destroy_tdio()'', when the tdio being destroyed is waited by
-      the scheduler.
-
-Now these callers will uniformly send an lwkt message to the helper thread,
-and all bfq_dequeue() will thus be serialized.
-
-*** Non-blocking ''bfq_timeout()''
-''bfq_timeout()'' needs to acquire BFQ_LOCK, which may cause the calling
-thread, the callout facility to block on it. To avoid this situation,
-in current version a function sending message to the helper thread will
-be called when the callout alarm strikes.
-
-*** Non-blocking ''bfq_destroy_tdio()''
-Due to high latency experienced in some test case (blogbench), we have
-found that blocking on destroying a thread is not healthy. Therefore the
-helper thread now receives message of destroying a tdio and call
-''bfq_destroy_tdio()'' instead. Note that in that function, no operation on
-the destroyed ''thread_io'' structure should be applied, because it may
-have been recycled.
-
-*** Possible Performance Issues
-
-As almost all major scheduler operations are serialized (actually, only
-''bfq_queue()'' and the customized biodone function are exceptions), the
-performance will be not as high as expected, and it is proved in some
-benchmarks. The helper thread seems to be the most possible source of the
-high latency, and this should be fixed in the next version, by refactoring
-all the synchronizing operations and use as few lockings as possible.
-
-
-** How the budget of a thread is adjusted by its behaviors 
-Ideally, equal budgets is excepted to assegned to all threads, and they
-should run out of their budgets immediately. However, the abstract is far
-from the real world conditions. First, a thread could do random I/O which
-is very time consuming. Second, it could be a CPU-intensive thread that
-seldom does I/O and thus consumes its budget very slowly. 
-
-As the BFQ scheduler runs on the service domain and it cares no time domain
-latency issues, the actual performance (and interactivity) could be
-affected by the two types of threads above. As a result, we have to add
-time domain restrictions to all threads to ensure low latency.
-
-First, we assign a preset time slice to every thread and they
-are only served within the interval (200ms). If a thread does not consume
-up its budget, the scheduler will reduce its budget to the amount it has
-consumed in the current time slice. Note that a lower budget does mean that
-lower bandwidth shared, because of the WF^2Q+ algorithm, the thread will be 
-more frequently selected.
-
-Second, if a thread having enough budget pushes no further I/O requests
-even after the whole scheduler suspends to wait a while for it, the budget
-of it will be reduced as well. And if the the thread consumes its budget
-too slow (for example, at current speed, it will only consume less than 2/3
-of its budget), it will be punished by ''charging a full budget''. As a
-result, the time when it is selected next time will be later than expected.
-
-Third, if a thread runs up its budget within the time slice, its budget
-gets increased. There are two types of the increment:
-    @ If the current budget is less than a threshold, it gets doubled, or
-    @ it gets a pre-defined linear increment.
-
-As one can expect, through the process of budget adjusting, every thread
-will be assigned a proper budget to be consumed just in the time slice.
-
-** The AS feature
-It is possible that a thread pushes one ''bio'' and then waits for it to be
-done before pushing another. Although it may be doing sequential I/O, the
-scheduler could misunderstand this behavior and switch to another thread
-too early.
-
-To avoid the above issue, the AS feature is introduced in BFQ: the
-scheduler suspends for a while, when the current serving thread has enough
-budget but no ''bio'' exists in its queue. If the thread pushes one or more
-''bio''s during waiting, the service will not be interrupted after the
-scheduler resumes.
-
-However, if a thread takes too long to "think", it can not enjoy the AS
-feature. This will be described in the next section.
-
-Now the AS feature is implemented with the help of the ''callout''
-facility.
-
-** Additional features: ''ttime_avg'', ''seek_avg'' and ''peak_rate''
-
-*** Average Thinking Time
-''ttime'' means the interval between the time when a thread pushes a
-''bio'' and the time when the last ''bio'' of it is done.
-
-We accumulate the think time and calculate an average value, by which the
-scheduler judges whether a thread takes too long to "think".
-
-If a thread is too "thinking", the AS waiting could be wasting of time,
-thus we turn of the AS feature of such a thread.
-
-*** Average Seek Distance
-''seek_avg'' is calculated by accumulating value ''current_seek_start -
-last_seek_end''. A "seeky" thread tends to have less budget, and the
-scheduler will not sample the disk peak rate after serving it.
-
-*** Disk Peak Rate Estimate
-The peak speed of the hard drive is estimated by the amount I/O done when:
-    @ a thread runs out of its budget
-    @ a not "seeky" thread runs out of its time slice
-
-The peak rate is used to adapt the max budget automatically:
-
-''max_budget = peak_rate * time_slice_length''
-
-** Debug interfaces
-*** ''dsched_debug''
-We have defined three ''dsched_debug'' levels:
-    @ ''BFQ_DEBUG_CRITICAL'': printing errors or warnings.
-    @ ''BFQ_DEBUG_NORMAL'': printing important and non-frequently appearing
-      scheduler decisions.
-    @ ''BFQ_DEBUG_VERBOSE'': printing all scheduler decisions.
-
-*** Kernel Tracing
-Also, we make use of the KTR facility to print the ''seek_avg'' and
-''ttime_avg'' before a thread is destroyed. To enable KTR, add the
-following lines in your kernel configure file:
-
-''options KTR''
-
-''options KTR_DSCHED_BFQ''
-
-*** ''sysctl'' subtree
-BFQ creates a subtree under node ''dsched'' for every device using it. The subtree has the following nodes:
-    @ ''max_budget'': [R/W] global maximum budget; if the auto max budget feature is turned on, this is the automatically adjusted maximum budget.
-    @ ''peak_rate'': [R] Estimated disk speed, unit: 1/1024 byte per microsecond (fixed point representation)
-    @ ''peak_samples'': [R] Valid number of samples that are used to calculate the peak rate. It remains still after reaching 80.
-    @ ''as_miss'': [R] Counter of times that a thread does not push any ''bio'' after AS waiting.
-    @ ''as_hit'': [R] Counter of times that a thread pushes at least one ''bio'' after AS waiting.
-    @ ''as_wait_avg_all'': [R] Average AS waiting time (ms).
-    @ ''as_wait_avg_miss'': [R] Average AS waiting time (ms), when AS is missed.
-    @ ''as_wait_max'': [R] The Maximum AS waiting time (ms), measured in the helper thread.
-    @ ''as_wait_max2'': [R] The Maximum AS waiting time (ms), measured in the ''callout'' callback.
-    @ ''as_high_wait_count'': [R] Counter of times that the scheduler does an AS waiting for longer than 50ms, measured in the helper thread.
-    @ ''as_high_wait_count'': [R] Counter of times that the scheduler does an AS waiting for longer than 50ms, measured in the ''callout'' callback.
-    @ ''avg_time_slice'': [R] Average length of time slice.
-    @ ''max_time_slice'': [R] Maximum length of time slice.
-    @ ''as_switch'': [R/W] Switch controlling the global AS feature.
-    @ ''auto_max_budget_switch'': [R/W] Switch controlling the auto max budget adapting feature. 
-* Tuning
-Now BFQ has two tunable parameters: the global AS switch and the max
-budget.
-** AS feature: on/off
-It is reported that turning AS on may affect the interactivity and increase
-max latency greatly. It is probably due to the over-serialized
-implementation of BFQ. However, the blogbench result shows that turning AS
-on will also increase the throughput greatly.
-
-Suggestion: turn on the AS feature, for it effects little on averate latency.
-** max budget: the advantages/disadvantages of a higher/lower/auto max budget
-One thread could be assigned a budget no more than the max budget. Generally,
-a higher budget means higher throughput because of less operations on WF2Q+
-augtree, while a lower budget force the scheduler cost more on those
-operations.
-
-However, the real world experiments show that a too high budget affects
-interactivity heavily. A too low budget will also cause higher latency, and
-if the budget is less than 64KB (65536), which is smaller than the size of
-some ''bio''s , the scheduler will retrograde to a round-robin scheduler,
-which is not a good form for a disk scheduler.
-
-Suggestions:
-Do not use auto max budget, it is usually too high. A budget of
-1/10 of the automatic max budget may be proper. In general, 512K(default), 256K, 192K 
-can be good. It is better to determine the best max budget by binary
-selecting by the result of some benchmarks.
-
-* Benchmark Results
-See http://leaf.dragonflybsd.org/~brillsp/bfq_bench/bfq_bench.html
-* Known Bugs & Bottlenecks
- @ When switching to another ''dsched'' policy from BFQ, the system may
-   deadlock. (Happens when the sysctl process and the helper thread are on the
-   same CPU.)
- @ Currently, the performance is not so ideal and it is not tested on large
-   number of machines. It is not recommanded to use this version in a
-   productivity environment.
-* Future Plans
-
- @ Rewrite the scheduler to carefully and properly synchronize the operations
-   to acquire better performance
-
- @ Distinguish sync and async ''bio''s, as the async ones takes less time to complete,
-   the budget and the length of time slice should be different from those of
-   the sync ''bio''s.
-
-
-* References
-[1] Paolo Valente, Fabio Checconi, High Throughput Disk Scheduling with Fair Bandwidth Distribution, IEEE Transactions on Computers, vol. 59 no. 9
-
-[2] http://retis.sssup.it/~fabio/linux/bfq/patches/
-
-[3] I. Stoica and H. Abdel-Wahab, Earliest eligible virtual deadline first: A flexible and accurate mechanism for proportional share resource allocation
diff --git a/sys/kern/dsched/bfq/wf2q.c b/sys/kern/dsched/bfq/wf2q.c
deleted file mode 100644 (file)
index c2f5e54..0000000
+++ /dev/null
@@ -1,231 +0,0 @@
-/*
- * Copyright (c) 2011 The DragonFly Project.  All rights reserved.
- *
- * This code is derived from software contributed to The DragonFly Project
- * by Brills Peng <brillsp@gmail.com>
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- * 3. Neither the name of The DragonFly Project nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific, prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
- * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
- * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-/*
- * Augment RB-tree for B-WF2Q+ queuing algorithm:
- *  - The key of the binary tree is the virtual eligible time (start time)
- *  - Each node maintains an additional min_vd value, which
- *    is the minimum virtual deadline (finish time) among the node and its
- *    children
- *  - Every operation on the tree changing the childs of a node will
- *    trigger RB_AUGMENT() marco, which change min_vd along the path to
- *    the root
- */
-
-#include <kern/dsched/bfq/wf2q.h>
-
-
-#undef RB_AUGMENT
-#define RB_AUGMENT(x) wf2q_augment_func(x);
-
-static void
-wf2q_augment_func(struct bfq_thread_io *node)
-{
-       struct bfq_thread_io *tmp = node, *tmp2;
-       int min_vd;
-       do{
-               min_vd = tmp->vd;
-               tmp2 = RB_LEFT(tmp, entry);
-               min_vd = tmp2 ? MIN(tmp2->min_vd, min_vd) : min_vd;
-               tmp2 = RB_RIGHT(tmp, entry);
-               min_vd = tmp2 ? MIN(tmp2->min_vd, min_vd) : min_vd;
-               tmp->min_vd = min_vd;
-       }while((tmp = RB_PARENT(tmp,entry)));
-}
-
-/*
- * The rb-tree is indexed by the virtual eligible (start) time
- */
-static int
-bfq_thread_io_cmp(struct bfq_thread_io *a, struct bfq_thread_io *b)
-{
-       if (a->ve - b->ve <= 0)
-               return -1;
-       return 1;
-}
-
-RB_PROTOTYPE(wf2q_augtree_t, bfq_thread_io, entry,);
-RB_GENERATE(wf2q_augtree_t, bfq_thread_io, entry, bfq_thread_io_cmp);
-
-/*
- * The algorithm is from
- *     I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline
- *  First: A Flexible and Accurate Mechanism for Proportional Share
- *  Resource Allocation,'' technical report.
- *
- *  http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
- *
- *  - Partition the tree into two parts by ve:
- *  - One part contains nodes with ve smaller than vtime
- *  - The other part contains nodes with ve larger than vtime
- *  - In the first part, find the node with minimum vd, along the
- *    min_vd value path
- *
- *  Returns
- *     NULL, if no node with ve smaller than vtime
- *     or the elegible node with minimum vd.
- */
-static struct bfq_thread_io *
-wf2q_augtree_get_eligible_with_min_vd(struct wf2q_augtree_t *tree, int vtime)
-{
-       struct bfq_thread_io *node = RB_ROOT(tree), *st_tree = NULL, *path_req = NULL;
-       while (node) {
-               if (node->ve <= vtime) {
-                       /* update node with earliest deadline along path. */
-                       if ((!path_req) || (path_req->vd > node->vd))
-                               path_req = node;
-                       /* update root of subtree containing earliest deadline */
-                       if ((!st_tree) || (RB_LEFT(node,entry) && st_tree->min_vd > RB_LEFT(node,entry)->min_vd))
-                               st_tree = RB_LEFT(node,entry);
-                       node = RB_RIGHT(node, entry);
-               } else
-                       node = RB_LEFT(node, entry);
-       }
-       /* check whether node with earliest deadline was along path */
-       if ((!st_tree) || (st_tree->min_vd >= path_req->vd))
-               return path_req;
-       /* return node with earliest deadline from subtree */
-       for (node = st_tree; node; ) {
-               /* if node found, return it */
-               if (st_tree->min_vd == node->vd)
-                       return node;
-               /* XXX: modified temporarily */
-               if (RB_LEFT(node, entry) && node->min_vd == RB_LEFT(node, entry)->min_vd)
-                       node = RB_LEFT(node, entry);
-               else
-                       node = RB_RIGHT(node, entry);
-       }
-       return NULL;
-}
-
-/*
- * This function initializes a wf2q structure
- */
-void
-wf2q_init(struct wf2q_t *pwf2q)
-{
-       RB_INIT(&pwf2q->wf2q_augtree);
-       pwf2q->wf2q_virtual_time = 0;
-       pwf2q->wf2q_tdio_count = 0;
-}
-
-/*
- * Insert a tdio into a wf2q queue.
- * The virtual eligible (start) time and deadline is handled
- * according to the current virtual time (in wf2q_t).
- */
-void
-wf2q_insert_thread_io(struct wf2q_t *wf2q, struct bfq_thread_io *tdio)
-{
-       /*
-        * TODO: The anticipatory parts
-        * start time varies on whether the tdio is being waited
-        */
-       tdio->ve = MAX(wf2q->wf2q_virtual_time, tdio->vd);
-       tdio->vd = tdio->ve + tdio->budget / tdio->weight;
-       tdio->min_vd = tdio->vd;
-       RB_INSERT(wf2q_augtree_t, &wf2q->wf2q_augtree, tdio);
-       wf2q->wf2q_tdio_count++;
-}
-
-/*
- * Remove a thread_io struct from the augment tree,
- * called before a thread is destroyed.
- */
-void
-wf2q_remove_thread_io(struct wf2q_t *wf2q, struct bfq_thread_io *tdio)
-{
-       RB_REMOVE(wf2q_augtree_t, &wf2q->wf2q_augtree, tdio);
-       wf2q->wf2q_tdio_count--;
-}
-
-/*
- * Increase the current virtual time as services are provided
- */
-void
-wf2q_inc_tot_service(struct wf2q_t *wf2q, int amount)
-{
-       wf2q->wf2q_virtual_time += amount;
-}
-
-/*
- * Update a tdio's virtual deadline as it received service
- */
-void
-wf2q_update_vd(struct bfq_thread_io *tdio, int received_service)
-{
-       tdio->vd = tdio->ve + received_service / tdio->weight;
-}
-
-static void
-wf2q_tree_dump(struct bfq_thread_io *root, int level)
-{
-       int i;
-       if (!root) return;
-       for (i = 0; i < level; i++)
-               kprintf("-");
-       kprintf("vd: %d; ve: %d; min_vd: %d\n", root->vd, root->ve, root->min_vd);
-       wf2q_tree_dump(RB_LEFT(root,entry), level + 1);
-       wf2q_tree_dump(RB_RIGHT(root, entry), level + 1);
-}
-
-/*
- * Get a tdio with minimum virtual deadline and virtual eligible
- * time smaller than the current virtual time.
- * If there is no such tdio, update the current virtual time to
- * the minimum ve in the queue. (And there must be one eligible then)
- */
-struct bfq_thread_io *
-wf2q_get_next_thread_io(struct wf2q_t *wf2q)
-{
-       struct bfq_thread_io *tdio;
-       struct wf2q_augtree_t *tree = &wf2q->wf2q_augtree;
-       if (!(tdio = wf2q_augtree_get_eligible_with_min_vd(tree, wf2q->wf2q_virtual_time))) {
-               tdio = RB_MIN(wf2q_augtree_t, tree);
-               if (!tdio)
-                       return NULL;
-               wf2q->wf2q_virtual_time = tdio->ve;
-               tdio = wf2q_augtree_get_eligible_with_min_vd(tree, wf2q->wf2q_virtual_time);
-       }
-       if (!tdio) {
-               kprintf("!!!wf2q: wf2q_tdio_count=%d\n", wf2q->wf2q_tdio_count);
-               wf2q_tree_dump(RB_ROOT(tree), 0);
-               KKASSERT(0);
-       }
-       RB_REMOVE(wf2q_augtree_t, tree, tdio);
-       wf2q->wf2q_tdio_count--;
-       return tdio;
-}
-
-
diff --git a/sys/kern/dsched/bfq/wf2q.h b/sys/kern/dsched/bfq/wf2q.h
deleted file mode 100644 (file)
index a98994f..0000000
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2011 The DragonFly Project.  All rights reserved.
- *
- * This code is derived from software contributed to The DragonFly Project
- * by Brills Peng <brillsp@gmail.com>
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- * 3. Neither the name of The DragonFly Project nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific, prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
- * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
- * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#ifndef _DSCHED_BFQ_WF2Q_H_
-#define _DSCHED_BFQ_WF2Q_H_
-
-#include <sys/tree.h>
-
-/* struct bfq_thread_io is defined in bfq.h */
-struct bfq_thread_io;
-
-RB_HEAD(wf2q_augtree_t, bfq_thread_io);
-
-struct wf2q_t {
-       struct wf2q_augtree_t wf2q_augtree;
-       int wf2q_virtual_time;
-       int wf2q_tdio_count;
-};
-
-#ifndef _DSCHED_BFQ_H_
-#include <kern/dsched/bfq/bfq.h>
-#endif
-
-void wf2q_init(struct wf2q_t *pwf2q);
-void wf2q_insert_thread_io(struct wf2q_t *wf2q, struct bfq_thread_io *tdio);
-void wf2q_remove_thread_io(struct wf2q_t *wf2q, struct bfq_thread_io *tdio);
-void wf2q_update_vd(struct bfq_thread_io *tdio, int received_service);
-struct bfq_thread_io *wf2q_get_next_thread_io(struct wf2q_t *wf2q);
-void wf2q_inc_tot_service(struct wf2q_t *wf2q, int amount);
-
-#endif /* !_DSCHED_BFQ_WF2Q_H_ */
diff --git a/sys/kern/dsched/fq/Makefile b/sys/kern/dsched/fq/Makefile
deleted file mode 100644 (file)
index 5e38e9a..0000000
+++ /dev/null
@@ -1,4 +0,0 @@
-KMOD=  dsched_fq
-SRCS=  fq_core.c fq_diskops.c
-
-.include <bsd.kmod.mk>
diff --git a/sys/kern/dsched/fq/flow.txt b/sys/kern/dsched/fq/flow.txt
deleted file mode 100644 (file)
index affe8f2..0000000
+++ /dev/null
@@ -1,67 +0,0 @@
-O create process -> assign master proc struct + fqps for each disk present
-O  create disk -> walk all master proc struct and add new fqp for new disk
-O      -> also keep some list on what disks are associated to the scheduler
-  destroy disk -> walk all master proc struct and remove new fqp; cancel all bios
-       -> remove from global list of disks with this scheduler
-  zombie process -> cancel all reads in all fqps; push all writes in all fqps to disk
-O new buf -> assign master proc struct using curthread/curproc
-O queue -> find actual fqp in master proc struct of buf
-
-
-  master proc ref count:
-O      + create process
-O      + new buf
-       - zombie process
-       - queue
-
-  fqp ref count:
-O      + queue
-O      - completed
-
-  master proc list (kept globally):
-O      + create process
-       - zombie process
-
-  fqp list (kept per proc):
-O      + create disk (walk master list, create fqps)
-O      + create process (one fqp per disk on create process)
-       - destroy disk (walk master list, destroy fqps)
-       - zombie process (destroy all fqps in master proc struct)
-
-
-so how do we do buckets?
-option 1)
-       - each process has a member pointing to a bucket
-       - bucket has also a list of all processes that are members
-       - per-disk list of buckets
-       - each bucket is allocated a proportional amount of total throughput
-       - each fqp in each bucket gets a fair share of the bucket's proportional amount
-       => so when do we instantiate them?
-               -> each fq disk has a list of all buckets that exist
-               -> but buckets are marked either active or inactive
-                       -> inactive buckets are skipped and not accounted for
-                          in bandwidth allocation.
-
-
-if fqp has references and is in flight but disk is destroyed?
-       -> set to inactive
-       -> on completion see if it's cancelled (?)
-
-subr_dsched:
-stuff to iterate through disks with a particular scheduler
-
-
-http://pcca.ath.cx/updatedb.php
-
-
-
-
-
-
-
-
-------------------------
-
-calculate budget in respective thread
-Andrew File System Benchmark
-TPC-B database benchmark
diff --git a/sys/kern/dsched/fq/fq.h b/sys/kern/dsched/fq/fq.h
deleted file mode 100644 (file)
index 75c9f0d..0000000
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * Copyright (c) 2009, 2010 The DragonFly Project.  All rights reserved.
- *
- * This code is derived from software contributed to The DragonFly Project
- * by Alex Hornung <ahornung@gmail.com>
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- * 3. Neither the name of The DragonFly Project nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific, prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
- * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
- * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-#ifndef        _DSCHED_FQ_H_
-#define        _DSCHED_FQ_H_
-
-#if defined(_KERNEL) || defined(_KERNEL_STRUCTURES)
-
-#ifndef _SYS_QUEUE_H_
-#include <sys/queue.h>
-#endif
-#ifndef _SYS_BIO_H_
-#include <sys/bio.h>
-#endif
-#ifndef _SYS_BIOTRACK_H_
-#include <sys/biotrack.h>
-#endif
-#ifndef _SYS_SPINLOCK_H_
-#include <sys/spinlock.h>
-#endif
-
-#define FQ_PRIO_BIAS           5
-#define FQ_PRIO_MAX            10
-#define FQ_PRIO_MIN            1
-#define FQ_PRIO_IDLE           -1
-#define        FQ_BUCKET_ACTIVE        0x01
-
-#define FQ_DISPATCH_SML_ARRAY_SZ       128
-#define FQ_DISPATCH_ARRAY_SZ   512
-
-#define        FQ_DRAIN_CANCEL 0x1
-#define        FQ_DRAIN_FLUSH  0x2
-
-struct disk;
-struct proc;
-
-struct fq_disk_ctx {
-       struct dsched_disk_ctx head;
-
-       struct thread   *td;            /* dispatcher thread td */
-       struct thread   *td_balance;    /* balancer thread td */
-       int     avg_rq_time;            /* XXX: not yet used */
-       int32_t incomplete_tp;          /* IOs issued but not completed */
-       int     idle;                   /* disk idle ? */
-       struct timeval start_idle;      /* disk idleness start time */
-       int     idle_time;              /* aggregate idle time in interval */
-       int     die;                    /* flag to kill related threads */
-       struct timeval start_interval;  /* current interval start time */
-
-       int     prev_full;              /* disk >90% busy during prev. to last
-                                          interval? */
-       int     last_full;              /* disk >90% busy during last interval */
-       int     disk_busy;              /* disk >90% busy during cur. interval */
-       int64_t budgetpb[FQ_PRIO_MAX+1];/* next interval budget for each thread
-                                          in each prio */
-};
-
-struct fq_thread_io {
-       struct dsched_thread_io head;
-
-       int32_t transactions;   /* IOs completed so far during current interval */
-       int32_t avg_latency;    /* avg latency for current interval IOs */
-       int32_t interval_transactions;  /* IOs completed during last interval */
-       int32_t interval_avg_latency;   /* avg latency for last interval IOs */
-       int32_t max_tp;         /* rate limit of transactions per interval */
-       int32_t issued;         /* IOs issued to disk (but not completed) */
-
-       int     rebalance;      /* thread needs to rebalance w/ fq_balance_self */
-};
-
-struct dispatch_prep {
-       struct fq_thread_io     *tdio;
-       struct bio              *bio;
-};
-
-
-void   fq_balance_thread(struct fq_disk_ctx *diskctx);
-void   fq_dispatcher(struct fq_disk_ctx *diskctx);
-biodone_t      fq_completed;
-
-void   fq_dispatch(struct fq_disk_ctx *diskctx, struct bio *bio,
-                       struct fq_thread_io *tdio);
-void   fq_drain(struct fq_disk_ctx *diskctx, int mode);
-void   fq_balance_self(struct fq_thread_io *tdio);
-#endif /* _KERNEL || _KERNEL_STRUCTURES */
-
-
-struct dsched_fq_stats {
-       int32_t procs_limited;
-
-       int32_t transactions;
-       int32_t transactions_completed;
-       int32_t cancelled;
-};
-
-#endif /* _DSCHED_FQ_H_ */
diff --git a/sys/kern/dsched/fq/fq_core.c b/sys/kern/dsched/fq/fq_core.c
deleted file mode 100644 (file)
index 2bd45ff..0000000
+++ /dev/null
@@ -1,430 +0,0 @@
-/*
- * Copyright (c) 2009, 2010 The DragonFly Project.  All rights reserved.
- *
- * This code is derived from software contributed to The DragonFly Project
- * by Alex Hornung <ahornung@gmail.com>
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- * 3. Neither the name of The DragonFly Project nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific, prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
- * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
- * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/kernel.h>
-#include <sys/proc.h>
-#include <sys/sysctl.h>
-#include <sys/buf.h>
-#include <sys/conf.h>
-#include <sys/diskslice.h>
-#include <sys/disk.h>
-#include <machine/atomic.h>
-#include <sys/thread.h>
-#include <sys/thread2.h>
-#include <sys/ctype.h>
-#include <sys/buf2.h>
-#include <sys/syslog.h>
-#include <sys/dsched.h>
-#include <machine/param.h>
-
-#include <kern/dsched/fq/fq.h>
-
-static int     dsched_fq_version_maj = 1;
-static int     dsched_fq_version_min = 1;
-
-/* Make sure our structs fit */
-CTASSERT(sizeof(struct fq_thread_io) <= DSCHED_THREAD_IO_MAX_SZ);
-CTASSERT(sizeof(struct fq_disk_ctx) <= DSCHED_DISK_CTX_MAX_SZ);
-
-struct dsched_fq_stats fq_stats;
-
-extern struct dsched_policy dsched_fq_policy;
-
-void
-fq_dispatcher(struct fq_disk_ctx *diskctx)
-{
-       struct dispatch_prep *dispatch_ary;
-       struct dsched_thread_io *ds_tdio, *ds_tdio2;
-       struct fq_thread_io *tdio;
-       struct bio *bio, *bio2;
-       int idle;
-       int i, prepd_io;
-
-       /*
-        * Array is dangerously big for an on-stack declaration, allocate
-        * it instead.
-        */
-       dispatch_ary = kmalloc(sizeof(*dispatch_ary) * FQ_DISPATCH_ARRAY_SZ,
-                              M_TEMP, M_INTWAIT | M_ZERO);
-
-       /*
-        * We need to manually assign an tdio to the tdctx of this thread
-        * since it isn't assigned one during fq_prepare, as the disk
-        * is not set up yet.
-        */
-       dsched_new_policy_thread_tdio(&diskctx->head, &dsched_fq_policy);
-
-       DSCHED_DISK_CTX_LOCK(&diskctx->head);
-       for(;;) {
-               idle = 0;
-               /*
-                * sleep ~60 ms, failsafe low hz rates.
-                */
-               if ((lksleep(diskctx, &diskctx->head.lock, 0,
-                            "fq_dispatcher", (hz + 14) / 15) == 0)) {
-                       /*
-                        * We've been woken up; this either means that we are
-                        * supposed to die away nicely or that the disk is idle.
-                        */
-
-                       if (__predict_false(diskctx->die == 1))
-                               break;
-
-                       /*
-                        * We have been awakened because the disk is idle.
-                        * So let's get ready to dispatch some extra bios.
-                        */
-                       idle = 1;
-               }
-
-               /* Maybe the disk is idle and we just didn't get the wakeup */
-               if (idle == 0)
-                       idle = diskctx->idle;
-
-               /* Set the number of prepared requests to 0 */
-               i = 0;
-
-               /*
-                * XXX: further room for improvements here. It would be better
-                *      to dispatch a few requests from each tdio as to ensure
-                *      real fairness.
-                */
-               TAILQ_FOREACH_MUTABLE(ds_tdio, &diskctx->head.tdio_list,
-                                     dlink, ds_tdio2) {
-                       tdio = (struct fq_thread_io *)ds_tdio;
-                       if (tdio->head.qlength == 0)
-                               continue;
-
-                       DSCHED_THREAD_IO_LOCK(&tdio->head);
-                       if (atomic_cmpset_int(&tdio->rebalance, 1, 0))
-                               fq_balance_self(tdio);
-                       /*
-                        * XXX: why 5 extra? should probably be dynamic,
-                        *      relying on information on latency.
-                        */
-                       if ((tdio->max_tp > 0) && idle &&
-                           (tdio->issued >= tdio->max_tp)) {
-                               tdio->max_tp += 5;
-                       }
-
-                       prepd_io = 0;
-                       TAILQ_FOREACH_MUTABLE(bio, &tdio->head.queue, link, bio2) {
-                               if (atomic_cmpset_int(&tdio->rebalance, 1, 0))
-                                       fq_balance_self(tdio);
-                               if (((tdio->max_tp > 0) &&
-                                   (tdio->issued + prepd_io >= tdio->max_tp)) ||
-                                   (i == FQ_DISPATCH_ARRAY_SZ))
-                                       break;
-
-                               TAILQ_REMOVE(&tdio->head.queue, bio, link);
-                               --tdio->head.qlength;
-
-                               /*
-                                * beware that we do have an tdio reference
-                                * from the queueing
-                                *
-                                * XXX: note that here we don't dispatch it yet
-                                *      but just prepare it for dispatch so
-                                *      that no locks are held when calling
-                                *      into the drivers.
-                                */
-                               dispatch_ary[i].bio = bio;
-                               dispatch_ary[i].tdio = tdio;
-                               ++i;
-                               ++prepd_io;
-                       }
-                       DSCHED_THREAD_IO_UNLOCK(&tdio->head);
-
-               }
-
-               dsched_disk_ctx_ref(&diskctx->head);
-               DSCHED_DISK_CTX_UNLOCK(&diskctx->head);
-
-               /*
-                * Dispatch all the previously prepared bios, now without
-                * holding any locks.
-                */
-               for (--i; i >= 0; i--) {
-                       bio = dispatch_ary[i].bio;
-                       tdio = dispatch_ary[i].tdio;
-                       fq_dispatch(diskctx, bio, tdio);
-               }
-
-               DSCHED_DISK_CTX_LOCK(&diskctx->head);
-               dsched_disk_ctx_unref(&diskctx->head);
-       }
-
-       /*
-        * If we are supposed to die, drain all queues, then
-        * unlock and exit.
-        */
-       fq_drain(diskctx, FQ_DRAIN_FLUSH);
-       DSCHED_DISK_CTX_UNLOCK(&diskctx->head);
-       kfree(dispatch_ary, M_TEMP);
-
-       kprintf("fq_dispatcher is peacefully dying\n");
-       lwkt_exit();
-       /* NOTREACHED */
-}
-
-void
-fq_balance_thread(struct fq_disk_ctx *diskctx)
-{
-       struct dsched_thread_io *ds_tdio;
-       struct  fq_thread_io    *tdio;
-       struct timeval tv, old_tv;
-       int64_t total_budget, product;
-       int64_t budget[FQ_PRIO_MAX+1];
-       int     n, i, sum, total_disk_time;
-       int     lost_bits;
-
-       DSCHED_DISK_CTX_LOCK(&diskctx->head);
-
-       getmicrotime(&diskctx->start_interval);
-
-       for (;;) {
-               /* sleep ~1s */
-               if ((lksleep(curthread, &diskctx->head.lock, 0, "fq_balancer", hz/2) == 0)) {
-                       if (__predict_false(diskctx->die)) {
-                               DSCHED_DISK_CTX_UNLOCK(&diskctx->head);
-                               lwkt_exit();
-                       }
-               }
-
-               bzero(budget, sizeof(budget));
-               total_budget = 0;
-               n = 0;
-
-               old_tv = diskctx->start_interval;
-               getmicrotime(&tv);
-
-               total_disk_time = (int)(1000000*((tv.tv_sec - old_tv.tv_sec)) +
-                   (tv.tv_usec - old_tv.tv_usec));
-
-               if (total_disk_time == 0)
-                       total_disk_time = 1;
-
-               dsched_debug(LOG_INFO, "total_disk_time = %d\n", total_disk_time);
-
-               diskctx->start_interval = tv;
-
-               diskctx->disk_busy = (100*(total_disk_time - diskctx->idle_time)) / total_disk_time;
-               if (diskctx->disk_busy < 0)
-                       diskctx->disk_busy = 0;
-
-               diskctx->idle_time = 0;
-               lost_bits = 0;
-
-               TAILQ_FOREACH(ds_tdio, &diskctx->head.tdio_list, dlink) {
-                       tdio = (struct fq_thread_io *)ds_tdio;
-                       tdio->interval_avg_latency = tdio->avg_latency;
-                       tdio->interval_transactions = tdio->transactions;
-                       if (tdio->interval_transactions > 0) {
-                               product = (int64_t)tdio->interval_avg_latency *
-                                   tdio->interval_transactions;
-                               product >>= lost_bits;
-                               while(total_budget >= INT64_MAX - product) {
-                                       ++lost_bits;
-                                       product >>= 1;
-                                       total_budget >>= 1;
-                               }
-                               total_budget += product;
-                               ++budget[(tdio->head.p) ? tdio->head.p->p_ionice : 0];
-                               KKASSERT(total_budget >= 0);
-                               dsched_debug(LOG_INFO,
-                                   "%d) avg_latency = %d, transactions = %d, ioprio = %d\n",
-                                   n, tdio->interval_avg_latency, tdio->interval_transactions,
-                                   (tdio->head.p) ? tdio->head.p->p_ionice : 0);
-                               ++n;
-                       } else {
-                               tdio->max_tp = 0;
-                       }
-                       tdio->rebalance = 0;
-                       tdio->transactions = 0;
-                       tdio->avg_latency = 0;
-                       tdio->issued = 0;
-               }
-
-               dsched_debug(LOG_INFO, "%d procs competing for disk\n"
-                   "total_budget = %jd (lost bits = %d)\n"
-                   "incomplete tp = %d\n", n, (intmax_t)total_budget,
-                   lost_bits, diskctx->incomplete_tp);
-
-               if (n == 0)
-                       continue;
-
-               sum = 0;
-
-               for (i = 0; i < FQ_PRIO_MAX+1; i++) {
-                       if (budget[i] == 0)
-                               continue;
-                       sum += (FQ_PRIO_BIAS+i)*budget[i];
-               }
-
-               if (sum == 0)
-                       sum = 1;
-
-               dsched_debug(LOG_INFO, "sum = %d\n", sum);
-
-               for (i = 0; i < FQ_PRIO_MAX+1; i++) {
-                       if (budget[i] == 0)
-                               continue;
-
-                       /*
-                        * XXX: if we still overflow here, we really need to switch to
-                        *      some more advanced mechanism such as compound int128 or
-                        *      storing the lost bits so they can be used in the
-                        *      fq_balance_self.
-                        */
-                       diskctx->budgetpb[i] = ((FQ_PRIO_BIAS+i)*total_budget/sum) << lost_bits;
-                       KKASSERT(diskctx->budgetpb[i] >= 0);
-               }
-
-               dsched_debug(4, "disk is %d%% busy\n", diskctx->disk_busy);
-               TAILQ_FOREACH(ds_tdio, &diskctx->head.tdio_list, dlink) {
-                       tdio = (struct fq_thread_io *)ds_tdio;
-                       tdio->rebalance = 1;
-               }
-
-               diskctx->prev_full = diskctx->last_full;
-               diskctx->last_full = (diskctx->disk_busy >= 90)?1:0;
-       }
-}
-
-
-/*
- * fq_balance_self should be called from all sorts of dispatchers. It basically
- * offloads some of the heavier calculations on throttling onto the process that
- * wants to do I/O instead of doing it in the fq_balance thread.
- * - should be called with diskctx lock held
- */
-void
-fq_balance_self(struct fq_thread_io *tdio) {
-       struct fq_disk_ctx *diskctx;
-
-       int64_t budget, used_budget;
-       int64_t avg_latency;
-       int64_t transactions;
-
-       transactions = (int64_t)tdio->interval_transactions;
-       avg_latency = (int64_t)tdio->interval_avg_latency;
-       diskctx = (struct fq_disk_ctx *)tdio->head.diskctx;
-
-#if 0
-       /* XXX: do we really require the lock? */
-       DSCHED_DISK_CTX_LOCK_ASSERT(diskctx);
-#endif
-
-       used_budget = avg_latency * transactions;
-       budget = diskctx->budgetpb[(tdio->head.p) ? tdio->head.p->p_ionice : 0];
-
-       if (used_budget > 0) {
-               dsched_debug(LOG_INFO,
-                   "info: used_budget = %jd, budget = %jd\n",
-                   (intmax_t)used_budget, budget);
-       }
-
-       if ((used_budget > budget) && (diskctx->disk_busy >= 90)) {
-               KKASSERT(avg_latency != 0);
-
-               tdio->max_tp = budget/(avg_latency);
-               atomic_add_int(&fq_stats.procs_limited, 1);
-
-               dsched_debug(LOG_INFO,
-                   "rate limited to %d transactions\n", tdio->max_tp);
-
-       } else if (((used_budget*2 < budget) || (diskctx->disk_busy < 80)) &&
-           (!diskctx->prev_full && !diskctx->last_full)) {
-               tdio->max_tp = 0;
-       }
-}
-
-
-static int
-do_fqstats(SYSCTL_HANDLER_ARGS)
-{
-       return (sysctl_handle_opaque(oidp, &fq_stats, sizeof(struct dsched_fq_stats), req));
-}
-
-static int
-fq_mod_handler(module_t mod, int type, void *unused)
-{
-       static struct sysctl_ctx_list sysctl_ctx;
-       static struct sysctl_oid *oid;
-       static char version[16];
-       int error;
-
-       ksnprintf(version, sizeof(version), "%d.%d",
-           dsched_fq_version_maj, dsched_fq_version_min);
-
-       switch (type) {
-       case MOD_LOAD:
-               bzero(&fq_stats, sizeof(struct dsched_fq_stats));
-               if ((error = dsched_register(&dsched_fq_policy)))
-                       return (error);
-
-               sysctl_ctx_init(&sysctl_ctx);
-               oid = SYSCTL_ADD_NODE(&sysctl_ctx,
-                   SYSCTL_STATIC_CHILDREN(_dsched),
-                   OID_AUTO,
-                   "fq",
-                   CTLFLAG_RD, 0, "");
-
-               SYSCTL_ADD_PROC(&sysctl_ctx, SYSCTL_CHILDREN(oid),
-                   OID_AUTO, "stats", CTLTYPE_OPAQUE|CTLFLAG_RD,
-                   0, 0, do_fqstats, "S,dsched_fq_stats", "fq statistics");
-
-               SYSCTL_ADD_STRING(&sysctl_ctx, SYSCTL_CHILDREN(oid),
-                   OID_AUTO, "version", CTLFLAG_RD, version, 0, "fq version");
-
-               kprintf("FQ scheduler policy version %d.%d loaded\n",
-                   dsched_fq_version_maj, dsched_fq_version_min);
-               break;
-
-       case MOD_UNLOAD:
-               if ((error = dsched_unregister(&dsched_fq_policy)))
-                       return (error);
-               sysctl_ctx_free(&sysctl_ctx);
-               kprintf("FQ scheduler policy unloaded\n");
-               break;
-
-       default:
-               break;
-       }
-
-       return 0;
-}
-
-DSCHED_POLICY_MODULE(dsched_fq, fq_mod_handler, 1);
diff --git a/sys/kern/dsched/fq/fq_diskops.c b/sys/kern/dsched/fq/fq_diskops.c
deleted file mode 100644 (file)
index c10dcc2..0000000
+++ /dev/null
@@ -1,349 +0,0 @@
-/*
- * Copyright (c) 2009, 2010 The DragonFly Project.  All rights reserved.
- *
- * This code is derived from software contributed to The DragonFly Project
- * by Alex Hornung <ahornung@gmail.com>
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- * 3. Neither the name of The DragonFly Project nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific, prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
- * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
- * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/kernel.h>
-#include <sys/proc.h>
-#include <sys/sysctl.h>
-#include <sys/buf.h>
-#include <sys/conf.h>
-#include <sys/diskslice.h>
-#include <sys/disk.h>
-#include <machine/atomic.h>
-#include <sys/thread.h>
-#include <sys/thread2.h>
-#include <sys/ctype.h>
-#include <sys/buf2.h>
-#include <sys/syslog.h>
-#include <sys/dsched.h>
-#include <machine/param.h>
-
-#include <kern/dsched/fq/fq.h>
-
-static dsched_prepare_t                fq_prepare;
-static dsched_teardown_t       fq_teardown;
-static dsched_cancel_t         fq_cancel;
-static dsched_queue_t          fq_queue;
-
-extern struct dsched_fq_stats  fq_stats;
-
-struct dsched_policy dsched_fq_policy = {
-       .name = "fq",
-
-       .prepare = fq_prepare,
-       .teardown = fq_teardown,
-       .cancel_all = fq_cancel,
-       .bio_queue = fq_queue
-};
-
-static int
-fq_prepare(struct dsched_disk_ctx *ds_diskctx)
-{
-       struct  fq_disk_ctx     *diskctx = (struct fq_disk_ctx *)ds_diskctx;
-       struct thread *td_core, *td_balance;
-
-       lwkt_create((void (*)(void *))fq_dispatcher, diskctx, &td_core,
-                   NULL, 0, -1, "fq_dispatch_%s",
-                   ds_diskctx->dp->d_cdev->si_name);
-       lwkt_create((void (*)(void *))fq_balance_thread, diskctx, &td_balance,
-                   NULL, 0, -1, "fq_balance_%s",
-                   ds_diskctx->dp->d_cdev->si_name);
-       diskctx->td_balance = td_balance;
-
-       return 0;
-}
-
-
-
-static void
-fq_teardown(struct dsched_disk_ctx *ds_diskctx)
-{
-       struct fq_disk_ctx *diskctx = (struct fq_disk_ctx *)ds_diskctx;
-       KKASSERT(diskctx != NULL);
-
-       /* Basically kill the dispatcher thread */
-       diskctx->die = 1;
-       wakeup(diskctx->td_balance);
-       wakeup(diskctx);
-       tsleep(diskctx, 0, "fq_dispatcher", hz/5); /* wait 200 ms */
-       wakeup(diskctx->td_balance);
-       wakeup(diskctx);
-       tsleep(diskctx, 0, "fq_dispatcher", hz/10); /* wait 100 ms */
-       wakeup(diskctx->td_balance);
-       wakeup(diskctx);
-}
-
-
-/* Must be called with locked diskctx */
-void
-fq_drain(struct fq_disk_ctx *diskctx, int mode)
-{
-       struct dsched_thread_io *ds_tdio, *ds_tdio2;
-       struct fq_thread_io *tdio;
-       struct bio *bio, *bio2;
-
-       TAILQ_FOREACH_MUTABLE(ds_tdio, &diskctx->head.tdio_list, dlink, ds_tdio2) {
-               tdio = (struct fq_thread_io *)ds_tdio;
-               if (tdio->head.qlength == 0)
-                       continue;
-
-               DSCHED_THREAD_IO_LOCK(&tdio->head);
-               TAILQ_FOREACH_MUTABLE(bio, &tdio->head.queue, link, bio2) {
-                       TAILQ_REMOVE(&tdio->head.queue, bio, link);
-                       --tdio->head.qlength;
-                       if (__predict_false(mode == FQ_DRAIN_CANCEL)) {
-                               /* FQ_DRAIN_CANCEL */
-                               dsched_cancel_bio(bio);
-                               atomic_add_int(&fq_stats.cancelled, 1);
-
-                               /* Release ref acquired on fq_queue */
-                               /* XXX: possible failure point */
-                               dsched_thread_io_unref(&tdio->head);
-                       } else {
-                               /* FQ_DRAIN_FLUSH */
-                               fq_dispatch(diskctx, bio, tdio);
-                       }
-               }
-               DSCHED_THREAD_IO_UNLOCK(&tdio->head);
-       }
-       return;
-}
-
-static void
-fq_cancel(struct dsched_disk_ctx *ds_diskctx)
-{
-       struct fq_disk_ctx      *diskctx = (struct fq_disk_ctx *)ds_diskctx;
-
-       KKASSERT(diskctx != NULL);
-
-       /*
-        * all bios not in flight are queued in their respective tdios.
-        * good thing we have a list of tdios per disk diskctx.
-        */
-       DSCHED_DISK_CTX_LOCK(&diskctx->head);
-       fq_drain(diskctx, FQ_DRAIN_CANCEL);
-       DSCHED_DISK_CTX_UNLOCK(&diskctx->head);
-}
-
-
-static int
-fq_queue(struct dsched_disk_ctx *ds_diskctx, struct dsched_thread_io *ds_tdio, struct bio *obio)
-{
-       struct bio *b_dispatch_ary[FQ_DISPATCH_SML_ARRAY_SZ];
-       struct bio *bio, *bio2;
-       struct fq_thread_io     *tdio;
-       struct fq_disk_ctx      *diskctx;
-       int max_tp, transactions;
-       int i;
-
-       /* We don't handle flushes, let dsched dispatch them */
-       if (__predict_false(obio->bio_buf->b_cmd == BUF_CMD_FLUSH))
-               return (EINVAL);
-
-       tdio = (struct fq_thread_io *)ds_tdio;
-       diskctx = (struct fq_disk_ctx *)ds_diskctx;
-
-       if (atomic_cmpset_int(&tdio->rebalance, 1, 0))
-               fq_balance_self(tdio);
-
-       max_tp = tdio->max_tp;
-       transactions = tdio->issued;
-
-       /* | No rate limiting || Hasn't reached limit rate | */
-       if ((max_tp == 0) || (transactions < max_tp)) {
-               /*
-                * Process pending bios from previous _queue() actions that
-                * have been rate-limited and hence queued in the tdio.
-                */
-               KKASSERT(tdio->head.qlength >= 0);
-
-               if (tdio->head.qlength > 0) {
-                       i = 0;
-
-                       DSCHED_THREAD_IO_LOCK(&tdio->head);
-
-                       TAILQ_FOREACH_MUTABLE(bio, &tdio->head.queue, link, bio2) {
-                               /* Rebalance ourselves if required */
-                               if (atomic_cmpset_int(&tdio->rebalance, 1, 0))
-                                       fq_balance_self(tdio);
-                               if ((tdio->max_tp > 0) &&
-                                   (tdio->issued + i >= tdio->max_tp))
-                                       break;
-                               if (i == FQ_DISPATCH_SML_ARRAY_SZ)
-                                       break;
-
-                               TAILQ_REMOVE(&tdio->head.queue, bio, link);
-                               --tdio->head.qlength;
-
-                               /*
-                                * beware that we do have an tdio reference from the
-                                * queueing
-                                *
-                                * XXX: note that here we don't dispatch the BIOs yet
-                                *      but just prepare them for dispatch so that
-                                *      later they are pushed down to the driver
-                                *      without holding locks.
-                                */
-                               b_dispatch_ary[i++] = bio;
-                       }
-
-                       DSCHED_THREAD_IO_UNLOCK(&tdio->head);
-
-                       /*
-                        * Now dispatch all the prepared BIOs without holding
-                        * the thread_io lock.
-                        */
-                       for (--i; i >= 0; i--)
-                               fq_dispatch(diskctx, b_dispatch_ary[i], tdio);
-               }
-
-               /* Nothing is pending from previous IO, so just pass it down */
-               dsched_thread_io_ref(&tdio->head);
-
-               fq_dispatch(diskctx, obio, tdio);
-       } else {
-               /*
-                * This thread has exceeeded its fair share,
-                * the transactions are now rate limited. At
-                * this point, the rate would be exceeded, so
-                * we just queue requests instead of
-                * despatching them.
-                */
-               DSCHED_THREAD_IO_LOCK(&tdio->head);
-               dsched_thread_io_ref(&tdio->head);
-
-               /*
-                * Prioritize reads by inserting them at the front of the
-                * queue.
-                *
-                * XXX: this might cause issues with data that should
-                *      have been written and is being read, but hasn't
-                *      actually been written yet.
-                */
-               if (obio->bio_buf->b_cmd == BUF_CMD_READ)
-                       TAILQ_INSERT_HEAD(&tdio->head.queue, obio, link);
-               else
-                       TAILQ_INSERT_TAIL(&tdio->head.queue, obio, link);
-
-               ++tdio->head.qlength;
-               DSCHED_THREAD_IO_UNLOCK(&tdio->head);
-       }
-
-       return 0;
-}
-
-
-void
-fq_completed(struct bio *bp)
-{
-       struct bio *obio;
-       int     delta;
-       struct fq_thread_io     *tdio;
-       struct fq_disk_ctx      *diskctx;
-       struct disk     *dp;
-       int transactions, latency;
-
-       struct timeval tv;
-
-       getmicrotime(&tv);
-
-       dp = dsched_get_bio_dp(bp);
-       diskctx = dsched_get_disk_priv(dp);
-       tdio = dsched_get_bio_priv(bp);
-       KKASSERT(tdio != NULL);
-       KKASSERT(diskctx != NULL);
-
-       dsched_disk_ctx_ref(&diskctx->head);
-       atomic_subtract_int(&diskctx->incomplete_tp, 1);
-
-       if (!(bp->bio_buf->b_flags & B_ERROR)) {
-               /*
-                * Get the start ticks from when the bio was dispatched and calculate
-                * how long it took until completion.
-                */
-               delta = (int)(1000000*((tv.tv_sec - bp->bio_caller_info3.tv.tv_sec)) +
-                   (tv.tv_usec - bp->bio_caller_info3.tv.tv_usec));
-               if (delta <= 0)
-                       delta = 10000; /* default assume 10 ms */
-
-               /* This is the last in-flight request and the disk is not idle yet */
-               if ((diskctx->incomplete_tp <= 1) && (!diskctx->idle)) {
-                       diskctx->idle = 1;      /* Mark disk as idle */
-                       diskctx->start_idle = tv;       /* Save start idle time */
-                       wakeup(diskctx);                /* Wake up fq_dispatcher */
-               }
-               transactions = atomic_fetchadd_int(&tdio->transactions, 1);
-               latency = tdio->avg_latency;
-
-               if (latency != 0) {
-                       /* Moving averager, ((n-1)*avg_{n-1} + x) / n */
-                       latency = (int)(((int64_t)(transactions) *
-                           (int64_t)latency + (int64_t)delta) / ((int64_t)transactions + 1));
-                       KKASSERT(latency > 0);
-               } else {
-                       latency = delta;
-               }
-
-               tdio->avg_latency = latency;
-
-               atomic_add_int(&fq_stats.transactions_completed, 1);
-       }
-
-       dsched_disk_ctx_unref(&diskctx->head);
-       /* decrease the ref count that was bumped for us on dispatch */
-       dsched_thread_io_unref(&tdio->head);
-
-       obio = pop_bio(bp);
-       biodone(obio);
-}
-
-void
-fq_dispatch(struct fq_disk_ctx *diskctx, struct bio *bio,
-    struct fq_thread_io *tdio)
-{
-       struct timeval tv;
-
-       if (diskctx->idle) {
-               getmicrotime(&tv);
-               atomic_add_int(&diskctx->idle_time,
-                   (int)(1000000*((tv.tv_sec - diskctx->start_idle.tv_sec)) +
-                   (tv.tv_usec - diskctx->start_idle.tv_usec)));
-               diskctx->idle = 0;
-       }
-       dsched_strategy_async(diskctx->head.dp, bio, fq_completed, tdio);
-
-       atomic_add_int(&tdio->issued, 1);
-       atomic_add_int(&diskctx->incomplete_tp, 1);
-       atomic_add_int(&fq_stats.transactions, 1);
-}
index 2a2fc39..a15a457 100644 (file)
 #include <sys/tree.h>
 #include <sys/syslink_rpc.h>
 #include <sys/proc.h>
-#include <machine/stdarg.h>
-#include <sys/devfs.h>
 #include <sys/dsched.h>
+#include <sys/devfs.h>
+
+#include <machine/stdarg.h>
 
 #include <sys/thread2.h>
 #include <sys/mplock2.h>
@@ -379,9 +380,7 @@ dev_dstrategy(cdev_t dev, struct bio *bio)
            track = &dev->si_track_write;
        bio_track_ref(track);
        bio->bio_track = track;
-
-       if (dsched_is_clear_buf_priv(bio->bio_buf))
-               dsched_new_buf(bio->bio_buf);
+       dsched_buf_enter(bio->bio_buf); /* might stack */
 
        KKASSERT((bio->bio_flags & BIO_DONE) == 0);
        if (needmplock) {
index bf6daf2..5142822 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2009, 2010 The DragonFly Project.  All rights reserved.
+ * Copyright (c) 2009, 2010, 2015 The DragonFly Project.  All rights reserved.
  *
  * This code is derived from software contributed to The DragonFly Project
  * by Alex Hornung <ahornung@gmail.com>
 #include <sys/fcntl.h>
 #include <machine/varargs.h>
 
-TAILQ_HEAD(tdio_list_head, dsched_thread_io);
-
-MALLOC_DEFINE(M_DSCHED, "dsched", "dsched allocs");
-
-static dsched_prepare_t                noop_prepare;
-static dsched_teardown_t       noop_teardown;
-static dsched_cancel_t         noop_cancel;
-static dsched_queue_t          noop_queue;
-
-static void dsched_thread_io_unref_destroy(struct dsched_thread_io *tdio);
-static void dsched_sysctl_add_disk(struct dsched_disk_ctx *diskctx, char *name);
-static void dsched_disk_ctx_destroy(struct dsched_disk_ctx *diskctx);
-static void dsched_thread_io_destroy(struct dsched_thread_io *tdio);
-static void dsched_thread_ctx_destroy(struct dsched_thread_ctx *tdctx);
-
-static struct dsched_thread_io *dsched_thread_io_alloc(
-               struct disk *dp, struct dsched_thread_ctx *tdctx,
-               struct dsched_policy *pol, int tdctx_locked);
-
-static int     dsched_inited = 0;
-static int     default_set = 0;
-
-struct lock    dsched_lock;
-static int     dsched_debug_enable = 0;
-
-struct dsched_stats    dsched_stats;
-
-struct objcache_malloc_args dsched_disk_ctx_malloc_args = {
-       DSCHED_DISK_CTX_MAX_SZ, M_DSCHED };
-struct objcache_malloc_args dsched_thread_io_malloc_args = {
-       DSCHED_THREAD_IO_MAX_SZ, M_DSCHED };
-struct objcache_malloc_args dsched_thread_ctx_malloc_args = {
-       DSCHED_THREAD_CTX_MAX_SZ, M_DSCHED };
-
-static struct objcache *dsched_diskctx_cache;
-static struct objcache *dsched_tdctx_cache;
-static struct objcache *dsched_tdio_cache;
-
-struct lock    dsched_tdctx_lock;
-
-static struct dsched_policy_head dsched_policy_list =
-               TAILQ_HEAD_INITIALIZER(dsched_policy_list);
-
-static struct dsched_policy dsched_noop_policy = {
-       .name = "noop",
-
-       .prepare = noop_prepare,
-       .teardown = noop_teardown,
-       .cancel_all = noop_cancel,
-       .bio_queue = noop_queue
-};
-
-static struct dsched_policy *default_policy = &dsched_noop_policy;
-
-/*
- * dsched_debug() is a SYSCTL and TUNABLE controlled debug output function
- * using kvprintf
- */
-int
-dsched_debug(int level, char *fmt, ...)
-{
-       __va_list ap;
-
-       __va_start(ap, fmt);
-       if (level <= dsched_debug_enable)
-               kvprintf(fmt, ap);
-       __va_end(ap);
-
-       return 0;
-}
-
 /*
  * Called on disk_create()
  * tries to read which policy to use from loader.conf, if there's
  * none specified, the default policy is used.
  */
 void
-dsched_disk_create_callback(struct disk *dp, const char *head_name, int unit)
+dsched_disk_create(struct disk *dp, const char *head_name, int unit)
 {
-       char tunable_key[SPECNAMELEN + 48];
-       char sched_policy[DSCHED_POLICY_NAME_LENGTH];
-       char *ptr;
-       struct dsched_policy *policy = NULL;
-
-       /* Also look for serno stuff? */
-       lockmgr(&dsched_lock, LK_EXCLUSIVE);
-
-       ksnprintf(tunable_key, sizeof(tunable_key),
-                 "dsched.policy.%s%d", head_name, unit);
-       if (TUNABLE_STR_FETCH(tunable_key, sched_policy,
-           sizeof(sched_policy)) != 0) {
-               policy = dsched_find_policy(sched_policy);
-       }
-
-       ksnprintf(tunable_key, sizeof(tunable_key),
-                 "dsched.policy.%s", head_name);
-
-       for (ptr = tunable_key; *ptr; ptr++) {
-               if (*ptr == '/')
-                       *ptr = '-';
-       }
-       if (!policy && (TUNABLE_STR_FETCH(tunable_key, sched_policy,
-           sizeof(sched_policy)) != 0)) {
-               policy = dsched_find_policy(sched_policy);
-       }
-
-       ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.default");
-       if (!policy && !default_set &&
-           (TUNABLE_STR_FETCH(tunable_key, sched_policy,
-                              sizeof(sched_policy)) != 0)) {
-               policy = dsched_find_policy(sched_policy);
-       }
-
-       if (!policy) {
-               if (!default_set && bootverbose) {
-                       dsched_debug(0,
-                                    "No policy for %s%d specified, "
-                                    "or policy not found\n",
-                                    head_name, unit);
-               }
-               dsched_set_policy(dp, default_policy);
-       } else {
-               dsched_set_policy(dp, policy);
-       }
-
-       if (strncmp(head_name, "mapper/", strlen("mapper/")) == 0)
-               ksnprintf(tunable_key, sizeof(tunable_key), "%s", head_name);
-       else
-               ksnprintf(tunable_key, sizeof(tunable_key), "%s%d", head_name, unit);
-       for (ptr = tunable_key; *ptr; ptr++) {
-               if (*ptr == '/')
-                       *ptr = '-';
-       }
-       dsched_sysctl_add_disk(
-           (struct dsched_disk_ctx *)dsched_get_disk_priv(dp),
-           tunable_key);
-
-       lockmgr(&dsched_lock, LK_RELEASE);
 }
 
 /*
@@ -197,34 +67,8 @@ dsched_disk_create_callback(struct disk *dp, const char *head_name, int unit)
  * there's any policy associated with the serial number of the device.
  */
 void
-dsched_disk_update_callback(struct disk *dp, struct disk_info *info)
+dsched_disk_update(struct disk *dp, struct disk_info *info)
 {
-       char tunable_key[SPECNAMELEN + 48];
-       char sched_policy[DSCHED_POLICY_NAME_LENGTH];
-       struct dsched_policy *policy = NULL;
-
-       if (info->d_serialno == NULL)
-               return;
-
-       lockmgr(&dsched_lock, LK_EXCLUSIVE);
-
-       ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.%s",
-           info->d_serialno);
-
-       if((TUNABLE_STR_FETCH(tunable_key, sched_policy,
-           sizeof(sched_policy)) != 0)) {
-               policy = dsched_find_policy(sched_policy);      
-       }
-
-       if (policy) {
-               dsched_switch(dp, policy);      
-       }
-
-       dsched_sysctl_add_disk(
-           (struct dsched_disk_ctx *)dsched_get_disk_priv(dp),
-           info->d_serialno);
-
-       lockmgr(&dsched_lock, LK_RELEASE);
 }
 
 /*
@@ -232,1210 +76,6 @@ dsched_disk_update_callback(struct disk *dp, struct disk_info *info)
  * shuts down the scheduler core and cancels all remaining bios
  */
 void
-dsched_disk_destroy_callback(struct disk *dp)
-{
-       struct dsched_policy *old_policy;
-       struct dsched_disk_ctx *diskctx;
-
-       lockmgr(&dsched_lock, LK_EXCLUSIVE);
-
-       diskctx = dsched_get_disk_priv(dp);
-
-       old_policy = dp->d_sched_policy;
-       dp->d_sched_policy = &dsched_noop_policy;
-       old_policy->cancel_all(dsched_get_disk_priv(dp));
-       old_policy->teardown(dsched_get_disk_priv(dp));
-
-       if (diskctx->flags & DSCHED_SYSCTL_CTX_INITED)
-               sysctl_ctx_free(&diskctx->sysctl_ctx);
-
-       policy_destroy(dp);
-       atomic_subtract_int(&old_policy->ref_count, 1);
-       KKASSERT(old_policy->ref_count >= 0);
-
-       lockmgr(&dsched_lock, LK_RELEASE);
-}
-
-
-/*
- * Caller must have dp->diskctx locked
- */
-void
-dsched_queue(struct disk *dp, struct bio *bio)
-{
-       struct dsched_thread_ctx        *tdctx;
-       struct dsched_thread_io         *tdio;
-       struct dsched_disk_ctx          *diskctx;
-       int     error;
-
-       if (dp->d_sched_policy == &dsched_noop_policy) {
-               dsched_clr_buf_priv(bio->bio_buf);
-               atomic_add_int(&dsched_stats.no_tdctx, 1);
-               dsched_strategy_raw(dp, bio);
-               return;
-       }
-
-       error = 0;
-       tdctx = dsched_get_buf_priv(bio->bio_buf);
-       if (tdctx == NULL) {
-               /* We don't handle this case, let dsched dispatch */
-               atomic_add_int(&dsched_stats.no_tdctx, 1);
-               dsched_strategy_raw(dp, bio);
-               return;
-       }
-
-       DSCHED_THREAD_CTX_LOCK(tdctx);
-
-       /*
-        * XXX:
-        * iterate in reverse to make sure we find the most up-to-date
-        * tdio for a given disk. After a switch it may take some time
-        * for everything to clean up.
-        */
-       TAILQ_FOREACH_REVERSE(tdio, &tdctx->tdio_list, tdio_list_head, link) {
-               if (tdio->dp == dp) {
-                       dsched_thread_io_ref(tdio);
-                       break;
-               }
-       }
-       if (tdio == NULL) {
-               tdio = dsched_thread_io_alloc(dp, tdctx, dp->d_sched_policy, 1);
-               dsched_thread_io_ref(tdio);
-       }
-
-       DSCHED_THREAD_CTX_UNLOCK(tdctx);
-       dsched_clr_buf_priv(bio->bio_buf);
-       dsched_thread_ctx_unref(tdctx); /* acquired on new_buf */
-
-       diskctx = dsched_get_disk_priv(dp);
-       dsched_disk_ctx_ref(diskctx);
-
-       if (dp->d_sched_policy != &dsched_noop_policy)
-               KKASSERT(tdio->debug_policy == dp->d_sched_policy);
-
-       KKASSERT(tdio->debug_inited == 0xF00F1234);
-
-       error = dp->d_sched_policy->bio_queue(diskctx, tdio, bio);
-
-       if (error) {
-               dsched_strategy_raw(dp, bio);
-       }
-       dsched_disk_ctx_unref(diskctx);
-       dsched_thread_io_unref(tdio);
-}
-
-
-/*
- * Called from each module_init or module_attach of each policy
- * registers the policy in the local policy list.
- */
-int
-dsched_register(struct dsched_policy *d_policy)
-{
-       struct dsched_policy *policy;
-       int error = 0;
-
-       lockmgr(&dsched_lock, LK_EXCLUSIVE);
-
-       policy = dsched_find_policy(d_policy->name);
-
-       if (!policy) {
-               TAILQ_INSERT_TAIL(&dsched_policy_list, d_policy, link);
-               atomic_add_int(&d_policy->ref_count, 1);
-       } else {
-               dsched_debug(LOG_ERR, "Policy with name %s already registered!\n",
-                   d_policy->name);
-               error = EEXIST;
-       }
-
-       lockmgr(&dsched_lock, LK_RELEASE);
-       return error;
-}
-
-/*
- * Called from each module_detach of each policy
- * unregisters the policy
- */
-int
-dsched_unregister(struct dsched_policy *d_policy)
-{
-       struct dsched_policy *policy;
-
-       lockmgr(&dsched_lock, LK_EXCLUSIVE);
-       policy = dsched_find_policy(d_policy->name);
-
-       if (policy) {
-               if (policy->ref_count > 1) {
-                       lockmgr(&dsched_lock, LK_RELEASE);
-                       return EBUSY;
-               }
-               TAILQ_REMOVE(&dsched_policy_list, policy, link);
-               atomic_subtract_int(&policy->ref_count, 1);
-               KKASSERT(policy->ref_count == 0);
-       }
-       lockmgr(&dsched_lock, LK_RELEASE);
-
-       return 0;
-}
-
-
-/*
- * switches the policy by first removing the old one and then
- * enabling the new one.
- */
-int
-dsched_switch(struct disk *dp, struct dsched_policy *new_policy)
-{
-       struct dsched_policy *old_policy;
-
-       /* If we are asked to set the same policy, do nothing */
-       if (dp->d_sched_policy == new_policy)
-               return 0;
-
-       /* lock everything down, diskwise */
-       lockmgr(&dsched_lock, LK_EXCLUSIVE);
-       old_policy = dp->d_sched_policy;
-
-       atomic_subtract_int(&old_policy->ref_count, 1);
-       KKASSERT(old_policy->ref_count >= 0);
-
-       dp->d_sched_policy = &dsched_noop_policy;
-       old_policy->teardown(dsched_get_disk_priv(dp));
-       policy_destroy(dp);
-
-       /* Bring everything back to life */
-       dsched_set_policy(dp, new_policy);
-       lockmgr(&dsched_lock, LK_RELEASE);
-
-       return 0;
-}
-
-
-/*
- * Loads a given policy and attaches it to the specified disk.
- * Also initializes the core for the policy
- */
-void
-dsched_set_policy(struct disk *dp, struct dsched_policy *new_policy)
-{
-       int locked = 0;
-
-       /* Check if it is locked already. if not, we acquire the devfs lock */
-       if ((lockstatus(&dsched_lock, curthread)) != LK_EXCLUSIVE) {
-               lockmgr(&dsched_lock, LK_EXCLUSIVE);
-               locked = 1;
-       }
-
-       DSCHED_GLOBAL_THREAD_CTX_LOCK();
-
-       policy_new(dp, new_policy);
-       new_policy->prepare(dsched_get_disk_priv(dp));
-       dp->d_sched_policy = new_policy;
-       atomic_add_int(&new_policy->ref_count, 1);
-
-       DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
-
-       kprintf("disk scheduler: set policy of %s to %s\n", dp->d_cdev->si_name,
-           new_policy->name);
-
-       /* If we acquired the lock, we also get rid of it */
-       if (locked)
-               lockmgr(&dsched_lock, LK_RELEASE);
-}
-
-struct dsched_policy*
-dsched_find_policy(char *search)
-{
-       struct dsched_policy *policy;
-       struct dsched_policy *policy_found = NULL;
-       int locked = 0;
-
-       /* Check if it is locked already. if not, we acquire the devfs lock */
-       if ((lockstatus(&dsched_lock, curthread)) != LK_EXCLUSIVE) {
-               lockmgr(&dsched_lock, LK_EXCLUSIVE);
-               locked = 1;
-       }
-
-       TAILQ_FOREACH(policy, &dsched_policy_list, link) {
-               if (!strcmp(policy->name, search)) {
-                       policy_found = policy;
-                       break;
-               }
-       }
-
-       /* If we acquired the lock, we also get rid of it */
-       if (locked)
-               lockmgr(&dsched_lock, LK_RELEASE);
-
-       return policy_found;
-}
-
-/*
- * Returns ref'd disk
- */
-struct disk *
-dsched_find_disk(char *search)
-{
-       struct disk marker;
-       struct disk *dp = NULL;
-
-       while ((dp = disk_enumerate(&marker, dp)) != NULL) {
-               if (strcmp(dp->d_cdev->si_name, search) == 0) {
-                       disk_enumerate_stop(&marker, NULL);
-                       /* leave ref on dp */
-                       break;
-               }
-       }
-       return dp;
-}
-
-struct disk *
-dsched_disk_enumerate(struct disk *marker, struct disk *dp,
-                     struct dsched_policy *policy)
-{
-       while ((dp = disk_enumerate(marker, dp)) != NULL) {
-               if (dp->d_sched_policy == policy)
-                       break;
-       }
-       return NULL;
-}
-
-struct dsched_policy *
-dsched_policy_enumerate(struct dsched_policy *pol)
-{
-       if (!pol)
-               return (TAILQ_FIRST(&dsched_policy_list));
-       else
-               return (TAILQ_NEXT(pol, link));
-}
-
-void
-dsched_cancel_bio(struct bio *bp)
-{
-       bp->bio_buf->b_error = ENXIO;
-       bp->bio_buf->b_flags |= B_ERROR;
-       bp->bio_buf->b_resid = bp->bio_buf->b_bcount;
-
-       biodone(bp);
-}
-
-void
-dsched_strategy_raw(struct disk *dp, struct bio *bp)
-{
-       /*
-        * Ideally, this stuff shouldn't be needed... but just in case, we leave it in
-        * to avoid panics
-        */
-       KASSERT(dp->d_rawdev != NULL, ("dsched_strategy_raw sees NULL d_rawdev!!"));
-       if(bp->bio_track != NULL) {
-               dsched_debug(LOG_INFO,
-                   "dsched_strategy_raw sees non-NULL bio_track!! "
-                   "bio: %p\n", bp);
-               bp->bio_track = NULL;
-       }
-       dev_dstrategy(dp->d_rawdev, bp);
-}
-
-void
-dsched_strategy_sync(struct disk *dp, struct bio *bio)
-{
-       struct buf *bp, *nbp;
-       struct bio *nbio;
-
-       bp = bio->bio_buf;
-
-       nbp = getpbuf(NULL);
-       nbio = &nbp->b_bio1;
-
-       nbp->b_cmd = bp->b_cmd;
-       nbp->b_bufsize = bp->b_bufsize;
-       nbp->b_runningbufspace = bp->b_runningbufspace;
-       nbp->b_bcount = bp->b_bcount;
-       nbp->b_resid = bp->b_resid;
-       nbp->b_data = bp->b_data;
-#if 0
-       /*
-        * Buffers undergoing device I/O do not need a kvabase/size.
-        */
-       nbp->b_kvabase = bp->b_kvabase;
-       nbp->b_kvasize = bp->b_kvasize;
-#endif
-       nbp->b_dirtyend = bp->b_dirtyend;
-
-       nbio->bio_done = biodone_sync;
-       nbio->bio_flags |= BIO_SYNC;
-       nbio->bio_track = NULL;
-
-       nbio->bio_caller_info1.ptr = dp;
-       nbio->bio_offset = bio->bio_offset;
-
-       dev_dstrategy(dp->d_rawdev, nbio);
-       biowait(nbio, "dschedsync");
-       bp->b_resid = nbp->b_resid;
-       bp->b_error = nbp->b_error;
-       biodone(bio);
-#if 0
-       nbp->b_kvabase = NULL;
-       nbp->b_kvasize = 0;
-#endif
-       relpbuf(nbp, NULL);
-}
-
-void
-dsched_strategy_async(struct disk *dp, struct bio *bio, biodone_t *done, void *priv)
-{
-       struct bio *nbio;
-
-       nbio = push_bio(bio);
-       nbio->bio_done = done;
-       nbio->bio_offset = bio->bio_offset;
-
-       dsched_set_bio_dp(nbio, dp);
-       dsched_set_bio_priv(nbio, priv);
-
-       getmicrotime(&nbio->bio_caller_info3.tv);
-       dev_dstrategy(dp->d_rawdev, nbio);
-}
-
-/*
- * A special bio done call back function
- * used by policy having request polling implemented.
- */
-static void
-request_polling_biodone(struct bio *bp)
-{
-       struct dsched_disk_ctx *diskctx = NULL;
-       struct disk *dp = NULL;
-       struct bio *obio;
-       struct dsched_policy *policy;
-
-       dp = dsched_get_bio_dp(bp);
-       policy = dp->d_sched_policy;
-       diskctx = dsched_get_disk_priv(dp);
-       KKASSERT(diskctx && policy);
-       dsched_disk_ctx_ref(diskctx);
-
-       /*
-        * XXX:
-        * the bio_done function should not be blocked !
-        */
-       if (diskctx->dp->d_sched_policy->bio_done)
-               diskctx->dp->d_sched_policy->bio_done(bp);
-
-       obio = pop_bio(bp);
-       biodone(obio);
-
-       atomic_subtract_int(&diskctx->current_tag_queue_depth, 1);
-
-       /* call the polling function,
-        * XXX:
-        * the polling function should not be blocked!
-        */
-       if (policy->polling_func)
-               policy->polling_func(diskctx);
-       else
-               dsched_debug(0, "dsched: the policy uses request polling without a polling function!\n");
-       dsched_disk_ctx_unref(diskctx);
-}
-
-/*
- * A special dsched strategy used by policy having request polling
- * (polling function) implemented.
- *
- * The strategy is the just like dsched_strategy_async(), but
- * the biodone call back is set to a preset one.
- *
- * If the policy needs its own biodone callback, it should
- * register it in the policy structure. (bio_done field)
- *
- * The current_tag_queue_depth is maintained by this function
- * and the request_polling_biodone() function
- */
-
-void
-dsched_strategy_request_polling(struct disk *dp, struct bio *bio, struct dsched_disk_ctx *diskctx)
-{
-       atomic_add_int(&diskctx->current_tag_queue_depth, 1);
-       dsched_strategy_async(dp, bio, request_polling_biodone, dsched_get_bio_priv(bio));
-}
-
-/*
- * Ref and deref various structures.  The 1->0 transition of the reference
- * count actually transitions 1->0x80000000 and causes the object to be
- * destroyed.  It is possible for transitory references to occur on the
- * object while it is being destroyed.  We use bit 31 to indicate that
- * destruction is in progress and to prevent nested destructions.
- */
-void
-dsched_disk_ctx_ref(struct dsched_disk_ctx *diskctx)
-{
-       int refcount __unused;
-
-       refcount = atomic_fetchadd_int(&diskctx->refcount, 1);
-}
-
-void
-dsched_thread_io_ref(struct dsched_thread_io *tdio)
-{
-       int refcount __unused;
-
-       refcount = atomic_fetchadd_int(&tdio->refcount, 1);
-}
-
-void
-dsched_thread_ctx_ref(struct dsched_thread_ctx *tdctx)
-{
-       int refcount __unused;
-
-       refcount = atomic_fetchadd_int(&tdctx->refcount, 1);
-}
-
-void
-dsched_disk_ctx_unref(struct dsched_disk_ctx *diskctx)
-{
-       int refs;
-       int nrefs;
-
-       /*
-        * Handle 1->0 transitions for diskctx and nested destruction
-        * recursions.  If the refs are already in destruction mode (bit 31
-        * set) on the 1->0 transition we don't try to destruct it again.
-        *
-        * 0x80000001->0x80000000 transitions are handled normally and
-        * thus avoid nested dstruction.
-        */
-       for (;;) {
-               refs = diskctx->refcount;
-               cpu_ccfence();
-               nrefs = refs - 1;
-
-               KKASSERT(((refs ^ nrefs) & 0x80000000) == 0);
-               if (nrefs) {
-                       if (atomic_cmpset_int(&diskctx->refcount, refs, nrefs))
-                               break;
-                       continue;
-               }
-               nrefs = 0x80000000;
-               if (atomic_cmpset_int(&diskctx->refcount, refs, nrefs)) {
-                       dsched_disk_ctx_destroy(diskctx);
-                       break;
-               }
-       }
-}
-
-static
-void
-dsched_disk_ctx_destroy(struct dsched_disk_ctx *diskctx)
-{
-       struct dsched_thread_io *tdio;
-       int refs;
-       int nrefs;
-
-#if 0
-       kprintf("diskctx (%p) destruction started, trace:\n", diskctx);
-       print_backtrace(4);
-#endif
-       lockmgr(&diskctx->lock, LK_EXCLUSIVE);
-       while ((tdio = TAILQ_FIRST(&diskctx->tdio_list)) != NULL) {
-               KKASSERT(tdio->flags & DSCHED_LINKED_DISK_CTX);
-               TAILQ_REMOVE(&diskctx->tdio_list, tdio, dlink);
-               atomic_clear_int(&tdio->flags, DSCHED_LINKED_DISK_CTX);
-               tdio->diskctx = NULL;
-               /* XXX tdio->diskctx->dp->d_sched_policy->destroy_tdio(tdio);*/
-               lockmgr(&diskctx->lock, LK_RELEASE);
-               dsched_thread_io_unref_destroy(tdio);
-               lockmgr(&diskctx->lock, LK_EXCLUSIVE);
-       }
-       lockmgr(&diskctx->lock, LK_RELEASE);
-
-       /*
-        * Expect diskctx->refcount to be 0x80000000.  If it isn't someone
-        * else still has a temporary ref on the diskctx and we have to
-        * transition it back to an undestroyed-state (albeit without any
-        * associations), so the other user destroys it properly when the
-        * ref is released.
-        */
-       while ((refs = diskctx->refcount) != 0x80000000) {
-               kprintf("dsched_thread_io: destroy race diskctx=%p\n", diskctx);
-               cpu_ccfence();
-               KKASSERT(refs & 0x80000000);
-               nrefs = refs & 0x7FFFFFFF;
-               if (atomic_cmpset_int(&diskctx->refcount, refs, nrefs))
-                       return;
-       }
-
-       /*
-        * Really for sure now.
-        */
-       if (diskctx->dp->d_sched_policy->destroy_diskctx)
-               diskctx->dp->d_sched_policy->destroy_diskctx(diskctx);
-       objcache_put(dsched_diskctx_cache, diskctx);
-       atomic_subtract_int(&dsched_stats.diskctx_allocations, 1);
-}
-
-void
-dsched_thread_io_unref(struct dsched_thread_io *tdio)
-{
-       int refs;
-       int nrefs;
-
-       /*
-        * Handle 1->0 transitions for tdio and nested destruction
-        * recursions.  If the refs are already in destruction mode (bit 31
-        * set) on the 1->0 transition we don't try to destruct it again.
-        *
-        * 0x80000001->0x80000000 transitions are handled normally and
-        * thus avoid nested dstruction.
-        */
-       for (;;) {
-               refs = tdio->refcount;
-               cpu_ccfence();
-               nrefs = refs - 1;
-
-               KKASSERT(((refs ^ nrefs) & 0x80000000) == 0);
-               if (nrefs) {
-                       if (atomic_cmpset_int(&tdio->refcount, refs, nrefs))
-                               break;
-                       continue;
-               }
-               nrefs = 0x80000000;
-               if (atomic_cmpset_int(&tdio->refcount, refs, nrefs)) {
-                       dsched_thread_io_destroy(tdio);
-                       break;
-               }
-       }
-}
-
-/*
- * Unref and destroy the tdio even if additional refs are present.
- */
-static
-void
-dsched_thread_io_unref_destroy(struct dsched_thread_io *tdio)
-{
-       int refs;
-       int nrefs;
-
-       /*
-        * If not already transitioned to destroy-in-progress we transition
-        * to destroy-in-progress, cleanup our ref, and destroy the tdio.
-        */
-       for (;;) {
-               refs = tdio->refcount;
-               cpu_ccfence();
-               nrefs = refs - 1;
-
-               KKASSERT(((refs ^ nrefs) & 0x80000000) == 0);
-               if (nrefs & 0x80000000) {
-                       if (atomic_cmpset_int(&tdio->refcount, refs, nrefs))
-                               break;
-                       continue;
-               }
-               nrefs |= 0x80000000;
-               if (atomic_cmpset_int(&tdio->refcount, refs, nrefs)) {
-                       dsched_thread_io_destroy(tdio);
-                       break;
-               }
-       }
-}
-
-static void
-dsched_thread_io_destroy(struct dsched_thread_io *tdio)
-{
-       struct dsched_thread_ctx *tdctx;
-       struct dsched_disk_ctx  *diskctx;
-       int refs;
-       int nrefs;
-
-#if 0
-       kprintf("tdio (%p) destruction started, trace:\n", tdio);
-       print_backtrace(8);
-#endif
-       KKASSERT(tdio->qlength == 0);
-
-       while ((diskctx = tdio->diskctx) != NULL) {
-               dsched_disk_ctx_ref(diskctx);
-               lockmgr(&diskctx->lock, LK_EXCLUSIVE);
-               if (diskctx != tdio->diskctx) {
-                       lockmgr(&diskctx->lock, LK_RELEASE);
-                       dsched_disk_ctx_unref(diskctx);
-                       continue;
-               }
-               KKASSERT(tdio->flags & DSCHED_LINKED_DISK_CTX);
-               if (diskctx->dp->d_sched_policy->destroy_tdio)
-                       diskctx->dp->d_sched_policy->destroy_tdio(tdio);
-               TAILQ_REMOVE(&diskctx->tdio_list, tdio, dlink);
-               atomic_clear_int(&tdio->flags, DSCHED_LINKED_DISK_CTX);
-               tdio->diskctx = NULL;
-               dsched_thread_io_unref(tdio);
-               lockmgr(&diskctx->lock, LK_RELEASE);
-               dsched_disk_ctx_unref(diskctx);
-       }
-       while ((tdctx = tdio->tdctx) != NULL) {
-               dsched_thread_ctx_ref(tdctx);
-               lockmgr(&tdctx->lock, LK_EXCLUSIVE);
-               if (tdctx != tdio->tdctx) {
-                       lockmgr(&tdctx->lock, LK_RELEASE);
-                       dsched_thread_ctx_unref(tdctx);
-                       continue;
-               }
-               KKASSERT(tdio->flags & DSCHED_LINKED_THREAD_CTX);
-               TAILQ_REMOVE(&tdctx->tdio_list, tdio, link);
-               atomic_clear_int(&tdio->flags, DSCHED_LINKED_THREAD_CTX);
-               tdio->tdctx = NULL;
-               dsched_thread_io_unref(tdio);
-               lockmgr(&tdctx->lock, LK_RELEASE);
-               dsched_thread_ctx_unref(tdctx);
-       }
-
-       /*
-        * Expect tdio->refcount to be 0x80000000.  If it isn't someone else
-        * still has a temporary ref on the tdio and we have to transition
-        * it back to an undestroyed-state (albeit without any associations)
-        * so the other user destroys it properly when the ref is released.
-        */
-       while ((refs = tdio->refcount) != 0x80000000) {
-               kprintf("dsched_thread_io: destroy race tdio=%p\n", tdio);
-               cpu_ccfence();
-               KKASSERT(refs & 0x80000000);
-               nrefs = refs & 0x7FFFFFFF;
-               if (atomic_cmpset_int(&tdio->refcount, refs, nrefs))
-                       return;
-       }
-
-       /*
-        * Really for sure now.
-        */
-       objcache_put(dsched_tdio_cache, tdio);
-       atomic_subtract_int(&dsched_stats.tdio_allocations, 1);
-}
-
-void
-dsched_thread_ctx_unref(struct dsched_thread_ctx *tdctx)
-{
-       int refs;
-       int nrefs;
-
-       /*
-        * Handle 1->0 transitions for tdctx and nested destruction
-        * recursions.  If the refs are already in destruction mode (bit 31
-        * set) on the 1->0 transition we don't try to destruct it again.
-        *
-        * 0x80000001->0x80000000 transitions are handled normally and
-        * thus avoid nested dstruction.
-        */
-       for (;;) {
-               refs = tdctx->refcount;
-               cpu_ccfence();
-               nrefs = refs - 1;
-
-               KKASSERT(((refs ^ nrefs) & 0x80000000) == 0);
-               if (nrefs) {
-                       if (atomic_cmpset_int(&tdctx->refcount, refs, nrefs))
-                               break;
-                       continue;
-               }
-               nrefs = 0x80000000;
-               if (atomic_cmpset_int(&tdctx->refcount, refs, nrefs)) {
-                       dsched_thread_ctx_destroy(tdctx);
-                       break;
-               }
-       }
-}
-
-static void
-dsched_thread_ctx_destroy(struct dsched_thread_ctx *tdctx)
-{
-       struct dsched_thread_io *tdio;
-
-       lockmgr(&tdctx->lock, LK_EXCLUSIVE);
-
-       while ((tdio = TAILQ_FIRST(&tdctx->tdio_list)) != NULL) {
-               KKASSERT(tdio->flags & DSCHED_LINKED_THREAD_CTX);
-               TAILQ_REMOVE(&tdctx->tdio_list, tdio, link);
-               atomic_clear_int(&tdio->flags, DSCHED_LINKED_THREAD_CTX);
-               tdio->tdctx = NULL;
-               lockmgr(&tdctx->lock, LK_RELEASE);      /* avoid deadlock */
-               dsched_thread_io_unref_destroy(tdio);
-               lockmgr(&tdctx->lock, LK_EXCLUSIVE);
-       }
-       KKASSERT(tdctx->refcount == 0x80000000);
-
-       lockmgr(&tdctx->lock, LK_RELEASE);
-
-       objcache_put(dsched_tdctx_cache, tdctx);
-       atomic_subtract_int(&dsched_stats.tdctx_allocations, 1);
-}
-
-/*
- * Ensures that a tdio is assigned to tdctx and disk.
- */
-static
-struct dsched_thread_io *
-dsched_thread_io_alloc(struct disk *dp, struct dsched_thread_ctx *tdctx,
-                      struct dsched_policy *pol, int tdctx_locked)
-{
-       struct dsched_thread_io *tdio;
-#if 0
-       dsched_disk_ctx_ref(dsched_get_disk_priv(dp));
-#endif
-       tdio = objcache_get(dsched_tdio_cache, M_INTWAIT);
-       bzero(tdio, DSCHED_THREAD_IO_MAX_SZ);
-
-       dsched_thread_io_ref(tdio);     /* prevent ripout */
-       dsched_thread_io_ref(tdio);     /* for diskctx ref */
-
-       DSCHED_THREAD_IO_LOCKINIT(tdio);
-       tdio->dp = dp;
-
-       tdio->diskctx = dsched_get_disk_priv(dp);
-       TAILQ_INIT(&tdio->queue);
-
-       if (pol->new_tdio)
-               pol->new_tdio(tdio);
-
-       DSCHED_DISK_CTX_LOCK(tdio->diskctx);
-       TAILQ_INSERT_TAIL(&tdio->diskctx->tdio_list, tdio, dlink);
-       atomic_set_int(&tdio->flags, DSCHED_LINKED_DISK_CTX);
-       DSCHED_DISK_CTX_UNLOCK(tdio->diskctx);
-
-       if (tdctx) {
-               /*
-                * Put the tdio in the tdctx list.  Inherit the temporary
-                * ref (one ref for each list).
-                */
-               if (tdctx_locked == 0)
-                       DSCHED_THREAD_CTX_LOCK(tdctx);
-               tdio->tdctx = tdctx;
-               tdio->p = tdctx->p;
-               TAILQ_INSERT_TAIL(&tdctx->tdio_list, tdio, link);
-               atomic_set_int(&tdio->flags, DSCHED_LINKED_THREAD_CTX);
-               if (tdctx_locked == 0)
-                       DSCHED_THREAD_CTX_UNLOCK(tdctx);
-       } else {
-               dsched_thread_io_unref(tdio);
-       }
-
-       tdio->debug_policy = pol;
-       tdio->debug_inited = 0xF00F1234;
-
-       atomic_add_int(&dsched_stats.tdio_allocations, 1);
-
-       return(tdio);
-}
-
-
-struct dsched_disk_ctx *
-dsched_disk_ctx_alloc(struct disk *dp, struct dsched_policy *pol)
-{
-       struct dsched_disk_ctx *diskctx;
-
-       diskctx = objcache_get(dsched_diskctx_cache, M_WAITOK);
-       bzero(diskctx, DSCHED_DISK_CTX_MAX_SZ);
-       dsched_disk_ctx_ref(diskctx);
-       diskctx->dp = dp;
-       DSCHED_DISK_CTX_LOCKINIT(diskctx);
-       TAILQ_INIT(&diskctx->tdio_list);
-       /*
-        * XXX: magic number 32: most device has a tag queue
-        * of depth 32.
-        * Better to retrive more precise value from the driver
-        */
-       diskctx->max_tag_queue_depth = 32;
-       diskctx->current_tag_queue_depth = 0;
-
-       atomic_add_int(&dsched_stats.diskctx_allocations, 1);
-       if (pol->new_diskctx)
-               pol->new_diskctx(diskctx);
-       return diskctx;
-}
-
-
-struct dsched_thread_ctx *
-dsched_thread_ctx_alloc(struct proc *p)
-{
-       struct dsched_thread_ctx        *tdctx;
-
-       tdctx = objcache_get(dsched_tdctx_cache, M_WAITOK);
-       bzero(tdctx, DSCHED_THREAD_CTX_MAX_SZ);
-       dsched_thread_ctx_ref(tdctx);
-#if 0
-       kprintf("dsched_thread_ctx_alloc, new tdctx = %p\n", tdctx);
-#endif
-       DSCHED_THREAD_CTX_LOCKINIT(tdctx);
-       TAILQ_INIT(&tdctx->tdio_list);
-       tdctx->p = p;
-
-       atomic_add_int(&dsched_stats.tdctx_allocations, 1);
-       /* XXX: no callback here */
-
-       return tdctx;
-}
-
-void
-policy_new(struct disk *dp, struct dsched_policy *pol)
-{
-       struct dsched_disk_ctx *diskctx;
-
-       diskctx = dsched_disk_ctx_alloc(dp, pol);
-       dsched_disk_ctx_ref(diskctx);
-       dsched_set_disk_priv(dp, diskctx);
-}
-
-void
-policy_destroy(struct disk *dp) {
-       struct dsched_disk_ctx *diskctx;
-
-       diskctx = dsched_get_disk_priv(dp);
-       KKASSERT(diskctx != NULL);
-
-       dsched_disk_ctx_unref(diskctx); /* from prepare */
-       dsched_disk_ctx_unref(diskctx); /* from alloc */
-
-       dsched_set_disk_priv(dp, NULL);
-}
-
-void
-dsched_new_buf(struct buf *bp)
+dsched_disk_destroy(struct disk *dp)
 {
-       struct dsched_thread_ctx        *tdctx = NULL;
-
-       if (dsched_inited == 0)
-               return;
-
-       if (curproc != NULL) {
-               tdctx = dsched_get_proc_priv(curproc);
-       } else {
-               /* This is a kernel thread, so no proc info is available */
-               tdctx = dsched_get_thread_priv(curthread);
-       }
-
-#if 0
-       /*
-        * XXX: hack. we don't want this assert because we aren't catching all
-        *      threads. mi_startup() is still getting away without an tdctx.
-        */
-
-       /* by now we should have an tdctx. if not, something bad is going on */
-       KKASSERT(tdctx != NULL);
-#endif
-
-       if (tdctx) {
-               dsched_thread_ctx_ref(tdctx);
-       }
-       dsched_set_buf_priv(bp, tdctx);
-}
-
-void
-dsched_exit_buf(struct buf *bp)
-{
-       struct dsched_thread_ctx        *tdctx;
-
-       tdctx = dsched_get_buf_priv(bp);
-       if (tdctx != NULL) {
-               dsched_clr_buf_priv(bp);
-               dsched_thread_ctx_unref(tdctx);
-       }
-}
-
-void
-dsched_new_proc(struct proc *p)
-{
-       struct dsched_thread_ctx        *tdctx;
-
-       if (dsched_inited == 0)
-               return;
-
-       KKASSERT(p != NULL);
-
-       tdctx = dsched_thread_ctx_alloc(p);
-       tdctx->p = p;
-       dsched_thread_ctx_ref(tdctx);
-
-       dsched_set_proc_priv(p, tdctx);
-       atomic_add_int(&dsched_stats.nprocs, 1);
-}
-
-
-void
-dsched_new_thread(struct thread *td)
-{
-       struct dsched_thread_ctx        *tdctx;
-
-       if (dsched_inited == 0)
-               return;
-
-       KKASSERT(td != NULL);
-
-       tdctx = dsched_thread_ctx_alloc(NULL);
-       tdctx->td = td;
-       dsched_thread_ctx_ref(tdctx);
-
-       dsched_set_thread_priv(td, tdctx);
-       atomic_add_int(&dsched_stats.nthreads, 1);
-}
-
-void
-dsched_exit_proc(struct proc *p)
-{
-       struct dsched_thread_ctx        *tdctx;
-
-       if (dsched_inited == 0)
-               return;
-
-       KKASSERT(p != NULL);
-
-       tdctx = dsched_get_proc_priv(p);
-       KKASSERT(tdctx != NULL);
-
-       tdctx->dead = 0xDEAD;
-       dsched_set_proc_priv(p, NULL);
-
-       dsched_thread_ctx_unref(tdctx); /* one for alloc, */
-       dsched_thread_ctx_unref(tdctx); /* one for ref */
-       atomic_subtract_int(&dsched_stats.nprocs, 1);
-}
-
-
-void
-dsched_exit_thread(struct thread *td)
-{
-       struct dsched_thread_ctx        *tdctx;
-
-       if (dsched_inited == 0)
-               return;
-
-       KKASSERT(td != NULL);
-
-       tdctx = dsched_get_thread_priv(td);
-       KKASSERT(tdctx != NULL);
-
-       tdctx->dead = 0xDEAD;
-       dsched_set_thread_priv(td, 0);
-
-       dsched_thread_ctx_unref(tdctx); /* one for alloc, */
-       dsched_thread_ctx_unref(tdctx); /* one for ref */
-       atomic_subtract_int(&dsched_stats.nthreads, 1);
-}
-
-/*
- * Returns ref'd tdio.
- *
- * tdio may have additional refs for the diskctx and tdctx it resides on.
- */
-void
-dsched_new_policy_thread_tdio(struct dsched_disk_ctx *diskctx,
-                             struct dsched_policy *pol)
-{
-       struct dsched_thread_ctx *tdctx;
-
-       tdctx = dsched_get_thread_priv(curthread);
-       KKASSERT(tdctx != NULL);
-       dsched_thread_io_alloc(diskctx->dp, tdctx, pol, 0);
-}
-
-/* DEFAULT NOOP POLICY */
-
-static int
-noop_prepare(struct dsched_disk_ctx *diskctx)
-{
-       return 0;
-}
-
-static void
-noop_teardown(struct dsched_disk_ctx *diskctx)
-{
-
-}
-
-static void
-noop_cancel(struct dsched_disk_ctx *diskctx)
-{
-
-}
-
-static int
-noop_queue(struct dsched_disk_ctx *diskctx, struct dsched_thread_io *tdio,
-          struct bio *bio)
-{
-       dsched_strategy_raw(diskctx->dp, bio);
-#if 0
-       dsched_strategy_async(diskctx->dp, bio, noop_completed, NULL);
-#endif
-       return 0;
-}
-
-/*
- * SYSINIT stuff
- */
-static void
-dsched_init(void)
-{
-       dsched_tdio_cache = objcache_create("dsched-tdio-cache", 0, 0,
-                                          NULL, NULL, NULL,
-                                          objcache_malloc_alloc,
-                                          objcache_malloc_free,
-                                          &dsched_thread_io_malloc_args );
-
-       dsched_tdctx_cache = objcache_create("dsched-tdctx-cache", 0, 0,
-                                          NULL, NULL, NULL,
-                                          objcache_malloc_alloc,
-                                          objcache_malloc_free,
-                                          &dsched_thread_ctx_malloc_args );
-
-       dsched_diskctx_cache = objcache_create("dsched-diskctx-cache", 0, 0,
-                                          NULL, NULL, NULL,
-                                          objcache_malloc_alloc,
-                                          objcache_malloc_free,
-                                          &dsched_disk_ctx_malloc_args );
-
-       bzero(&dsched_stats, sizeof(struct dsched_stats));
-
-       lockinit(&dsched_lock, "dsched lock", 0, LK_CANRECURSE);
-       DSCHED_GLOBAL_THREAD_CTX_LOCKINIT();
-
-       dsched_register(&dsched_noop_policy);
-
-       dsched_inited = 1;
-}
-
-static void
-dsched_uninit(void)
-{
-}
-
-SYSINIT(subr_dsched_register, SI_SUB_CREATE_INIT-1, SI_ORDER_FIRST, dsched_init, NULL);
-SYSUNINIT(subr_dsched_register, SI_SUB_CREATE_INIT-1, SI_ORDER_ANY, dsched_uninit, NULL);
-
-/*
- * SYSCTL stuff
- */
-static int
-sysctl_dsched_stats(SYSCTL_HANDLER_ARGS)
-{
-       return (sysctl_handle_opaque(oidp, &dsched_stats, sizeof(struct dsched_stats), req));
-}
-
-static int
-sysctl_dsched_list_policies(SYSCTL_HANDLER_ARGS)
-{
-       struct dsched_policy *pol = NULL;
-       int error, first = 1;
-
-       lockmgr(&dsched_lock, LK_EXCLUSIVE);
-
-       while ((pol = dsched_policy_enumerate(pol))) {
-               if (!first) {
-                       error = SYSCTL_OUT(req, " ", 1);
-                       if (error)
-                               break;
-               } else {
-                       first = 0;
-               }
-               error = SYSCTL_OUT(req, pol->name, strlen(pol->name));
-               if (error)
-                       break;
-
-       }
-
-       lockmgr(&dsched_lock, LK_RELEASE);
-
-       error = SYSCTL_OUT(req, "", 1);
-
-       return error;
-}
-
-static int
-sysctl_dsched_policy(SYSCTL_HANDLER_ARGS)
-{
-       char buf[DSCHED_POLICY_NAME_LENGTH];
-       struct dsched_disk_ctx *diskctx = arg1;
-       struct dsched_policy *pol = NULL;
-       int error;
-
-       if (diskctx == NULL) {
-               return 0;
-       }
-
-       lockmgr(&dsched_lock, LK_EXCLUSIVE);
-
-       pol = diskctx->dp->d_sched_policy;
-       memcpy(buf, pol->name, DSCHED_POLICY_NAME_LENGTH);
-
-       error = sysctl_handle_string(oidp, buf, DSCHED_POLICY_NAME_LENGTH, req);
-       if (error || req->newptr == NULL) {
-               lockmgr(&dsched_lock, LK_RELEASE);
-               return (error);
-       }
-
-       pol = dsched_find_policy(buf);
-       if (pol == NULL) {
-               lockmgr(&dsched_lock, LK_RELEASE);
-               return 0;
-       }
-
-       dsched_switch(diskctx->dp, pol);
-
-       lockmgr(&dsched_lock, LK_RELEASE);
-
-       return error;
-}
-
-static int
-sysctl_dsched_default_policy(SYSCTL_HANDLER_ARGS)
-{
-       char buf[DSCHED_POLICY_NAME_LENGTH];
-       struct dsched_policy *pol = NULL;
-       int error;
-
-       lockmgr(&dsched_lock, LK_EXCLUSIVE);
-
-       pol = default_policy;
-       memcpy(buf, pol->name, DSCHED_POLICY_NAME_LENGTH);
-
-       error = sysctl_handle_string(oidp, buf, DSCHED_POLICY_NAME_LENGTH, req);
-       if (error || req->newptr == NULL) {
-               lockmgr(&dsched_lock, LK_RELEASE);
-               return (error);
-       }
-
-       pol = dsched_find_policy(buf);
-       if (pol == NULL) {
-               lockmgr(&dsched_lock, LK_RELEASE);
-               return 0;
-       }
-
-       default_set = 1;
-       default_policy = pol;
-
-       lockmgr(&dsched_lock, LK_RELEASE);
-
-       return error;
-}
-
-SYSCTL_NODE(, OID_AUTO, dsched, CTLFLAG_RD, NULL,
-    "Disk Scheduler Framework (dsched) magic");
-SYSCTL_NODE(_dsched, OID_AUTO, policy, CTLFLAG_RW, NULL,
-    "List of disks and their policies");
-SYSCTL_INT(_dsched, OID_AUTO, debug, CTLFLAG_RW, &dsched_debug_enable,
-    0, "Enable dsched debugging");
-SYSCTL_PROC(_dsched, OID_AUTO, stats, CTLTYPE_OPAQUE|CTLFLAG_RD,
-    0, sizeof(struct dsched_stats), sysctl_dsched_stats, "dsched_stats",
-    "dsched statistics");
-SYSCTL_PROC(_dsched, OID_AUTO, policies, CTLTYPE_STRING|CTLFLAG_RD,
-    NULL, 0, sysctl_dsched_list_policies, "A", "names of available policies");
-SYSCTL_PROC(_dsched_policy, OID_AUTO, default, CTLTYPE_STRING|CTLFLAG_RW,
-    NULL, 0, sysctl_dsched_default_policy, "A", "default dsched policy");
-
-static void
-dsched_sysctl_add_disk(struct dsched_disk_ctx *diskctx, char *name)
-{
-       if (!(diskctx->flags & DSCHED_SYSCTL_CTX_INITED)) {
-               diskctx->flags |= DSCHED_SYSCTL_CTX_INITED;
-               sysctl_ctx_init(&diskctx->sysctl_ctx);
-       }
-
-       SYSCTL_ADD_PROC(&diskctx->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_dsched_policy),
-           OID_AUTO, name, CTLTYPE_STRING|CTLFLAG_RW,
-           diskctx, 0, sysctl_dsched_policy, "A", "policy");
 }
index 3bd4bd1..2717f6c 100644 (file)
@@ -443,7 +443,7 @@ fork1(struct lwp *lp1, int flags, struct proc **procp)
 
        p2->p_usched = p1->p_usched;
        /* XXX: verify copy of the secondary iosched stuff */
-       dsched_new_proc(p2);
+       dsched_enter_proc(p2);
 
        if (flags & RFSIGSHARE) {
                p2->p_sigacts = p1->p_sigacts;
index 3f8bb57..3355f86 100644 (file)
@@ -460,7 +460,7 @@ lwkt_init_thread(thread_t td, void *stack, int stksize, int flags,
     } else {
        lwkt_send_ipiq(gd, lwkt_init_thread_remote, td);
     }
-    dsched_new_thread(td);
+    dsched_enter_thread(td);
 }
 
 void
index 67d1be4..6dc5cc1 100644 (file)
@@ -484,16 +484,14 @@ disk_msg_core(void *arg)
                         * Interlock against struct disk enumerations.
                         * Wait for enumerations to complete then remove
                         * the dp from the list before tearing it down.
-                        *
-                        * This avoids races against e.g.
-                        * dsched_thread_io_alloc().
+                        * This avoids numerous races.
                         */
                        lwkt_gettoken(&disklist_token);
                        while (dp->d_refs)
                                tsleep(&dp->d_refs, 0, "diskdel", hz / 10);
                        LIST_REMOVE(dp, d_list);
 
-                       dsched_disk_destroy_callback(dp);
+                       dsched_disk_destroy(dp);
                        devfs_destroy_related(dp->d_cdev);
                        destroy_dev(dp->d_cdev);
                        destroy_only_dev(dp->d_rawdev);
@@ -694,9 +692,9 @@ _disk_create_named(const char *name, int unit, struct disk *dp,
        dp->d_cdev->si_disk = dp;
 
        if (name)
-               dsched_disk_create_callback(dp, name, unit);
+               dsched_disk_create(dp, name, unit);
        else
-               dsched_disk_create_callback(dp, raw_ops->head.name, unit);
+               dsched_disk_create(dp, raw_ops->head.name, unit);
 
        lwkt_gettoken(&disklist_token);
        LIST_INSERT_HEAD(&disklist, dp, d_list);
@@ -758,7 +756,7 @@ _setdiskinfo(struct disk *disk, struct disk_info *info)
        if (oldserialno)
                kfree(oldserialno, M_TEMP);
 
-       dsched_disk_update_callback(disk, info);
+       dsched_disk_update(disk, info);
 
        /*
         * The caller may set d_media_size or d_media_blocks and we
@@ -1175,7 +1173,7 @@ diskstrategy(struct dev_strategy_args *ap)
         * or error due to being beyond the device size).
         */
        if ((nbio = dscheck(dev, bio, dp->d_slice)) != NULL) {
-               dsched_queue(dp, nbio);
+               dev_dstrategy(dp->d_rawdev, nbio);
        } else {
                biodone(bio);
        }
index 4c31e17..2b80535 100644 (file)
@@ -772,7 +772,7 @@ reinitbufbio(struct buf *bp)
 void
 uninitbufbio(struct buf *bp)
 {
-       dsched_exit_buf(bp);
+       dsched_buf_exit(bp);
        BUF_LOCKFREE(bp);
 }
 
@@ -1127,8 +1127,7 @@ bdwrite(struct buf *bp)
        }
        bdirty(bp);
 
-       if (dsched_is_clear_buf_priv(bp))
-               dsched_new_buf(bp);
+       dsched_buf_enter(bp);   /* might stack */
 
        /*
         * Set B_CACHE, indicating that the buffer is fully valid.  This is
@@ -1420,7 +1419,7 @@ brelse(struct buf *bp)
         * or B_RELBUF flags.
         */
        bp->b_cmd = BUF_CMD_DONE;
-       dsched_exit_buf(bp);
+       dsched_buf_exit(bp);
 
        /*
         * VMIO buffer rundown.  Make sure the VM page array is restored
@@ -1777,7 +1776,7 @@ bqrelse(struct buf *bp)
         * buffer is actively locked.
         */
        bp->b_flags &= ~(B_ORDERED | B_NOCACHE | B_RELBUF);
-       dsched_exit_buf(bp);
+       dsched_buf_exit(bp);
        BUF_UNLOCK(bp);
 }
 
@@ -3103,7 +3102,6 @@ loop:
 
                allocbuf(bp, size);
        }
-       KKASSERT(dsched_is_clear_buf_priv(bp));
        return (bp);
 }
 
@@ -3146,7 +3144,6 @@ geteblk(int size)
                ;
        allocbuf(bp, size);
        bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */
-       KKASSERT(dsched_is_clear_buf_priv(bp));
        return (bp);
 }
 
@@ -3540,9 +3537,8 @@ void
 bio_start_transaction(struct bio *bio, struct bio_track *track)
 {
        bio->bio_track = track;
-       if (dsched_is_clear_buf_priv(bio->bio_buf))
-               dsched_new_buf(bio->bio_buf);
        bio_track_ref(track);
+       dsched_buf_enter(bio->bio_buf); /* might stack */
 }
 
 /*
@@ -3602,9 +3598,8 @@ vn_strategy(struct vnode *vp, struct bio *bio)
                 track = &vp->v_track_write;
        KKASSERT((bio->bio_flags & BIO_DONE) == 0);
        bio->bio_track = track;
-       if (dsched_is_clear_buf_priv(bio->bio_buf))
-               dsched_new_buf(bio->bio_buf);
        bio_track_ref(track);
+       dsched_buf_enter(bp);   /* might stack */
         vop_strategy(*vp->v_ops, vp, bio);
 }
 
index e1ed51c..970dfc4 100644 (file)
@@ -166,7 +166,6 @@ struct buf {
        unsigned char b_act_count;      /* similar to vm_page act_count */
        unsigned char b_unused01;
        struct lock b_lock;             /* Buffer lock */
-       void    *b_iosched;             /* I/O scheduler priv data */
        buf_cmd_t b_cmd;                /* I/O command */
        int     b_bufsize;              /* Allocated buffer size. */
        int     b_runningbufspace;      /* when I/O is running, pipelining */
index 761aa7b..9729e41 100644 (file)
@@ -142,9 +142,9 @@ struct disk {
        cdev_t                  d_cdev;         /* special whole-disk part */
        struct diskslices       *d_slice;
        struct disk_info        d_info;         /* info structure for media */
-       void                    *d_dsched_priv1;/* I/O scheduler priv. data */
-       void                    *d_dsched_priv2;/* I/O scheduler priv. data */
-       struct dsched_policy    *d_sched_policy;/* I/O scheduler policy */
+       void                    *d_unused01;
+       void                    *d_unused02;
+       void                    *d_unused03;
        const char              *d_disktype;    /* Disk type information */
        LIST_ENTRY(disk)        d_list;
        kdmsg_iocom_t           d_iocom;        /* cluster import/export */
index 20025c1..d040229 100644 (file)
 
 #define        DSCHED_POLICY_NAME_LENGTH       64
 
-#define dsched_set_disk_priv(dp, x)    ((dp)->d_dsched_priv1 = (x))
-#define dsched_get_disk_priv(dp)       ((dp)?((dp)->d_dsched_priv1):NULL)
-#define dsched_set_proc_priv(pp, x)    ((pp)->p_dsched_priv1 = (x))
-#define dsched_get_proc_priv(pp)       ((pp)?((pp)->p_dsched_priv1):NULL)
-
-#define dsched_set_thread_priv(td, x)  ((td)->td_dsched_priv1 = (x))
-#define dsched_get_thread_priv(td)     ((td)?((td)->td_dsched_priv1):NULL)
-
-#define dsched_set_buf_priv(bp, x)     ((bp)->b_iosched = (x))
-#define dsched_get_buf_priv(bp)                ((bp)?((bp)->b_iosched):NULL)
-#define        dsched_clr_buf_priv(bp)         ((bp)->b_iosched = NULL)
-#define        dsched_is_clear_buf_priv(bp)    ((bp)->b_iosched == NULL)
-
-
-#define        dsched_set_bio_dp(bio, x)       ((bio)->bio_caller_info1.ptr = (x))
-#define        dsched_get_bio_dp(bio)          ((bio)?((bio)->bio_caller_info1.ptr):NULL)
-#define        dsched_set_bio_priv(bio, x)     ((bio)->bio_caller_info2.ptr = (x))
-#define        dsched_get_bio_priv(bio)        ((bio)?((bio)->bio_caller_info2.ptr):NULL)
-#define        dsched_set_bio_stime(bio, x)    ((bio)->bio_caller_info3.lvalue = (x))
-#define        dsched_get_bio_stime(bio)       ((bio)?((bio)->bio_caller_info3.lvalue):0)
-#define        dsched_set_bio_tdio(bio, x)     ((bio)->bio_caller_info3.ptr = (x))
-#define        dsched_get_bio_tdio(bio)        ((bio)?((bio)->bio_caller_info3.ptr):0)
-
-
-struct dsched_thread_ctx {
-       TAILQ_ENTRY(dsched_thread_ctx)  link;
-
-       TAILQ_HEAD(, dsched_thread_io)  tdio_list;      /* list of thread_io */
-       struct lock     lock;
-
-       int32_t         refcount;
-       
-       struct proc *p;
-       struct thread *td;      
-       int32_t dead;
-};
-
-struct dsched_disk_ctx {
-       TAILQ_ENTRY(dsched_disk_ctx)    link;
-
-       TAILQ_HEAD(, dsched_thread_io)  tdio_list;      /* list of thread_io of disk */
-       struct lock     lock;
-
-       int32_t         refcount;
-       int32_t         flags;
-
-       int             max_tag_queue_depth;            /* estimated max tag queue depth */
-       int             current_tag_queue_depth;        /* estimated current tag queue depth */
-
-       struct disk     *dp;            /* back pointer to disk struct */
-
-       struct sysctl_ctx_list sysctl_ctx;
-};
-
-struct dsched_policy;
-
-struct dsched_thread_io {
-       TAILQ_ENTRY(dsched_thread_io)   link;
-       TAILQ_ENTRY(dsched_thread_io)   dlink;
-
-       TAILQ_HEAD(, bio)       queue;  /* IO queue (bio) */
-       struct lock             lock;
-       int32_t                 qlength;/* IO queue length */
-
-       int32_t refcount;
-
-       int32_t flags;
-       
-       struct disk             *dp;
-       struct dsched_disk_ctx  *diskctx;
-       struct dsched_thread_ctx        *tdctx;
-       struct proc             *p;
-       struct dsched_policy    *debug_policy;
-       int                     debug_inited;
-       int                     debug_priv;
-};
-
-typedef int    dsched_prepare_t(struct dsched_disk_ctx *diskctx);
-typedef void   dsched_teardown_t(struct dsched_disk_ctx *diskctx);
-typedef void   dsched_cancel_t(struct dsched_disk_ctx *diskctx);
-typedef int    dsched_queue_t(struct dsched_disk_ctx *diskctx,
-                   struct dsched_thread_io *tdio, struct bio *bio);
-typedef void dsched_dequeue_t(struct dsched_disk_ctx *diskctx);
-
-typedef        void    dsched_new_tdio_t(struct dsched_thread_io *tdio);
-typedef        void    dsched_new_diskctx_t(struct dsched_disk_ctx *diskctx);
-typedef        void    dsched_destroy_tdio_t(struct dsched_thread_io *tdio);
-typedef        void    dsched_destroy_diskctx_t(struct dsched_disk_ctx *diskctx);
-typedef void   dsched_bio_done_t(struct bio *bio);
-typedef void   dsched_polling_func_t(struct dsched_disk_ctx *diskctx);
-
-struct dsched_policy {
-       char                    name[DSCHED_POLICY_NAME_LENGTH];
-       uint64_t                uniq_id;
-       int                     ref_count;
-
-       TAILQ_ENTRY(dsched_policy) link;
-
-       dsched_prepare_t        *prepare;
-       dsched_teardown_t       *teardown;
-       dsched_cancel_t         *cancel_all;
-       dsched_queue_t          *bio_queue;
-
-       dsched_new_tdio_t       *new_tdio;
-       dsched_new_diskctx_t    *new_diskctx;
-       dsched_destroy_tdio_t   *destroy_tdio;
-       dsched_destroy_diskctx_t        *destroy_diskctx;
-
-       dsched_bio_done_t       *bio_done;      /* call back when a bio dispatched by dsched_strategy_request_polling() is done */
-       dsched_polling_func_t   *polling_func; /* it gets called when the disk is idle or about to idle */
-};
-
-TAILQ_HEAD(dsched_policy_head, dsched_policy);
-
-
-#define        DSCHED_THREAD_IO_LOCKINIT(x)    \
-               lockinit(&(x)->lock, "tdiobioq", 0, LK_CANRECURSE)
-
-#define        DSCHED_THREAD_IO_LOCK(x)        do {                    \
-                       dsched_thread_io_ref((x));              \
-                       lockmgr(&(x)->lock, LK_EXCLUSIVE);      \
-               } while(0)
-
-#define        DSCHED_THREAD_IO_UNLOCK(x)      do {                    \
-                       lockmgr(&(x)->lock, LK_RELEASE);        \
-                       dsched_thread_io_unref((x));            \
-               } while(0)
-
-#define        DSCHED_DISK_CTX_LOCKINIT(x)     \
-               lockinit(&(x)->lock, "tdiodiskq", 0, LK_CANRECURSE)
-
-#define        DSCHED_DISK_CTX_LOCK(x)         do {                    \
-                       dsched_disk_ctx_ref((x));               \
-                       lockmgr(&(x)->lock, LK_EXCLUSIVE);      \
-               } while(0)
-
-#define        DSCHED_DISK_CTX_UNLOCK(x)       do {                    \
-                       lockmgr(&(x)->lock, LK_RELEASE);        \
-                       dsched_disk_ctx_unref((x));             \
-               } while(0)
-
-#define DSCHED_DISK_CTX_LOCK_ASSERT(x) \
-               KKASSERT(lockstatus(&(x)->lock, curthread) == LK_EXCLUSIVE)
-
-#define        DSCHED_GLOBAL_THREAD_CTX_LOCKINIT(x)    \
-               lockinit(&dsched_tdctx_lock, "tdctxglob", 0, LK_CANRECURSE)
-#define        DSCHED_GLOBAL_THREAD_CTX_LOCK(x)        \
-               lockmgr(&dsched_tdctx_lock, LK_EXCLUSIVE)
-#define        DSCHED_GLOBAL_THREAD_CTX_UNLOCK(x)      \
-               lockmgr(&dsched_tdctx_lock, LK_RELEASE)
-
-#define        DSCHED_THREAD_CTX_LOCKINIT(x)   \
-               lockinit(&(x)->lock, "tdctx", 0, LK_CANRECURSE)
-
-#define        DSCHED_THREAD_CTX_LOCK(x)       do {                    \
-                       dsched_thread_ctx_ref((x));             \
-                       lockmgr(&(x)->lock, LK_EXCLUSIVE);      \
-               } while(0)
-
-#define DSCHED_THREAD_CTX_UNLOCK(x)    do {                    \
-                       lockmgr(&(x)->lock, LK_RELEASE);        \
-                       dsched_thread_ctx_unref((x));           \
-               } while(0)
-
-/* flags for thread_io */
-#define        DSCHED_LINKED_DISK_CTX          0x01
-#define        DSCHED_LINKED_THREAD_CTX        0x02
-/* flags for disk_ctx */
-#define        DSCHED_SYSCTL_CTX_INITED        0x01
-
-#define DSCHED_THREAD_CTX_MAX_SZ       sizeof(struct dsched_thread_ctx)
-#define DSCHED_THREAD_IO_MAX_SZ                384
-#define DSCHED_DISK_CTX_MAX_SZ         1024
-
-#define DSCHED_POLICY_MODULE(name, evh, version)                       \
-static moduledata_t name##_mod = {                                     \
-    #name,                                                             \
-    evh,                                                               \
-    NULL                                                               \
-};                                                                     \
-DECLARE_MODULE(name, name##_mod, SI_SUB_PRE_DRIVERS, SI_ORDER_MIDDLE); \
-MODULE_VERSION(name, version)
-
-void   dsched_disk_create_callback(struct disk *dp, const char *head_name, int unit);
-void   dsched_disk_update_callback(struct disk *dp, struct disk_info *info);
-void   dsched_disk_destroy_callback(struct disk *dp);
-void   dsched_queue(struct disk *dp, struct bio *bio);
-int    dsched_register(struct dsched_policy *d_policy);
-int    dsched_unregister(struct dsched_policy *d_policy);
-int    dsched_switch(struct disk *dp, struct dsched_policy *new_policy);
-void   dsched_set_policy(struct disk *dp, struct dsched_policy *new_policy);
-struct dsched_policy *dsched_find_policy(char *search);
-struct disk *dsched_find_disk(char *search);
-struct dsched_policy *dsched_policy_enumerate(struct dsched_policy *pol);
-struct disk *dsched_disk_enumerate(struct disk *marker, struct disk *dp,
-                       struct dsched_policy *policy);
-void   dsched_cancel_bio(struct bio *bp);
-void   dsched_strategy_raw(struct disk *dp, struct bio *bp);
-void   dsched_strategy_sync(struct disk *dp, struct bio *bp);
-void   dsched_strategy_async(struct disk *dp, struct bio *bp, biodone_t *done, void *priv);
-void   dsched_strategy_request_polling(struct disk *bp, struct bio *bio, struct dsched_disk_ctx *diskctx);
-int    dsched_debug(int level, char *fmt, ...) __printflike(2, 3);
-
-void   policy_new(struct disk *dp, struct dsched_policy *pol);
-void   policy_destroy(struct disk *dp);
-
-void   dsched_disk_ctx_ref(struct dsched_disk_ctx *diskctx);
-void   dsched_thread_io_ref(struct dsched_thread_io *tdio);
-void   dsched_thread_ctx_ref(struct dsched_thread_ctx *tdctx);
-void   dsched_disk_ctx_unref(struct dsched_disk_ctx *diskctx);
-void   dsched_thread_io_unref(struct dsched_thread_io *tdio);
-void   dsched_thread_ctx_unref(struct dsched_thread_ctx *tdctx);
-
-void   dsched_new_policy_thread_tdio(struct dsched_disk_ctx *diskctx,
-                       struct dsched_policy *pol);
-struct dsched_disk_ctx *dsched_disk_ctx_alloc(struct disk *dp,
-                       struct dsched_policy *pol);
-struct dsched_thread_ctx *dsched_thread_ctx_alloc(struct proc *p);
-
-typedef        void    dsched_new_buf_t(struct buf *bp);
-typedef        void    dsched_new_proc_t(struct proc *p);
-typedef        void    dsched_new_thread_t(struct thread *td);
-typedef        void    dsched_exit_buf_t(struct buf *bp);
-typedef        void    dsched_exit_proc_t(struct proc *p);
-typedef        void    dsched_exit_thread_t(struct thread *td);
-
-dsched_new_buf_t       dsched_new_buf;
-dsched_new_proc_t      dsched_new_proc;
-dsched_new_thread_t    dsched_new_thread;
-dsched_exit_buf_t      dsched_exit_buf;
-dsched_exit_proc_t     dsched_exit_proc;
-dsched_exit_thread_t   dsched_exit_thread;
+void   dsched_disk_create(struct disk *dp, const char *head_name, int unit);
+void   dsched_disk_update(struct disk *dp, struct disk_info *info);
+void   dsched_disk_destroy(struct disk *dp);
+
+/* placemarkers for future work */
+#define        dsched_buf_enter(bp)
+#define        dsched_buf_exit(bp)
+#define        dsched_enter_proc(p)
+#define        dsched_enter_thread(td)
+#define        dsched_exit_proc(p)
+#define        dsched_exit_thread(td)
 
 #endif /* _KERNEL */
 
-
-#define        DSCHED_NAME_LENGTH              64
-#define        DSCHED_SET_DEVICE_POLICY        _IOWR('d', 1, struct dsched_ioctl)
-#define        DSCHED_LIST_DISKS               _IOWR('d', 2, struct dsched_ioctl)
-#define        DSCHED_LIST_DISK                _IOWR('d', 3, struct dsched_ioctl)
-#define        DSCHED_LIST_POLICIES            _IOWR('d', 4, struct dsched_ioctl)
-
-struct dsched_ioctl {
-       uint16_t        num_elem;
-       char            dev_name[DSCHED_NAME_LENGTH];
-       char            pol_name[DSCHED_NAME_LENGTH];
-};
-
-struct dsched_stats {
-       int32_t tdctx_allocations;
-       int32_t tdio_allocations;
-       int32_t diskctx_allocations;
-
-       int32_t no_tdctx;
-
-       int32_t nthreads;
-       int32_t nprocs;
-};
-
 #endif /* !_SYS_DSCHED_H_ */
index fda1290..6309b28 100644 (file)
@@ -295,7 +295,7 @@ struct      proc {
 
        struct rusage   p_ru;           /* stats for this proc */
        struct rusage   p_cru;          /* sum of stats for reaped children */
-       void            *p_dsched_priv1;
+       void            *p_unused01;
 
 /* The following fields are all copied upon creation in fork. */
 #define        p_startcopy     p_comm
@@ -316,9 +316,9 @@ struct      proc {
        u_short         p_xstat;        /* Exit status or last stop signal */
 
        int             p_ionice;
-       void            *p_dsched_priv2;
+       void            *p_unused02;
 /* End area that is copied on creation. */
-#define        p_endcopy       p_dsched_priv2
+#define        p_endcopy       p_unused02
        u_short         p_acflag;       /* Accounting flags. */
 
        int             p_lock;         /* Prevent proc destruction */
index ff9064d..03b0f01 100644 (file)
@@ -648,7 +648,6 @@ SYSCTL_DECL(_user);
 SYSCTL_DECL(_compat);
 SYSCTL_DECL(_lwkt);
 SYSCTL_DECL(_security);
-SYSCTL_DECL(_dsched);
 
 /*
  * Common second-level oids.
index 2aa72fa..643e242 100644 (file)
@@ -271,7 +271,7 @@ struct thread {
     __uint64_t td_sticks;      /* Statclock hits in system mode (uS) */
     __uint64_t td_iticks;      /* Statclock hits processing intr (uS) */
     int                td_locks;       /* lockmgr lock debugging */
-    void       *td_dsched_priv1;       /* priv data for I/O schedulers */
+    void       *td_unused01;   /* (future I/O scheduler heuristic) */
     int                td_refs;        /* hold position in gd_tdallq / hold free */
     int                td_nest_count;  /* prevent splz nesting */
     int                td_contended;   /* token contention count */
index a1a11fd..26b3d57 100644 (file)
@@ -81,4 +81,4 @@ struct udev_event {
        prop_dictionary_t       ev_dict;
 };
 
-#endif /* _SYS_DSCHED_H_ */
+#endif /* _SYS_UDEV_H_ */
index 9d01a65..b2799ac 100644 (file)
@@ -379,7 +379,6 @@ getpbuf(int *pfreecnt)
        spin_unlock(&bswspin);
 
        initpbuf(bp);
-       KKASSERT(dsched_is_clear_buf_priv(bp));
 
        return (bp);
 }
@@ -412,7 +411,6 @@ getpbuf_kva(int *pfreecnt)
        spin_unlock(&bswspin);
 
        initpbuf(bp);
-       KKASSERT(dsched_is_clear_buf_priv(bp));
 
        return (bp);
 }
@@ -485,7 +483,7 @@ relpbuf(struct buf *bp, int *pfreecnt)
        int wake_freecnt = 0;
 
        KKASSERT(bp->b_flags & B_PAGING);
-       dsched_exit_buf(bp);
+       dsched_buf_exit(bp);
 
        BUF_UNLOCK(bp);