sys/kern/dsched/bfq/bfq.c

   1 /*
   2  * Copyright (c) 2011 The DragonFly Project.  All rights reserved.
   3  *
   4  * This code is derived from software contributed to The DragonFly Project
   5  * by Brills Peng <brillsp@gmail.com>
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  *
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in
  15  *    the documentation and/or other materials provided with the
  16  *    distribution.
  17  * 3. Neither the name of The DragonFly Project nor the names of its
  18  *    contributors may be used to endorse or promote products derived
  19  *    from this software without specific, prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  */
  34
  35
  36 /*
  37  * BFQ disk scheduler, the algorithm routines and the interfaces with the
  38  * dsched framework.
  39  *
  40  */
  41
  42 #include <inttypes.h>
  43 #include <sys/systm.h>
  44 #include <sys/kernel.h>
  45 #include <sys/proc.h>
  46 #include <sys/sysctl.h>
  47 #include <sys/buf.h>
  48 #include <sys/conf.h>
  49 #include <sys/diskslice.h>
  50 #include <sys/disk.h>
  51 #include <sys/malloc.h>
  52 #include <machine/md_var.h>
  53 #include <sys/ctype.h>
  54 #include <sys/syslog.h>
  55 #include <sys/device.h>
  56 #include <sys/msgport.h>
  57 #include <sys/msgport2.h>
  58 #include <sys/buf2.h>
  59 #include <sys/dsched.h>
  60 #include <sys/fcntl.h>
  61 #include <machine/varargs.h>
  62
  63 #include <kern/dsched/bfq/bfq.h>
  64 #include <kern/dsched/bfq/bfq_helper_thread.h>
  65
  66 #define _DSCHED_BFQ_BFQ_C_
  67 #include <kern/dsched/bfq/bfq_ktr.h>
  68
  69 /* Make sure our structs fit */
  70 CTASSERT(sizeof(struct bfq_thread_io) <= DSCHED_THREAD_IO_MAX_SZ);
  71 CTASSERT(sizeof(struct bfq_disk_ctx) <= DSCHED_DISK_CTX_MAX_SZ);
  72
  73
  74 static dsched_prepare_t         bfq_prepare;
  75 static dsched_teardown_t        bfq_teardown;
  76 static dsched_cancel_t          bfq_cancel_all;
  77 static dsched_queue_t           bfq_queue;
  78 static dsched_new_tdio_t        bfq_new_tdio;
  79 static dsched_destroy_tdio_t    bfq_destroy_tdio;
  80 static dsched_bio_done_t        bfq_bio_done;
  81
  82
  83 static void bfq_update_peak_rate(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio);
  84 static int bfq_slow_tdio(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio);
  85 static void bfq_expire(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio, enum bfq_expire_reason reason);
  86 static void bfq_update_tdio_seek_avg(struct bfq_thread_io *bfq_tdio, struct bio *bp);
  87 static void bfq_update_tdio_ttime_avg(struct bfq_thread_io *bfq_tdio);
  88 static void bfq_update_as_avg_wait(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio, int flag);
  89 static void bfq_update_avg_time_slice(struct bfq_disk_ctx *bfq_diskctx, struct timeval tv);
  90
  91
  92
  93 struct dsched_policy dsched_bfq_policy = {
  94         .name           = "bfq",
  95         .prepare        = bfq_prepare,
  96         .teardown       = bfq_teardown,
  97         .cancel_all     = bfq_cancel_all,
  98         .bio_queue      = bfq_queue,
  99         .new_tdio       = bfq_new_tdio,
 100         .destroy_tdio   = bfq_destroy_tdio,
 101         .bio_done       = bfq_bio_done,
 102         .polling_func   = (void (*)(struct dsched_disk_ctx *))helper_msg_dequeue,
 103 };
 104
 105
 106 struct sysctl_oid *bfq_mod_oid;
 107
 108 struct dsched_bfq_stats bfq_stats;
 109
 110 static int dsched_bfq_version_maj = 1;
 111 static int dsched_bfq_version_min = 0;
 112
 113 /*
 114  * bfq_prepare(): the .prepare callback of the bfq policy. Initialize
 115  * all fields in bfq_diskctx and initialize the corresponding helper
 116  * thread.
 117  *
 118  * lock: none
 119  * refcount: none
 120  *
 121  * Returns 0
 122  */
 123 static int
 124 bfq_prepare(struct dsched_disk_ctx *diskctx)
 125 {
 126         struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)diskctx;
 127
 128         BFQ_LOCKINIT(bfq_diskctx);
 129
 130         bfq_diskctx->pending_dequeue = 0;
 131
 132         wf2q_init(&bfq_diskctx->bfq_wf2q);
 133
 134         callout_init_mp(&bfq_diskctx->bfq_callout);
 135
 136         bfq_diskctx->bfq_blockon = NULL;
 137         bfq_diskctx->bfq_active_tdio = NULL;
 138         bfq_diskctx->bfq_remaining_budget = 0;
 139
 140         bfq_diskctx->bfq_max_budget = BFQ_DEFAULT_MAX_BUDGET;
 141         bfq_diskctx->bfq_peak_rate_samples = 0;
 142         bfq_diskctx->bfq_peak_rate = 0;
 143
 144 #if 0
 145         bfq_diskctx->bfq_flag = BFQ_FLAG_AS | BFQ_FLAG_AUTO_MAX_BUDGET;
 146 #endif
 147         bfq_diskctx->bfq_flag = BFQ_FLAG_AS;
 148
 149         bfq_diskctx->bfq_as_miss = 0;
 150         bfq_diskctx->bfq_as_hit = 0;
 151
 152         bfq_diskctx->bfq_as_avg_wait_miss = 0;
 153         bfq_diskctx->bfq_as_avg_wait_all = 0;
 154         bfq_diskctx->bfq_as_max_wait = 0;
 155         bfq_diskctx->bfq_as_max_wait2 = 0;
 156         bfq_diskctx->bfq_as_high_wait_count = 0;
 157         bfq_diskctx->bfq_as_high_wait_count2 = 0;
 158
 159         bfq_diskctx->bfq_avg_time_slice = 0;
 160         bfq_diskctx->bfq_max_time_slice = 0;
 161         bfq_diskctx->bfq_high_time_slice_count = 0;
 162
 163         /* initiailize the helper thread */
 164         helper_init(bfq_diskctx);
 165
 166         dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: initialized!\n");
 167         return 0;
 168 }
 169
 170 /*
 171  * bfq_teardown(): .teardown callback of the bfq policy. Send message
 172  * of killing to the helper thread and deallocate resources used by
 173  * the helper thread (currently the objcache)
 174  *
 175  * XXX: deadlock causing when the caller of bfq_teardown() and the
 176  * helper thread are on the same CPU.
 177  *
 178  * lock: none
 179  * refcount: none
 180  *
 181  */
 182
 183 static void
 184 bfq_teardown(struct dsched_disk_ctx *diskctx)
 185 {
 186         struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)diskctx;
 187         KKASSERT(diskctx);
 188
 189         helper_msg_kill(bfq_diskctx);
 190
 191         tsleep(diskctx, 0, "teardn", hz * 3 / 2);
 192
 193         helper_uninit(bfq_diskctx);
 194 }
 195
 196 /*
 197  * bfq_cancel_all(): .cancel_all callback of the bfq policy. Cancel
 198  * all bios that queue in each bfq_thread_io structure in the
 199  * wf2q tree.
 200  *
 201  * lock:
 202  *      BFQ_LOCK: protect from wf2q_insert operation in bfq_queue() and
 203  *      bfq_dequeue(); wf2q_get_next operation in bfq_dequeue()
 204  *      THREAD_IO_LOCK: protect from queue iteration in bfq_dequeue() and
 205  *      queue insertion in bfq_queue()
 206  *
 207  * refcount:
 208  *      unref thread_io structures; they are referenced in queue(),
 209  *      when a bio is queued. The refcount may decrease to zero.
 210  *
 211  */
 212 static void
 213 bfq_cancel_all(struct dsched_disk_ctx *diskctx)
 214 {
 215         struct bio *bio;
 216         struct bfq_thread_io *bfq_tdio;
 217         struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)diskctx;
 218
 219         BFQ_LOCK(bfq_diskctx);
 220
 221         while ((bfq_tdio = wf2q_get_next_thread_io(&bfq_diskctx->bfq_wf2q))) {
 222                 DSCHED_THREAD_IO_LOCK(&bfq_tdio->head);
 223                 KKASSERT(lockstatus(&bfq_tdio->head.lock, curthread) == LK_EXCLUSIVE);
 224
 225                 while ((bio = TAILQ_FIRST(&bfq_tdio->head.queue))) {
 226                         bfq_tdio->head.qlength--;
 227                         TAILQ_REMOVE(&bfq_tdio->head.queue, bio, link);
 228                         dsched_cancel_bio(bio);
 229                         dsched_thread_io_unref(&bfq_tdio->head);
 230                 }
 231
 232                 KKASSERT(bfq_tdio->head.qlength == 0);
 233                 DSCHED_THREAD_IO_UNLOCK(&bfq_tdio->head);
 234         }
 235
 236         BFQ_UNLOCK(bfq_diskctx);
 237 }
 238
 239 /*
 240  * bfq_new_tdio(): .new_tdio callback of the bfq policy. Initialize
 241  * the bfq_thread_io structure.
 242  *
 243  * lock: none
 244  * refcount: none
 245  */
 246 static void
 247 bfq_new_tdio(struct dsched_thread_io *tdio)
 248 {
 249         struct bfq_thread_io *bfq_tdio = (struct bfq_thread_io *) tdio;
 250
 251         /* the queue has to be initialized some where else */
 252         tdio->qlength = 0;
 253
 254         tdio->debug_priv = 0xF00FF00F;
 255
 256         bfq_tdio->budget = BFQ_DEFAULT_MIN_BUDGET;
 257         bfq_tdio->weight = BFQ_DEFAULT_WEIGHT;
 258
 259         bfq_tdio->tdio_as_switch = 1;
 260         bfq_tdio->maybe_timeout = 0;
 261
 262         bfq_tdio->seek_samples = 0;
 263         bfq_tdio->seek_avg = 0;
 264         bfq_tdio->seek_total = 0;
 265         bfq_tdio->ttime_samples = 0;
 266         bfq_tdio->ttime_avg = 0;
 267         bfq_tdio->service_received = 0;
 268         bfq_tdio->bio_dispatched = 0;
 269         bfq_tdio->bio_completed = 0;
 270
 271         KTR_LOG(dsched_bfq_thread_created, bfq_tdio);
 272 }
 273
 274 /*
 275  * bfq_helper_destroy_tdio(): called after a thread_io struct is destroyed.
 276  * if the scheduler is AS waiting for a destroyed tdio, this function resumes
 277  * the scheduler.
 278  *
 279  * lock:
 280  *      BFQ_LOCK: protect from nullify bfq_diskctx->bfq_blockon/bfq_active_tdio
 281  *      in bfq_timeout()
 282  *
 283  * refcount: none
 284  *
 285  * Calling path: bfq_destroy_tdio --lwkt_msg--> helper_thread --call--> me
 286  *
 287  */
 288 void
 289 bfq_helper_destroy_tdio(struct dsched_thread_io *tdio, struct bfq_disk_ctx *bfq_diskctx)
 290 {
 291         KKASSERT(bfq_diskctx);
 292
 293         BFQ_LOCK(bfq_diskctx);
 294
 295         /*
 296          * Test whether the scheduler is pending on the tdio to
 297          * be destroyed.
 298          */
 299         if (((struct dsched_thread_io *)bfq_diskctx->bfq_blockon == tdio) &&
 300             callout_pending(&bfq_diskctx->bfq_callout)) {
 301                 dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: pending on a being destroyed thread!\n");
 302
 303                 callout_stop(&bfq_diskctx->bfq_callout);
 304
 305                 bfq_diskctx->bfq_blockon = NULL;
 306                 bfq_diskctx->bfq_active_tdio = NULL;
 307
 308                 BFQ_UNLOCK(bfq_diskctx);
 309
 310                 helper_msg_dequeue(bfq_diskctx);
 311                 return;
 312         }
 313         BFQ_UNLOCK(bfq_diskctx);
 314
 315 }
 316
 317 /*
 318  * bfq_destroy_tdio(): .destroy_tdio callback of the bfq policy
 319  *
 320  * Called immediate after a dsched_thread_io struct's refcount decreases
 321  * to zero. This function will record the seek_avg and ttime_avg of the
 322  * destroyed thread with the KTR facility.
 323  *
 324  * lock: none
 325  *
 326  * refcount: the tdio's refcount should be zero. It may be nuked, and
 327  * any read/write to the tdio is not safe by then.
 328  */
 329 static void
 330 bfq_destroy_tdio(struct dsched_thread_io *tdio)
 331 {
 332         struct bfq_thread_io *bfq_tdio = (struct bfq_thread_io *)tdio;
 333
 334         /*
 335          * do not log threads without I/O
 336          */
 337         if (bfq_tdio->seek_samples != 0 || bfq_tdio->ttime_samples != 0) {
 338                 KTR_LOG(dsched_bfq_thread_seek_avg, bfq_tdio, bfq_tdio->seek_avg );
 339                 KTR_LOG(dsched_bfq_thread_ttime_avg, bfq_tdio, bfq_tdio->ttime_avg);
 340         }
 341
 342         helper_msg_destroy_tdio((struct bfq_disk_ctx *)tdio->diskctx, tdio);
 343 }
 344
 345 /*
 346  * bfq_bio_done(): .bio_done callback of the bfq policy
 347  *
 348  * Called after a bio is done, (by request_polling_biodone of dsched).
 349  * This function judges whet her a thread consumes up its time slice, and
 350  * if so, it will set the maybe_timeout flag in bfq_tdio structure. Any
 351  * further action of that thread or the bfq scheduler will cause the
 352  * thread to be expired. (in bfq_queue() or in bfq_dequeue())
 353  *
 354  * This function requires the bfq_tdio pointer of the thread that pushes
 355  * bp to be stored by dsched_set_bio_priv() earlier. Currently it is
 356  * stored when bfq_queue() is called.
 357  *
 358  * lock: none. This function CANNOT be blocked by any lock
 359  *
 360  * refcount:
 361  *      the corresponding tdio's refcount should decrease by 1 after
 362  *      this function call. The counterpart increasing is in bfq_queue().
 363  *      For each bio pushed down, we increase the refcount of the pushing
 364  *      tdio.
 365  */
 366 static void
 367 bfq_bio_done(struct bio *bp)
 368 {
 369         struct disk *dp = dsched_get_bio_dp(bp);
 370         struct bfq_thread_io *bfq_tdio = dsched_get_bio_priv(bp);
 371         struct bfq_disk_ctx *bfq_diskctx = dsched_get_disk_priv(dp);
 372         struct timeval tv;
 373         int ticks_expired;
 374
 375         KKASSERT(bfq_tdio);
 376
 377         dsched_thread_io_ref(&bfq_tdio->head);
 378
 379         atomic_add_int(&bfq_tdio->bio_completed, 1);
 380
 381         /* the tdio has already expired */
 382         if (bfq_tdio != bfq_diskctx->bfq_active_tdio)
 383                 goto rtn;
 384         atomic_add_int(&bfq_tdio->service_received, BIO_SIZE(bp));
 385
 386         /* current time */
 387         getmicrotime(&tv);
 388         bfq_tdio->last_request_done_time = tv;
 389         timevalsub (&tv, &bfq_tdio->service_start_time);
 390         ticks_expired = tvtohz_high(&tv);
 391
 392         /* the thread has run out its time slice */
 393         if ((ticks_expired != 0x7fffffff) &&
 394             (ticks_expired >= BFQ_SLICE_TIMEOUT)) {
 395                 /*
 396                  * we cannot block here, so just set a flag
 397                  */
 398 #if 0
 399                 bfq_tdio->maybe_timeout = 1;
 400 #endif
 401                 if (atomic_cmpset_int(&bfq_tdio->maybe_timeout, 0, 1)) {
 402                         bfq_update_avg_time_slice(bfq_diskctx, tv);
 403                         dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: %p may time out\n", bfq_tdio);
 404                 }
 405         }
 406 rtn:
 407         dsched_thread_io_unref(&bfq_tdio->head); /* ref'ed in this function */
 408         dsched_thread_io_unref(&bfq_tdio->head); /* ref'ed in queue() */
 409
 410 }
 411
 412 /*
 413  * bfq_timeout(): called after the callout alarm strikes.
 414  *
 415  * This function getting called indicates that after waiting for
 416  * BFQ_T_WAIT / BFQ_T_WAIT_MIN ticks, the thread "active_tdio"
 417  * represents does not push any further bios. This tdio should
 418  * be expired with the reason BFQ_REASON_TOO_IDLE, but if the tdio
 419  * is marked as timeout (in bfq_biodone()) first, we expire it
 420  * for BFQ_REASON_TIMEOUT. The bfq scheduler should resume working
 421  * (and pick another thread to serve).
 422  *
 423  * It is possible that this function gets called a litter after
 424  * the thread pushes a bio with bfq_queue(), and thus a "fake timeout"
 425  * happens. We treat it as the callout does not strike, and continue
 426  * to serve the active_tdio.
 427  *
 428  * lock:
 429  *      BFQ_LOCK: protect bfq_diskctx->blockon and bfq_diskctx->active_tdio
 430  *      they should either changed in bfq_queue() or in this function,
 431  *      atomically.
 432  *      TDIO_LOCK: protect from dequeue() updateing the budget by the
 433  *      maybe_timeout branch. (Not necessary, because we already hold the
 434  *      BFQ_LOCK, and no one else could change the budget of the tdio)
 435  *
 436  * refcount:
 437  *  the refcount of bfq_diskctx->bfq_active_tdio will decrease one
 438  *  after this function. (The counterpart increasing is in bfq_dequeue(),
 439  *  before resetting the callout alarm.)
 440  *
 441  * AS timeout:
 442  * during the waiting period, no bio is pushed by the being
 443  * waited tdio
 444  *
 445  * Calling path:
 446  * callout facility --> helper_msg_timeout --lwkt_msg--> helper thread
 447  *  --> me
 448  */
 449 void
 450 bfq_timeout(void *p)
 451 {
 452         /* waiting time out:
 453          * no deceptive idleness, and unblock dispatching
 454          */
 455         struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)p;
 456         struct bfq_thread_io *bfq_tdio;
 457
 458         BFQ_LOCK(bfq_diskctx);
 459
 460         /*
 461          * the timeout occurs after the thread
 462          * pushing one more bio
 463          */
 464         if (bfq_diskctx->bfq_blockon == NULL) {
 465                 dsched_debug(BFQ_DEBUG_VERBOSE , "BFQ: fake AS timeout \n");
 466                 goto rtn;
 467         }
 468
 469         bfq_diskctx->bfq_as_miss++;
 470
 471         KKASSERT(bfq_diskctx->bfq_active_tdio);
 472         bfq_tdio = bfq_diskctx->bfq_active_tdio;
 473
 474         DSCHED_THREAD_IO_LOCK(&bfq_tdio->head);
 475
 476         bfq_update_as_avg_wait(bfq_diskctx, bfq_tdio, BFQ_AS_STAT_ALL|BFQ_AS_STAT_ONLY_MISS);
 477
 478         bfq_diskctx->bfq_blockon = NULL;
 479         bfq_diskctx->bfq_active_tdio = NULL;
 480         dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: unblocked %p\n", bfq_tdio);
 481
 482         wf2q_update_vd(bfq_tdio, bfq_tdio->budget - bfq_diskctx->bfq_remaining_budget);
 483         /*
 484          * the time slice expired before as timeout
 485          * this should be REASON_TIMEOUT
 486          */
 487         if (bfq_tdio->maybe_timeout) {
 488                 bfq_expire(bfq_diskctx, bfq_tdio, BFQ_REASON_TIMEOUT);
 489                 dsched_debug(BFQ_DEBUG_VERBOSE, "%p time out in timeout()\n", bfq_tdio);
 490         } else {
 491                 bfq_expire(bfq_diskctx, bfq_tdio, BFQ_REASON_TOO_IDLE);
 492                 dsched_debug(BFQ_DEBUG_VERBOSE, "%p too idle\n", bfq_tdio);
 493         }
 494
 495         DSCHED_THREAD_IO_UNLOCK(&bfq_tdio->head);
 496
 497         /* ref'ed in dequeue(), before resetting callout */
 498         dsched_thread_io_unref(&bfq_tdio->head);
 499 rtn:
 500         BFQ_UNLOCK(bfq_diskctx);
 501         helper_msg_dequeue(bfq_diskctx);
 502 }
 503
 504 /*
 505  * bfq_queue(): .queue callback of the bfq policy.
 506  *
 507  * A thread calls this function to hand in its I/O requests (bio).
 508  * Their bios are stored in the per-thread queue, in tdio structure.
 509  * Currently, the sync/async bios are queued together, which may cause
 510  * some issues on performance.
 511  *
 512  * Besides queueing bios, this function also calculates the average
 513  * thinking time and average seek distance of a thread, using the
 514  * information in bio structure.
 515  *
 516  * If the calling thread is waiting by the bfq scheduler due to
 517  * the AS feature, this function will cancel the callout alarm
 518  * and resume the scheduler to continue serving this thread.
 519  *
 520  * lock:
 521  *   THREAD_IO_LOCK: protect from queue iteration in bfq_dequeue()
 522  *   BFQ_LOCK: protect from other insertions/deletions in wf2q_augtree
 523  *   in bfq_queue() or bfq_dequeue().
 524  *
 525  * refcount:
 526  *   If the calling thread is waited by the scheduler, the refcount
 527  *   of the related tdio will decrease by 1 after this function. The
 528  *   counterpart increasing is in bfq_dequeue(), before resetting the
 529  *   callout alarm.
 530  *
 531  * Return value:
 532  *  EINVAL: if bio->bio_buf->b_cmd == BUF_CMD_FLUSH
 533  *  0: bio is queued successfully.
 534  */
 535 static int
 536 bfq_queue(struct dsched_disk_ctx *diskctx, struct dsched_thread_io *tdio,
 537                 struct  bio *bio)
 538 {
 539         struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)diskctx;
 540         struct bfq_thread_io *bfq_tdio = (struct bfq_thread_io *)tdio;
 541         int original_qlength;
 542
 543         /* we do not handle flush requests. push it down to dsched */
 544         if (__predict_false(bio->bio_buf->b_cmd == BUF_CMD_FLUSH))
 545                 return (EINVAL);
 546
 547         DSCHED_THREAD_IO_LOCK(tdio);
 548         KKASSERT(tdio->debug_priv == 0xF00FF00F);
 549         dsched_debug(BFQ_DEBUG_NORMAL, "bfq: tdio %p pushes bio %p\n", bfq_tdio, bio);
 550
 551         dsched_set_bio_priv(bio, tdio);
 552         dsched_thread_io_ref(tdio);
 553
 554         if ((bio->bio_buf->b_cmd == BUF_CMD_READ) ||
 555             (bio->bio_buf->b_cmd == BUF_CMD_WRITE)) {
 556                 bfq_update_tdio_seek_avg(bfq_tdio, bio);
 557         }
 558
 559         bfq_update_tdio_ttime_avg(bfq_tdio);
 560
 561         /* update last_bio_pushed_time */
 562         getmicrotime(&bfq_tdio->last_bio_pushed_time);
 563
 564         if ((bfq_tdio->seek_samples > BFQ_VALID_MIN_SAMPLES) &&
 565             BFQ_TDIO_SEEKY(bfq_tdio))
 566                 dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: tdio %p is seeky\n", bfq_tdio);
 567
 568         /*
 569          * If a tdio taks too long to think, we disable the AS feature of it.
 570          */
 571         if ((bfq_tdio->ttime_samples > BFQ_VALID_MIN_SAMPLES) &&
 572             (bfq_tdio->ttime_avg > BFQ_T_WAIT * (1000 / hz) * 1000) &&
 573             (bfq_tdio->service_received > bfq_tdio->budget / 8)) {
 574                 dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: tdio %p takes too long time to think\n", bfq_tdio);
 575                 bfq_tdio->tdio_as_switch = 0;
 576         } else {
 577                 bfq_tdio->tdio_as_switch = 1;
 578         }
 579
 580         /* insert the bio into the tdio's own queue */
 581         KKASSERT(lockstatus(&tdio->lock, curthread) == LK_EXCLUSIVE);
 582         TAILQ_INSERT_TAIL(&tdio->queue, bio, link);
 583 #if 0
 584         tdio->qlength++;
 585 #endif
 586         original_qlength = atomic_fetchadd_int(&tdio->qlength, 1);
 587         DSCHED_THREAD_IO_UNLOCK(tdio);
 588         /*
 589          * A new thread:
 590          * In dequeue function, we remove the thread
 591          * from the aug-tree if it has no further bios.
 592          * Therefore "new" means a really new thread (a
 593          * newly created thread or a thread that pushed no more
 594          * bios when the scheduler was waiting for it) or
 595          * one that was removed from the aug-tree earlier.
 596          */
 597         if (original_qlength == 0) {
 598                 /*
 599                  * a really new thread
 600                  */
 601                 BFQ_LOCK(bfq_diskctx);
 602                 if (bfq_tdio != bfq_diskctx->bfq_active_tdio) {
 603                         /* insert the tdio into the wf2q queue */
 604                         wf2q_insert_thread_io(&bfq_diskctx->bfq_wf2q, bfq_tdio);
 605                 } else {
 606                         /*
 607                          * the thread being waited by the scheduler
 608                          */
 609                         if (bfq_diskctx->bfq_blockon == bfq_tdio) {
 610                                 /*
 611                                  * XXX: possible race condition here:
 612                                  * if the callout function is triggered when
 613                                  * the following code is executed, then after
 614                                  * releasing the TDIO lock, the callout function
 615                                  * will set the thread inactive and it will never
 616                                  * be inserted into the aug-tree (so its bio pushed
 617                                  * this time will not be dispatched) until it pushes
 618                                  * further bios
 619                                  */
 620                                 bfq_diskctx->bfq_as_hit++;
 621                                 bfq_update_as_avg_wait(bfq_diskctx, bfq_tdio, BFQ_AS_STAT_ALL);
 622
 623                                 if (callout_pending(&bfq_diskctx->bfq_callout))
 624                                         callout_stop(&bfq_diskctx->bfq_callout);
 625                                 bfq_diskctx->bfq_blockon = NULL;
 626
 627                                 /* ref'ed in dequeue(), before resetting callout */
 628                                 dsched_thread_io_unref(&bfq_tdio->head);
 629
 630                                 dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: %p pushes a new bio when AS\n", bfq_tdio);
 631                         }
 632                 }
 633
 634                 BFQ_UNLOCK(bfq_diskctx);
 635         }
 636
 637         helper_msg_dequeue(bfq_diskctx);
 638
 639         return 0;
 640 }
 641
 642 /*
 643  * bfq_dequeue(): dispatch bios to the disk driver.
 644  *
 645  * This function will push as many bios as the number of free slots
 646  * in the tag queue.
 647  *
 648  * In the progress of dispatching, the following events may happen:
 649  *  - Current thread is timeout: Expire the current thread for
 650  *    BFQ_REASON_TIMEOUT, and select a new thread to serve in the
 651  *    wf2q tree.
 652  *
 653  *  - Current thread runs out of its budget: Expire the current thread
 654  *    for BFQ_REASON_OUT_OF_BUDGET, and select a new thread to serve
 655  *
 656  *  - Current thread has no further bios in its queue: if the AS feature
 657  *    is turned on, the bfq scheduler sets an alarm and starts to suspend.
 658  *    The bfq_timeout() or bfq_queue() calls may resume the scheduler.
 659  *
 660  * Implementation note: The bios selected to be dispatched will first
 661  * be stored in an array bio_do_dispatch. After this function releases
 662  * all the locks it holds, it will call dsched_strategy_request_polling()
 663  * for each bio stored.
 664  *
 665  * With the help of bfq_disk_ctx->pending_dequeue,
 666  * there will be only one bfq_dequeue pending on the BFQ_LOCK.
 667  *
 668  * lock:
 669  *      BFQ_LOCK: protect from wf2q_augtree operations in bfq_queue()
 670  *      THREAD_IO_LOCK: locks the active_tdio. Protect from queue insertions
 671  *      in bfq_queue; Protect the active_tdio->budget
 672  *
 673  * refcount:
 674  *  If the scheduler decides to suspend, the refcount of active_tdio
 675  *  increases by 1. The counterpart decreasing is in bfq_queue() and
 676  *  bfq_timeout()
 677  * blocking:
 678  *  May be blocking on the disk driver lock. It depends on drivers.
 679  *
 680  * Calling path:
 681  * The callers could be:
 682  *      bfq_queue(), bfq_timeout() and the registered polling function.
 683  *
 684  *      caller --> helper_msg_dequeue --lwkt_msg--> helper_thread-> me
 685  *
 686  */
 687 void
 688 bfq_dequeue(struct dsched_disk_ctx *diskctx)
 689 {
 690         int free_slots,
 691             bio_index = 0, i,
 692             remaining_budget = 0;/* remaining budget of current active process */
 693
 694         struct bio *bio, *bio_to_dispatch[33];
 695         struct bfq_thread_io *active_tdio = NULL;
 696         struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)diskctx;
 697
 698         BFQ_LOCK(bfq_diskctx);
 699         atomic_cmpset_int(&bfq_diskctx->pending_dequeue, 1, 0);
 700
 701         /*
 702          * The whole scheduler is waiting for further bios
 703          * from process currently being served
 704          */
 705         if (bfq_diskctx->bfq_blockon != NULL)
 706                 goto rtn;
 707
 708         remaining_budget = bfq_diskctx->bfq_remaining_budget;
 709         active_tdio = bfq_diskctx->bfq_active_tdio;
 710         dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: dequeue: Im in. active_tdio = %p\n", active_tdio);
 711
 712         free_slots = diskctx->max_tag_queue_depth - diskctx->current_tag_queue_depth;
 713         KKASSERT(free_slots >= 0 && free_slots <= 32);
 714
 715         if (active_tdio)
 716                 DSCHED_THREAD_IO_LOCK(&active_tdio->head);
 717
 718         while (free_slots) {
 719                 /* Here active_tdio must be locked ! */
 720                 if (active_tdio) {
 721                         /*
 722                          * the bio_done function has marked the current
 723                          * tdio timeout
 724                          */
 725                         if (active_tdio->maybe_timeout) {
 726                                 dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: %p time out in dequeue()\n", active_tdio);
 727                                 wf2q_update_vd(active_tdio, active_tdio->budget - remaining_budget);
 728                                 bfq_expire(bfq_diskctx, active_tdio, BFQ_REASON_TIMEOUT);
 729
 730                                 /* there still exist bios not dispatched,
 731                                  * reinsert the tdio into aug-tree*/
 732                                 if (active_tdio->head.qlength > 0) {
 733                                         wf2q_insert_thread_io(&bfq_diskctx->bfq_wf2q, active_tdio);
 734                                         KKASSERT(bfq_diskctx->bfq_wf2q.wf2q_tdio_count);
 735                                 }
 736
 737                                 active_tdio->maybe_timeout = 0;
 738                                 DSCHED_THREAD_IO_UNLOCK(&active_tdio->head);
 739                                 active_tdio = NULL;
 740                                 continue;
 741                         }
 742
 743                         /* select next bio to dispatch */
 744                         /* TODO: a wiser slection */
 745                         KKASSERT(lockstatus(&active_tdio->head.lock, curthread) == LK_EXCLUSIVE);
 746                         bio = TAILQ_FIRST(&active_tdio->head.queue);
 747                         dsched_debug(BFQ_DEBUG_NORMAL, "bfq: the first bio in queue of active_tdio %p is %p\n", active_tdio, bio);
 748
 749                         dsched_debug(BFQ_DEBUG_VERBOSE, "bfq: active_tdio %p exists, remaining budget = %d, tdio budget = %d\n, qlength = %d, first bio = %p, first bio cmd = %d, first bio size = %d\n", active_tdio, remaining_budget, active_tdio->budget, active_tdio->head.qlength, bio, bio?bio->bio_buf->b_cmd:-1, bio?bio->bio_buf->b_bcount:-1);
 750
 751                         /*
 752                          * The bio is not read or write, just
 753                          * push it down.
 754                          */
 755                         if (bio && (bio->bio_buf->b_cmd != BUF_CMD_READ) &&
 756                             (bio->bio_buf->b_cmd != BUF_CMD_WRITE)) {
 757                                 dsched_debug(BFQ_DEBUG_NORMAL, "bfq: remove bio %p from the queue of %p\n", bio, active_tdio);
 758                                 KKASSERT(lockstatus(&active_tdio->head.lock, curthread) == LK_EXCLUSIVE);
 759                                 TAILQ_REMOVE(&active_tdio->head.queue, bio, link);
 760                                 active_tdio->head.qlength--;
 761                                 free_slots--;
 762
 763 #if 0
 764                                 dsched_strategy_request_polling(diskctx->dp, bio, diskctx);
 765 #endif
 766                                 bio_to_dispatch[bio_index++] = bio;
 767                                 KKASSERT(bio_index <= bfq_diskctx->head.max_tag_queue_depth);
 768                                 continue;
 769                         }
 770                         /*
 771                          * Run out of budget
 772                          * But this is not because the size of bio is larger
 773                          * than the complete budget.
 774                          * If the size of bio is larger than the complete
 775                          * budget, then use a complete budget to cover it.
 776                          */
 777                         if (bio && (remaining_budget < BIO_SIZE(bio)) &&
 778                             (remaining_budget != active_tdio->budget)) {
 779                                 /* charge budget used */
 780                                 wf2q_update_vd(active_tdio, active_tdio->budget - remaining_budget);
 781                                 bfq_expire(bfq_diskctx, active_tdio, BFQ_REASON_OUT_OF_BUDGET);
 782                                 wf2q_insert_thread_io(&bfq_diskctx->bfq_wf2q, active_tdio);
 783                                 dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: thread %p ran out of budget\n", active_tdio);
 784                                 DSCHED_THREAD_IO_UNLOCK(&active_tdio->head);
 785                                 active_tdio = NULL;
 786                         } else { /* if (bio && remaining_budget < BIO_SIZE(bio) && remaining_budget != active_tdio->budget) */
 787
 788                                 /*
 789                                  * Having enough budget,
 790                                  * or having a complete budget and the size of bio
 791                                  * is larger than that.
 792                                  */
 793                                 if (bio) {
 794                                         /* dispatch */
 795                                         remaining_budget -= BIO_SIZE(bio);
 796                                         /*
 797                                          * The size of the first bio is larger
 798                                          * than the whole budget, we should
 799                                          * charge the extra part
 800                                          */
 801                                         if (remaining_budget < 0)
 802                                                 wf2q_update_vd(active_tdio, -remaining_budget);
 803                                         /* compensate */
 804                                         wf2q_update_vd(active_tdio, -remaining_budget);
 805                                         /*
 806                                          * remaining_budget may be < 0,
 807                                          * but to prevent the budget of current tdio
 808                                          * to substract a negative number,
 809                                          * the remaining_budget has to be >= 0
 810                                          */
 811                                         remaining_budget = MAX(0, remaining_budget);
 812                                         dsched_debug(BFQ_DEBUG_NORMAL, "bfq: remove bio %p from the queue of %p\n", bio, active_tdio);
 813                                         KKASSERT(lockstatus(&active_tdio->head.lock, curthread) == LK_EXCLUSIVE);
 814                                         TAILQ_REMOVE(&active_tdio->head.queue, bio, link);
 815                                         free_slots--;
 816                                         active_tdio->head.qlength--;
 817                                         active_tdio->bio_dispatched++;
 818                                         wf2q_inc_tot_service(&bfq_diskctx->bfq_wf2q, BIO_SIZE(bio));
 819                                         dsched_debug(BFQ_DEBUG_VERBOSE,
 820                                             "BFQ: %p's bio dispatched, size=%d, remaining_budget = %d\n",
 821                                             active_tdio, BIO_SIZE(bio), remaining_budget);
 822 #if 0
 823                                         dsched_strategy_request_polling(diskctx->dp, bio, diskctx);
 824 #endif
 825                                         bio_to_dispatch[bio_index++] = bio;
 826                                         KKASSERT(bio_index <= bfq_diskctx->head.max_tag_queue_depth);
 827
 828                                 } else { /* if (bio) */
 829
 830                                         KKASSERT(active_tdio);
 831                                         /*
 832                                          * If AS feature is switched off,
 833                                          * expire the tdio as well
 834                                          */
 835                                         if ((remaining_budget <= 0) ||
 836                                             !(bfq_diskctx->bfq_flag & BFQ_FLAG_AS) ||
 837                                             !active_tdio->tdio_as_switch) {
 838                                                 active_tdio->budget -= remaining_budget;
 839                                                 wf2q_update_vd(active_tdio, active_tdio->budget);
 840                                                 bfq_expire(bfq_diskctx, active_tdio, BFQ_REASON_OUT_OF_BUDGET);
 841                                                 DSCHED_THREAD_IO_UNLOCK(&active_tdio->head);
 842                                                 active_tdio = NULL;
 843                                         } else {
 844
 845                                                 /* no further bio, wait for a while */
 846                                                 bfq_diskctx->bfq_blockon = active_tdio;
 847                                                 /*
 848                                                  * Increase ref count to ensure that
 849                                                  * tdio will not be destroyed during waiting.
 850                                                  */
 851                                                 dsched_thread_io_ref(&active_tdio->head);
 852                                                 /*
 853                                                  * If the tdio is seeky but not thingking for
 854                                                  * too long, we wait for it a little shorter
 855                                                  */
 856                                                 if (active_tdio->seek_samples >= BFQ_VALID_MIN_SAMPLES && BFQ_TDIO_SEEKY(active_tdio))
 857                                                         callout_reset(&bfq_diskctx->bfq_callout, BFQ_T_WAIT_MIN, (void (*) (void *))helper_msg_as_timeout, bfq_diskctx);
 858                                                 else
 859                                                         callout_reset(&bfq_diskctx->bfq_callout, BFQ_T_WAIT, (void (*) (void *))helper_msg_as_timeout, bfq_diskctx);
 860
 861                                                 /* save the start time of blocking */
 862                                                 getmicrotime(&active_tdio->as_start_time);
 863
 864                                                 dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: blocked on %p, remaining_budget = %d\n", active_tdio, remaining_budget);
 865                                                 DSCHED_THREAD_IO_UNLOCK(&active_tdio->head);
 866                                                 goto save_and_rtn;
 867                                         }
 868                                 }
 869                         }
 870                 } else { /* if (active_tdio) */
 871                         /* there is no active tdio */
 872
 873                         /* no pending bios at all */
 874                         active_tdio = wf2q_get_next_thread_io(&bfq_diskctx->bfq_wf2q);
 875
 876                         if (!active_tdio) {
 877                                 KKASSERT(bfq_diskctx->bfq_wf2q.wf2q_tdio_count == 0);
 878                                 dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: no more eligible tdio!\n");
 879                                 goto save_and_rtn;
 880                         }
 881
 882                         /*
 883                          * A new tdio is picked,
 884                          * initialize the service related statistic data
 885                          */
 886                         DSCHED_THREAD_IO_LOCK(&active_tdio->head);
 887                         active_tdio->service_received = 0;
 888
 889                         /*
 890                          * Reset the maybe_timeout flag, which
 891                          * may be set by a biodone after the the service is done
 892                          */
 893                         getmicrotime(&active_tdio->service_start_time);
 894                         active_tdio->maybe_timeout = 0;
 895
 896                         remaining_budget = active_tdio->budget;
 897                         dsched_debug(BFQ_DEBUG_VERBOSE, "bfq: active_tdio %p selected, remaining budget = %d, tdio budget = %d\n, qlength = %d\n", active_tdio, remaining_budget, active_tdio->budget, active_tdio->head.qlength);
 898                 }
 899
 900         }/* while (free_slots) */
 901
 902         /* reach here only when free_slots == 0 */
 903         if (active_tdio) /* && lockcount(&active_tdio->head.lock) > 0) */
 904                 DSCHED_THREAD_IO_UNLOCK(&active_tdio->head);
 905
 906 save_and_rtn:
 907         /* save the remaining budget */
 908         bfq_diskctx->bfq_remaining_budget = remaining_budget;
 909         bfq_diskctx->bfq_active_tdio = active_tdio;
 910 rtn:
 911         BFQ_UNLOCK(bfq_diskctx);
 912         /*dispatch the planned bios*/
 913         for (i = 0; i < bio_index; i++)
 914                 dsched_strategy_request_polling(diskctx->dp, bio_to_dispatch[i], diskctx);
 915
 916 }
 917
 918 /*
 919  * bfq_slow_tdio(): decide whether a tdio is slow
 920  *
 921  * This function decides whether a tdio is slow by the speed
 922  * estimated from the current time slice start time: if the
 923  * tdio is not fast enough to consume its budget (or 2/3
 924  * its budget) within the time slice, it is judged slow.
 925  *
 926  * Called by bfq_expire()
 927  *
 928  * lock:
 929  *  THREAD_IO_LOCK is expected to be held.
 930  * refcount:
 931  *      none
 932  *
 933  */
 934 static int
 935 bfq_slow_tdio(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio)
 936 {
 937         /**
 938          * A tdio is considered slow if it can not finish its budget
 939          * at its current average speed
 940          */
 941         uint64_t usec_elapsed, service_received, speed;
 942         int expect;
 943         struct timeval tv = bfq_tdio->last_request_done_time;
 944
 945         timevalsub (&tv, &bfq_tdio->service_start_time);
 946         usec_elapsed = (uint64_t)(1000000 * (uint64_t)tv.tv_sec + tv.tv_usec);
 947
 948         /* discard absurd value */
 949         if (usec_elapsed < 20000)
 950                 return 0;
 951
 952         service_received = (uint64_t)bfq_tdio->service_received << BFQ_FIXPOINT_SHIFT;
 953         speed = service_received / usec_elapsed;
 954         expect = (speed * BFQ_SLICE_TIMEOUT * (1000 * 1000 / hz)) >> BFQ_FIXPOINT_SHIFT;
 955
 956         if (expect < 0) {
 957                 dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: overflow on calculating slow_tdio\n");
 958                 return 0;
 959         }
 960
 961         if (expect < bfq_tdio->budget * 2 / 3) {
 962                 dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: %p is judged slow\n", bfq_tdio);
 963                 return 1;
 964         }
 965
 966         return 0;
 967 }
 968
 969 /*
 970  * bfq_expire(): expire a tdio for a given reason.
 971  *
 972  * Different amount of the new budget will be assign to the expired
 973  * tdio according to the following reasons:
 974  *
 975  * BFQ_REASON_TIMEOUT:
 976  *  The tdio does not consume its budget up within BFQ_SLICE_TIMEOUT ticks.
 977  *  We shall update the disk peak rate if the tdio is not seeky. The new
 978  *  budget will be the budget it actually consumes during this time
 979  *  slice.
 980  *
 981  * BFQ_REASON_TOO_IDLE:
 982  *  The tdio does not push any further bios during the scheduler is
 983  *  suspending. To ensure low global latency, this tdio should be
 984  *  punished by assign it the minimum budget. But if the tdio's not
 985  *  pushing any bio is because it is waiting for the dispatched bios
 986  *  to be done, we just keep the budget unchanged.
 987  *
 988  * BFQ_REASON_OUT_OF_BUDGET:
 989  *      The tdio runs out of its budget within the time slice. It usually
 990  *      indicates that the tdio is doing well. We increase the budget of it.
 991  *
 992  * lock:
 993  *  THREAD_IO_LOCK is expected to be held.
 994  *  BFQ_LOCK is expected to be held (needed by bfq_update_peak_rate()).
 995  *
 996  * refcount: none
 997  *
 998  * Callers: bfq_timeout(), bfq_dequeue()
 999  *
1000  */
1001 static void
1002 bfq_expire(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio, enum bfq_expire_reason reason)
1003 {
1004         int max_budget = bfq_diskctx->bfq_max_budget,
1005                 budget_left,
1006                 bio_in_flight,
1007                 service_received;
1008
1009         service_received = bfq_tdio->service_received;
1010         budget_left = bfq_tdio->budget - bfq_tdio->service_received;
1011
1012         if (budget_left < 0) {
1013                 dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: budget down flow: %d, %d\n", bfq_tdio->budget, bfq_tdio->service_received);
1014                 budget_left = 0;
1015         }
1016
1017         KKASSERT(budget_left >= 0);
1018
1019         switch (reason) {
1020                 case BFQ_REASON_TIMEOUT:
1021                         /* the tdio is not seeky so that we can update
1022                          * the disk peak rate based on the service received
1023                          * by the tdio
1024                          */
1025                         if ((bfq_tdio->seek_samples >= BFQ_VALID_MIN_SAMPLES) &&
1026                             (!BFQ_TDIO_SEEKY(bfq_tdio)))
1027                                 bfq_update_peak_rate(bfq_diskctx, bfq_tdio);
1028
1029                         /* max_budget may be updated */
1030                         max_budget = bfq_diskctx->bfq_max_budget;
1031
1032                         /* update budget to service_received*/
1033                         bfq_tdio->budget = MAX(service_received, BFQ_DEFAULT_MIN_BUDGET);
1034
1035                         break;
1036
1037                 case BFQ_REASON_TOO_IDLE:
1038                         /*
1039                          * the tdio is too slow, charge full budget
1040                          */
1041                         if (bfq_slow_tdio(bfq_diskctx, bfq_tdio))
1042                                 wf2q_update_vd(bfq_tdio, budget_left);
1043
1044                         bio_in_flight = bfq_tdio->bio_dispatched - bfq_tdio->bio_completed;
1045                         KKASSERT(bio_in_flight >= 0);
1046                         /*
1047                          * maybe the tdio pushes no bio
1048                          * because it is waiting for some bios
1049                          * dispatched to be done, in this case
1050                          * we do not reduce the budget too harshly
1051                          */
1052                         if (bio_in_flight > 0) {
1053                                 bfq_tdio->budget = MAX(BFQ_DEFAULT_MIN_BUDGET, service_received);
1054                         } else {
1055 #if 0
1056                                 bfq_tdio->budget = MAX(BFQ_DEFAULT_MIN_BUDGET, bfq_diskctx->bfq_max_budget / BFQ_MIN_BUDGET_FACTOR);
1057 #endif
1058                                 bfq_tdio->budget = BFQ_DEFAULT_MIN_BUDGET;
1059                         }
1060
1061                         break;
1062                 case BFQ_REASON_OUT_OF_BUDGET:
1063
1064                         if ((bfq_tdio->seek_samples >= BFQ_VALID_MIN_SAMPLES) &&
1065                             (!BFQ_TDIO_SEEKY(bfq_tdio)))
1066                                 bfq_update_peak_rate(bfq_diskctx, bfq_tdio);
1067
1068                         /* increase the budget */
1069                         if (bfq_tdio->budget < BFQ_BUDGET_MULTIPLE_THRESHOLD)
1070                                 bfq_tdio->budget = MIN(max_budget, bfq_tdio->budget * 2);
1071                         else
1072                                 bfq_tdio->budget = MIN(max_budget, bfq_tdio->budget + BFQ_BUDG_INC_STEP);
1073                         break;
1074                 default:
1075                         break;
1076         }
1077 }
1078
1079 /*
1080  * bfq_update_peak_rate(): update the peak disk speed by sampling
1081  * the throughput within a time slice.
1082  *
1083  * lock:
1084  *  BFQ_LOCK is expected to be held
1085  *
1086  * refcount:
1087  *      none
1088  *
1089  * Caller: bfq_expire()
1090  */
1091 static void
1092 bfq_update_peak_rate(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio)
1093 {
1094         struct timeval tv = bfq_tdio->last_request_done_time;
1095         uint64_t usec, service_received, peak_rate;
1096
1097
1098         timevalsub (&tv, &bfq_tdio->service_start_time);
1099         usec = (uint64_t)(1000000 * (uint64_t)tv.tv_sec + tv.tv_usec);
1100
1101         /* discard absurd value */
1102         if (usec < 2000 || usec > (BFQ_SLICE_TIMEOUT * (1000 / hz) * 1000)) {
1103                 dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: absurd interval for peak rate\n");
1104                 return;
1105         }
1106
1107         service_received = (uint64_t)bfq_tdio->service_received << BFQ_FIXPOINT_SHIFT;
1108         peak_rate = service_received / usec;
1109         bfq_diskctx->bfq_peak_rate = (peak_rate + 7 * bfq_diskctx->bfq_peak_rate) / 8;
1110         bfq_diskctx->bfq_peak_rate_samples++;
1111
1112         /* update the max_budget according to the peak rate */
1113         if (bfq_diskctx->bfq_peak_rate_samples > BFQ_VALID_MIN_SAMPLES) {
1114                 bfq_diskctx->bfq_peak_rate_samples = BFQ_VALID_MIN_SAMPLES;
1115                 /*
1116                  * if the auto max budget adjust is disabled,
1117                  * the bfq_max_budget will always be BFQ_DEFAULT_MAX_BUDGET;
1118                  */
1119                 if (bfq_diskctx->bfq_flag & BFQ_FLAG_AUTO_MAX_BUDGET) {
1120                         bfq_diskctx->bfq_max_budget =
1121                                 (uint32_t)((BFQ_SLICE_TIMEOUT * (1000 / hz) * bfq_diskctx->bfq_peak_rate * 1000) >> BFQ_FIXPOINT_SHIFT);
1122                         dsched_debug(BFQ_DEBUG_NORMAL, "max budget updated to %d\n", bfq_diskctx->bfq_max_budget);
1123                 }
1124         }
1125 }
1126
1127 /*
1128  * bfq_update_tdio_seek_avg(): update the average seek distance of a
1129  * tdio.
1130  *
1131  * lock:
1132  *      THREAD_IO_LOCK is expected to be held.
1133  *
1134  * refcount:
1135  *  none
1136  *
1137  * Caller: bfq_queue()
1138  */
1139 static void
1140 bfq_update_tdio_seek_avg(struct bfq_thread_io *bfq_tdio, struct bio *bp)
1141 {
1142         off_t seek;
1143
1144         /* the first bio it dispatches,
1145          * we do not calculate the seek_avg,
1146          * just update the last_seek_end
1147          */
1148         if (bfq_tdio->seek_samples == 0) {
1149                 ++bfq_tdio->seek_samples;
1150                 goto rtn;
1151         }
1152
1153         seek = ABS(bp->bio_offset - bfq_tdio->last_seek_end);
1154
1155         /*
1156          * we do not do seek_samples++,
1157          * because the seek_total may overflow if seek_total += seek,
1158          */
1159         bfq_tdio->seek_samples = (7 * bfq_tdio->seek_samples + 256) / 8;
1160         bfq_tdio->seek_total = (7 * bfq_tdio->seek_total + 256 * seek) / 8;
1161         bfq_tdio->seek_avg = (bfq_tdio->seek_total + bfq_tdio->seek_samples / 2) / bfq_tdio->seek_samples;
1162
1163         dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: tdio %p seek_avg updated to %" PRIu64 "\n", bfq_tdio, bfq_tdio->seek_avg);
1164
1165 rtn:
1166         bfq_tdio->last_seek_end = bp->bio_offset + BIO_SIZE(bp);
1167 }
1168
1169 /*
1170  * bfq_update_tdio_ttime_avg(): update the average thinking time
1171  * of a tdio.
1172  *
1173  * The thinking time is used to switch on / off the tdio's AS feature
1174  *
1175  * lock:
1176  *  THREAD_IO_LOCK is expected to be held.
1177  *
1178  * refcount:
1179  *  none
1180  *
1181  * Caller:
1182  *  bfq_queue()
1183  *
1184  */
1185 static void
1186 bfq_update_tdio_ttime_avg(struct bfq_thread_io *bfq_tdio)
1187 {
1188         struct timeval tv, after_start;
1189         uint64_t usec;
1190
1191         if (bfq_tdio->ttime_samples == 0) {
1192                 ++bfq_tdio->ttime_samples;
1193                 return;
1194         }
1195
1196         getmicrotime(&tv);
1197         after_start = bfq_tdio->last_request_done_time;
1198
1199 #if 0
1200         timevalsub (&tv, &bfq_tdio->last_request_done_time);
1201 #endif
1202         /*
1203          * Try the interval between two bios are pushed,
1204          * instead of between last_request_done_time and
1205          * the current time.
1206          */
1207
1208         timevalsub (&tv, &bfq_tdio->last_bio_pushed_time);
1209
1210         timevalsub (&after_start, &bfq_tdio->service_start_time);
1211
1212         /*
1213          * tv.tv_sec < 0 means the last reauest done time is
1214          * after the current time.
1215          * this may happen because the biodone function is not blocked
1216          *
1217          * after_start.tv_sec < 0 means that the last bio done happens
1218          * before the current service slice, and we should drop this value.
1219          */
1220         if (tv.tv_sec < 0 || after_start.tv_sec < 0)
1221                 return;
1222
1223         usec = (uint64_t)(1000000 * (uint64_t)tv.tv_sec + tv.tv_usec);
1224
1225         bfq_tdio->ttime_samples = (7 * bfq_tdio->ttime_samples + 256) / 8;
1226         bfq_tdio->ttime_total = (7 * bfq_tdio->ttime_total + 256 * usec) / 8;
1227         bfq_tdio->ttime_avg = (bfq_tdio->ttime_total + 128) / bfq_tdio->ttime_samples;
1228
1229 }
1230
1231 /*
1232  * This function will also update the bfq_max_time_slice field
1233  *
1234  * tv: the timeval structure representing the length of time slice
1235  */
1236 static void
1237 bfq_update_avg_time_slice(struct bfq_disk_ctx *bfq_diskctx, struct timeval tv)
1238 {
1239         uint32_t msec;
1240
1241         msec = ((uint64_t)(1000000 * (uint64_t)tv.tv_sec + tv.tv_usec) >> 10 );
1242
1243         if (msec > 3 * BFQ_SLICE_TIMEOUT * (1000 / hz))
1244                 atomic_add_int(&bfq_diskctx->bfq_high_time_slice_count, 1);
1245
1246         bfq_diskctx->bfq_avg_time_slice =
1247                 (7 * bfq_diskctx->bfq_avg_time_slice + msec) / 8;
1248
1249         if (bfq_diskctx->bfq_max_time_slice < msec)
1250                 bfq_diskctx->bfq_max_time_slice = msec;
1251 }
1252 /*
1253  * This function will also update the bfq_as_max_wait field
1254  * flag: BFQ_AS_STAT_ALL, BFQ_AS_STAT_ONLY_MISS
1255  *
1256  */
1257 static void
1258 bfq_update_as_avg_wait(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio, int flag)
1259 {
1260         struct timeval tv;
1261         uint32_t msec;
1262         getmicrotime(&tv);
1263         timevalsub (&tv, &bfq_tdio->as_start_time);
1264
1265         /* approximately divide 1000 by left shift 10 */
1266         msec = ((uint64_t)(1000000 * (uint64_t)tv.tv_sec + tv.tv_usec) >> 10 );
1267
1268         /* ridiculous value */
1269         if (msec > 10000) {
1270                 dsched_debug(BFQ_DEBUG_NORMAL, "bfq: ridiculous as wait time!\n");
1271                 return;
1272         }
1273
1274         if (msec > 5 * BFQ_T_WAIT_MIN * (1000 / hz))
1275                 atomic_add_int(&bfq_diskctx->bfq_as_high_wait_count, 1);
1276
1277         if (flag & BFQ_AS_STAT_ALL) {
1278                 bfq_diskctx->bfq_as_avg_wait_all =
1279                         (7 * bfq_diskctx->bfq_as_avg_wait_all + msec) / 8;
1280         }
1281
1282         if (flag & BFQ_AS_STAT_ONLY_MISS) {
1283                 bfq_diskctx->bfq_as_avg_wait_miss =
1284                         (7 * bfq_diskctx->bfq_as_avg_wait_miss + msec) / 8;
1285         }
1286
1287         /* update the maximum waiting time */
1288         if (bfq_diskctx->bfq_as_max_wait < msec)
1289                 bfq_diskctx->bfq_as_max_wait = msec;
1290
1291         return;
1292 }
1293
1294 static int
1295 bfq_mod_handler(module_t mod, int type, void *unused)
1296 {
1297         static struct sysctl_ctx_list sysctl_ctx;
1298         static struct sysctl_oid *oid;
1299         static char version[16];
1300         int error;
1301
1302         ksnprintf(version, sizeof(version), "%d.%d",
1303                         dsched_bfq_version_maj, dsched_bfq_version_min);
1304
1305         switch (type) {
1306         case MOD_LOAD:
1307                 bzero(&bfq_stats, sizeof(struct dsched_bfq_stats));
1308                 if ((error = dsched_register(&dsched_bfq_policy)))
1309                         return (error);
1310
1311                 sysctl_ctx_init(&sysctl_ctx);
1312                 oid = SYSCTL_ADD_NODE(&sysctl_ctx,
1313                     SYSCTL_STATIC_CHILDREN(_dsched),
1314                     OID_AUTO,
1315                     "bfq",
1316                     CTLFLAG_RD, 0, "");
1317                 bfq_mod_oid = oid;
1318
1319                 SYSCTL_ADD_STRING(&sysctl_ctx, SYSCTL_CHILDREN(oid),
1320                     OID_AUTO, "version", CTLFLAG_RD, version, 0, "bfq version");
1321                 helper_init_global();
1322
1323                 kprintf("BFQ scheduler policy version %d.%d loaded. sizeof(bfq_thread_io) = %zu\n",
1324                     dsched_bfq_version_maj, dsched_bfq_version_min, sizeof(struct bfq_thread_io));
1325                 break;
1326
1327         case MOD_UNLOAD:
1328                 if ((error = dsched_unregister(&dsched_bfq_policy)))
1329                         return (error);
1330                 sysctl_ctx_free(&sysctl_ctx);
1331                 kprintf("BFQ scheduler policy unloaded\n");
1332                 break;
1333
1334         default:
1335                 break;
1336         }
1337
1338         return 0;
1339 }
1340
1341 int
1342 bfq_sysctl_as_switch_handler(SYSCTL_HANDLER_ARGS)
1343 {
1344         struct bfq_disk_ctx *bfq_diskctx = arg1;
1345         int as_switch, error;
1346
1347         as_switch = ((bfq_diskctx->bfq_flag & BFQ_FLAG_AS) ? 1 : 0);
1348         error = sysctl_handle_int(oidp, &as_switch, 0, req);
1349         if (error || !req->newptr)
1350                 return error;
1351
1352         if (as_switch == 1)
1353                 bfq_diskctx->bfq_flag |= BFQ_FLAG_AS;
1354         else if (as_switch == 0)
1355                 bfq_diskctx->bfq_flag &= ~(BFQ_FLAG_AS);
1356         else
1357                 return 0;
1358
1359         return error;
1360 }
1361
1362 int
1363 bfq_sysctl_auto_max_budget_handler(SYSCTL_HANDLER_ARGS)
1364 {
1365         struct bfq_disk_ctx *bfq_diskctx = arg1;
1366         int auto_max_budget_switch, error;
1367         auto_max_budget_switch = ((bfq_diskctx->bfq_flag & BFQ_FLAG_AUTO_MAX_BUDGET) ? 1 : 0);
1368         error = sysctl_handle_int(oidp, &auto_max_budget_switch, 0, req);
1369         if (error || !req->newptr)
1370                 return error;
1371
1372         if (auto_max_budget_switch == 1)
1373                 bfq_diskctx->bfq_flag |= BFQ_FLAG_AUTO_MAX_BUDGET;
1374         else if (auto_max_budget_switch == 0)
1375                 bfq_diskctx->bfq_flag &= ~(BFQ_FLAG_AUTO_MAX_BUDGET);
1376         else
1377                 return 0;
1378
1379         return error;
1380 }
1381
1382 DSCHED_POLICY_MODULE(dsched_bfq, bfq_mod_handler);