1cd88b06cddd0bfe9cc32f074c8b5f4c67e7a005
[dragonfly.git] / sys / kern / dsched / bfq / bfq.c
1 /*
2  * Copyright (c) 2011 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Brills Peng <brillsp@gmail.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34
35
36 /*
37  * BFQ disk scheduler, the algorithm routines and the interfaces with the
38  * dsched framework.
39  *
40  */
41
42 #include <sys/systm.h>
43 #include <sys/kernel.h>
44 #include <sys/proc.h>
45 #include <sys/sysctl.h>
46 #include <sys/buf.h>
47 #include <sys/conf.h>
48 #include <sys/diskslice.h>
49 #include <sys/disk.h>
50 #include <sys/malloc.h>
51 #include <machine/md_var.h>
52 #include <sys/ctype.h>
53 #include <sys/syslog.h>
54 #include <sys/device.h>
55 #include <sys/msgport.h>
56 #include <sys/msgport2.h>
57 #include <sys/buf2.h>
58 #include <sys/dsched.h>
59 #include <sys/fcntl.h>
60 #include <machine/inttypes.h>
61 #include <machine/varargs.h>
62
63 #include <kern/dsched/bfq/bfq.h>
64 #include <kern/dsched/bfq/bfq_helper_thread.h>
65
66 #define _DSCHED_BFQ_BFQ_C_
67 #include <kern/dsched/bfq/bfq_ktr.h>
68
69 /* Make sure our structs fit */
70 CTASSERT(sizeof(struct bfq_thread_io) <= DSCHED_THREAD_IO_MAX_SZ);
71 CTASSERT(sizeof(struct bfq_disk_ctx) <= DSCHED_DISK_CTX_MAX_SZ);
72
73
74 static dsched_prepare_t         bfq_prepare;
75 static dsched_teardown_t        bfq_teardown;
76 static dsched_cancel_t          bfq_cancel_all;
77 static dsched_queue_t           bfq_queue;
78 static dsched_new_tdio_t        bfq_new_tdio;
79 static dsched_destroy_tdio_t    bfq_destroy_tdio;
80 static dsched_bio_done_t        bfq_bio_done;
81
82
83 static void bfq_update_peak_rate(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio);
84 static int bfq_slow_tdio(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio);
85 static void bfq_expire(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio, enum bfq_expire_reason reason);
86 static void bfq_update_tdio_seek_avg(struct bfq_thread_io *bfq_tdio, struct bio *bp);
87 static void bfq_update_tdio_ttime_avg(struct bfq_thread_io *bfq_tdio);
88 static void bfq_update_as_avg_wait(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio, int flag);
89 static void bfq_update_avg_time_slice(struct bfq_disk_ctx *bfq_diskctx, struct timeval tv);
90
91
92
93 struct dsched_policy dsched_bfq_policy = {
94         .name           = "bfq",
95         .prepare        = bfq_prepare,
96         .teardown       = bfq_teardown,
97         .cancel_all     = bfq_cancel_all,
98         .bio_queue      = bfq_queue,
99         .new_tdio       = bfq_new_tdio,
100         .destroy_tdio   = bfq_destroy_tdio,
101         .bio_done       = bfq_bio_done,
102         .polling_func   = (void (*)(struct dsched_disk_ctx *))helper_msg_dequeue,
103 };
104
105
106 struct sysctl_oid *bfq_mod_oid;
107
108 struct dsched_bfq_stats bfq_stats;
109
110 static int dsched_bfq_version_maj = 1;
111 static int dsched_bfq_version_min = 0;
112
113 /*
114  * bfq_prepare(): the .prepare callback of the bfq policy. Initialize
115  * all fields in bfq_diskctx and initialize the corresponding helper
116  * thread.
117  *
118  * lock: none
119  * refcount: none
120  *
121  * Returns 0
122  */
123 static int
124 bfq_prepare(struct dsched_disk_ctx *diskctx)
125 {
126         struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)diskctx;
127
128         BFQ_LOCKINIT(bfq_diskctx);
129
130         bfq_diskctx->pending_dequeue = 0;
131
132         wf2q_init(&bfq_diskctx->bfq_wf2q);
133
134         callout_init_mp(&bfq_diskctx->bfq_callout);
135
136         bfq_diskctx->bfq_blockon = NULL;
137         bfq_diskctx->bfq_active_tdio = NULL;
138         bfq_diskctx->bfq_remaining_budget = 0;
139
140         bfq_diskctx->bfq_max_budget = BFQ_DEFAULT_MAX_BUDGET;
141         bfq_diskctx->bfq_peak_rate_samples = 0;
142         bfq_diskctx->bfq_peak_rate = 0;
143
144 #if 0
145         bfq_diskctx->bfq_flag = BFQ_FLAG_AS | BFQ_FLAG_AUTO_MAX_BUDGET;
146 #endif
147         bfq_diskctx->bfq_flag = BFQ_FLAG_AS;
148
149         bfq_diskctx->bfq_as_miss = 0;
150         bfq_diskctx->bfq_as_hit = 0;
151
152         bfq_diskctx->bfq_as_avg_wait_miss = 0;
153         bfq_diskctx->bfq_as_avg_wait_all = 0;
154         bfq_diskctx->bfq_as_max_wait = 0;
155         bfq_diskctx->bfq_as_max_wait2 = 0;
156         bfq_diskctx->bfq_as_high_wait_count = 0;
157         bfq_diskctx->bfq_as_high_wait_count2 = 0;
158
159         bfq_diskctx->bfq_avg_time_slice = 0;
160         bfq_diskctx->bfq_max_time_slice = 0;
161         bfq_diskctx->bfq_high_time_slice_count = 0;
162
163         /* initiailize the helper thread */
164         helper_init(bfq_diskctx);
165
166         dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: initialized!\n");
167         return 0;
168 }
169
170 /*
171  * bfq_teardown(): .teardown callback of the bfq policy. Send message
172  * of killing to the helper thread and deallocate resources used by
173  * the helper thread (currently the objcache)
174  *
175  * XXX: deadlock causing when the caller of bfq_teardown() and the
176  * helper thread are on the same CPU.
177  *
178  * lock: none
179  * refcount: none
180  *
181  */
182
183 static void
184 bfq_teardown(struct dsched_disk_ctx *diskctx)
185 {
186         struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)diskctx;
187         KKASSERT(diskctx);
188
189         helper_msg_kill(bfq_diskctx);
190
191         tsleep(diskctx, 0, "teardn", hz * 3 / 2);
192
193         helper_uninit(bfq_diskctx);
194 }
195
196 /*
197  * bfq_cancel_all(): .cancel_all callback of the bfq policy. Cancel
198  * all bios that queue in each bfq_thread_io structure in the
199  * wf2q tree.
200  *
201  * lock:
202  *      BFQ_LOCK: protect from wf2q_insert operation in bfq_queue() and
203  *      bfq_dequeue(); wf2q_get_next operation in bfq_dequeue()
204  *      THREAD_IO_LOCK: protect from queue iteration in bfq_dequeue() and
205  *      queue insertion in bfq_queue()
206  *
207  * refcount:
208  *      unref thread_io structures; they are referenced in queue(),
209  *      when a bio is queued. The refcount may decrease to zero.
210  *
211  */
212 static void
213 bfq_cancel_all(struct dsched_disk_ctx *diskctx)
214 {
215         struct bio *bio;
216         struct bfq_thread_io *bfq_tdio;
217         struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)diskctx;
218
219         BFQ_LOCK(bfq_diskctx);
220
221         while ((bfq_tdio = wf2q_get_next_thread_io(&bfq_diskctx->bfq_wf2q))) {
222                 DSCHED_THREAD_IO_LOCK(&bfq_tdio->head);
223                 KKASSERT(lockstatus(&bfq_tdio->head.lock, curthread) == LK_EXCLUSIVE);
224
225                 while ((bio = TAILQ_FIRST(&bfq_tdio->head.queue))) {
226                         bfq_tdio->head.qlength--;
227                         TAILQ_REMOVE(&bfq_tdio->head.queue, bio, link);
228                         dsched_cancel_bio(bio);
229                         dsched_thread_io_unref(&bfq_tdio->head);
230                 }
231
232                 KKASSERT(bfq_tdio->head.qlength == 0);
233                 DSCHED_THREAD_IO_UNLOCK(&bfq_tdio->head);
234         }
235
236         BFQ_UNLOCK(bfq_diskctx);
237 }
238
239 /*
240  * bfq_new_tdio(): .new_tdio callback of the bfq policy. Initialize
241  * the bfq_thread_io structure.
242  *
243  * lock: none
244  * refcount: none
245  */
246 static void
247 bfq_new_tdio(struct dsched_thread_io *tdio)
248 {
249         struct bfq_thread_io *bfq_tdio = (struct bfq_thread_io *) tdio;
250
251         /* the queue has to be initialized some where else */
252         tdio->qlength = 0;
253
254         tdio->debug_priv = 0xF00FF00F;
255
256         bfq_tdio->budget = BFQ_DEFAULT_MIN_BUDGET;
257         bfq_tdio->weight = BFQ_DEFAULT_WEIGHT;
258
259         bfq_tdio->tdio_as_switch = 1;
260         bfq_tdio->maybe_timeout = 0;
261
262         bfq_tdio->seek_samples = 0;
263         bfq_tdio->seek_avg = 0;
264         bfq_tdio->seek_total = 0;
265         bfq_tdio->ttime_samples = 0;
266         bfq_tdio->ttime_avg = 0;
267         bfq_tdio->service_received = 0;
268         bfq_tdio->bio_dispatched = 0;
269         bfq_tdio->bio_completed = 0;
270
271         KTR_LOG(dsched_bfq_thread_created, bfq_tdio);
272 }
273
274 /*
275  * bfq_helper_destroy_tdio(): called after a thread_io struct is destroyed.
276  * if the scheduler is AS waiting for a destroyed tdio, this function resumes
277  * the scheduler.
278  *
279  * lock:
280  *      BFQ_LOCK: protect from nullify bfq_diskctx->bfq_blockon/bfq_active_tdio
281  *      in bfq_timeout()
282  *
283  * refcount: none
284  *
285  * Calling path: bfq_destroy_tdio --lwkt_msg--> helper_thread --call--> me
286  *
287  */
288 void
289 bfq_helper_destroy_tdio(struct dsched_thread_io *tdio, struct bfq_disk_ctx *bfq_diskctx)
290 {
291         KKASSERT(bfq_diskctx);
292
293         BFQ_LOCK(bfq_diskctx);
294
295         /*
296          * Test whether the scheduler is pending on the tdio to
297          * be destroyed.
298          */
299         if (((struct dsched_thread_io *)bfq_diskctx->bfq_blockon == tdio) &&
300             callout_pending(&bfq_diskctx->bfq_callout)) {
301                 dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: pending on a being destroyed thread!\n");
302
303                 callout_stop(&bfq_diskctx->bfq_callout);
304
305                 bfq_diskctx->bfq_blockon = NULL;
306                 bfq_diskctx->bfq_active_tdio = NULL;
307
308                 BFQ_UNLOCK(bfq_diskctx);
309
310                 helper_msg_dequeue(bfq_diskctx);
311                 return;
312         }
313         BFQ_UNLOCK(bfq_diskctx);
314
315 }
316
317 /*
318  * bfq_destroy_tdio(): .destroy_tdio callback of the bfq policy
319  *
320  * Called immediate after a dsched_thread_io struct's refcount decreases
321  * to zero. This function will record the seek_avg and ttime_avg of the
322  * destroyed thread with the KTR facility.
323  *
324  * lock: none
325  *
326  * refcount: the tdio's refcount should be zero. It may be nuked, and
327  * any read/write to the tdio is not safe by then.
328  */
329 static void
330 bfq_destroy_tdio(struct dsched_thread_io *tdio)
331 {
332         struct bfq_thread_io *bfq_tdio = (struct bfq_thread_io *)tdio;
333
334         /*
335          * do not log threads without I/O
336          */
337         if (bfq_tdio->seek_samples != 0 || bfq_tdio->ttime_samples != 0) {
338                 KTR_LOG(dsched_bfq_thread_seek_avg, bfq_tdio, bfq_tdio->seek_avg );
339                 KTR_LOG(dsched_bfq_thread_ttime_avg, bfq_tdio, bfq_tdio->ttime_avg);
340         }
341
342         helper_msg_destroy_tdio((struct bfq_disk_ctx *)tdio->diskctx, tdio);
343 }
344
345 /*
346  * bfq_bio_done(): .bio_done callback of the bfq policy
347  *
348  * Called after a bio is done, (by request_polling_biodone of dsched).
349  * This function judges whet her a thread consumes up its time slice, and
350  * if so, it will set the maybe_timeout flag in bfq_tdio structure. Any
351  * further action of that thread or the bfq scheduler will cause the
352  * thread to be expired. (in bfq_queue() or in bfq_dequeue())
353  *
354  * This function requires the bfq_tdio pointer of the thread that pushes
355  * bp to be stored by dsched_set_bio_priv() earlier. Currently it is
356  * stored when bfq_queue() is called.
357  *
358  * lock: none. This function CANNOT be blocked by any lock
359  *
360  * refcount:
361  *      the corresponding tdio's refcount should decrease by 1 after
362  *      this function call. The counterpart increasing is in bfq_queue().
363  *      For each bio pushed down, we increase the refcount of the pushing
364  *      tdio.
365  */
366 static void
367 bfq_bio_done(struct bio *bp)
368 {
369         struct disk *dp = dsched_get_bio_dp(bp);
370         struct bfq_thread_io *bfq_tdio = dsched_get_bio_priv(bp);
371         struct bfq_disk_ctx *bfq_diskctx = dsched_get_disk_priv(dp);
372         struct timeval tv;
373         int ticks_expired;
374
375         KKASSERT(bfq_tdio);
376
377         dsched_thread_io_ref(&bfq_tdio->head);
378
379         atomic_add_int(&bfq_tdio->bio_completed, 1);
380
381         /* the tdio has already expired */
382         if (bfq_tdio != bfq_diskctx->bfq_active_tdio)
383                 goto rtn;
384         atomic_add_int(&bfq_tdio->service_received, BIO_SIZE(bp));
385
386         /* current time */
387         getmicrotime(&tv);
388         bfq_tdio->last_request_done_time = tv;
389         timevalsub (&tv, &bfq_tdio->service_start_time);
390         ticks_expired = tvtohz_high(&tv);
391
392         /* the thread has run out its time slice */
393         if ((ticks_expired != 0x7fffffff) &&
394             (ticks_expired >= BFQ_SLICE_TIMEOUT)) {
395                 /*
396                  * we cannot block here, so just set a flag
397                  */
398 #if 0
399                 bfq_tdio->maybe_timeout = 1;
400 #endif
401                 if (atomic_cmpset_int(&bfq_tdio->maybe_timeout, 0, 1)) {
402                         bfq_update_avg_time_slice(bfq_diskctx, tv);
403                         dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: %p may time out\n", bfq_tdio);
404                 }
405         }
406 rtn:
407         dsched_thread_io_unref(&bfq_tdio->head); /* ref'ed in this function */
408         dsched_thread_io_unref(&bfq_tdio->head); /* ref'ed in queue() */
409
410 }
411
412 /*
413  * bfq_timeout(): called after the callout alarm strikes.
414  *
415  * This function getting called indicates that after waiting for
416  * BFQ_T_WAIT / BFQ_T_WAIT_MIN ticks, the thread "active_tdio"
417  * represents does not push any further bios. This tdio should
418  * be expired with the reason BFQ_REASON_TOO_IDLE, but if the tdio
419  * is marked as timeout (in bfq_biodone()) first, we expire it
420  * for BFQ_REASON_TIMEOUT. The bfq scheduler should resume working
421  * (and pick another thread to serve).
422  *
423  * It is possible that this function gets called a litter after
424  * the thread pushes a bio with bfq_queue(), and thus a "fake timeout"
425  * happens. We treat it as the callout does not strike, and continue
426  * to serve the active_tdio.
427  *
428  * lock:
429  *      BFQ_LOCK: protect bfq_diskctx->blockon and bfq_diskctx->active_tdio
430  *      they should either changed in bfq_queue() or in this function,
431  *      atomically.
432  *      TDIO_LOCK: protect from dequeue() updateing the budget by the
433  *      maybe_timeout branch. (Not necessary, because we already hold the
434  *      BFQ_LOCK, and no one else could change the budget of the tdio)
435  *
436  * refcount:
437  *  the refcount of bfq_diskctx->bfq_active_tdio will decrease one
438  *  after this function. (The counterpart increasing is in bfq_dequeue(),
439  *  before resetting the callout alarm.)
440  *
441  * AS timeout:
442  * during the waiting period, no bio is pushed by the being
443  * waited tdio
444  *
445  * Calling path:
446  * callout facility --> helper_msg_timeout --lwkt_msg--> helper thread
447  *  --> me
448  */
449 void
450 bfq_timeout(void *p)
451 {
452         /* waiting time out:
453          * no deceptive idleness, and unblock dispatching
454          */
455         struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)p;
456         struct bfq_thread_io *bfq_tdio;
457
458         BFQ_LOCK(bfq_diskctx);
459
460         /*
461          * the timeout occurs after the thread
462          * pushing one more bio
463          */
464         if (bfq_diskctx->bfq_blockon == NULL) {
465                 dsched_debug(BFQ_DEBUG_VERBOSE , "BFQ: fake AS timeout \n");
466                 goto rtn;
467         }
468
469         bfq_diskctx->bfq_as_miss++;
470
471         KKASSERT(bfq_diskctx->bfq_active_tdio);
472         bfq_tdio = bfq_diskctx->bfq_active_tdio;
473
474         DSCHED_THREAD_IO_LOCK(&bfq_tdio->head);
475
476         bfq_update_as_avg_wait(bfq_diskctx, bfq_tdio, BFQ_AS_STAT_ALL|BFQ_AS_STAT_ONLY_MISS);
477
478         bfq_diskctx->bfq_blockon = NULL;
479         bfq_diskctx->bfq_active_tdio = NULL;
480         dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: unblocked %p\n", bfq_tdio);
481
482         wf2q_update_vd(bfq_tdio, bfq_tdio->budget - bfq_diskctx->bfq_remaining_budget);
483         /*
484          * the time slice expired before as timeout
485          * this should be REASON_TIMEOUT
486          */
487         if (bfq_tdio->maybe_timeout) {
488                 bfq_expire(bfq_diskctx, bfq_tdio, BFQ_REASON_TIMEOUT);
489                 dsched_debug(BFQ_DEBUG_VERBOSE, "%p time out in timeout()\n", bfq_tdio);
490         } else {
491                 bfq_expire(bfq_diskctx, bfq_tdio, BFQ_REASON_TOO_IDLE);
492                 dsched_debug(BFQ_DEBUG_VERBOSE, "%p too idle\n", bfq_tdio);
493         }
494
495         DSCHED_THREAD_IO_UNLOCK(&bfq_tdio->head);
496
497         /* ref'ed in dequeue(), before resetting callout */
498         dsched_thread_io_unref(&bfq_tdio->head);
499 rtn:
500         BFQ_UNLOCK(bfq_diskctx);
501         helper_msg_dequeue(bfq_diskctx);
502 }
503
504 /*
505  * bfq_queue(): .queue callback of the bfq policy.
506  *
507  * A thread calls this function to hand in its I/O requests (bio).
508  * Their bios are stored in the per-thread queue, in tdio structure.
509  * Currently, the sync/async bios are queued together, which may cause
510  * some issues on performance.
511  *
512  * Besides queueing bios, this function also calculates the average
513  * thinking time and average seek distance of a thread, using the
514  * information in bio structure.
515  *
516  * If the calling thread is waiting by the bfq scheduler due to
517  * the AS feature, this function will cancel the callout alarm
518  * and resume the scheduler to continue serving this thread.
519  *
520  * lock:
521  *   THREAD_IO_LOCK: protect from queue iteration in bfq_dequeue()
522  *   BFQ_LOCK: protect from other insertions/deletions in wf2q_augtree
523  *   in bfq_queue() or bfq_dequeue().
524  *
525  * refcount:
526  *   If the calling thread is waited by the scheduler, the refcount
527  *   of the related tdio will decrease by 1 after this function. The
528  *   counterpart increasing is in bfq_dequeue(), before resetting the
529  *   callout alarm.
530  *
531  * Return value:
532  *  EINVAL: if bio->bio_buf->b_cmd == BUF_CMD_FLUSH
533  *  0: bio is queued successfully.
534  */
535 static int
536 bfq_queue(struct dsched_disk_ctx *diskctx, struct dsched_thread_io *tdio,
537                 struct  bio *bio)
538 {
539         struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)diskctx;
540         struct bfq_thread_io *bfq_tdio = (struct bfq_thread_io *)tdio;
541         int original_qlength;
542
543         /* we do not handle flush requests. push it down to dsched */
544         if (__predict_false(bio->bio_buf->b_cmd == BUF_CMD_FLUSH))
545                 return (EINVAL);
546
547         DSCHED_THREAD_IO_LOCK(tdio);
548         KKASSERT(tdio->debug_priv == 0xF00FF00F);
549         dsched_debug(BFQ_DEBUG_NORMAL, "bfq: tdio %p pushes bio %p\n", bfq_tdio, bio);
550
551         dsched_set_bio_priv(bio, tdio);
552         dsched_thread_io_ref(tdio);
553
554         if ((bio->bio_buf->b_cmd == BUF_CMD_READ) ||
555             (bio->bio_buf->b_cmd == BUF_CMD_WRITE)) {
556                 bfq_update_tdio_seek_avg(bfq_tdio, bio);
557         }
558
559         bfq_update_tdio_ttime_avg(bfq_tdio);
560
561         /* update last_bio_pushed_time */
562         getmicrotime(&bfq_tdio->last_bio_pushed_time);
563
564         if ((bfq_tdio->seek_samples > BFQ_VALID_MIN_SAMPLES) &&
565             BFQ_TDIO_SEEKY(bfq_tdio))
566                 dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: tdio %p is seeky\n", bfq_tdio);
567
568         /*
569          * If a tdio taks too long to think, we disable the AS feature of it.
570          */
571         if ((bfq_tdio->ttime_samples > BFQ_VALID_MIN_SAMPLES) &&
572             (bfq_tdio->ttime_avg > BFQ_T_WAIT * (1000 / hz) * 1000) &&
573             (bfq_tdio->service_received > bfq_tdio->budget / 8)) {
574                 dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: tdio %p takes too long time to think\n", bfq_tdio);
575                 bfq_tdio->tdio_as_switch = 0;
576         } else {
577                 bfq_tdio->tdio_as_switch = 1;
578         }
579
580         /* insert the bio into the tdio's own queue */
581         KKASSERT(lockstatus(&tdio->lock, curthread) == LK_EXCLUSIVE);
582         TAILQ_INSERT_TAIL(&tdio->queue, bio, link);
583 #if 0
584         tdio->qlength++;
585 #endif
586         original_qlength = atomic_fetchadd_int(&tdio->qlength, 1);
587         DSCHED_THREAD_IO_UNLOCK(tdio);
588         /*
589          * A new thread:
590          * In dequeue function, we remove the thread
591          * from the aug-tree if it has no further bios.
592          * Therefore "new" means a really new thread (a
593          * newly created thread or a thread that pushed no more
594          * bios when the scheduler was waiting for it) or
595          * one that was removed from the aug-tree earlier.
596          */
597         if (original_qlength == 0) {
598                 /*
599                  * a really new thread
600                  */
601                 BFQ_LOCK(bfq_diskctx);
602                 if (bfq_tdio != bfq_diskctx->bfq_active_tdio) {
603                         /* insert the tdio into the wf2q queue */
604                         wf2q_insert_thread_io(&bfq_diskctx->bfq_wf2q, bfq_tdio);
605                 } else {
606                         /*
607                          * the thread being waited by the scheduler
608                          */
609                         if (bfq_diskctx->bfq_blockon == bfq_tdio) {
610                                 /*
611                                  * XXX: possible race condition here:
612                                  * if the callout function is triggered when
613                                  * the following code is executed, then after
614                                  * releasing the TDIO lock, the callout function
615                                  * will set the thread inactive and it will never
616                                  * be inserted into the aug-tree (so its bio pushed
617                                  * this time will not be dispatched) until it pushes
618                                  * further bios
619                                  */
620                                 bfq_diskctx->bfq_as_hit++;
621                                 bfq_update_as_avg_wait(bfq_diskctx, bfq_tdio, BFQ_AS_STAT_ALL);
622
623                                 if (callout_pending(&bfq_diskctx->bfq_callout))
624                                         callout_stop(&bfq_diskctx->bfq_callout);
625                                 bfq_diskctx->bfq_blockon = NULL;
626
627                                 /* ref'ed in dequeue(), before resetting callout */
628                                 dsched_thread_io_unref(&bfq_tdio->head);
629
630                                 dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: %p pushes a new bio when AS\n", bfq_tdio);
631                         }
632                 }
633
634                 BFQ_UNLOCK(bfq_diskctx);
635         }
636
637         helper_msg_dequeue(bfq_diskctx);
638
639         return 0;
640 }
641
642 /*
643  * bfq_dequeue(): dispatch bios to the disk driver.
644  *
645  * This function will push as many bios as the number of free slots
646  * in the tag queue.
647  *
648  * In the progress of dispatching, the following events may happen:
649  *  - Current thread is timeout: Expire the current thread for
650  *    BFQ_REASON_TIMEOUT, and select a new thread to serve in the
651  *    wf2q tree.
652  *
653  *  - Current thread runs out of its budget: Expire the current thread
654  *    for BFQ_REASON_OUT_OF_BUDGET, and select a new thread to serve
655  *
656  *  - Current thread has no further bios in its queue: if the AS feature
657  *    is turned on, the bfq scheduler sets an alarm and starts to suspend.
658  *    The bfq_timeout() or bfq_queue() calls may resume the scheduler.
659  *
660  * Implementation note: The bios selected to be dispatched will first
661  * be stored in an array bio_do_dispatch. After this function releases
662  * all the locks it holds, it will call dsched_strategy_request_polling()
663  * for each bio stored.
664  *
665  * With the help of bfq_disk_ctx->pending_dequeue,
666  * there will be only one bfq_dequeue pending on the BFQ_LOCK.
667  *
668  * lock:
669  *      BFQ_LOCK: protect from wf2q_augtree operations in bfq_queue()
670  *      THREAD_IO_LOCK: locks the active_tdio. Protect from queue insertions
671  *      in bfq_queue; Protect the active_tdio->budget
672  *
673  * refcount:
674  *  If the scheduler decides to suspend, the refcount of active_tdio
675  *  increases by 1. The counterpart decreasing is in bfq_queue() and
676  *  bfq_timeout()
677  * blocking:
678  *  May be blocking on the disk driver lock. It depends on drivers.
679  *
680  * Calling path:
681  * The callers could be:
682  *      bfq_queue(), bfq_timeout() and the registered polling function.
683  *
684  *      caller --> helper_msg_dequeue --lwkt_msg--> helper_thread-> me
685  *
686  */
687 void
688 bfq_dequeue(struct dsched_disk_ctx *diskctx)
689 {
690         int free_slots,
691             bio_index = 0, i,
692             remaining_budget = 0;/* remaining budget of current active process */
693
694         struct bio *bio, *bio_to_dispatch[33];
695         struct bfq_thread_io *active_tdio = NULL;
696         struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)diskctx;
697
698         BFQ_LOCK(bfq_diskctx);
699         atomic_cmpset_int(&bfq_diskctx->pending_dequeue, 1, 0);
700
701         /*
702          * The whole scheduler is waiting for further bios
703          * from process currently being served
704          */
705         if (bfq_diskctx->bfq_blockon != NULL)
706                 goto rtn;
707
708         remaining_budget = bfq_diskctx->bfq_remaining_budget;
709         active_tdio = bfq_diskctx->bfq_active_tdio;
710         dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: dequeue: Im in. active_tdio = %p\n", active_tdio);
711
712         free_slots = diskctx->max_tag_queue_depth - diskctx->current_tag_queue_depth;
713         KKASSERT(free_slots >= 0 && free_slots <= 32);
714
715         if (active_tdio)
716                 DSCHED_THREAD_IO_LOCK(&active_tdio->head);
717
718         while (free_slots) {
719                 /* Here active_tdio must be locked ! */
720                 if (active_tdio) {
721                         /*
722                          * the bio_done function has marked the current
723                          * tdio timeout
724                          */
725                         if (active_tdio->maybe_timeout) {
726                                 dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: %p time out in dequeue()\n", active_tdio);
727                                 wf2q_update_vd(active_tdio, active_tdio->budget - remaining_budget);
728                                 bfq_expire(bfq_diskctx, active_tdio, BFQ_REASON_TIMEOUT);
729
730                                 /* there still exist bios not dispatched,
731                                  * reinsert the tdio into aug-tree*/
732                                 if (active_tdio->head.qlength > 0) {
733                                         wf2q_insert_thread_io(&bfq_diskctx->bfq_wf2q, active_tdio);
734                                         KKASSERT(bfq_diskctx->bfq_wf2q.wf2q_tdio_count);
735                                 }
736
737                                 active_tdio->maybe_timeout = 0;
738                                 DSCHED_THREAD_IO_UNLOCK(&active_tdio->head);
739                                 active_tdio = NULL;
740                                 continue;
741                         }
742
743                         /* select next bio to dispatch */
744                         /* TODO: a wiser slection */
745                         KKASSERT(lockstatus(&active_tdio->head.lock, curthread) == LK_EXCLUSIVE);
746                         bio = TAILQ_FIRST(&active_tdio->head.queue);
747                         dsched_debug(BFQ_DEBUG_NORMAL, "bfq: the first bio in queue of active_tdio %p is %p\n", active_tdio, bio);
748
749                         dsched_debug(BFQ_DEBUG_VERBOSE, "bfq: active_tdio %p exists, remaining budget = %d, tdio budget = %d\n, qlength = %d, first bio = %p, first bio cmd = %d, first bio size = %d\n", active_tdio, remaining_budget, active_tdio->budget, active_tdio->head.qlength, bio, bio?bio->bio_buf->b_cmd:-1, bio?bio->bio_buf->b_bcount:-1);
750
751                         /*
752                          * The bio is not read or write, just
753                          * push it down.
754                          */
755                         if (bio && (bio->bio_buf->b_cmd != BUF_CMD_READ) &&
756                             (bio->bio_buf->b_cmd != BUF_CMD_WRITE)) {
757                                 dsched_debug(BFQ_DEBUG_NORMAL, "bfq: remove bio %p from the queue of %p\n", bio, active_tdio);
758                                 KKASSERT(lockstatus(&active_tdio->head.lock, curthread) == LK_EXCLUSIVE);
759                                 TAILQ_REMOVE(&active_tdio->head.queue, bio, link);
760                                 active_tdio->head.qlength--;
761                                 free_slots--;
762
763 #if 0
764                                 dsched_strategy_request_polling(diskctx->dp, bio, diskctx);
765 #endif
766                                 bio_to_dispatch[bio_index++] = bio;
767                                 KKASSERT(bio_index <= bfq_diskctx->head.max_tag_queue_depth);
768                                 continue;
769                         }
770                         /*
771                          * Run out of budget
772                          * But this is not because the size of bio is larger
773                          * than the complete budget.
774                          * If the size of bio is larger than the complete
775                          * budget, then use a complete budget to cover it.
776                          */
777                         if (bio && (remaining_budget < BIO_SIZE(bio)) &&
778                             (remaining_budget != active_tdio->budget)) {
779                                 /* charge budget used */
780                                 wf2q_update_vd(active_tdio, active_tdio->budget - remaining_budget);
781                                 bfq_expire(bfq_diskctx, active_tdio, BFQ_REASON_OUT_OF_BUDGET);
782                                 wf2q_insert_thread_io(&bfq_diskctx->bfq_wf2q, active_tdio);
783                                 dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: thread %p ran out of budget\n", active_tdio);
784                                 DSCHED_THREAD_IO_UNLOCK(&active_tdio->head);
785                                 active_tdio = NULL;
786                         } else { /* if (bio && remaining_budget < BIO_SIZE(bio) && remaining_budget != active_tdio->budget) */
787
788                                 /*
789                                  * Having enough budget,
790                                  * or having a complete budget and the size of bio
791                                  * is larger than that.
792                                  */
793                                 if (bio) {
794                                         /* dispatch */
795                                         remaining_budget -= BIO_SIZE(bio);
796                                         /*
797                                          * The size of the first bio is larger
798                                          * than the whole budget, we should
799                                          * charge the extra part
800                                          */
801                                         if (remaining_budget < 0)
802                                                 wf2q_update_vd(active_tdio, -remaining_budget);
803                                         /* compensate */
804                                         wf2q_update_vd(active_tdio, -remaining_budget);
805                                         /*
806                                          * remaining_budget may be < 0,
807                                          * but to prevent the budget of current tdio
808                                          * to substract a negative number,
809                                          * the remaining_budget has to be >= 0
810                                          */
811                                         remaining_budget = MAX(0, remaining_budget);
812                                         dsched_debug(BFQ_DEBUG_NORMAL, "bfq: remove bio %p from the queue of %p\n", bio, active_tdio);
813                                         KKASSERT(lockstatus(&active_tdio->head.lock, curthread) == LK_EXCLUSIVE);
814                                         TAILQ_REMOVE(&active_tdio->head.queue, bio, link);
815                                         free_slots--;
816                                         active_tdio->head.qlength--;
817                                         active_tdio->bio_dispatched++;
818                                         wf2q_inc_tot_service(&bfq_diskctx->bfq_wf2q, BIO_SIZE(bio));
819                                         dsched_debug(BFQ_DEBUG_VERBOSE,
820                                             "BFQ: %p's bio dispatched, size=%d, remaining_budget = %d\n",
821                                             active_tdio, BIO_SIZE(bio), remaining_budget);
822 #if 0
823                                         dsched_strategy_request_polling(diskctx->dp, bio, diskctx);
824 #endif
825                                         bio_to_dispatch[bio_index++] = bio;
826                                         KKASSERT(bio_index <= bfq_diskctx->head.max_tag_queue_depth);
827
828                                 } else { /* if (bio) */
829
830                                         KKASSERT(active_tdio);
831                                         /*
832                                          * If AS feature is switched off,
833                                          * expire the tdio as well
834                                          */
835                                         if ((remaining_budget <= 0) ||
836                                             !(bfq_diskctx->bfq_flag & BFQ_FLAG_AS) ||
837                                             !active_tdio->tdio_as_switch) {
838                                                 active_tdio->budget -= remaining_budget;
839                                                 wf2q_update_vd(active_tdio, active_tdio->budget);
840                                                 bfq_expire(bfq_diskctx, active_tdio, BFQ_REASON_OUT_OF_BUDGET);
841                                                 DSCHED_THREAD_IO_UNLOCK(&active_tdio->head);
842                                                 active_tdio = NULL;
843                                         } else {
844
845                                                 /* no further bio, wait for a while */
846                                                 bfq_diskctx->bfq_blockon = active_tdio;
847                                                 /*
848                                                  * Increase ref count to ensure that
849                                                  * tdio will not be destroyed during waiting.
850                                                  */
851                                                 dsched_thread_io_ref(&active_tdio->head);
852                                                 /*
853                                                  * If the tdio is seeky but not thingking for
854                                                  * too long, we wait for it a little shorter
855                                                  */
856                                                 if (active_tdio->seek_samples >= BFQ_VALID_MIN_SAMPLES && BFQ_TDIO_SEEKY(active_tdio))
857                                                         callout_reset(&bfq_diskctx->bfq_callout, BFQ_T_WAIT_MIN, (void (*) (void *))helper_msg_as_timeout, bfq_diskctx);
858                                                 else
859                                                         callout_reset(&bfq_diskctx->bfq_callout, BFQ_T_WAIT, (void (*) (void *))helper_msg_as_timeout, bfq_diskctx);
860
861                                                 /* save the start time of blocking */
862                                                 getmicrotime(&active_tdio->as_start_time);
863
864                                                 dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: blocked on %p, remaining_budget = %d\n", active_tdio, remaining_budget);
865                                                 DSCHED_THREAD_IO_UNLOCK(&active_tdio->head);
866                                                 goto save_and_rtn;
867                                         }
868                                 }
869                         }
870                 } else { /* if (active_tdio) */
871                         /* there is no active tdio */
872
873                         /* no pending bios at all */
874                         active_tdio = wf2q_get_next_thread_io(&bfq_diskctx->bfq_wf2q);
875
876                         if (!active_tdio) {
877                                 KKASSERT(bfq_diskctx->bfq_wf2q.wf2q_tdio_count == 0);
878                                 dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: no more eligible tdio!\n");
879                                 goto save_and_rtn;
880                         }
881
882                         /*
883                          * A new tdio is picked,
884                          * initialize the service related statistic data
885                          */
886                         DSCHED_THREAD_IO_LOCK(&active_tdio->head);
887                         active_tdio->service_received = 0;
888
889                         /*
890                          * Reset the maybe_timeout flag, which
891                          * may be set by a biodone after the the service is done
892                          */
893                         getmicrotime(&active_tdio->service_start_time);
894                         active_tdio->maybe_timeout = 0;
895
896                         remaining_budget = active_tdio->budget;
897                         dsched_debug(BFQ_DEBUG_VERBOSE, "bfq: active_tdio %p selected, remaining budget = %d, tdio budget = %d\n, qlength = %d\n", active_tdio, remaining_budget, active_tdio->budget, active_tdio->head.qlength);
898                 }
899
900         }/* while (free_slots) */
901
902         /* reach here only when free_slots == 0 */
903         if (active_tdio) /* && lockcount(&active_tdio->head.lock) > 0) */
904                 DSCHED_THREAD_IO_UNLOCK(&active_tdio->head);
905
906 save_and_rtn:
907         /* save the remaining budget */
908         bfq_diskctx->bfq_remaining_budget = remaining_budget;
909         bfq_diskctx->bfq_active_tdio = active_tdio;
910 rtn:
911         BFQ_UNLOCK(bfq_diskctx);
912         /*dispatch the planned bios*/
913         for (i = 0; i < bio_index; i++)
914                 dsched_strategy_request_polling(diskctx->dp, bio_to_dispatch[i], diskctx);
915
916 }
917
918 /*
919  * bfq_slow_tdio(): decide whether a tdio is slow
920  *
921  * This function decides whether a tdio is slow by the speed
922  * estimated from the current time slice start time: if the
923  * tdio is not fast enough to consume its budget (or 2/3
924  * its budget) within the time slice, it is judged slow.
925  *
926  * Called by bfq_expire()
927  *
928  * lock:
929  *  THREAD_IO_LOCK is expected to be held.
930  * refcount:
931  *      none
932  *
933  */
934 static int
935 bfq_slow_tdio(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio)
936 {
937         /**
938          * A tdio is considered slow if it can not finish its budget
939          * at its current average speed
940          */
941         uint64_t usec_elapsed, service_received, speed;
942         int expect;
943         struct timeval tv = bfq_tdio->last_request_done_time;
944
945         timevalsub (&tv, &bfq_tdio->service_start_time);
946         usec_elapsed = (uint64_t)(1000000 * (uint64_t)tv.tv_sec + tv.tv_usec);
947
948         /* discard absurd value */
949         if (usec_elapsed < 20000)
950                 return 0;
951
952         service_received = (uint64_t)bfq_tdio->service_received << BFQ_FIXPOINT_SHIFT;
953         speed = service_received / usec_elapsed;
954         expect = (speed * BFQ_SLICE_TIMEOUT * (1000 * 1000 / hz)) >> BFQ_FIXPOINT_SHIFT;
955
956         if (expect < 0) {
957                 dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: overflow on calculating slow_tdio\n");
958                 return 0;
959         }
960
961         if (expect < bfq_tdio->budget * 2 / 3) {
962                 dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: %p is judged slow\n", bfq_tdio);
963                 return 1;
964         }
965
966         return 0;
967 }
968
969 /*
970  * bfq_expire(): expire a tdio for a given reason.
971  *
972  * Different amount of the new budget will be assign to the expired
973  * tdio according to the following reasons:
974  *
975  * BFQ_REASON_TIMEOUT:
976  *  The tdio does not consume its budget up within BFQ_SLICE_TIMEOUT ticks.
977  *  We shall update the disk peak rate if the tdio is not seeky. The new
978  *  budget will be the budget it actually consumes during this time
979  *  slice.
980  *
981  * BFQ_REASON_TOO_IDLE:
982  *  The tdio does not push any further bios during the scheduler is
983  *  suspending. To ensure low global latency, this tdio should be
984  *  punished by assign it the minimum budget. But if the tdio's not
985  *  pushing any bio is because it is waiting for the dispatched bios
986  *  to be done, we just keep the budget unchanged.
987  *
988  * BFQ_REASON_OUT_OF_BUDGET:
989  *      The tdio runs out of its budget within the time slice. It usually
990  *      indicates that the tdio is doing well. We increase the budget of it.
991  *
992  * lock:
993  *  THREAD_IO_LOCK is expected to be held.
994  *  BFQ_LOCK is expected to be held (needed by bfq_update_peak_rate()).
995  *
996  * refcount: none
997  *
998  * Callers: bfq_timeout(), bfq_dequeue()
999  *
1000  */
1001 static void
1002 bfq_expire(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio, enum bfq_expire_reason reason)
1003 {
1004         int max_budget = bfq_diskctx->bfq_max_budget,
1005                 budget_left,
1006                 bio_in_flight,
1007                 service_received;
1008
1009         service_received = bfq_tdio->service_received;
1010         budget_left = bfq_tdio->budget - bfq_tdio->service_received;
1011
1012         if (budget_left < 0) {
1013                 dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: budget down flow: %d, %d\n", bfq_tdio->budget, bfq_tdio->service_received);
1014                 budget_left = 0;
1015         }
1016
1017         KKASSERT(budget_left >= 0);
1018
1019         switch (reason) {
1020                 case BFQ_REASON_TIMEOUT:
1021                         /* the tdio is not seeky so that we can update
1022                          * the disk peak rate based on the service received
1023                          * by the tdio
1024                          */
1025                         if ((bfq_tdio->seek_samples >= BFQ_VALID_MIN_SAMPLES) &&
1026                             (!BFQ_TDIO_SEEKY(bfq_tdio)))
1027                                 bfq_update_peak_rate(bfq_diskctx, bfq_tdio);
1028
1029                         /* max_budget may be updated */
1030                         max_budget = bfq_diskctx->bfq_max_budget;
1031
1032                         /* update budget to service_received*/
1033                         bfq_tdio->budget = MAX(service_received, BFQ_DEFAULT_MIN_BUDGET);
1034
1035                         break;
1036
1037                 case BFQ_REASON_TOO_IDLE:
1038                         /*
1039                          * the tdio is too slow, charge full budget
1040                          */
1041                         if (bfq_slow_tdio(bfq_diskctx, bfq_tdio))
1042                                 wf2q_update_vd(bfq_tdio, budget_left);
1043
1044                         bio_in_flight = bfq_tdio->bio_dispatched - bfq_tdio->bio_completed;
1045                         KKASSERT(bio_in_flight >= 0);
1046                         /*
1047                          * maybe the tdio pushes no bio
1048                          * because it is waiting for some bios
1049                          * dispatched to be done, in this case
1050                          * we do not reduce the budget too harshly
1051                          */
1052                         if (bio_in_flight > 0) {
1053                                 bfq_tdio->budget = MAX(BFQ_DEFAULT_MIN_BUDGET, service_received);
1054                         } else {
1055 #if 0
1056                                 bfq_tdio->budget = MAX(BFQ_DEFAULT_MIN_BUDGET, bfq_diskctx->bfq_max_budget / BFQ_MIN_BUDGET_FACTOR);
1057 #endif
1058                                 bfq_tdio->budget = BFQ_DEFAULT_MIN_BUDGET;
1059                         }
1060
1061                         break;
1062                 case BFQ_REASON_OUT_OF_BUDGET:
1063
1064                         if ((bfq_tdio->seek_samples >= BFQ_VALID_MIN_SAMPLES) &&
1065                             (!BFQ_TDIO_SEEKY(bfq_tdio)))
1066                                 bfq_update_peak_rate(bfq_diskctx, bfq_tdio);
1067
1068                         /* increase the budget */
1069                         if (bfq_tdio->budget < BFQ_BUDGET_MULTIPLE_THRESHOLD)
1070                                 bfq_tdio->budget = MIN(max_budget, bfq_tdio->budget * 2);
1071                         else
1072                                 bfq_tdio->budget = MIN(max_budget, bfq_tdio->budget + BFQ_BUDG_INC_STEP);
1073                         break;
1074                 default:
1075                         break;
1076         }
1077 }
1078
1079 /*
1080  * bfq_update_peak_rate(): update the peak disk speed by sampling
1081  * the throughput within a time slice.
1082  *
1083  * lock:
1084  *  BFQ_LOCK is expected to be held
1085  *
1086  * refcount:
1087  *      none
1088  *
1089  * Caller: bfq_expire()
1090  */
1091 static void
1092 bfq_update_peak_rate(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio)
1093 {
1094         struct timeval tv = bfq_tdio->last_request_done_time;
1095         uint64_t usec, service_received, peak_rate;
1096
1097
1098         timevalsub (&tv, &bfq_tdio->service_start_time);
1099         usec = (uint64_t)(1000000 * (uint64_t)tv.tv_sec + tv.tv_usec);
1100
1101         /* discard absurd value */
1102         if (usec < 2000 || usec > (BFQ_SLICE_TIMEOUT * (1000 / hz) * 1000)) {
1103                 dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: absurd interval for peak rate\n");
1104                 return;
1105         }
1106
1107         service_received = (uint64_t)bfq_tdio->service_received << BFQ_FIXPOINT_SHIFT;
1108         peak_rate = service_received / usec;
1109         bfq_diskctx->bfq_peak_rate = (peak_rate + 7 * bfq_diskctx->bfq_peak_rate) / 8;
1110         bfq_diskctx->bfq_peak_rate_samples++;
1111
1112         /* update the max_budget according to the peak rate */
1113         if (bfq_diskctx->bfq_peak_rate_samples > BFQ_VALID_MIN_SAMPLES) {
1114                 bfq_diskctx->bfq_peak_rate_samples = BFQ_VALID_MIN_SAMPLES;
1115                 /*
1116                  * if the auto max budget adjust is disabled,
1117                  * the bfq_max_budget will always be BFQ_DEFAULT_MAX_BUDGET;
1118                  */
1119                 if (bfq_diskctx->bfq_flag & BFQ_FLAG_AUTO_MAX_BUDGET) {
1120                         bfq_diskctx->bfq_max_budget =
1121                                 (uint32_t)((BFQ_SLICE_TIMEOUT * (1000 / hz) * bfq_diskctx->bfq_peak_rate * 1000) >> BFQ_FIXPOINT_SHIFT);
1122                         dsched_debug(BFQ_DEBUG_NORMAL, "max budget updated to %d\n", bfq_diskctx->bfq_max_budget);
1123                 }
1124         }
1125 }
1126
1127 /*
1128  * bfq_update_tdio_seek_avg(): update the average seek distance of a
1129  * tdio.
1130  *
1131  * lock:
1132  *      THREAD_IO_LOCK is expected to be held.
1133  *
1134  * refcount:
1135  *  none
1136  *
1137  * Caller: bfq_queue()
1138  */
1139 static void
1140 bfq_update_tdio_seek_avg(struct bfq_thread_io *bfq_tdio, struct bio *bp)
1141 {
1142         off_t seek;
1143
1144         /* the first bio it dispatches,
1145          * we do not calculate the seek_avg,
1146          * just update the last_seek_end
1147          */
1148         if (bfq_tdio->seek_samples == 0) {
1149                 ++bfq_tdio->seek_samples;
1150                 goto rtn;
1151         }
1152
1153         seek = ABS(bp->bio_offset - bfq_tdio->last_seek_end);
1154
1155         /*
1156          * we do not do seek_samples++,
1157          * because the seek_total may overflow if seek_total += seek,
1158          */
1159         bfq_tdio->seek_samples = (7 * bfq_tdio->seek_samples + 256) / 8;
1160         bfq_tdio->seek_total = (7 * bfq_tdio->seek_total + 256 * seek) / 8;
1161         bfq_tdio->seek_avg = (bfq_tdio->seek_total + bfq_tdio->seek_samples / 2) / bfq_tdio->seek_samples;
1162
1163         dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: tdio %p seek_avg updated to %" PRIu64 "\n", bfq_tdio, bfq_tdio->seek_avg);
1164
1165 rtn:
1166         bfq_tdio->last_seek_end = bp->bio_offset + BIO_SIZE(bp);
1167 }
1168
1169 /*
1170  * bfq_update_tdio_ttime_avg(): update the average thinking time
1171  * of a tdio.
1172  *
1173  * The thinking time is used to switch on / off the tdio's AS feature
1174  *
1175  * lock:
1176  *  THREAD_IO_LOCK is expected to be held.
1177  *
1178  * refcount:
1179  *  none
1180  *
1181  * Caller:
1182  *  bfq_queue()
1183  *
1184  */
1185 static void
1186 bfq_update_tdio_ttime_avg(struct bfq_thread_io *bfq_tdio)
1187 {
1188         struct timeval tv, after_start;
1189         uint64_t usec;
1190
1191         if (bfq_tdio->ttime_samples == 0) {
1192                 ++bfq_tdio->ttime_samples;
1193                 return;
1194         }
1195
1196         getmicrotime(&tv);
1197         after_start = bfq_tdio->last_request_done_time;
1198
1199 #if 0
1200         timevalsub (&tv, &bfq_tdio->last_request_done_time);
1201 #endif
1202         /*
1203          * Try the interval between two bios are pushed,
1204          * instead of between last_request_done_time and
1205          * the current time.
1206          */
1207
1208         timevalsub (&tv, &bfq_tdio->last_bio_pushed_time);
1209
1210         timevalsub (&after_start, &bfq_tdio->service_start_time);
1211
1212         /*
1213          * tv.tv_sec < 0 means the last reauest done time is
1214          * after the current time.
1215          * this may happen because the biodone function is not blocked
1216          *
1217          * after_start.tv_sec < 0 means that the last bio done happens
1218          * before the current service slice, and we should drop this value.
1219          */
1220         if (tv.tv_sec < 0 || after_start.tv_sec < 0)
1221                 return;
1222
1223         usec = (uint64_t)(1000000 * (uint64_t)tv.tv_sec + tv.tv_usec);
1224
1225         bfq_tdio->ttime_samples = (7 * bfq_tdio->ttime_samples + 256) / 8;
1226         bfq_tdio->ttime_total = (7 * bfq_tdio->ttime_total + 256 * usec) / 8;
1227         bfq_tdio->ttime_avg = (bfq_tdio->ttime_total + 128) / bfq_tdio->ttime_samples;
1228
1229 }
1230
1231 /*
1232  * This function will also update the bfq_max_time_slice field
1233  *
1234  * tv: the timeval structure representing the length of time slice
1235  */
1236 static void
1237 bfq_update_avg_time_slice(struct bfq_disk_ctx *bfq_diskctx, struct timeval tv)
1238 {
1239         uint32_t msec;
1240
1241         msec = ((uint64_t)(1000000 * (uint64_t)tv.tv_sec + tv.tv_usec) >> 10 );
1242
1243         if (msec > 3 * BFQ_SLICE_TIMEOUT * (1000 / hz))
1244                 atomic_add_int(&bfq_diskctx->bfq_high_time_slice_count, 1);
1245
1246         bfq_diskctx->bfq_avg_time_slice =
1247                 (7 * bfq_diskctx->bfq_avg_time_slice + msec) / 8;
1248
1249         if (bfq_diskctx->bfq_max_time_slice < msec)
1250                 bfq_diskctx->bfq_max_time_slice = msec;
1251 }
1252 /*
1253  * This function will also update the bfq_as_max_wait field
1254  * flag: BFQ_AS_STAT_ALL, BFQ_AS_STAT_ONLY_MISS
1255  *
1256  */
1257 static void
1258 bfq_update_as_avg_wait(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio, int flag)
1259 {
1260         struct timeval tv;
1261         uint32_t msec;
1262         getmicrotime(&tv);
1263         timevalsub (&tv, &bfq_tdio->as_start_time);
1264
1265         /* approximately divide 1000 by left shift 10 */
1266         msec = ((uint64_t)(1000000 * (uint64_t)tv.tv_sec + tv.tv_usec) >> 10 );
1267
1268         /* ridiculous value */
1269         if (msec > 10000) {
1270                 dsched_debug(BFQ_DEBUG_NORMAL, "bfq: ridiculous as wait time!\n");
1271                 return;
1272         }
1273
1274         if (msec > 5 * BFQ_T_WAIT_MIN * (1000 / hz))
1275                 atomic_add_int(&bfq_diskctx->bfq_as_high_wait_count, 1);
1276
1277         if (flag & BFQ_AS_STAT_ALL) {
1278                 bfq_diskctx->bfq_as_avg_wait_all =
1279                         (7 * bfq_diskctx->bfq_as_avg_wait_all + msec) / 8;
1280         }
1281
1282         if (flag & BFQ_AS_STAT_ONLY_MISS) {
1283                 bfq_diskctx->bfq_as_avg_wait_miss =
1284                         (7 * bfq_diskctx->bfq_as_avg_wait_miss + msec) / 8;
1285         }
1286
1287         /* update the maximum waiting time */
1288         if (bfq_diskctx->bfq_as_max_wait < msec)
1289                 bfq_diskctx->bfq_as_max_wait = msec;
1290
1291         return;
1292 }
1293
1294 static int
1295 bfq_mod_handler(module_t mod, int type, void *unused)
1296 {
1297         static struct sysctl_ctx_list sysctl_ctx;
1298         static struct sysctl_oid *oid;
1299         static char version[16];
1300         int error;
1301
1302         ksnprintf(version, sizeof(version), "%d.%d",
1303                         dsched_bfq_version_maj, dsched_bfq_version_min);
1304
1305         switch (type) {
1306         case MOD_LOAD:
1307                 bzero(&bfq_stats, sizeof(struct dsched_bfq_stats));
1308                 if ((error = dsched_register(&dsched_bfq_policy)))
1309                         return (error);
1310
1311                 sysctl_ctx_init(&sysctl_ctx);
1312                 oid = SYSCTL_ADD_NODE(&sysctl_ctx,
1313                     SYSCTL_STATIC_CHILDREN(_dsched),
1314                     OID_AUTO,
1315                     "bfq",
1316                     CTLFLAG_RD, 0, "");
1317                 bfq_mod_oid = oid;
1318
1319                 SYSCTL_ADD_STRING(&sysctl_ctx, SYSCTL_CHILDREN(oid),
1320                     OID_AUTO, "version", CTLFLAG_RD, version, 0, "bfq version");
1321                 helper_init_global();
1322
1323                 kprintf("BFQ scheduler policy version %d.%d loaded. sizeof(bfq_thread_io) = %zu\n",
1324                     dsched_bfq_version_maj, dsched_bfq_version_min, sizeof(struct bfq_thread_io));
1325                 break;
1326
1327         case MOD_UNLOAD:
1328                 if ((error = dsched_unregister(&dsched_bfq_policy)))
1329                         return (error);
1330                 sysctl_ctx_free(&sysctl_ctx);
1331                 kprintf("BFQ scheduler policy unloaded\n");
1332                 break;
1333
1334         default:
1335                 break;
1336         }
1337
1338         return 0;
1339 }
1340
1341 int
1342 bfq_sysctl_as_switch_handler(SYSCTL_HANDLER_ARGS)
1343 {
1344         struct bfq_disk_ctx *bfq_diskctx = arg1;
1345         int as_switch, error;
1346
1347         as_switch = ((bfq_diskctx->bfq_flag & BFQ_FLAG_AS) ? 1 : 0);
1348         error = sysctl_handle_int(oidp, &as_switch, 0, req);
1349         if (error || !req->newptr)
1350                 return error;
1351
1352         if (as_switch == 1)
1353                 bfq_diskctx->bfq_flag |= BFQ_FLAG_AS;
1354         else if (as_switch == 0)
1355                 bfq_diskctx->bfq_flag &= ~(BFQ_FLAG_AS);
1356         else
1357                 return 0;
1358
1359         return error;
1360 }
1361
1362 int
1363 bfq_sysctl_auto_max_budget_handler(SYSCTL_HANDLER_ARGS)
1364 {
1365         struct bfq_disk_ctx *bfq_diskctx = arg1;
1366         int auto_max_budget_switch, error;
1367         auto_max_budget_switch = ((bfq_diskctx->bfq_flag & BFQ_FLAG_AUTO_MAX_BUDGET) ? 1 : 0);
1368         error = sysctl_handle_int(oidp, &auto_max_budget_switch, 0, req);
1369         if (error || !req->newptr)
1370                 return error;
1371
1372         if (auto_max_budget_switch == 1)
1373                 bfq_diskctx->bfq_flag |= BFQ_FLAG_AUTO_MAX_BUDGET;
1374         else if (auto_max_budget_switch == 0)
1375                 bfq_diskctx->bfq_flag &= ~(BFQ_FLAG_AUTO_MAX_BUDGET);
1376         else
1377                 return 0;
1378
1379         return error;
1380 }
1381
1382 DSCHED_POLICY_MODULE(dsched_bfq, bfq_mod_handler, 1);