kernel/dsched: Add a version parameter to the DSCHED_POLICY_MODULE macro.
[dragonfly.git] / sys / kern / dsched / bfq / bfq.c
CommitLineData
aabeb187
BP
1/*
2 * Copyright (c) 2011 The DragonFly Project. All rights reserved.
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Brills Peng <brillsp@gmail.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 */
34
35
36/*
37 * BFQ disk scheduler, the algorithm routines and the interfaces with the
38 * dsched framework.
39 *
40 */
41
42#include <inttypes.h>
43#include <sys/systm.h>
44#include <sys/kernel.h>
45#include <sys/proc.h>
46#include <sys/sysctl.h>
47#include <sys/buf.h>
48#include <sys/conf.h>
49#include <sys/diskslice.h>
50#include <sys/disk.h>
51#include <sys/malloc.h>
52#include <machine/md_var.h>
53#include <sys/ctype.h>
54#include <sys/syslog.h>
55#include <sys/device.h>
56#include <sys/msgport.h>
57#include <sys/msgport2.h>
58#include <sys/buf2.h>
59#include <sys/dsched.h>
60#include <sys/fcntl.h>
61#include <machine/varargs.h>
62
63#include <kern/dsched/bfq/bfq.h>
64#include <kern/dsched/bfq/bfq_helper_thread.h>
65
66#define _DSCHED_BFQ_BFQ_C_
67#include <kern/dsched/bfq/bfq_ktr.h>
68
69/* Make sure our structs fit */
70CTASSERT(sizeof(struct bfq_thread_io) <= DSCHED_THREAD_IO_MAX_SZ);
71CTASSERT(sizeof(struct bfq_disk_ctx) <= DSCHED_DISK_CTX_MAX_SZ);
72
73
74static dsched_prepare_t bfq_prepare;
75static dsched_teardown_t bfq_teardown;
76static dsched_cancel_t bfq_cancel_all;
77static dsched_queue_t bfq_queue;
78static dsched_new_tdio_t bfq_new_tdio;
79static dsched_destroy_tdio_t bfq_destroy_tdio;
80static dsched_bio_done_t bfq_bio_done;
81
82
83static void bfq_update_peak_rate(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio);
84static int bfq_slow_tdio(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio);
85static void bfq_expire(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio, enum bfq_expire_reason reason);
86static void bfq_update_tdio_seek_avg(struct bfq_thread_io *bfq_tdio, struct bio *bp);
87static void bfq_update_tdio_ttime_avg(struct bfq_thread_io *bfq_tdio);
88static void bfq_update_as_avg_wait(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio, int flag);
89static void bfq_update_avg_time_slice(struct bfq_disk_ctx *bfq_diskctx, struct timeval tv);
90
91
92
93struct dsched_policy dsched_bfq_policy = {
94 .name = "bfq",
95 .prepare = bfq_prepare,
96 .teardown = bfq_teardown,
97 .cancel_all = bfq_cancel_all,
98 .bio_queue = bfq_queue,
99 .new_tdio = bfq_new_tdio,
100 .destroy_tdio = bfq_destroy_tdio,
101 .bio_done = bfq_bio_done,
102 .polling_func = (void (*)(struct dsched_disk_ctx *))helper_msg_dequeue,
103};
104
105
106struct sysctl_oid *bfq_mod_oid;
107
108struct dsched_bfq_stats bfq_stats;
109
110static int dsched_bfq_version_maj = 1;
111static int dsched_bfq_version_min = 0;
112
113/*
114 * bfq_prepare(): the .prepare callback of the bfq policy. Initialize
115 * all fields in bfq_diskctx and initialize the corresponding helper
116 * thread.
117 *
118 * lock: none
119 * refcount: none
120 *
121 * Returns 0
122 */
123static int
124bfq_prepare(struct dsched_disk_ctx *diskctx)
125{
126 struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)diskctx;
127
128 BFQ_LOCKINIT(bfq_diskctx);
129
130 bfq_diskctx->pending_dequeue = 0;
131
132 wf2q_init(&bfq_diskctx->bfq_wf2q);
133
134 callout_init_mp(&bfq_diskctx->bfq_callout);
135
136 bfq_diskctx->bfq_blockon = NULL;
137 bfq_diskctx->bfq_active_tdio = NULL;
138 bfq_diskctx->bfq_remaining_budget = 0;
139
140 bfq_diskctx->bfq_max_budget = BFQ_DEFAULT_MAX_BUDGET;
141 bfq_diskctx->bfq_peak_rate_samples = 0;
142 bfq_diskctx->bfq_peak_rate = 0;
143
144#if 0
145 bfq_diskctx->bfq_flag = BFQ_FLAG_AS | BFQ_FLAG_AUTO_MAX_BUDGET;
146#endif
147 bfq_diskctx->bfq_flag = BFQ_FLAG_AS;
148
149 bfq_diskctx->bfq_as_miss = 0;
150 bfq_diskctx->bfq_as_hit = 0;
151
152 bfq_diskctx->bfq_as_avg_wait_miss = 0;
153 bfq_diskctx->bfq_as_avg_wait_all = 0;
154 bfq_diskctx->bfq_as_max_wait = 0;
155 bfq_diskctx->bfq_as_max_wait2 = 0;
156 bfq_diskctx->bfq_as_high_wait_count = 0;
157 bfq_diskctx->bfq_as_high_wait_count2 = 0;
158
159 bfq_diskctx->bfq_avg_time_slice = 0;
160 bfq_diskctx->bfq_max_time_slice = 0;
161 bfq_diskctx->bfq_high_time_slice_count = 0;
162
163 /* initiailize the helper thread */
164 helper_init(bfq_diskctx);
165
166 dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: initialized!\n");
167 return 0;
168}
169
170/*
171 * bfq_teardown(): .teardown callback of the bfq policy. Send message
172 * of killing to the helper thread and deallocate resources used by
173 * the helper thread (currently the objcache)
174 *
175 * XXX: deadlock causing when the caller of bfq_teardown() and the
176 * helper thread are on the same CPU.
177 *
178 * lock: none
179 * refcount: none
180 *
181 */
182
183static void
184bfq_teardown(struct dsched_disk_ctx *diskctx)
185{
186 struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)diskctx;
187 KKASSERT(diskctx);
188
189 helper_msg_kill(bfq_diskctx);
190
191 tsleep(diskctx, 0, "teardn", hz * 3 / 2);
192
193 helper_uninit(bfq_diskctx);
194}
195
196/*
197 * bfq_cancel_all(): .cancel_all callback of the bfq policy. Cancel
198 * all bios that queue in each bfq_thread_io structure in the
199 * wf2q tree.
200 *
201 * lock:
202 * BFQ_LOCK: protect from wf2q_insert operation in bfq_queue() and
203 * bfq_dequeue(); wf2q_get_next operation in bfq_dequeue()
204 * THREAD_IO_LOCK: protect from queue iteration in bfq_dequeue() and
205 * queue insertion in bfq_queue()
206 *
207 * refcount:
208 * unref thread_io structures; they are referenced in queue(),
209 * when a bio is queued. The refcount may decrease to zero.
210 *
211 */
212static void
213bfq_cancel_all(struct dsched_disk_ctx *diskctx)
214{
215 struct bio *bio;
216 struct bfq_thread_io *bfq_tdio;
217 struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)diskctx;
218
219 BFQ_LOCK(bfq_diskctx);
220
221 while ((bfq_tdio = wf2q_get_next_thread_io(&bfq_diskctx->bfq_wf2q))) {
222 DSCHED_THREAD_IO_LOCK(&bfq_tdio->head);
223 KKASSERT(lockstatus(&bfq_tdio->head.lock, curthread) == LK_EXCLUSIVE);
224
225 while ((bio = TAILQ_FIRST(&bfq_tdio->head.queue))) {
226 bfq_tdio->head.qlength--;
227 TAILQ_REMOVE(&bfq_tdio->head.queue, bio, link);
228 dsched_cancel_bio(bio);
229 dsched_thread_io_unref(&bfq_tdio->head);
230 }
231
232 KKASSERT(bfq_tdio->head.qlength == 0);
233 DSCHED_THREAD_IO_UNLOCK(&bfq_tdio->head);
234 }
235
236 BFQ_UNLOCK(bfq_diskctx);
237}
238
239/*
240 * bfq_new_tdio(): .new_tdio callback of the bfq policy. Initialize
241 * the bfq_thread_io structure.
242 *
243 * lock: none
244 * refcount: none
245 */
246static void
247bfq_new_tdio(struct dsched_thread_io *tdio)
248{
249 struct bfq_thread_io *bfq_tdio = (struct bfq_thread_io *) tdio;
250
251 /* the queue has to be initialized some where else */
252 tdio->qlength = 0;
253
254 tdio->debug_priv = 0xF00FF00F;
255
256 bfq_tdio->budget = BFQ_DEFAULT_MIN_BUDGET;
257 bfq_tdio->weight = BFQ_DEFAULT_WEIGHT;
258
259 bfq_tdio->tdio_as_switch = 1;
260 bfq_tdio->maybe_timeout = 0;
261
262 bfq_tdio->seek_samples = 0;
263 bfq_tdio->seek_avg = 0;
264 bfq_tdio->seek_total = 0;
265 bfq_tdio->ttime_samples = 0;
266 bfq_tdio->ttime_avg = 0;
267 bfq_tdio->service_received = 0;
268 bfq_tdio->bio_dispatched = 0;
269 bfq_tdio->bio_completed = 0;
270
271 KTR_LOG(dsched_bfq_thread_created, bfq_tdio);
272}
273
274/*
275 * bfq_helper_destroy_tdio(): called after a thread_io struct is destroyed.
276 * if the scheduler is AS waiting for a destroyed tdio, this function resumes
277 * the scheduler.
278 *
279 * lock:
280 * BFQ_LOCK: protect from nullify bfq_diskctx->bfq_blockon/bfq_active_tdio
281 * in bfq_timeout()
282 *
283 * refcount: none
284 *
285 * Calling path: bfq_destroy_tdio --lwkt_msg--> helper_thread --call--> me
286 *
287 */
288void
289bfq_helper_destroy_tdio(struct dsched_thread_io *tdio, struct bfq_disk_ctx *bfq_diskctx)
290{
291 KKASSERT(bfq_diskctx);
292
293 BFQ_LOCK(bfq_diskctx);
294
295 /*
296 * Test whether the scheduler is pending on the tdio to
297 * be destroyed.
298 */
299 if (((struct dsched_thread_io *)bfq_diskctx->bfq_blockon == tdio) &&
300 callout_pending(&bfq_diskctx->bfq_callout)) {
301 dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: pending on a being destroyed thread!\n");
302
303 callout_stop(&bfq_diskctx->bfq_callout);
304
305 bfq_diskctx->bfq_blockon = NULL;
306 bfq_diskctx->bfq_active_tdio = NULL;
307
308 BFQ_UNLOCK(bfq_diskctx);
309
310 helper_msg_dequeue(bfq_diskctx);
311 return;
312 }
313 BFQ_UNLOCK(bfq_diskctx);
314
315}
316
317/*
318 * bfq_destroy_tdio(): .destroy_tdio callback of the bfq policy
319 *
320 * Called immediate after a dsched_thread_io struct's refcount decreases
321 * to zero. This function will record the seek_avg and ttime_avg of the
322 * destroyed thread with the KTR facility.
323 *
324 * lock: none
325 *
326 * refcount: the tdio's refcount should be zero. It may be nuked, and
327 * any read/write to the tdio is not safe by then.
328 */
329static void
330bfq_destroy_tdio(struct dsched_thread_io *tdio)
331{
332 struct bfq_thread_io *bfq_tdio = (struct bfq_thread_io *)tdio;
333
334 /*
335 * do not log threads without I/O
336 */
337 if (bfq_tdio->seek_samples != 0 || bfq_tdio->ttime_samples != 0) {
338 KTR_LOG(dsched_bfq_thread_seek_avg, bfq_tdio, bfq_tdio->seek_avg );
339 KTR_LOG(dsched_bfq_thread_ttime_avg, bfq_tdio, bfq_tdio->ttime_avg);
340 }
341
342 helper_msg_destroy_tdio((struct bfq_disk_ctx *)tdio->diskctx, tdio);
343}
344
345/*
346 * bfq_bio_done(): .bio_done callback of the bfq policy
347 *
348 * Called after a bio is done, (by request_polling_biodone of dsched).
349 * This function judges whet her a thread consumes up its time slice, and
350 * if so, it will set the maybe_timeout flag in bfq_tdio structure. Any
351 * further action of that thread or the bfq scheduler will cause the
352 * thread to be expired. (in bfq_queue() or in bfq_dequeue())
353 *
354 * This function requires the bfq_tdio pointer of the thread that pushes
355 * bp to be stored by dsched_set_bio_priv() earlier. Currently it is
356 * stored when bfq_queue() is called.
357 *
358 * lock: none. This function CANNOT be blocked by any lock
359 *
360 * refcount:
361 * the corresponding tdio's refcount should decrease by 1 after
362 * this function call. The counterpart increasing is in bfq_queue().
363 * For each bio pushed down, we increase the refcount of the pushing
364 * tdio.
365 */
366static void
367bfq_bio_done(struct bio *bp)
368{
369 struct disk *dp = dsched_get_bio_dp(bp);
370 struct bfq_thread_io *bfq_tdio = dsched_get_bio_priv(bp);
371 struct bfq_disk_ctx *bfq_diskctx = dsched_get_disk_priv(dp);
372 struct timeval tv;
373 int ticks_expired;
374
375 KKASSERT(bfq_tdio);
376
377 dsched_thread_io_ref(&bfq_tdio->head);
378
379 atomic_add_int(&bfq_tdio->bio_completed, 1);
380
381 /* the tdio has already expired */
382 if (bfq_tdio != bfq_diskctx->bfq_active_tdio)
383 goto rtn;
384 atomic_add_int(&bfq_tdio->service_received, BIO_SIZE(bp));
385
386 /* current time */
387 getmicrotime(&tv);
388 bfq_tdio->last_request_done_time = tv;
389 timevalsub (&tv, &bfq_tdio->service_start_time);
390 ticks_expired = tvtohz_high(&tv);
391
392 /* the thread has run out its time slice */
393 if ((ticks_expired != 0x7fffffff) &&
394 (ticks_expired >= BFQ_SLICE_TIMEOUT)) {
395 /*
396 * we cannot block here, so just set a flag
397 */
398#if 0
399 bfq_tdio->maybe_timeout = 1;
400#endif
401 if (atomic_cmpset_int(&bfq_tdio->maybe_timeout, 0, 1)) {
402 bfq_update_avg_time_slice(bfq_diskctx, tv);
403 dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: %p may time out\n", bfq_tdio);
404 }
405 }
406rtn:
407 dsched_thread_io_unref(&bfq_tdio->head); /* ref'ed in this function */
408 dsched_thread_io_unref(&bfq_tdio->head); /* ref'ed in queue() */
409
410}
411
412/*
413 * bfq_timeout(): called after the callout alarm strikes.
414 *
415 * This function getting called indicates that after waiting for
416 * BFQ_T_WAIT / BFQ_T_WAIT_MIN ticks, the thread "active_tdio"
417 * represents does not push any further bios. This tdio should
418 * be expired with the reason BFQ_REASON_TOO_IDLE, but if the tdio
419 * is marked as timeout (in bfq_biodone()) first, we expire it
420 * for BFQ_REASON_TIMEOUT. The bfq scheduler should resume working
421 * (and pick another thread to serve).
422 *
423 * It is possible that this function gets called a litter after
424 * the thread pushes a bio with bfq_queue(), and thus a "fake timeout"
425 * happens. We treat it as the callout does not strike, and continue
426 * to serve the active_tdio.
427 *
428 * lock:
429 * BFQ_LOCK: protect bfq_diskctx->blockon and bfq_diskctx->active_tdio
430 * they should either changed in bfq_queue() or in this function,
431 * atomically.
432 * TDIO_LOCK: protect from dequeue() updateing the budget by the
433 * maybe_timeout branch. (Not necessary, because we already hold the
434 * BFQ_LOCK, and no one else could change the budget of the tdio)
435 *
436 * refcount:
437 * the refcount of bfq_diskctx->bfq_active_tdio will decrease one
438 * after this function. (The counterpart increasing is in bfq_dequeue(),
439 * before resetting the callout alarm.)
440 *
441 * AS timeout:
442 * during the waiting period, no bio is pushed by the being
443 * waited tdio
444 *
445 * Calling path:
446 * callout facility --> helper_msg_timeout --lwkt_msg--> helper thread
447 * --> me
448 */
449void
450bfq_timeout(void *p)
451{
452 /* waiting time out:
453 * no deceptive idleness, and unblock dispatching
454 */
455 struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)p;
456 struct bfq_thread_io *bfq_tdio;
457
458 BFQ_LOCK(bfq_diskctx);
459
460 /*
461 * the timeout occurs after the thread
462 * pushing one more bio
463 */
464 if (bfq_diskctx->bfq_blockon == NULL) {
465 dsched_debug(BFQ_DEBUG_VERBOSE , "BFQ: fake AS timeout \n");
466 goto rtn;
467 }
468
469 bfq_diskctx->bfq_as_miss++;
470
471 KKASSERT(bfq_diskctx->bfq_active_tdio);
472 bfq_tdio = bfq_diskctx->bfq_active_tdio;
473
474 DSCHED_THREAD_IO_LOCK(&bfq_tdio->head);
475
476 bfq_update_as_avg_wait(bfq_diskctx, bfq_tdio, BFQ_AS_STAT_ALL|BFQ_AS_STAT_ONLY_MISS);
477
478 bfq_diskctx->bfq_blockon = NULL;
479 bfq_diskctx->bfq_active_tdio = NULL;
480 dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: unblocked %p\n", bfq_tdio);
481
482 wf2q_update_vd(bfq_tdio, bfq_tdio->budget - bfq_diskctx->bfq_remaining_budget);
483 /*
484 * the time slice expired before as timeout
485 * this should be REASON_TIMEOUT
486 */
487 if (bfq_tdio->maybe_timeout) {
488 bfq_expire(bfq_diskctx, bfq_tdio, BFQ_REASON_TIMEOUT);
489 dsched_debug(BFQ_DEBUG_VERBOSE, "%p time out in timeout()\n", bfq_tdio);
490 } else {
491 bfq_expire(bfq_diskctx, bfq_tdio, BFQ_REASON_TOO_IDLE);
492 dsched_debug(BFQ_DEBUG_VERBOSE, "%p too idle\n", bfq_tdio);
493 }
494
495 DSCHED_THREAD_IO_UNLOCK(&bfq_tdio->head);
496
497 /* ref'ed in dequeue(), before resetting callout */
498 dsched_thread_io_unref(&bfq_tdio->head);
499rtn:
500 BFQ_UNLOCK(bfq_diskctx);
501 helper_msg_dequeue(bfq_diskctx);
502}
503
504/*
505 * bfq_queue(): .queue callback of the bfq policy.
506 *
507 * A thread calls this function to hand in its I/O requests (bio).
508 * Their bios are stored in the per-thread queue, in tdio structure.
509 * Currently, the sync/async bios are queued together, which may cause
510 * some issues on performance.
511 *
512 * Besides queueing bios, this function also calculates the average
513 * thinking time and average seek distance of a thread, using the
514 * information in bio structure.
515 *
516 * If the calling thread is waiting by the bfq scheduler due to
517 * the AS feature, this function will cancel the callout alarm
518 * and resume the scheduler to continue serving this thread.
519 *
520 * lock:
521 * THREAD_IO_LOCK: protect from queue iteration in bfq_dequeue()
522 * BFQ_LOCK: protect from other insertions/deletions in wf2q_augtree
523 * in bfq_queue() or bfq_dequeue().
524 *
525 * refcount:
526 * If the calling thread is waited by the scheduler, the refcount
527 * of the related tdio will decrease by 1 after this function. The
528 * counterpart increasing is in bfq_dequeue(), before resetting the
529 * callout alarm.
530 *
531 * Return value:
532 * EINVAL: if bio->bio_buf->b_cmd == BUF_CMD_FLUSH
533 * 0: bio is queued successfully.
534 */
535static int
536bfq_queue(struct dsched_disk_ctx *diskctx, struct dsched_thread_io *tdio,
537 struct bio *bio)
538{
539 struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)diskctx;
540 struct bfq_thread_io *bfq_tdio = (struct bfq_thread_io *)tdio;
541 int original_qlength;
542
543 /* we do not handle flush requests. push it down to dsched */
544 if (__predict_false(bio->bio_buf->b_cmd == BUF_CMD_FLUSH))
545 return (EINVAL);
546
547 DSCHED_THREAD_IO_LOCK(tdio);
548 KKASSERT(tdio->debug_priv == 0xF00FF00F);
549 dsched_debug(BFQ_DEBUG_NORMAL, "bfq: tdio %p pushes bio %p\n", bfq_tdio, bio);
550
551 dsched_set_bio_priv(bio, tdio);
552 dsched_thread_io_ref(tdio);
553
554 if ((bio->bio_buf->b_cmd == BUF_CMD_READ) ||
555 (bio->bio_buf->b_cmd == BUF_CMD_WRITE)) {
556 bfq_update_tdio_seek_avg(bfq_tdio, bio);
557 }
558
559 bfq_update_tdio_ttime_avg(bfq_tdio);
560
561 /* update last_bio_pushed_time */
562 getmicrotime(&bfq_tdio->last_bio_pushed_time);
563
564 if ((bfq_tdio->seek_samples > BFQ_VALID_MIN_SAMPLES) &&
565 BFQ_TDIO_SEEKY(bfq_tdio))
566 dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: tdio %p is seeky\n", bfq_tdio);
567
568 /*
569 * If a tdio taks too long to think, we disable the AS feature of it.
570 */
571 if ((bfq_tdio->ttime_samples > BFQ_VALID_MIN_SAMPLES) &&
572 (bfq_tdio->ttime_avg > BFQ_T_WAIT * (1000 / hz) * 1000) &&
573 (bfq_tdio->service_received > bfq_tdio->budget / 8)) {
574 dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: tdio %p takes too long time to think\n", bfq_tdio);
575 bfq_tdio->tdio_as_switch = 0;
576 } else {
577 bfq_tdio->tdio_as_switch = 1;
578 }
579
580 /* insert the bio into the tdio's own queue */
581 KKASSERT(lockstatus(&tdio->lock, curthread) == LK_EXCLUSIVE);
582 TAILQ_INSERT_TAIL(&tdio->queue, bio, link);
583#if 0
584 tdio->qlength++;
585#endif
586 original_qlength = atomic_fetchadd_int(&tdio->qlength, 1);
587 DSCHED_THREAD_IO_UNLOCK(tdio);
588 /*
589 * A new thread:
590 * In dequeue function, we remove the thread
591 * from the aug-tree if it has no further bios.
592 * Therefore "new" means a really new thread (a
593 * newly created thread or a thread that pushed no more
594 * bios when the scheduler was waiting for it) or
595 * one that was removed from the aug-tree earlier.
596 */
597 if (original_qlength == 0) {
598 /*
599 * a really new thread
600 */
601 BFQ_LOCK(bfq_diskctx);
602 if (bfq_tdio != bfq_diskctx->bfq_active_tdio) {
603 /* insert the tdio into the wf2q queue */
604 wf2q_insert_thread_io(&bfq_diskctx->bfq_wf2q, bfq_tdio);
605 } else {
606 /*
607 * the thread being waited by the scheduler
608 */
609 if (bfq_diskctx->bfq_blockon == bfq_tdio) {
610 /*
611 * XXX: possible race condition here:
612 * if the callout function is triggered when
613 * the following code is executed, then after
614 * releasing the TDIO lock, the callout function
615 * will set the thread inactive and it will never
616 * be inserted into the aug-tree (so its bio pushed
617 * this time will not be dispatched) until it pushes
618 * further bios
619 */
620 bfq_diskctx->bfq_as_hit++;
621 bfq_update_as_avg_wait(bfq_diskctx, bfq_tdio, BFQ_AS_STAT_ALL);
622
623 if (callout_pending(&bfq_diskctx->bfq_callout))
624 callout_stop(&bfq_diskctx->bfq_callout);
625 bfq_diskctx->bfq_blockon = NULL;
626
627 /* ref'ed in dequeue(), before resetting callout */
628 dsched_thread_io_unref(&bfq_tdio->head);
629
630 dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: %p pushes a new bio when AS\n", bfq_tdio);
631 }
632 }
633
634 BFQ_UNLOCK(bfq_diskctx);
635 }
636
637 helper_msg_dequeue(bfq_diskctx);
638
639 return 0;
640}
641
642/*
643 * bfq_dequeue(): dispatch bios to the disk driver.
644 *
645 * This function will push as many bios as the number of free slots
646 * in the tag queue.
647 *
648 * In the progress of dispatching, the following events may happen:
649 * - Current thread is timeout: Expire the current thread for
650 * BFQ_REASON_TIMEOUT, and select a new thread to serve in the
651 * wf2q tree.
652 *
653 * - Current thread runs out of its budget: Expire the current thread
654 * for BFQ_REASON_OUT_OF_BUDGET, and select a new thread to serve
655 *
656 * - Current thread has no further bios in its queue: if the AS feature
657 * is turned on, the bfq scheduler sets an alarm and starts to suspend.
658 * The bfq_timeout() or bfq_queue() calls may resume the scheduler.
659 *
660 * Implementation note: The bios selected to be dispatched will first
661 * be stored in an array bio_do_dispatch. After this function releases
662 * all the locks it holds, it will call dsched_strategy_request_polling()
663 * for each bio stored.
664 *
665 * With the help of bfq_disk_ctx->pending_dequeue,
666 * there will be only one bfq_dequeue pending on the BFQ_LOCK.
667 *
668 * lock:
669 * BFQ_LOCK: protect from wf2q_augtree operations in bfq_queue()
670 * THREAD_IO_LOCK: locks the active_tdio. Protect from queue insertions
671 * in bfq_queue; Protect the active_tdio->budget
672 *
673 * refcount:
674 * If the scheduler decides to suspend, the refcount of active_tdio
675 * increases by 1. The counterpart decreasing is in bfq_queue() and
676 * bfq_timeout()
677 * blocking:
678 * May be blocking on the disk driver lock. It depends on drivers.
679 *
680 * Calling path:
681 * The callers could be:
682 * bfq_queue(), bfq_timeout() and the registered polling function.
683 *
684 * caller --> helper_msg_dequeue --lwkt_msg--> helper_thread-> me
685 *
686 */
687void
688bfq_dequeue(struct dsched_disk_ctx *diskctx)
689{
690 int free_slots,
691 bio_index = 0, i,
692 remaining_budget = 0;/* remaining budget of current active process */
693
694 struct bio *bio, *bio_to_dispatch[33];
695 struct bfq_thread_io *active_tdio = NULL;
696 struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)diskctx;
697
698 BFQ_LOCK(bfq_diskctx);
699 atomic_cmpset_int(&bfq_diskctx->pending_dequeue, 1, 0);
700
701 /*
702 * The whole scheduler is waiting for further bios
703 * from process currently being served
704 */
705 if (bfq_diskctx->bfq_blockon != NULL)
706 goto rtn;
707
708 remaining_budget = bfq_diskctx->bfq_remaining_budget;
709 active_tdio = bfq_diskctx->bfq_active_tdio;
710 dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: dequeue: Im in. active_tdio = %p\n", active_tdio);
711
712 free_slots = diskctx->max_tag_queue_depth - diskctx->current_tag_queue_depth;
713 KKASSERT(free_slots >= 0 && free_slots <= 32);
714
715 if (active_tdio)
716 DSCHED_THREAD_IO_LOCK(&active_tdio->head);
717
718 while (free_slots) {
719 /* Here active_tdio must be locked ! */
720 if (active_tdio) {
721 /*
722 * the bio_done function has marked the current
723 * tdio timeout
724 */
725 if (active_tdio->maybe_timeout) {
726 dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: %p time out in dequeue()\n", active_tdio);
727 wf2q_update_vd(active_tdio, active_tdio->budget - remaining_budget);
728 bfq_expire(bfq_diskctx, active_tdio, BFQ_REASON_TIMEOUT);
729
730 /* there still exist bios not dispatched,
731 * reinsert the tdio into aug-tree*/
732 if (active_tdio->head.qlength > 0) {
733 wf2q_insert_thread_io(&bfq_diskctx->bfq_wf2q, active_tdio);
734 KKASSERT(bfq_diskctx->bfq_wf2q.wf2q_tdio_count);
735 }
736
737 active_tdio->maybe_timeout = 0;
738 DSCHED_THREAD_IO_UNLOCK(&active_tdio->head);
739 active_tdio = NULL;
740 continue;
741 }
742
743 /* select next bio to dispatch */
744 /* TODO: a wiser slection */
745 KKASSERT(lockstatus(&active_tdio->head.lock, curthread) == LK_EXCLUSIVE);
746 bio = TAILQ_FIRST(&active_tdio->head.queue);
747 dsched_debug(BFQ_DEBUG_NORMAL, "bfq: the first bio in queue of active_tdio %p is %p\n", active_tdio, bio);
748
749 dsched_debug(BFQ_DEBUG_VERBOSE, "bfq: active_tdio %p exists, remaining budget = %d, tdio budget = %d\n, qlength = %d, first bio = %p, first bio cmd = %d, first bio size = %d\n", active_tdio, remaining_budget, active_tdio->budget, active_tdio->head.qlength, bio, bio?bio->bio_buf->b_cmd:-1, bio?bio->bio_buf->b_bcount:-1);
750
751 /*
752 * The bio is not read or write, just
753 * push it down.
754 */
755 if (bio && (bio->bio_buf->b_cmd != BUF_CMD_READ) &&
756 (bio->bio_buf->b_cmd != BUF_CMD_WRITE)) {
757 dsched_debug(BFQ_DEBUG_NORMAL, "bfq: remove bio %p from the queue of %p\n", bio, active_tdio);
758 KKASSERT(lockstatus(&active_tdio->head.lock, curthread) == LK_EXCLUSIVE);
759 TAILQ_REMOVE(&active_tdio->head.queue, bio, link);
760 active_tdio->head.qlength--;
761 free_slots--;
762
763#if 0
764 dsched_strategy_request_polling(diskctx->dp, bio, diskctx);
765#endif
766 bio_to_dispatch[bio_index++] = bio;
767 KKASSERT(bio_index <= bfq_diskctx->head.max_tag_queue_depth);
768 continue;
769 }
770 /*
771 * Run out of budget
772 * But this is not because the size of bio is larger
773 * than the complete budget.
774 * If the size of bio is larger than the complete
775 * budget, then use a complete budget to cover it.
776 */
777 if (bio && (remaining_budget < BIO_SIZE(bio)) &&
778 (remaining_budget != active_tdio->budget)) {
779 /* charge budget used */
780 wf2q_update_vd(active_tdio, active_tdio->budget - remaining_budget);
781 bfq_expire(bfq_diskctx, active_tdio, BFQ_REASON_OUT_OF_BUDGET);
782 wf2q_insert_thread_io(&bfq_diskctx->bfq_wf2q, active_tdio);
783 dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: thread %p ran out of budget\n", active_tdio);
784 DSCHED_THREAD_IO_UNLOCK(&active_tdio->head);
785 active_tdio = NULL;
786 } else { /* if (bio && remaining_budget < BIO_SIZE(bio) && remaining_budget != active_tdio->budget) */
787
788 /*
789 * Having enough budget,
790 * or having a complete budget and the size of bio
791 * is larger than that.
792 */
793 if (bio) {
794 /* dispatch */
795 remaining_budget -= BIO_SIZE(bio);
796 /*
797 * The size of the first bio is larger
798 * than the whole budget, we should
799 * charge the extra part
800 */
801 if (remaining_budget < 0)
802 wf2q_update_vd(active_tdio, -remaining_budget);
803 /* compensate */
804 wf2q_update_vd(active_tdio, -remaining_budget);
805 /*
806 * remaining_budget may be < 0,
807 * but to prevent the budget of current tdio
808 * to substract a negative number,
809 * the remaining_budget has to be >= 0
810 */
811 remaining_budget = MAX(0, remaining_budget);
812 dsched_debug(BFQ_DEBUG_NORMAL, "bfq: remove bio %p from the queue of %p\n", bio, active_tdio);
813 KKASSERT(lockstatus(&active_tdio->head.lock, curthread) == LK_EXCLUSIVE);
814 TAILQ_REMOVE(&active_tdio->head.queue, bio, link);
815 free_slots--;
816 active_tdio->head.qlength--;
817 active_tdio->bio_dispatched++;
818 wf2q_inc_tot_service(&bfq_diskctx->bfq_wf2q, BIO_SIZE(bio));
819 dsched_debug(BFQ_DEBUG_VERBOSE,
820 "BFQ: %p's bio dispatched, size=%d, remaining_budget = %d\n",
821 active_tdio, BIO_SIZE(bio), remaining_budget);
822#if 0
823 dsched_strategy_request_polling(diskctx->dp, bio, diskctx);
824#endif
825 bio_to_dispatch[bio_index++] = bio;
826 KKASSERT(bio_index <= bfq_diskctx->head.max_tag_queue_depth);
827
828 } else { /* if (bio) */
829
830 KKASSERT(active_tdio);
831 /*
832 * If AS feature is switched off,
833 * expire the tdio as well
834 */
835 if ((remaining_budget <= 0) ||
836 !(bfq_diskctx->bfq_flag & BFQ_FLAG_AS) ||
837 !active_tdio->tdio_as_switch) {
838 active_tdio->budget -= remaining_budget;
839 wf2q_update_vd(active_tdio, active_tdio->budget);
840 bfq_expire(bfq_diskctx, active_tdio, BFQ_REASON_OUT_OF_BUDGET);
841 DSCHED_THREAD_IO_UNLOCK(&active_tdio->head);
842 active_tdio = NULL;
843 } else {
844
845 /* no further bio, wait for a while */
846 bfq_diskctx->bfq_blockon = active_tdio;
847 /*
848 * Increase ref count to ensure that
849 * tdio will not be destroyed during waiting.
850 */
851 dsched_thread_io_ref(&active_tdio->head);
852 /*
853 * If the tdio is seeky but not thingking for
854 * too long, we wait for it a little shorter
855 */
856 if (active_tdio->seek_samples >= BFQ_VALID_MIN_SAMPLES && BFQ_TDIO_SEEKY(active_tdio))
857 callout_reset(&bfq_diskctx->bfq_callout, BFQ_T_WAIT_MIN, (void (*) (void *))helper_msg_as_timeout, bfq_diskctx);
858 else
859 callout_reset(&bfq_diskctx->bfq_callout, BFQ_T_WAIT, (void (*) (void *))helper_msg_as_timeout, bfq_diskctx);
860
861 /* save the start time of blocking */
862 getmicrotime(&active_tdio->as_start_time);
863
864 dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: blocked on %p, remaining_budget = %d\n", active_tdio, remaining_budget);
865 DSCHED_THREAD_IO_UNLOCK(&active_tdio->head);
866 goto save_and_rtn;
867 }
868 }
869 }
870 } else { /* if (active_tdio) */
871 /* there is no active tdio */
872
873 /* no pending bios at all */
874 active_tdio = wf2q_get_next_thread_io(&bfq_diskctx->bfq_wf2q);
875
876 if (!active_tdio) {
877 KKASSERT(bfq_diskctx->bfq_wf2q.wf2q_tdio_count == 0);
878 dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: no more eligible tdio!\n");
879 goto save_and_rtn;
880 }
881
882 /*
883 * A new tdio is picked,
884 * initialize the service related statistic data
885 */
886 DSCHED_THREAD_IO_LOCK(&active_tdio->head);
887 active_tdio->service_received = 0;
888
889 /*
890 * Reset the maybe_timeout flag, which
891 * may be set by a biodone after the the service is done
892 */
893 getmicrotime(&active_tdio->service_start_time);
894 active_tdio->maybe_timeout = 0;
895
896 remaining_budget = active_tdio->budget;
897 dsched_debug(BFQ_DEBUG_VERBOSE, "bfq: active_tdio %p selected, remaining budget = %d, tdio budget = %d\n, qlength = %d\n", active_tdio, remaining_budget, active_tdio->budget, active_tdio->head.qlength);
898 }
899
900 }/* while (free_slots) */
901
902 /* reach here only when free_slots == 0 */
903 if (active_tdio) /* && lockcount(&active_tdio->head.lock) > 0) */
904 DSCHED_THREAD_IO_UNLOCK(&active_tdio->head);
905
906save_and_rtn:
907 /* save the remaining budget */
908 bfq_diskctx->bfq_remaining_budget = remaining_budget;
909 bfq_diskctx->bfq_active_tdio = active_tdio;
910rtn:
911 BFQ_UNLOCK(bfq_diskctx);
912 /*dispatch the planned bios*/
913 for (i = 0; i < bio_index; i++)
914 dsched_strategy_request_polling(diskctx->dp, bio_to_dispatch[i], diskctx);
915
916}
917
918/*
919 * bfq_slow_tdio(): decide whether a tdio is slow
920 *
921 * This function decides whether a tdio is slow by the speed
922 * estimated from the current time slice start time: if the
923 * tdio is not fast enough to consume its budget (or 2/3
924 * its budget) within the time slice, it is judged slow.
925 *
926 * Called by bfq_expire()
927 *
928 * lock:
929 * THREAD_IO_LOCK is expected to be held.
930 * refcount:
931 * none
932 *
933 */
934static int
935bfq_slow_tdio(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio)
936{
937 /**
938 * A tdio is considered slow if it can not finish its budget
939 * at its current average speed
940 */
941 uint64_t usec_elapsed, service_received, speed;
942 int expect;
943 struct timeval tv = bfq_tdio->last_request_done_time;
944
945 timevalsub (&tv, &bfq_tdio->service_start_time);
946 usec_elapsed = (uint64_t)(1000000 * (uint64_t)tv.tv_sec + tv.tv_usec);
947
948 /* discard absurd value */
949 if (usec_elapsed < 20000)
950 return 0;
951
952 service_received = (uint64_t)bfq_tdio->service_received << BFQ_FIXPOINT_SHIFT;
953 speed = service_received / usec_elapsed;
954 expect = (speed * BFQ_SLICE_TIMEOUT * (1000 * 1000 / hz)) >> BFQ_FIXPOINT_SHIFT;
955
956 if (expect < 0) {
957 dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: overflow on calculating slow_tdio\n");
958 return 0;
959 }
960
961 if (expect < bfq_tdio->budget * 2 / 3) {
962 dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: %p is judged slow\n", bfq_tdio);
963 return 1;
964 }
965
966 return 0;
967}
968
969/*
970 * bfq_expire(): expire a tdio for a given reason.
971 *
972 * Different amount of the new budget will be assign to the expired
973 * tdio according to the following reasons:
974 *
975 * BFQ_REASON_TIMEOUT:
976 * The tdio does not consume its budget up within BFQ_SLICE_TIMEOUT ticks.
977 * We shall update the disk peak rate if the tdio is not seeky. The new
978 * budget will be the budget it actually consumes during this time
979 * slice.
980 *
981 * BFQ_REASON_TOO_IDLE:
982 * The tdio does not push any further bios during the scheduler is
983 * suspending. To ensure low global latency, this tdio should be
984 * punished by assign it the minimum budget. But if the tdio's not
985 * pushing any bio is because it is waiting for the dispatched bios
986 * to be done, we just keep the budget unchanged.
987 *
988 * BFQ_REASON_OUT_OF_BUDGET:
989 * The tdio runs out of its budget within the time slice. It usually
990 * indicates that the tdio is doing well. We increase the budget of it.
991 *
992 * lock:
993 * THREAD_IO_LOCK is expected to be held.
994 * BFQ_LOCK is expected to be held (needed by bfq_update_peak_rate()).
995 *
996 * refcount: none
997 *
998 * Callers: bfq_timeout(), bfq_dequeue()
999 *
1000 */
1001static void
1002bfq_expire(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio, enum bfq_expire_reason reason)
1003{
1004 int max_budget = bfq_diskctx->bfq_max_budget,
1005 budget_left,
1006 bio_in_flight,
1007 service_received;
1008
1009 service_received = bfq_tdio->service_received;
1010 budget_left = bfq_tdio->budget - bfq_tdio->service_received;
1011
1012 if (budget_left < 0) {
1013 dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: budget down flow: %d, %d\n", bfq_tdio->budget, bfq_tdio->service_received);
1014 budget_left = 0;
1015 }
1016
1017 KKASSERT(budget_left >= 0);
1018
1019 switch (reason) {
1020 case BFQ_REASON_TIMEOUT:
1021 /* the tdio is not seeky so that we can update
1022 * the disk peak rate based on the service received
1023 * by the tdio
1024 */
1025 if ((bfq_tdio->seek_samples >= BFQ_VALID_MIN_SAMPLES) &&
1026 (!BFQ_TDIO_SEEKY(bfq_tdio)))
1027 bfq_update_peak_rate(bfq_diskctx, bfq_tdio);
1028
1029 /* max_budget may be updated */
1030 max_budget = bfq_diskctx->bfq_max_budget;
1031
1032 /* update budget to service_received*/
1033 bfq_tdio->budget = MAX(service_received, BFQ_DEFAULT_MIN_BUDGET);
1034
1035 break;
1036
1037 case BFQ_REASON_TOO_IDLE:
1038 /*
1039 * the tdio is too slow, charge full budget
1040 */
1041 if (bfq_slow_tdio(bfq_diskctx, bfq_tdio))
1042 wf2q_update_vd(bfq_tdio, budget_left);
1043
1044 bio_in_flight = bfq_tdio->bio_dispatched - bfq_tdio->bio_completed;
1045 KKASSERT(bio_in_flight >= 0);
1046 /*
1047 * maybe the tdio pushes no bio
1048 * because it is waiting for some bios
1049 * dispatched to be done, in this case
1050 * we do not reduce the budget too harshly
1051 */
1052 if (bio_in_flight > 0) {
1053 bfq_tdio->budget = MAX(BFQ_DEFAULT_MIN_BUDGET, service_received);
1054 } else {
1055#if 0
1056 bfq_tdio->budget = MAX(BFQ_DEFAULT_MIN_BUDGET, bfq_diskctx->bfq_max_budget / BFQ_MIN_BUDGET_FACTOR);
1057#endif
1058 bfq_tdio->budget = BFQ_DEFAULT_MIN_BUDGET;
1059 }
1060
1061 break;
1062 case BFQ_REASON_OUT_OF_BUDGET:
1063
1064 if ((bfq_tdio->seek_samples >= BFQ_VALID_MIN_SAMPLES) &&
1065 (!BFQ_TDIO_SEEKY(bfq_tdio)))
1066 bfq_update_peak_rate(bfq_diskctx, bfq_tdio);
1067
1068 /* increase the budget */
1069 if (bfq_tdio->budget < BFQ_BUDGET_MULTIPLE_THRESHOLD)
1070 bfq_tdio->budget = MIN(max_budget, bfq_tdio->budget * 2);
1071 else
1072 bfq_tdio->budget = MIN(max_budget, bfq_tdio->budget + BFQ_BUDG_INC_STEP);
1073 break;
1074 default:
1075 break;
1076 }
1077}
1078
1079/*
1080 * bfq_update_peak_rate(): update the peak disk speed by sampling
1081 * the throughput within a time slice.
1082 *
1083 * lock:
1084 * BFQ_LOCK is expected to be held
1085 *
1086 * refcount:
1087 * none
1088 *
1089 * Caller: bfq_expire()
1090 */
1091static void
1092bfq_update_peak_rate(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio)
1093{
1094 struct timeval tv = bfq_tdio->last_request_done_time;
1095 uint64_t usec, service_received, peak_rate;
1096
1097
1098 timevalsub (&tv, &bfq_tdio->service_start_time);
1099 usec = (uint64_t)(1000000 * (uint64_t)tv.tv_sec + tv.tv_usec);
1100
1101 /* discard absurd value */
1102 if (usec < 2000 || usec > (BFQ_SLICE_TIMEOUT * (1000 / hz) * 1000)) {
1103 dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: absurd interval for peak rate\n");
1104 return;
1105 }
1106
1107 service_received = (uint64_t)bfq_tdio->service_received << BFQ_FIXPOINT_SHIFT;
1108 peak_rate = service_received / usec;
1109 bfq_diskctx->bfq_peak_rate = (peak_rate + 7 * bfq_diskctx->bfq_peak_rate) / 8;
1110 bfq_diskctx->bfq_peak_rate_samples++;
1111
1112 /* update the max_budget according to the peak rate */
1113 if (bfq_diskctx->bfq_peak_rate_samples > BFQ_VALID_MIN_SAMPLES) {
1114 bfq_diskctx->bfq_peak_rate_samples = BFQ_VALID_MIN_SAMPLES;
1115 /*
1116 * if the auto max budget adjust is disabled,
1117 * the bfq_max_budget will always be BFQ_DEFAULT_MAX_BUDGET;
1118 */
1119 if (bfq_diskctx->bfq_flag & BFQ_FLAG_AUTO_MAX_BUDGET) {
1120 bfq_diskctx->bfq_max_budget =
1121 (uint32_t)((BFQ_SLICE_TIMEOUT * (1000 / hz) * bfq_diskctx->bfq_peak_rate * 1000) >> BFQ_FIXPOINT_SHIFT);
1122 dsched_debug(BFQ_DEBUG_NORMAL, "max budget updated to %d\n", bfq_diskctx->bfq_max_budget);
1123 }
1124 }
1125}
1126
1127/*
1128 * bfq_update_tdio_seek_avg(): update the average seek distance of a
1129 * tdio.
1130 *
1131 * lock:
1132 * THREAD_IO_LOCK is expected to be held.
1133 *
1134 * refcount:
1135 * none
1136 *
1137 * Caller: bfq_queue()
1138 */
1139static void
1140bfq_update_tdio_seek_avg(struct bfq_thread_io *bfq_tdio, struct bio *bp)
1141{
1142 off_t seek;
1143
1144 /* the first bio it dispatches,
1145 * we do not calculate the seek_avg,
1146 * just update the last_seek_end
1147 */
1148 if (bfq_tdio->seek_samples == 0) {
1149 ++bfq_tdio->seek_samples;
1150 goto rtn;
1151 }
1152
1153 seek = ABS(bp->bio_offset - bfq_tdio->last_seek_end);
1154
1155 /*
1156 * we do not do seek_samples++,
1157 * because the seek_total may overflow if seek_total += seek,
1158 */
1159 bfq_tdio->seek_samples = (7 * bfq_tdio->seek_samples + 256) / 8;
1160 bfq_tdio->seek_total = (7 * bfq_tdio->seek_total + 256 * seek) / 8;
1161 bfq_tdio->seek_avg = (bfq_tdio->seek_total + bfq_tdio->seek_samples / 2) / bfq_tdio->seek_samples;
1162
1163 dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: tdio %p seek_avg updated to %" PRIu64 "\n", bfq_tdio, bfq_tdio->seek_avg);
1164
1165rtn:
1166 bfq_tdio->last_seek_end = bp->bio_offset + BIO_SIZE(bp);
1167}
1168
1169/*
1170 * bfq_update_tdio_ttime_avg(): update the average thinking time
1171 * of a tdio.
1172 *
1173 * The thinking time is used to switch on / off the tdio's AS feature
1174 *
1175 * lock:
1176 * THREAD_IO_LOCK is expected to be held.
1177 *
1178 * refcount:
1179 * none
1180 *
1181 * Caller:
1182 * bfq_queue()
1183 *
1184 */
1185static void
1186bfq_update_tdio_ttime_avg(struct bfq_thread_io *bfq_tdio)
1187{
1188 struct timeval tv, after_start;
1189 uint64_t usec;
1190
1191 if (bfq_tdio->ttime_samples == 0) {
1192 ++bfq_tdio->ttime_samples;
1193 return;
1194 }
1195
1196 getmicrotime(&tv);
1197 after_start = bfq_tdio->last_request_done_time;
1198
1199#if 0
1200 timevalsub (&tv, &bfq_tdio->last_request_done_time);
1201#endif
1202 /*
1203 * Try the interval between two bios are pushed,
1204 * instead of between last_request_done_time and
1205 * the current time.
1206 */
1207
1208 timevalsub (&tv, &bfq_tdio->last_bio_pushed_time);
1209
1210 timevalsub (&after_start, &bfq_tdio->service_start_time);
1211
1212 /*
1213 * tv.tv_sec < 0 means the last reauest done time is
1214 * after the current time.
1215 * this may happen because the biodone function is not blocked
1216 *
1217 * after_start.tv_sec < 0 means that the last bio done happens
1218 * before the current service slice, and we should drop this value.
1219 */
1220 if (tv.tv_sec < 0 || after_start.tv_sec < 0)
1221 return;
1222
1223 usec = (uint64_t)(1000000 * (uint64_t)tv.tv_sec + tv.tv_usec);
1224
1225 bfq_tdio->ttime_samples = (7 * bfq_tdio->ttime_samples + 256) / 8;
1226 bfq_tdio->ttime_total = (7 * bfq_tdio->ttime_total + 256 * usec) / 8;
1227 bfq_tdio->ttime_avg = (bfq_tdio->ttime_total + 128) / bfq_tdio->ttime_samples;
1228
1229}
1230
1231/*
1232 * This function will also update the bfq_max_time_slice field
1233 *
1234 * tv: the timeval structure representing the length of time slice
1235 */
1236static void
1237bfq_update_avg_time_slice(struct bfq_disk_ctx *bfq_diskctx, struct timeval tv)
1238{
1239 uint32_t msec;
1240
1241 msec = ((uint64_t)(1000000 * (uint64_t)tv.tv_sec + tv.tv_usec) >> 10 );
1242
1243 if (msec > 3 * BFQ_SLICE_TIMEOUT * (1000 / hz))
1244 atomic_add_int(&bfq_diskctx->bfq_high_time_slice_count, 1);
1245
1246 bfq_diskctx->bfq_avg_time_slice =
1247 (7 * bfq_diskctx->bfq_avg_time_slice + msec) / 8;
1248
1249 if (bfq_diskctx->bfq_max_time_slice < msec)
1250 bfq_diskctx->bfq_max_time_slice = msec;
1251}
1252/*
1253 * This function will also update the bfq_as_max_wait field
1254 * flag: BFQ_AS_STAT_ALL, BFQ_AS_STAT_ONLY_MISS
1255 *
1256 */
1257static void
1258bfq_update_as_avg_wait(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio, int flag)
1259{
1260 struct timeval tv;
1261 uint32_t msec;
1262 getmicrotime(&tv);
1263 timevalsub (&tv, &bfq_tdio->as_start_time);
1264
1265 /* approximately divide 1000 by left shift 10 */
1266 msec = ((uint64_t)(1000000 * (uint64_t)tv.tv_sec + tv.tv_usec) >> 10 );
1267
1268 /* ridiculous value */
1269 if (msec > 10000) {
1270 dsched_debug(BFQ_DEBUG_NORMAL, "bfq: ridiculous as wait time!\n");
1271 return;
1272 }
1273
1274 if (msec > 5 * BFQ_T_WAIT_MIN * (1000 / hz))
1275 atomic_add_int(&bfq_diskctx->bfq_as_high_wait_count, 1);
1276
1277 if (flag & BFQ_AS_STAT_ALL) {
1278 bfq_diskctx->bfq_as_avg_wait_all =
1279 (7 * bfq_diskctx->bfq_as_avg_wait_all + msec) / 8;
1280 }
1281
1282 if (flag & BFQ_AS_STAT_ONLY_MISS) {
1283 bfq_diskctx->bfq_as_avg_wait_miss =
1284 (7 * bfq_diskctx->bfq_as_avg_wait_miss + msec) / 8;
1285 }
1286
1287 /* update the maximum waiting time */
1288 if (bfq_diskctx->bfq_as_max_wait < msec)
1289 bfq_diskctx->bfq_as_max_wait = msec;
1290
1291 return;
1292}
1293
1294static int
1295bfq_mod_handler(module_t mod, int type, void *unused)
1296{
1297 static struct sysctl_ctx_list sysctl_ctx;
1298 static struct sysctl_oid *oid;
1299 static char version[16];
1300 int error;
1301
1302 ksnprintf(version, sizeof(version), "%d.%d",
1303 dsched_bfq_version_maj, dsched_bfq_version_min);
1304
1305 switch (type) {
1306 case MOD_LOAD:
1307 bzero(&bfq_stats, sizeof(struct dsched_bfq_stats));
1308 if ((error = dsched_register(&dsched_bfq_policy)))
1309 return (error);
1310
1311 sysctl_ctx_init(&sysctl_ctx);
1312 oid = SYSCTL_ADD_NODE(&sysctl_ctx,
1313 SYSCTL_STATIC_CHILDREN(_dsched),
1314 OID_AUTO,
1315 "bfq",
1316 CTLFLAG_RD, 0, "");
1317 bfq_mod_oid = oid;
1318
1319 SYSCTL_ADD_STRING(&sysctl_ctx, SYSCTL_CHILDREN(oid),
1320 OID_AUTO, "version", CTLFLAG_RD, version, 0, "bfq version");
1321 helper_init_global();
1322
1323 kprintf("BFQ scheduler policy version %d.%d loaded. sizeof(bfq_thread_io) = %zu\n",
1324 dsched_bfq_version_maj, dsched_bfq_version_min, sizeof(struct bfq_thread_io));
1325 break;
1326
1327 case MOD_UNLOAD:
1328 if ((error = dsched_unregister(&dsched_bfq_policy)))
1329 return (error);
1330 sysctl_ctx_free(&sysctl_ctx);
1331 kprintf("BFQ scheduler policy unloaded\n");
1332 break;
1333
1334 default:
1335 break;
1336 }
1337
1338 return 0;
1339}
1340
1341int
1342bfq_sysctl_as_switch_handler(SYSCTL_HANDLER_ARGS)
1343{
1344 struct bfq_disk_ctx *bfq_diskctx = arg1;
1345 int as_switch, error;
1346
1347 as_switch = ((bfq_diskctx->bfq_flag & BFQ_FLAG_AS) ? 1 : 0);
1348 error = sysctl_handle_int(oidp, &as_switch, 0, req);
1349 if (error || !req->newptr)
1350 return error;
1351
1352 if (as_switch == 1)
1353 bfq_diskctx->bfq_flag |= BFQ_FLAG_AS;
1354 else if (as_switch == 0)
1355 bfq_diskctx->bfq_flag &= ~(BFQ_FLAG_AS);
1356 else
1357 return 0;
1358
1359 return error;
1360}
1361
1362int
1363bfq_sysctl_auto_max_budget_handler(SYSCTL_HANDLER_ARGS)
1364{
1365 struct bfq_disk_ctx *bfq_diskctx = arg1;
1366 int auto_max_budget_switch, error;
1367 auto_max_budget_switch = ((bfq_diskctx->bfq_flag & BFQ_FLAG_AUTO_MAX_BUDGET) ? 1 : 0);
1368 error = sysctl_handle_int(oidp, &auto_max_budget_switch, 0, req);
1369 if (error || !req->newptr)
1370 return error;
1371
1372 if (auto_max_budget_switch == 1)
1373 bfq_diskctx->bfq_flag |= BFQ_FLAG_AUTO_MAX_BUDGET;
1374 else if (auto_max_budget_switch == 0)
1375 bfq_diskctx->bfq_flag &= ~(BFQ_FLAG_AUTO_MAX_BUDGET);
1376 else
1377 return 0;
1378
1379 return error;
1380}
1381
1f509c0d 1382DSCHED_POLICY_MODULE(dsched_bfq, bfq_mod_handler, 1);