Commit | Line | Data |
---|---|---|
aabeb187 BP |
1 | /* |
2 | * Copyright (c) 2011 The DragonFly Project. All rights reserved. | |
3 | * | |
4 | * This code is derived from software contributed to The DragonFly Project | |
5 | * by Brills Peng <brillsp@gmail.com> | |
6 | * | |
7 | * Redistribution and use in source and binary forms, with or without | |
8 | * modification, are permitted provided that the following conditions | |
9 | * are met: | |
10 | * | |
11 | * 1. Redistributions of source code must retain the above copyright | |
12 | * notice, this list of conditions and the following disclaimer. | |
13 | * 2. Redistributions in binary form must reproduce the above copyright | |
14 | * notice, this list of conditions and the following disclaimer in | |
15 | * the documentation and/or other materials provided with the | |
16 | * distribution. | |
17 | * 3. Neither the name of The DragonFly Project nor the names of its | |
18 | * contributors may be used to endorse or promote products derived | |
19 | * from this software without specific, prior written permission. | |
20 | * | |
21 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
22 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
23 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS | |
24 | * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE | |
25 | * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, | |
26 | * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, | |
27 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
28 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED | |
29 | * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
30 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT | |
31 | * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
32 | * SUCH DAMAGE. | |
33 | */ | |
34 | ||
35 | ||
36 | /* | |
37 | * BFQ disk scheduler, the algorithm routines and the interfaces with the | |
38 | * dsched framework. | |
39 | * | |
40 | */ | |
41 | ||
aabeb187 BP |
42 | #include <sys/systm.h> |
43 | #include <sys/kernel.h> | |
44 | #include <sys/proc.h> | |
45 | #include <sys/sysctl.h> | |
46 | #include <sys/buf.h> | |
47 | #include <sys/conf.h> | |
48 | #include <sys/diskslice.h> | |
49 | #include <sys/disk.h> | |
50 | #include <sys/malloc.h> | |
51 | #include <machine/md_var.h> | |
52 | #include <sys/ctype.h> | |
53 | #include <sys/syslog.h> | |
54 | #include <sys/device.h> | |
55 | #include <sys/msgport.h> | |
56 | #include <sys/msgport2.h> | |
57 | #include <sys/buf2.h> | |
58 | #include <sys/dsched.h> | |
59 | #include <sys/fcntl.h> | |
cff97b06 | 60 | #include <machine/inttypes.h> |
aabeb187 BP |
61 | #include <machine/varargs.h> |
62 | ||
63 | #include <kern/dsched/bfq/bfq.h> | |
64 | #include <kern/dsched/bfq/bfq_helper_thread.h> | |
65 | ||
66 | #define _DSCHED_BFQ_BFQ_C_ | |
67 | #include <kern/dsched/bfq/bfq_ktr.h> | |
68 | ||
69 | /* Make sure our structs fit */ | |
70 | CTASSERT(sizeof(struct bfq_thread_io) <= DSCHED_THREAD_IO_MAX_SZ); | |
71 | CTASSERT(sizeof(struct bfq_disk_ctx) <= DSCHED_DISK_CTX_MAX_SZ); | |
72 | ||
73 | ||
74 | static dsched_prepare_t bfq_prepare; | |
75 | static dsched_teardown_t bfq_teardown; | |
76 | static dsched_cancel_t bfq_cancel_all; | |
77 | static dsched_queue_t bfq_queue; | |
78 | static dsched_new_tdio_t bfq_new_tdio; | |
79 | static dsched_destroy_tdio_t bfq_destroy_tdio; | |
80 | static dsched_bio_done_t bfq_bio_done; | |
81 | ||
82 | ||
83 | static void bfq_update_peak_rate(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio); | |
84 | static int bfq_slow_tdio(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio); | |
85 | static void bfq_expire(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio, enum bfq_expire_reason reason); | |
86 | static void bfq_update_tdio_seek_avg(struct bfq_thread_io *bfq_tdio, struct bio *bp); | |
87 | static void bfq_update_tdio_ttime_avg(struct bfq_thread_io *bfq_tdio); | |
88 | static void bfq_update_as_avg_wait(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio, int flag); | |
89 | static void bfq_update_avg_time_slice(struct bfq_disk_ctx *bfq_diskctx, struct timeval tv); | |
90 | ||
91 | ||
92 | ||
93 | struct dsched_policy dsched_bfq_policy = { | |
94 | .name = "bfq", | |
95 | .prepare = bfq_prepare, | |
96 | .teardown = bfq_teardown, | |
97 | .cancel_all = bfq_cancel_all, | |
98 | .bio_queue = bfq_queue, | |
99 | .new_tdio = bfq_new_tdio, | |
100 | .destroy_tdio = bfq_destroy_tdio, | |
101 | .bio_done = bfq_bio_done, | |
102 | .polling_func = (void (*)(struct dsched_disk_ctx *))helper_msg_dequeue, | |
103 | }; | |
104 | ||
105 | ||
106 | struct sysctl_oid *bfq_mod_oid; | |
107 | ||
108 | struct dsched_bfq_stats bfq_stats; | |
109 | ||
110 | static int dsched_bfq_version_maj = 1; | |
111 | static int dsched_bfq_version_min = 0; | |
112 | ||
113 | /* | |
114 | * bfq_prepare(): the .prepare callback of the bfq policy. Initialize | |
115 | * all fields in bfq_diskctx and initialize the corresponding helper | |
116 | * thread. | |
117 | * | |
118 | * lock: none | |
119 | * refcount: none | |
120 | * | |
121 | * Returns 0 | |
122 | */ | |
123 | static int | |
124 | bfq_prepare(struct dsched_disk_ctx *diskctx) | |
125 | { | |
126 | struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)diskctx; | |
127 | ||
128 | BFQ_LOCKINIT(bfq_diskctx); | |
129 | ||
130 | bfq_diskctx->pending_dequeue = 0; | |
131 | ||
132 | wf2q_init(&bfq_diskctx->bfq_wf2q); | |
133 | ||
134 | callout_init_mp(&bfq_diskctx->bfq_callout); | |
135 | ||
136 | bfq_diskctx->bfq_blockon = NULL; | |
137 | bfq_diskctx->bfq_active_tdio = NULL; | |
138 | bfq_diskctx->bfq_remaining_budget = 0; | |
139 | ||
140 | bfq_diskctx->bfq_max_budget = BFQ_DEFAULT_MAX_BUDGET; | |
141 | bfq_diskctx->bfq_peak_rate_samples = 0; | |
142 | bfq_diskctx->bfq_peak_rate = 0; | |
143 | ||
144 | #if 0 | |
145 | bfq_diskctx->bfq_flag = BFQ_FLAG_AS | BFQ_FLAG_AUTO_MAX_BUDGET; | |
146 | #endif | |
147 | bfq_diskctx->bfq_flag = BFQ_FLAG_AS; | |
148 | ||
149 | bfq_diskctx->bfq_as_miss = 0; | |
150 | bfq_diskctx->bfq_as_hit = 0; | |
151 | ||
152 | bfq_diskctx->bfq_as_avg_wait_miss = 0; | |
153 | bfq_diskctx->bfq_as_avg_wait_all = 0; | |
154 | bfq_diskctx->bfq_as_max_wait = 0; | |
155 | bfq_diskctx->bfq_as_max_wait2 = 0; | |
156 | bfq_diskctx->bfq_as_high_wait_count = 0; | |
157 | bfq_diskctx->bfq_as_high_wait_count2 = 0; | |
158 | ||
159 | bfq_diskctx->bfq_avg_time_slice = 0; | |
160 | bfq_diskctx->bfq_max_time_slice = 0; | |
161 | bfq_diskctx->bfq_high_time_slice_count = 0; | |
162 | ||
163 | /* initiailize the helper thread */ | |
164 | helper_init(bfq_diskctx); | |
165 | ||
166 | dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: initialized!\n"); | |
167 | return 0; | |
168 | } | |
169 | ||
170 | /* | |
171 | * bfq_teardown(): .teardown callback of the bfq policy. Send message | |
172 | * of killing to the helper thread and deallocate resources used by | |
173 | * the helper thread (currently the objcache) | |
174 | * | |
175 | * XXX: deadlock causing when the caller of bfq_teardown() and the | |
176 | * helper thread are on the same CPU. | |
177 | * | |
178 | * lock: none | |
179 | * refcount: none | |
180 | * | |
181 | */ | |
182 | ||
183 | static void | |
184 | bfq_teardown(struct dsched_disk_ctx *diskctx) | |
185 | { | |
186 | struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)diskctx; | |
187 | KKASSERT(diskctx); | |
188 | ||
189 | helper_msg_kill(bfq_diskctx); | |
190 | ||
191 | tsleep(diskctx, 0, "teardn", hz * 3 / 2); | |
192 | ||
193 | helper_uninit(bfq_diskctx); | |
194 | } | |
195 | ||
196 | /* | |
197 | * bfq_cancel_all(): .cancel_all callback of the bfq policy. Cancel | |
198 | * all bios that queue in each bfq_thread_io structure in the | |
199 | * wf2q tree. | |
200 | * | |
201 | * lock: | |
202 | * BFQ_LOCK: protect from wf2q_insert operation in bfq_queue() and | |
203 | * bfq_dequeue(); wf2q_get_next operation in bfq_dequeue() | |
204 | * THREAD_IO_LOCK: protect from queue iteration in bfq_dequeue() and | |
205 | * queue insertion in bfq_queue() | |
206 | * | |
207 | * refcount: | |
208 | * unref thread_io structures; they are referenced in queue(), | |
209 | * when a bio is queued. The refcount may decrease to zero. | |
210 | * | |
211 | */ | |
212 | static void | |
213 | bfq_cancel_all(struct dsched_disk_ctx *diskctx) | |
214 | { | |
215 | struct bio *bio; | |
216 | struct bfq_thread_io *bfq_tdio; | |
217 | struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)diskctx; | |
218 | ||
219 | BFQ_LOCK(bfq_diskctx); | |
220 | ||
221 | while ((bfq_tdio = wf2q_get_next_thread_io(&bfq_diskctx->bfq_wf2q))) { | |
222 | DSCHED_THREAD_IO_LOCK(&bfq_tdio->head); | |
223 | KKASSERT(lockstatus(&bfq_tdio->head.lock, curthread) == LK_EXCLUSIVE); | |
224 | ||
225 | while ((bio = TAILQ_FIRST(&bfq_tdio->head.queue))) { | |
226 | bfq_tdio->head.qlength--; | |
227 | TAILQ_REMOVE(&bfq_tdio->head.queue, bio, link); | |
228 | dsched_cancel_bio(bio); | |
229 | dsched_thread_io_unref(&bfq_tdio->head); | |
230 | } | |
231 | ||
232 | KKASSERT(bfq_tdio->head.qlength == 0); | |
233 | DSCHED_THREAD_IO_UNLOCK(&bfq_tdio->head); | |
234 | } | |
235 | ||
236 | BFQ_UNLOCK(bfq_diskctx); | |
237 | } | |
238 | ||
239 | /* | |
240 | * bfq_new_tdio(): .new_tdio callback of the bfq policy. Initialize | |
241 | * the bfq_thread_io structure. | |
242 | * | |
243 | * lock: none | |
244 | * refcount: none | |
245 | */ | |
246 | static void | |
247 | bfq_new_tdio(struct dsched_thread_io *tdio) | |
248 | { | |
249 | struct bfq_thread_io *bfq_tdio = (struct bfq_thread_io *) tdio; | |
250 | ||
251 | /* the queue has to be initialized some where else */ | |
252 | tdio->qlength = 0; | |
253 | ||
254 | tdio->debug_priv = 0xF00FF00F; | |
255 | ||
256 | bfq_tdio->budget = BFQ_DEFAULT_MIN_BUDGET; | |
257 | bfq_tdio->weight = BFQ_DEFAULT_WEIGHT; | |
258 | ||
259 | bfq_tdio->tdio_as_switch = 1; | |
260 | bfq_tdio->maybe_timeout = 0; | |
261 | ||
262 | bfq_tdio->seek_samples = 0; | |
263 | bfq_tdio->seek_avg = 0; | |
264 | bfq_tdio->seek_total = 0; | |
265 | bfq_tdio->ttime_samples = 0; | |
266 | bfq_tdio->ttime_avg = 0; | |
267 | bfq_tdio->service_received = 0; | |
268 | bfq_tdio->bio_dispatched = 0; | |
269 | bfq_tdio->bio_completed = 0; | |
270 | ||
271 | KTR_LOG(dsched_bfq_thread_created, bfq_tdio); | |
272 | } | |
273 | ||
274 | /* | |
275 | * bfq_helper_destroy_tdio(): called after a thread_io struct is destroyed. | |
276 | * if the scheduler is AS waiting for a destroyed tdio, this function resumes | |
277 | * the scheduler. | |
278 | * | |
279 | * lock: | |
280 | * BFQ_LOCK: protect from nullify bfq_diskctx->bfq_blockon/bfq_active_tdio | |
281 | * in bfq_timeout() | |
282 | * | |
283 | * refcount: none | |
284 | * | |
285 | * Calling path: bfq_destroy_tdio --lwkt_msg--> helper_thread --call--> me | |
286 | * | |
287 | */ | |
288 | void | |
289 | bfq_helper_destroy_tdio(struct dsched_thread_io *tdio, struct bfq_disk_ctx *bfq_diskctx) | |
290 | { | |
291 | KKASSERT(bfq_diskctx); | |
292 | ||
293 | BFQ_LOCK(bfq_diskctx); | |
294 | ||
295 | /* | |
296 | * Test whether the scheduler is pending on the tdio to | |
297 | * be destroyed. | |
298 | */ | |
299 | if (((struct dsched_thread_io *)bfq_diskctx->bfq_blockon == tdio) && | |
300 | callout_pending(&bfq_diskctx->bfq_callout)) { | |
301 | dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: pending on a being destroyed thread!\n"); | |
302 | ||
303 | callout_stop(&bfq_diskctx->bfq_callout); | |
304 | ||
305 | bfq_diskctx->bfq_blockon = NULL; | |
306 | bfq_diskctx->bfq_active_tdio = NULL; | |
307 | ||
308 | BFQ_UNLOCK(bfq_diskctx); | |
309 | ||
310 | helper_msg_dequeue(bfq_diskctx); | |
311 | return; | |
312 | } | |
313 | BFQ_UNLOCK(bfq_diskctx); | |
314 | ||
315 | } | |
316 | ||
317 | /* | |
318 | * bfq_destroy_tdio(): .destroy_tdio callback of the bfq policy | |
319 | * | |
320 | * Called immediate after a dsched_thread_io struct's refcount decreases | |
321 | * to zero. This function will record the seek_avg and ttime_avg of the | |
322 | * destroyed thread with the KTR facility. | |
323 | * | |
324 | * lock: none | |
325 | * | |
326 | * refcount: the tdio's refcount should be zero. It may be nuked, and | |
327 | * any read/write to the tdio is not safe by then. | |
328 | */ | |
329 | static void | |
330 | bfq_destroy_tdio(struct dsched_thread_io *tdio) | |
331 | { | |
332 | struct bfq_thread_io *bfq_tdio = (struct bfq_thread_io *)tdio; | |
333 | ||
334 | /* | |
335 | * do not log threads without I/O | |
336 | */ | |
337 | if (bfq_tdio->seek_samples != 0 || bfq_tdio->ttime_samples != 0) { | |
338 | KTR_LOG(dsched_bfq_thread_seek_avg, bfq_tdio, bfq_tdio->seek_avg ); | |
339 | KTR_LOG(dsched_bfq_thread_ttime_avg, bfq_tdio, bfq_tdio->ttime_avg); | |
340 | } | |
341 | ||
342 | helper_msg_destroy_tdio((struct bfq_disk_ctx *)tdio->diskctx, tdio); | |
343 | } | |
344 | ||
345 | /* | |
346 | * bfq_bio_done(): .bio_done callback of the bfq policy | |
347 | * | |
348 | * Called after a bio is done, (by request_polling_biodone of dsched). | |
349 | * This function judges whet her a thread consumes up its time slice, and | |
350 | * if so, it will set the maybe_timeout flag in bfq_tdio structure. Any | |
351 | * further action of that thread or the bfq scheduler will cause the | |
352 | * thread to be expired. (in bfq_queue() or in bfq_dequeue()) | |
353 | * | |
354 | * This function requires the bfq_tdio pointer of the thread that pushes | |
355 | * bp to be stored by dsched_set_bio_priv() earlier. Currently it is | |
356 | * stored when bfq_queue() is called. | |
357 | * | |
358 | * lock: none. This function CANNOT be blocked by any lock | |
359 | * | |
360 | * refcount: | |
361 | * the corresponding tdio's refcount should decrease by 1 after | |
362 | * this function call. The counterpart increasing is in bfq_queue(). | |
363 | * For each bio pushed down, we increase the refcount of the pushing | |
364 | * tdio. | |
365 | */ | |
366 | static void | |
367 | bfq_bio_done(struct bio *bp) | |
368 | { | |
369 | struct disk *dp = dsched_get_bio_dp(bp); | |
370 | struct bfq_thread_io *bfq_tdio = dsched_get_bio_priv(bp); | |
371 | struct bfq_disk_ctx *bfq_diskctx = dsched_get_disk_priv(dp); | |
372 | struct timeval tv; | |
373 | int ticks_expired; | |
374 | ||
375 | KKASSERT(bfq_tdio); | |
376 | ||
377 | dsched_thread_io_ref(&bfq_tdio->head); | |
378 | ||
379 | atomic_add_int(&bfq_tdio->bio_completed, 1); | |
380 | ||
381 | /* the tdio has already expired */ | |
382 | if (bfq_tdio != bfq_diskctx->bfq_active_tdio) | |
383 | goto rtn; | |
384 | atomic_add_int(&bfq_tdio->service_received, BIO_SIZE(bp)); | |
385 | ||
386 | /* current time */ | |
387 | getmicrotime(&tv); | |
388 | bfq_tdio->last_request_done_time = tv; | |
389 | timevalsub (&tv, &bfq_tdio->service_start_time); | |
390 | ticks_expired = tvtohz_high(&tv); | |
391 | ||
392 | /* the thread has run out its time slice */ | |
393 | if ((ticks_expired != 0x7fffffff) && | |
394 | (ticks_expired >= BFQ_SLICE_TIMEOUT)) { | |
395 | /* | |
396 | * we cannot block here, so just set a flag | |
397 | */ | |
398 | #if 0 | |
399 | bfq_tdio->maybe_timeout = 1; | |
400 | #endif | |
401 | if (atomic_cmpset_int(&bfq_tdio->maybe_timeout, 0, 1)) { | |
402 | bfq_update_avg_time_slice(bfq_diskctx, tv); | |
403 | dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: %p may time out\n", bfq_tdio); | |
404 | } | |
405 | } | |
406 | rtn: | |
407 | dsched_thread_io_unref(&bfq_tdio->head); /* ref'ed in this function */ | |
408 | dsched_thread_io_unref(&bfq_tdio->head); /* ref'ed in queue() */ | |
409 | ||
410 | } | |
411 | ||
412 | /* | |
413 | * bfq_timeout(): called after the callout alarm strikes. | |
414 | * | |
415 | * This function getting called indicates that after waiting for | |
416 | * BFQ_T_WAIT / BFQ_T_WAIT_MIN ticks, the thread "active_tdio" | |
417 | * represents does not push any further bios. This tdio should | |
418 | * be expired with the reason BFQ_REASON_TOO_IDLE, but if the tdio | |
419 | * is marked as timeout (in bfq_biodone()) first, we expire it | |
420 | * for BFQ_REASON_TIMEOUT. The bfq scheduler should resume working | |
421 | * (and pick another thread to serve). | |
422 | * | |
423 | * It is possible that this function gets called a litter after | |
424 | * the thread pushes a bio with bfq_queue(), and thus a "fake timeout" | |
425 | * happens. We treat it as the callout does not strike, and continue | |
426 | * to serve the active_tdio. | |
427 | * | |
428 | * lock: | |
429 | * BFQ_LOCK: protect bfq_diskctx->blockon and bfq_diskctx->active_tdio | |
430 | * they should either changed in bfq_queue() or in this function, | |
431 | * atomically. | |
432 | * TDIO_LOCK: protect from dequeue() updateing the budget by the | |
433 | * maybe_timeout branch. (Not necessary, because we already hold the | |
434 | * BFQ_LOCK, and no one else could change the budget of the tdio) | |
435 | * | |
436 | * refcount: | |
437 | * the refcount of bfq_diskctx->bfq_active_tdio will decrease one | |
438 | * after this function. (The counterpart increasing is in bfq_dequeue(), | |
439 | * before resetting the callout alarm.) | |
440 | * | |
441 | * AS timeout: | |
442 | * during the waiting period, no bio is pushed by the being | |
443 | * waited tdio | |
444 | * | |
445 | * Calling path: | |
446 | * callout facility --> helper_msg_timeout --lwkt_msg--> helper thread | |
447 | * --> me | |
448 | */ | |
449 | void | |
450 | bfq_timeout(void *p) | |
451 | { | |
452 | /* waiting time out: | |
453 | * no deceptive idleness, and unblock dispatching | |
454 | */ | |
455 | struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)p; | |
456 | struct bfq_thread_io *bfq_tdio; | |
457 | ||
458 | BFQ_LOCK(bfq_diskctx); | |
459 | ||
460 | /* | |
461 | * the timeout occurs after the thread | |
462 | * pushing one more bio | |
463 | */ | |
464 | if (bfq_diskctx->bfq_blockon == NULL) { | |
465 | dsched_debug(BFQ_DEBUG_VERBOSE , "BFQ: fake AS timeout \n"); | |
466 | goto rtn; | |
467 | } | |
468 | ||
469 | bfq_diskctx->bfq_as_miss++; | |
470 | ||
471 | KKASSERT(bfq_diskctx->bfq_active_tdio); | |
472 | bfq_tdio = bfq_diskctx->bfq_active_tdio; | |
473 | ||
474 | DSCHED_THREAD_IO_LOCK(&bfq_tdio->head); | |
475 | ||
476 | bfq_update_as_avg_wait(bfq_diskctx, bfq_tdio, BFQ_AS_STAT_ALL|BFQ_AS_STAT_ONLY_MISS); | |
477 | ||
478 | bfq_diskctx->bfq_blockon = NULL; | |
479 | bfq_diskctx->bfq_active_tdio = NULL; | |
480 | dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: unblocked %p\n", bfq_tdio); | |
481 | ||
482 | wf2q_update_vd(bfq_tdio, bfq_tdio->budget - bfq_diskctx->bfq_remaining_budget); | |
483 | /* | |
484 | * the time slice expired before as timeout | |
485 | * this should be REASON_TIMEOUT | |
486 | */ | |
487 | if (bfq_tdio->maybe_timeout) { | |
488 | bfq_expire(bfq_diskctx, bfq_tdio, BFQ_REASON_TIMEOUT); | |
489 | dsched_debug(BFQ_DEBUG_VERBOSE, "%p time out in timeout()\n", bfq_tdio); | |
490 | } else { | |
491 | bfq_expire(bfq_diskctx, bfq_tdio, BFQ_REASON_TOO_IDLE); | |
492 | dsched_debug(BFQ_DEBUG_VERBOSE, "%p too idle\n", bfq_tdio); | |
493 | } | |
494 | ||
495 | DSCHED_THREAD_IO_UNLOCK(&bfq_tdio->head); | |
496 | ||
497 | /* ref'ed in dequeue(), before resetting callout */ | |
498 | dsched_thread_io_unref(&bfq_tdio->head); | |
499 | rtn: | |
500 | BFQ_UNLOCK(bfq_diskctx); | |
501 | helper_msg_dequeue(bfq_diskctx); | |
502 | } | |
503 | ||
504 | /* | |
505 | * bfq_queue(): .queue callback of the bfq policy. | |
506 | * | |
507 | * A thread calls this function to hand in its I/O requests (bio). | |
508 | * Their bios are stored in the per-thread queue, in tdio structure. | |
509 | * Currently, the sync/async bios are queued together, which may cause | |
510 | * some issues on performance. | |
511 | * | |
512 | * Besides queueing bios, this function also calculates the average | |
513 | * thinking time and average seek distance of a thread, using the | |
514 | * information in bio structure. | |
515 | * | |
516 | * If the calling thread is waiting by the bfq scheduler due to | |
517 | * the AS feature, this function will cancel the callout alarm | |
518 | * and resume the scheduler to continue serving this thread. | |
519 | * | |
520 | * lock: | |
521 | * THREAD_IO_LOCK: protect from queue iteration in bfq_dequeue() | |
522 | * BFQ_LOCK: protect from other insertions/deletions in wf2q_augtree | |
523 | * in bfq_queue() or bfq_dequeue(). | |
524 | * | |
525 | * refcount: | |
526 | * If the calling thread is waited by the scheduler, the refcount | |
527 | * of the related tdio will decrease by 1 after this function. The | |
528 | * counterpart increasing is in bfq_dequeue(), before resetting the | |
529 | * callout alarm. | |
530 | * | |
531 | * Return value: | |
532 | * EINVAL: if bio->bio_buf->b_cmd == BUF_CMD_FLUSH | |
533 | * 0: bio is queued successfully. | |
534 | */ | |
535 | static int | |
536 | bfq_queue(struct dsched_disk_ctx *diskctx, struct dsched_thread_io *tdio, | |
537 | struct bio *bio) | |
538 | { | |
539 | struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)diskctx; | |
540 | struct bfq_thread_io *bfq_tdio = (struct bfq_thread_io *)tdio; | |
541 | int original_qlength; | |
542 | ||
543 | /* we do not handle flush requests. push it down to dsched */ | |
544 | if (__predict_false(bio->bio_buf->b_cmd == BUF_CMD_FLUSH)) | |
545 | return (EINVAL); | |
546 | ||
547 | DSCHED_THREAD_IO_LOCK(tdio); | |
548 | KKASSERT(tdio->debug_priv == 0xF00FF00F); | |
549 | dsched_debug(BFQ_DEBUG_NORMAL, "bfq: tdio %p pushes bio %p\n", bfq_tdio, bio); | |
550 | ||
551 | dsched_set_bio_priv(bio, tdio); | |
552 | dsched_thread_io_ref(tdio); | |
553 | ||
554 | if ((bio->bio_buf->b_cmd == BUF_CMD_READ) || | |
555 | (bio->bio_buf->b_cmd == BUF_CMD_WRITE)) { | |
556 | bfq_update_tdio_seek_avg(bfq_tdio, bio); | |
557 | } | |
558 | ||
559 | bfq_update_tdio_ttime_avg(bfq_tdio); | |
560 | ||
561 | /* update last_bio_pushed_time */ | |
562 | getmicrotime(&bfq_tdio->last_bio_pushed_time); | |
563 | ||
564 | if ((bfq_tdio->seek_samples > BFQ_VALID_MIN_SAMPLES) && | |
565 | BFQ_TDIO_SEEKY(bfq_tdio)) | |
566 | dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: tdio %p is seeky\n", bfq_tdio); | |
567 | ||
568 | /* | |
569 | * If a tdio taks too long to think, we disable the AS feature of it. | |
570 | */ | |
571 | if ((bfq_tdio->ttime_samples > BFQ_VALID_MIN_SAMPLES) && | |
572 | (bfq_tdio->ttime_avg > BFQ_T_WAIT * (1000 / hz) * 1000) && | |
573 | (bfq_tdio->service_received > bfq_tdio->budget / 8)) { | |
574 | dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: tdio %p takes too long time to think\n", bfq_tdio); | |
575 | bfq_tdio->tdio_as_switch = 0; | |
576 | } else { | |
577 | bfq_tdio->tdio_as_switch = 1; | |
578 | } | |
579 | ||
580 | /* insert the bio into the tdio's own queue */ | |
581 | KKASSERT(lockstatus(&tdio->lock, curthread) == LK_EXCLUSIVE); | |
582 | TAILQ_INSERT_TAIL(&tdio->queue, bio, link); | |
583 | #if 0 | |
584 | tdio->qlength++; | |
585 | #endif | |
586 | original_qlength = atomic_fetchadd_int(&tdio->qlength, 1); | |
587 | DSCHED_THREAD_IO_UNLOCK(tdio); | |
588 | /* | |
589 | * A new thread: | |
590 | * In dequeue function, we remove the thread | |
591 | * from the aug-tree if it has no further bios. | |
592 | * Therefore "new" means a really new thread (a | |
593 | * newly created thread or a thread that pushed no more | |
594 | * bios when the scheduler was waiting for it) or | |
595 | * one that was removed from the aug-tree earlier. | |
596 | */ | |
597 | if (original_qlength == 0) { | |
598 | /* | |
599 | * a really new thread | |
600 | */ | |
601 | BFQ_LOCK(bfq_diskctx); | |
602 | if (bfq_tdio != bfq_diskctx->bfq_active_tdio) { | |
603 | /* insert the tdio into the wf2q queue */ | |
604 | wf2q_insert_thread_io(&bfq_diskctx->bfq_wf2q, bfq_tdio); | |
605 | } else { | |
606 | /* | |
607 | * the thread being waited by the scheduler | |
608 | */ | |
609 | if (bfq_diskctx->bfq_blockon == bfq_tdio) { | |
610 | /* | |
611 | * XXX: possible race condition here: | |
612 | * if the callout function is triggered when | |
613 | * the following code is executed, then after | |
614 | * releasing the TDIO lock, the callout function | |
615 | * will set the thread inactive and it will never | |
616 | * be inserted into the aug-tree (so its bio pushed | |
617 | * this time will not be dispatched) until it pushes | |
618 | * further bios | |
619 | */ | |
620 | bfq_diskctx->bfq_as_hit++; | |
621 | bfq_update_as_avg_wait(bfq_diskctx, bfq_tdio, BFQ_AS_STAT_ALL); | |
622 | ||
623 | if (callout_pending(&bfq_diskctx->bfq_callout)) | |
624 | callout_stop(&bfq_diskctx->bfq_callout); | |
625 | bfq_diskctx->bfq_blockon = NULL; | |
626 | ||
627 | /* ref'ed in dequeue(), before resetting callout */ | |
628 | dsched_thread_io_unref(&bfq_tdio->head); | |
629 | ||
630 | dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: %p pushes a new bio when AS\n", bfq_tdio); | |
631 | } | |
632 | } | |
633 | ||
634 | BFQ_UNLOCK(bfq_diskctx); | |
635 | } | |
636 | ||
637 | helper_msg_dequeue(bfq_diskctx); | |
638 | ||
639 | return 0; | |
640 | } | |
641 | ||
642 | /* | |
643 | * bfq_dequeue(): dispatch bios to the disk driver. | |
644 | * | |
645 | * This function will push as many bios as the number of free slots | |
646 | * in the tag queue. | |
647 | * | |
648 | * In the progress of dispatching, the following events may happen: | |
649 | * - Current thread is timeout: Expire the current thread for | |
650 | * BFQ_REASON_TIMEOUT, and select a new thread to serve in the | |
651 | * wf2q tree. | |
652 | * | |
653 | * - Current thread runs out of its budget: Expire the current thread | |
654 | * for BFQ_REASON_OUT_OF_BUDGET, and select a new thread to serve | |
655 | * | |
656 | * - Current thread has no further bios in its queue: if the AS feature | |
657 | * is turned on, the bfq scheduler sets an alarm and starts to suspend. | |
658 | * The bfq_timeout() or bfq_queue() calls may resume the scheduler. | |
659 | * | |
660 | * Implementation note: The bios selected to be dispatched will first | |
661 | * be stored in an array bio_do_dispatch. After this function releases | |
662 | * all the locks it holds, it will call dsched_strategy_request_polling() | |
663 | * for each bio stored. | |
664 | * | |
665 | * With the help of bfq_disk_ctx->pending_dequeue, | |
666 | * there will be only one bfq_dequeue pending on the BFQ_LOCK. | |
667 | * | |
668 | * lock: | |
669 | * BFQ_LOCK: protect from wf2q_augtree operations in bfq_queue() | |
670 | * THREAD_IO_LOCK: locks the active_tdio. Protect from queue insertions | |
671 | * in bfq_queue; Protect the active_tdio->budget | |
672 | * | |
673 | * refcount: | |
674 | * If the scheduler decides to suspend, the refcount of active_tdio | |
675 | * increases by 1. The counterpart decreasing is in bfq_queue() and | |
676 | * bfq_timeout() | |
677 | * blocking: | |
678 | * May be blocking on the disk driver lock. It depends on drivers. | |
679 | * | |
680 | * Calling path: | |
681 | * The callers could be: | |
682 | * bfq_queue(), bfq_timeout() and the registered polling function. | |
683 | * | |
684 | * caller --> helper_msg_dequeue --lwkt_msg--> helper_thread-> me | |
685 | * | |
686 | */ | |
687 | void | |
688 | bfq_dequeue(struct dsched_disk_ctx *diskctx) | |
689 | { | |
690 | int free_slots, | |
691 | bio_index = 0, i, | |
692 | remaining_budget = 0;/* remaining budget of current active process */ | |
693 | ||
694 | struct bio *bio, *bio_to_dispatch[33]; | |
695 | struct bfq_thread_io *active_tdio = NULL; | |
696 | struct bfq_disk_ctx *bfq_diskctx = (struct bfq_disk_ctx *)diskctx; | |
697 | ||
698 | BFQ_LOCK(bfq_diskctx); | |
699 | atomic_cmpset_int(&bfq_diskctx->pending_dequeue, 1, 0); | |
700 | ||
701 | /* | |
702 | * The whole scheduler is waiting for further bios | |
703 | * from process currently being served | |
704 | */ | |
705 | if (bfq_diskctx->bfq_blockon != NULL) | |
706 | goto rtn; | |
707 | ||
708 | remaining_budget = bfq_diskctx->bfq_remaining_budget; | |
709 | active_tdio = bfq_diskctx->bfq_active_tdio; | |
710 | dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: dequeue: Im in. active_tdio = %p\n", active_tdio); | |
711 | ||
712 | free_slots = diskctx->max_tag_queue_depth - diskctx->current_tag_queue_depth; | |
713 | KKASSERT(free_slots >= 0 && free_slots <= 32); | |
714 | ||
715 | if (active_tdio) | |
716 | DSCHED_THREAD_IO_LOCK(&active_tdio->head); | |
717 | ||
718 | while (free_slots) { | |
719 | /* Here active_tdio must be locked ! */ | |
720 | if (active_tdio) { | |
721 | /* | |
722 | * the bio_done function has marked the current | |
723 | * tdio timeout | |
724 | */ | |
725 | if (active_tdio->maybe_timeout) { | |
726 | dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: %p time out in dequeue()\n", active_tdio); | |
727 | wf2q_update_vd(active_tdio, active_tdio->budget - remaining_budget); | |
728 | bfq_expire(bfq_diskctx, active_tdio, BFQ_REASON_TIMEOUT); | |
729 | ||
730 | /* there still exist bios not dispatched, | |
731 | * reinsert the tdio into aug-tree*/ | |
732 | if (active_tdio->head.qlength > 0) { | |
733 | wf2q_insert_thread_io(&bfq_diskctx->bfq_wf2q, active_tdio); | |
734 | KKASSERT(bfq_diskctx->bfq_wf2q.wf2q_tdio_count); | |
735 | } | |
736 | ||
737 | active_tdio->maybe_timeout = 0; | |
738 | DSCHED_THREAD_IO_UNLOCK(&active_tdio->head); | |
739 | active_tdio = NULL; | |
740 | continue; | |
741 | } | |
742 | ||
743 | /* select next bio to dispatch */ | |
744 | /* TODO: a wiser slection */ | |
745 | KKASSERT(lockstatus(&active_tdio->head.lock, curthread) == LK_EXCLUSIVE); | |
746 | bio = TAILQ_FIRST(&active_tdio->head.queue); | |
747 | dsched_debug(BFQ_DEBUG_NORMAL, "bfq: the first bio in queue of active_tdio %p is %p\n", active_tdio, bio); | |
748 | ||
749 | dsched_debug(BFQ_DEBUG_VERBOSE, "bfq: active_tdio %p exists, remaining budget = %d, tdio budget = %d\n, qlength = %d, first bio = %p, first bio cmd = %d, first bio size = %d\n", active_tdio, remaining_budget, active_tdio->budget, active_tdio->head.qlength, bio, bio?bio->bio_buf->b_cmd:-1, bio?bio->bio_buf->b_bcount:-1); | |
750 | ||
751 | /* | |
752 | * The bio is not read or write, just | |
753 | * push it down. | |
754 | */ | |
755 | if (bio && (bio->bio_buf->b_cmd != BUF_CMD_READ) && | |
756 | (bio->bio_buf->b_cmd != BUF_CMD_WRITE)) { | |
757 | dsched_debug(BFQ_DEBUG_NORMAL, "bfq: remove bio %p from the queue of %p\n", bio, active_tdio); | |
758 | KKASSERT(lockstatus(&active_tdio->head.lock, curthread) == LK_EXCLUSIVE); | |
759 | TAILQ_REMOVE(&active_tdio->head.queue, bio, link); | |
760 | active_tdio->head.qlength--; | |
761 | free_slots--; | |
762 | ||
763 | #if 0 | |
764 | dsched_strategy_request_polling(diskctx->dp, bio, diskctx); | |
765 | #endif | |
766 | bio_to_dispatch[bio_index++] = bio; | |
767 | KKASSERT(bio_index <= bfq_diskctx->head.max_tag_queue_depth); | |
768 | continue; | |
769 | } | |
770 | /* | |
771 | * Run out of budget | |
772 | * But this is not because the size of bio is larger | |
773 | * than the complete budget. | |
774 | * If the size of bio is larger than the complete | |
775 | * budget, then use a complete budget to cover it. | |
776 | */ | |
777 | if (bio && (remaining_budget < BIO_SIZE(bio)) && | |
778 | (remaining_budget != active_tdio->budget)) { | |
779 | /* charge budget used */ | |
780 | wf2q_update_vd(active_tdio, active_tdio->budget - remaining_budget); | |
781 | bfq_expire(bfq_diskctx, active_tdio, BFQ_REASON_OUT_OF_BUDGET); | |
782 | wf2q_insert_thread_io(&bfq_diskctx->bfq_wf2q, active_tdio); | |
783 | dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: thread %p ran out of budget\n", active_tdio); | |
784 | DSCHED_THREAD_IO_UNLOCK(&active_tdio->head); | |
785 | active_tdio = NULL; | |
786 | } else { /* if (bio && remaining_budget < BIO_SIZE(bio) && remaining_budget != active_tdio->budget) */ | |
787 | ||
788 | /* | |
789 | * Having enough budget, | |
790 | * or having a complete budget and the size of bio | |
791 | * is larger than that. | |
792 | */ | |
793 | if (bio) { | |
794 | /* dispatch */ | |
795 | remaining_budget -= BIO_SIZE(bio); | |
796 | /* | |
797 | * The size of the first bio is larger | |
798 | * than the whole budget, we should | |
799 | * charge the extra part | |
800 | */ | |
801 | if (remaining_budget < 0) | |
802 | wf2q_update_vd(active_tdio, -remaining_budget); | |
803 | /* compensate */ | |
804 | wf2q_update_vd(active_tdio, -remaining_budget); | |
805 | /* | |
806 | * remaining_budget may be < 0, | |
807 | * but to prevent the budget of current tdio | |
808 | * to substract a negative number, | |
809 | * the remaining_budget has to be >= 0 | |
810 | */ | |
811 | remaining_budget = MAX(0, remaining_budget); | |
812 | dsched_debug(BFQ_DEBUG_NORMAL, "bfq: remove bio %p from the queue of %p\n", bio, active_tdio); | |
813 | KKASSERT(lockstatus(&active_tdio->head.lock, curthread) == LK_EXCLUSIVE); | |
814 | TAILQ_REMOVE(&active_tdio->head.queue, bio, link); | |
815 | free_slots--; | |
816 | active_tdio->head.qlength--; | |
817 | active_tdio->bio_dispatched++; | |
818 | wf2q_inc_tot_service(&bfq_diskctx->bfq_wf2q, BIO_SIZE(bio)); | |
819 | dsched_debug(BFQ_DEBUG_VERBOSE, | |
820 | "BFQ: %p's bio dispatched, size=%d, remaining_budget = %d\n", | |
821 | active_tdio, BIO_SIZE(bio), remaining_budget); | |
822 | #if 0 | |
823 | dsched_strategy_request_polling(diskctx->dp, bio, diskctx); | |
824 | #endif | |
825 | bio_to_dispatch[bio_index++] = bio; | |
826 | KKASSERT(bio_index <= bfq_diskctx->head.max_tag_queue_depth); | |
827 | ||
828 | } else { /* if (bio) */ | |
829 | ||
830 | KKASSERT(active_tdio); | |
831 | /* | |
832 | * If AS feature is switched off, | |
833 | * expire the tdio as well | |
834 | */ | |
835 | if ((remaining_budget <= 0) || | |
836 | !(bfq_diskctx->bfq_flag & BFQ_FLAG_AS) || | |
837 | !active_tdio->tdio_as_switch) { | |
838 | active_tdio->budget -= remaining_budget; | |
839 | wf2q_update_vd(active_tdio, active_tdio->budget); | |
840 | bfq_expire(bfq_diskctx, active_tdio, BFQ_REASON_OUT_OF_BUDGET); | |
841 | DSCHED_THREAD_IO_UNLOCK(&active_tdio->head); | |
842 | active_tdio = NULL; | |
843 | } else { | |
844 | ||
845 | /* no further bio, wait for a while */ | |
846 | bfq_diskctx->bfq_blockon = active_tdio; | |
847 | /* | |
848 | * Increase ref count to ensure that | |
849 | * tdio will not be destroyed during waiting. | |
850 | */ | |
851 | dsched_thread_io_ref(&active_tdio->head); | |
852 | /* | |
853 | * If the tdio is seeky but not thingking for | |
854 | * too long, we wait for it a little shorter | |
855 | */ | |
856 | if (active_tdio->seek_samples >= BFQ_VALID_MIN_SAMPLES && BFQ_TDIO_SEEKY(active_tdio)) | |
857 | callout_reset(&bfq_diskctx->bfq_callout, BFQ_T_WAIT_MIN, (void (*) (void *))helper_msg_as_timeout, bfq_diskctx); | |
858 | else | |
859 | callout_reset(&bfq_diskctx->bfq_callout, BFQ_T_WAIT, (void (*) (void *))helper_msg_as_timeout, bfq_diskctx); | |
860 | ||
861 | /* save the start time of blocking */ | |
862 | getmicrotime(&active_tdio->as_start_time); | |
863 | ||
864 | dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: blocked on %p, remaining_budget = %d\n", active_tdio, remaining_budget); | |
865 | DSCHED_THREAD_IO_UNLOCK(&active_tdio->head); | |
866 | goto save_and_rtn; | |
867 | } | |
868 | } | |
869 | } | |
870 | } else { /* if (active_tdio) */ | |
871 | /* there is no active tdio */ | |
872 | ||
873 | /* no pending bios at all */ | |
874 | active_tdio = wf2q_get_next_thread_io(&bfq_diskctx->bfq_wf2q); | |
875 | ||
876 | if (!active_tdio) { | |
877 | KKASSERT(bfq_diskctx->bfq_wf2q.wf2q_tdio_count == 0); | |
878 | dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: no more eligible tdio!\n"); | |
879 | goto save_and_rtn; | |
880 | } | |
881 | ||
882 | /* | |
883 | * A new tdio is picked, | |
884 | * initialize the service related statistic data | |
885 | */ | |
886 | DSCHED_THREAD_IO_LOCK(&active_tdio->head); | |
887 | active_tdio->service_received = 0; | |
888 | ||
889 | /* | |
890 | * Reset the maybe_timeout flag, which | |
891 | * may be set by a biodone after the the service is done | |
892 | */ | |
893 | getmicrotime(&active_tdio->service_start_time); | |
894 | active_tdio->maybe_timeout = 0; | |
895 | ||
896 | remaining_budget = active_tdio->budget; | |
897 | dsched_debug(BFQ_DEBUG_VERBOSE, "bfq: active_tdio %p selected, remaining budget = %d, tdio budget = %d\n, qlength = %d\n", active_tdio, remaining_budget, active_tdio->budget, active_tdio->head.qlength); | |
898 | } | |
899 | ||
900 | }/* while (free_slots) */ | |
901 | ||
902 | /* reach here only when free_slots == 0 */ | |
903 | if (active_tdio) /* && lockcount(&active_tdio->head.lock) > 0) */ | |
904 | DSCHED_THREAD_IO_UNLOCK(&active_tdio->head); | |
905 | ||
906 | save_and_rtn: | |
907 | /* save the remaining budget */ | |
908 | bfq_diskctx->bfq_remaining_budget = remaining_budget; | |
909 | bfq_diskctx->bfq_active_tdio = active_tdio; | |
910 | rtn: | |
911 | BFQ_UNLOCK(bfq_diskctx); | |
912 | /*dispatch the planned bios*/ | |
913 | for (i = 0; i < bio_index; i++) | |
914 | dsched_strategy_request_polling(diskctx->dp, bio_to_dispatch[i], diskctx); | |
915 | ||
916 | } | |
917 | ||
918 | /* | |
919 | * bfq_slow_tdio(): decide whether a tdio is slow | |
920 | * | |
921 | * This function decides whether a tdio is slow by the speed | |
922 | * estimated from the current time slice start time: if the | |
923 | * tdio is not fast enough to consume its budget (or 2/3 | |
924 | * its budget) within the time slice, it is judged slow. | |
925 | * | |
926 | * Called by bfq_expire() | |
927 | * | |
928 | * lock: | |
929 | * THREAD_IO_LOCK is expected to be held. | |
930 | * refcount: | |
931 | * none | |
932 | * | |
933 | */ | |
934 | static int | |
935 | bfq_slow_tdio(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio) | |
936 | { | |
937 | /** | |
938 | * A tdio is considered slow if it can not finish its budget | |
939 | * at its current average speed | |
940 | */ | |
941 | uint64_t usec_elapsed, service_received, speed; | |
942 | int expect; | |
943 | struct timeval tv = bfq_tdio->last_request_done_time; | |
944 | ||
945 | timevalsub (&tv, &bfq_tdio->service_start_time); | |
946 | usec_elapsed = (uint64_t)(1000000 * (uint64_t)tv.tv_sec + tv.tv_usec); | |
947 | ||
948 | /* discard absurd value */ | |
949 | if (usec_elapsed < 20000) | |
950 | return 0; | |
951 | ||
952 | service_received = (uint64_t)bfq_tdio->service_received << BFQ_FIXPOINT_SHIFT; | |
953 | speed = service_received / usec_elapsed; | |
954 | expect = (speed * BFQ_SLICE_TIMEOUT * (1000 * 1000 / hz)) >> BFQ_FIXPOINT_SHIFT; | |
955 | ||
956 | if (expect < 0) { | |
957 | dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: overflow on calculating slow_tdio\n"); | |
958 | return 0; | |
959 | } | |
960 | ||
961 | if (expect < bfq_tdio->budget * 2 / 3) { | |
962 | dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: %p is judged slow\n", bfq_tdio); | |
963 | return 1; | |
964 | } | |
965 | ||
966 | return 0; | |
967 | } | |
968 | ||
969 | /* | |
970 | * bfq_expire(): expire a tdio for a given reason. | |
971 | * | |
972 | * Different amount of the new budget will be assign to the expired | |
973 | * tdio according to the following reasons: | |
974 | * | |
975 | * BFQ_REASON_TIMEOUT: | |
976 | * The tdio does not consume its budget up within BFQ_SLICE_TIMEOUT ticks. | |
977 | * We shall update the disk peak rate if the tdio is not seeky. The new | |
978 | * budget will be the budget it actually consumes during this time | |
979 | * slice. | |
980 | * | |
981 | * BFQ_REASON_TOO_IDLE: | |
982 | * The tdio does not push any further bios during the scheduler is | |
983 | * suspending. To ensure low global latency, this tdio should be | |
984 | * punished by assign it the minimum budget. But if the tdio's not | |
985 | * pushing any bio is because it is waiting for the dispatched bios | |
986 | * to be done, we just keep the budget unchanged. | |
987 | * | |
988 | * BFQ_REASON_OUT_OF_BUDGET: | |
989 | * The tdio runs out of its budget within the time slice. It usually | |
990 | * indicates that the tdio is doing well. We increase the budget of it. | |
991 | * | |
992 | * lock: | |
993 | * THREAD_IO_LOCK is expected to be held. | |
994 | * BFQ_LOCK is expected to be held (needed by bfq_update_peak_rate()). | |
995 | * | |
996 | * refcount: none | |
997 | * | |
998 | * Callers: bfq_timeout(), bfq_dequeue() | |
999 | * | |
1000 | */ | |
1001 | static void | |
1002 | bfq_expire(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio, enum bfq_expire_reason reason) | |
1003 | { | |
1004 | int max_budget = bfq_diskctx->bfq_max_budget, | |
1005 | budget_left, | |
1006 | bio_in_flight, | |
1007 | service_received; | |
1008 | ||
1009 | service_received = bfq_tdio->service_received; | |
1010 | budget_left = bfq_tdio->budget - bfq_tdio->service_received; | |
1011 | ||
1012 | if (budget_left < 0) { | |
1013 | dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: budget down flow: %d, %d\n", bfq_tdio->budget, bfq_tdio->service_received); | |
1014 | budget_left = 0; | |
1015 | } | |
1016 | ||
1017 | KKASSERT(budget_left >= 0); | |
1018 | ||
1019 | switch (reason) { | |
1020 | case BFQ_REASON_TIMEOUT: | |
1021 | /* the tdio is not seeky so that we can update | |
1022 | * the disk peak rate based on the service received | |
1023 | * by the tdio | |
1024 | */ | |
1025 | if ((bfq_tdio->seek_samples >= BFQ_VALID_MIN_SAMPLES) && | |
1026 | (!BFQ_TDIO_SEEKY(bfq_tdio))) | |
1027 | bfq_update_peak_rate(bfq_diskctx, bfq_tdio); | |
1028 | ||
1029 | /* max_budget may be updated */ | |
1030 | max_budget = bfq_diskctx->bfq_max_budget; | |
1031 | ||
1032 | /* update budget to service_received*/ | |
1033 | bfq_tdio->budget = MAX(service_received, BFQ_DEFAULT_MIN_BUDGET); | |
1034 | ||
1035 | break; | |
1036 | ||
1037 | case BFQ_REASON_TOO_IDLE: | |
1038 | /* | |
1039 | * the tdio is too slow, charge full budget | |
1040 | */ | |
1041 | if (bfq_slow_tdio(bfq_diskctx, bfq_tdio)) | |
1042 | wf2q_update_vd(bfq_tdio, budget_left); | |
1043 | ||
1044 | bio_in_flight = bfq_tdio->bio_dispatched - bfq_tdio->bio_completed; | |
1045 | KKASSERT(bio_in_flight >= 0); | |
1046 | /* | |
1047 | * maybe the tdio pushes no bio | |
1048 | * because it is waiting for some bios | |
1049 | * dispatched to be done, in this case | |
1050 | * we do not reduce the budget too harshly | |
1051 | */ | |
1052 | if (bio_in_flight > 0) { | |
1053 | bfq_tdio->budget = MAX(BFQ_DEFAULT_MIN_BUDGET, service_received); | |
1054 | } else { | |
1055 | #if 0 | |
1056 | bfq_tdio->budget = MAX(BFQ_DEFAULT_MIN_BUDGET, bfq_diskctx->bfq_max_budget / BFQ_MIN_BUDGET_FACTOR); | |
1057 | #endif | |
1058 | bfq_tdio->budget = BFQ_DEFAULT_MIN_BUDGET; | |
1059 | } | |
1060 | ||
1061 | break; | |
1062 | case BFQ_REASON_OUT_OF_BUDGET: | |
1063 | ||
1064 | if ((bfq_tdio->seek_samples >= BFQ_VALID_MIN_SAMPLES) && | |
1065 | (!BFQ_TDIO_SEEKY(bfq_tdio))) | |
1066 | bfq_update_peak_rate(bfq_diskctx, bfq_tdio); | |
1067 | ||
1068 | /* increase the budget */ | |
1069 | if (bfq_tdio->budget < BFQ_BUDGET_MULTIPLE_THRESHOLD) | |
1070 | bfq_tdio->budget = MIN(max_budget, bfq_tdio->budget * 2); | |
1071 | else | |
1072 | bfq_tdio->budget = MIN(max_budget, bfq_tdio->budget + BFQ_BUDG_INC_STEP); | |
1073 | break; | |
1074 | default: | |
1075 | break; | |
1076 | } | |
1077 | } | |
1078 | ||
1079 | /* | |
1080 | * bfq_update_peak_rate(): update the peak disk speed by sampling | |
1081 | * the throughput within a time slice. | |
1082 | * | |
1083 | * lock: | |
1084 | * BFQ_LOCK is expected to be held | |
1085 | * | |
1086 | * refcount: | |
1087 | * none | |
1088 | * | |
1089 | * Caller: bfq_expire() | |
1090 | */ | |
1091 | static void | |
1092 | bfq_update_peak_rate(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio) | |
1093 | { | |
1094 | struct timeval tv = bfq_tdio->last_request_done_time; | |
1095 | uint64_t usec, service_received, peak_rate; | |
1096 | ||
1097 | ||
1098 | timevalsub (&tv, &bfq_tdio->service_start_time); | |
1099 | usec = (uint64_t)(1000000 * (uint64_t)tv.tv_sec + tv.tv_usec); | |
1100 | ||
1101 | /* discard absurd value */ | |
1102 | if (usec < 2000 || usec > (BFQ_SLICE_TIMEOUT * (1000 / hz) * 1000)) { | |
1103 | dsched_debug(BFQ_DEBUG_NORMAL, "BFQ: absurd interval for peak rate\n"); | |
1104 | return; | |
1105 | } | |
1106 | ||
1107 | service_received = (uint64_t)bfq_tdio->service_received << BFQ_FIXPOINT_SHIFT; | |
1108 | peak_rate = service_received / usec; | |
1109 | bfq_diskctx->bfq_peak_rate = (peak_rate + 7 * bfq_diskctx->bfq_peak_rate) / 8; | |
1110 | bfq_diskctx->bfq_peak_rate_samples++; | |
1111 | ||
1112 | /* update the max_budget according to the peak rate */ | |
1113 | if (bfq_diskctx->bfq_peak_rate_samples > BFQ_VALID_MIN_SAMPLES) { | |
1114 | bfq_diskctx->bfq_peak_rate_samples = BFQ_VALID_MIN_SAMPLES; | |
1115 | /* | |
1116 | * if the auto max budget adjust is disabled, | |
1117 | * the bfq_max_budget will always be BFQ_DEFAULT_MAX_BUDGET; | |
1118 | */ | |
1119 | if (bfq_diskctx->bfq_flag & BFQ_FLAG_AUTO_MAX_BUDGET) { | |
1120 | bfq_diskctx->bfq_max_budget = | |
1121 | (uint32_t)((BFQ_SLICE_TIMEOUT * (1000 / hz) * bfq_diskctx->bfq_peak_rate * 1000) >> BFQ_FIXPOINT_SHIFT); | |
1122 | dsched_debug(BFQ_DEBUG_NORMAL, "max budget updated to %d\n", bfq_diskctx->bfq_max_budget); | |
1123 | } | |
1124 | } | |
1125 | } | |
1126 | ||
1127 | /* | |
1128 | * bfq_update_tdio_seek_avg(): update the average seek distance of a | |
1129 | * tdio. | |
1130 | * | |
1131 | * lock: | |
1132 | * THREAD_IO_LOCK is expected to be held. | |
1133 | * | |
1134 | * refcount: | |
1135 | * none | |
1136 | * | |
1137 | * Caller: bfq_queue() | |
1138 | */ | |
1139 | static void | |
1140 | bfq_update_tdio_seek_avg(struct bfq_thread_io *bfq_tdio, struct bio *bp) | |
1141 | { | |
1142 | off_t seek; | |
1143 | ||
1144 | /* the first bio it dispatches, | |
1145 | * we do not calculate the seek_avg, | |
1146 | * just update the last_seek_end | |
1147 | */ | |
1148 | if (bfq_tdio->seek_samples == 0) { | |
1149 | ++bfq_tdio->seek_samples; | |
1150 | goto rtn; | |
1151 | } | |
1152 | ||
1153 | seek = ABS(bp->bio_offset - bfq_tdio->last_seek_end); | |
1154 | ||
1155 | /* | |
1156 | * we do not do seek_samples++, | |
1157 | * because the seek_total may overflow if seek_total += seek, | |
1158 | */ | |
1159 | bfq_tdio->seek_samples = (7 * bfq_tdio->seek_samples + 256) / 8; | |
1160 | bfq_tdio->seek_total = (7 * bfq_tdio->seek_total + 256 * seek) / 8; | |
1161 | bfq_tdio->seek_avg = (bfq_tdio->seek_total + bfq_tdio->seek_samples / 2) / bfq_tdio->seek_samples; | |
1162 | ||
1163 | dsched_debug(BFQ_DEBUG_VERBOSE, "BFQ: tdio %p seek_avg updated to %" PRIu64 "\n", bfq_tdio, bfq_tdio->seek_avg); | |
1164 | ||
1165 | rtn: | |
1166 | bfq_tdio->last_seek_end = bp->bio_offset + BIO_SIZE(bp); | |
1167 | } | |
1168 | ||
1169 | /* | |
1170 | * bfq_update_tdio_ttime_avg(): update the average thinking time | |
1171 | * of a tdio. | |
1172 | * | |
1173 | * The thinking time is used to switch on / off the tdio's AS feature | |
1174 | * | |
1175 | * lock: | |
1176 | * THREAD_IO_LOCK is expected to be held. | |
1177 | * | |
1178 | * refcount: | |
1179 | * none | |
1180 | * | |
1181 | * Caller: | |
1182 | * bfq_queue() | |
1183 | * | |
1184 | */ | |
1185 | static void | |
1186 | bfq_update_tdio_ttime_avg(struct bfq_thread_io *bfq_tdio) | |
1187 | { | |
1188 | struct timeval tv, after_start; | |
1189 | uint64_t usec; | |
1190 | ||
1191 | if (bfq_tdio->ttime_samples == 0) { | |
1192 | ++bfq_tdio->ttime_samples; | |
1193 | return; | |
1194 | } | |
1195 | ||
1196 | getmicrotime(&tv); | |
1197 | after_start = bfq_tdio->last_request_done_time; | |
1198 | ||
1199 | #if 0 | |
1200 | timevalsub (&tv, &bfq_tdio->last_request_done_time); | |
1201 | #endif | |
1202 | /* | |
1203 | * Try the interval between two bios are pushed, | |
1204 | * instead of between last_request_done_time and | |
1205 | * the current time. | |
1206 | */ | |
1207 | ||
1208 | timevalsub (&tv, &bfq_tdio->last_bio_pushed_time); | |
1209 | ||
1210 | timevalsub (&after_start, &bfq_tdio->service_start_time); | |
1211 | ||
1212 | /* | |
1213 | * tv.tv_sec < 0 means the last reauest done time is | |
1214 | * after the current time. | |
1215 | * this may happen because the biodone function is not blocked | |
1216 | * | |
1217 | * after_start.tv_sec < 0 means that the last bio done happens | |
1218 | * before the current service slice, and we should drop this value. | |
1219 | */ | |
1220 | if (tv.tv_sec < 0 || after_start.tv_sec < 0) | |
1221 | return; | |
1222 | ||
1223 | usec = (uint64_t)(1000000 * (uint64_t)tv.tv_sec + tv.tv_usec); | |
1224 | ||
1225 | bfq_tdio->ttime_samples = (7 * bfq_tdio->ttime_samples + 256) / 8; | |
1226 | bfq_tdio->ttime_total = (7 * bfq_tdio->ttime_total + 256 * usec) / 8; | |
1227 | bfq_tdio->ttime_avg = (bfq_tdio->ttime_total + 128) / bfq_tdio->ttime_samples; | |
1228 | ||
1229 | } | |
1230 | ||
1231 | /* | |
1232 | * This function will also update the bfq_max_time_slice field | |
1233 | * | |
1234 | * tv: the timeval structure representing the length of time slice | |
1235 | */ | |
1236 | static void | |
1237 | bfq_update_avg_time_slice(struct bfq_disk_ctx *bfq_diskctx, struct timeval tv) | |
1238 | { | |
1239 | uint32_t msec; | |
1240 | ||
1241 | msec = ((uint64_t)(1000000 * (uint64_t)tv.tv_sec + tv.tv_usec) >> 10 ); | |
1242 | ||
1243 | if (msec > 3 * BFQ_SLICE_TIMEOUT * (1000 / hz)) | |
1244 | atomic_add_int(&bfq_diskctx->bfq_high_time_slice_count, 1); | |
1245 | ||
1246 | bfq_diskctx->bfq_avg_time_slice = | |
1247 | (7 * bfq_diskctx->bfq_avg_time_slice + msec) / 8; | |
1248 | ||
1249 | if (bfq_diskctx->bfq_max_time_slice < msec) | |
1250 | bfq_diskctx->bfq_max_time_slice = msec; | |
1251 | } | |
1252 | /* | |
1253 | * This function will also update the bfq_as_max_wait field | |
1254 | * flag: BFQ_AS_STAT_ALL, BFQ_AS_STAT_ONLY_MISS | |
1255 | * | |
1256 | */ | |
1257 | static void | |
1258 | bfq_update_as_avg_wait(struct bfq_disk_ctx *bfq_diskctx, struct bfq_thread_io *bfq_tdio, int flag) | |
1259 | { | |
1260 | struct timeval tv; | |
1261 | uint32_t msec; | |
1262 | getmicrotime(&tv); | |
1263 | timevalsub (&tv, &bfq_tdio->as_start_time); | |
1264 | ||
1265 | /* approximately divide 1000 by left shift 10 */ | |
1266 | msec = ((uint64_t)(1000000 * (uint64_t)tv.tv_sec + tv.tv_usec) >> 10 ); | |
1267 | ||
1268 | /* ridiculous value */ | |
1269 | if (msec > 10000) { | |
1270 | dsched_debug(BFQ_DEBUG_NORMAL, "bfq: ridiculous as wait time!\n"); | |
1271 | return; | |
1272 | } | |
1273 | ||
1274 | if (msec > 5 * BFQ_T_WAIT_MIN * (1000 / hz)) | |
1275 | atomic_add_int(&bfq_diskctx->bfq_as_high_wait_count, 1); | |
1276 | ||
1277 | if (flag & BFQ_AS_STAT_ALL) { | |
1278 | bfq_diskctx->bfq_as_avg_wait_all = | |
1279 | (7 * bfq_diskctx->bfq_as_avg_wait_all + msec) / 8; | |
1280 | } | |
1281 | ||
1282 | if (flag & BFQ_AS_STAT_ONLY_MISS) { | |
1283 | bfq_diskctx->bfq_as_avg_wait_miss = | |
1284 | (7 * bfq_diskctx->bfq_as_avg_wait_miss + msec) / 8; | |
1285 | } | |
1286 | ||
1287 | /* update the maximum waiting time */ | |
1288 | if (bfq_diskctx->bfq_as_max_wait < msec) | |
1289 | bfq_diskctx->bfq_as_max_wait = msec; | |
1290 | ||
1291 | return; | |
1292 | } | |
1293 | ||
1294 | static int | |
1295 | bfq_mod_handler(module_t mod, int type, void *unused) | |
1296 | { | |
1297 | static struct sysctl_ctx_list sysctl_ctx; | |
1298 | static struct sysctl_oid *oid; | |
1299 | static char version[16]; | |
1300 | int error; | |
1301 | ||
1302 | ksnprintf(version, sizeof(version), "%d.%d", | |
1303 | dsched_bfq_version_maj, dsched_bfq_version_min); | |
1304 | ||
1305 | switch (type) { | |
1306 | case MOD_LOAD: | |
1307 | bzero(&bfq_stats, sizeof(struct dsched_bfq_stats)); | |
1308 | if ((error = dsched_register(&dsched_bfq_policy))) | |
1309 | return (error); | |
1310 | ||
1311 | sysctl_ctx_init(&sysctl_ctx); | |
1312 | oid = SYSCTL_ADD_NODE(&sysctl_ctx, | |
1313 | SYSCTL_STATIC_CHILDREN(_dsched), | |
1314 | OID_AUTO, | |
1315 | "bfq", | |
1316 | CTLFLAG_RD, 0, ""); | |
1317 | bfq_mod_oid = oid; | |
1318 | ||
1319 | SYSCTL_ADD_STRING(&sysctl_ctx, SYSCTL_CHILDREN(oid), | |
1320 | OID_AUTO, "version", CTLFLAG_RD, version, 0, "bfq version"); | |
1321 | helper_init_global(); | |
1322 | ||
1323 | kprintf("BFQ scheduler policy version %d.%d loaded. sizeof(bfq_thread_io) = %zu\n", | |
1324 | dsched_bfq_version_maj, dsched_bfq_version_min, sizeof(struct bfq_thread_io)); | |
1325 | break; | |
1326 | ||
1327 | case MOD_UNLOAD: | |
1328 | if ((error = dsched_unregister(&dsched_bfq_policy))) | |
1329 | return (error); | |
1330 | sysctl_ctx_free(&sysctl_ctx); | |
1331 | kprintf("BFQ scheduler policy unloaded\n"); | |
1332 | break; | |
1333 | ||
1334 | default: | |
1335 | break; | |
1336 | } | |
1337 | ||
1338 | return 0; | |
1339 | } | |
1340 | ||
1341 | int | |
1342 | bfq_sysctl_as_switch_handler(SYSCTL_HANDLER_ARGS) | |
1343 | { | |
1344 | struct bfq_disk_ctx *bfq_diskctx = arg1; | |
1345 | int as_switch, error; | |
1346 | ||
1347 | as_switch = ((bfq_diskctx->bfq_flag & BFQ_FLAG_AS) ? 1 : 0); | |
1348 | error = sysctl_handle_int(oidp, &as_switch, 0, req); | |
1349 | if (error || !req->newptr) | |
1350 | return error; | |
1351 | ||
1352 | if (as_switch == 1) | |
1353 | bfq_diskctx->bfq_flag |= BFQ_FLAG_AS; | |
1354 | else if (as_switch == 0) | |
1355 | bfq_diskctx->bfq_flag &= ~(BFQ_FLAG_AS); | |
1356 | else | |
1357 | return 0; | |
1358 | ||
1359 | return error; | |
1360 | } | |
1361 | ||
1362 | int | |
1363 | bfq_sysctl_auto_max_budget_handler(SYSCTL_HANDLER_ARGS) | |
1364 | { | |
1365 | struct bfq_disk_ctx *bfq_diskctx = arg1; | |
1366 | int auto_max_budget_switch, error; | |
1367 | auto_max_budget_switch = ((bfq_diskctx->bfq_flag & BFQ_FLAG_AUTO_MAX_BUDGET) ? 1 : 0); | |
1368 | error = sysctl_handle_int(oidp, &auto_max_budget_switch, 0, req); | |
1369 | if (error || !req->newptr) | |
1370 | return error; | |
1371 | ||
1372 | if (auto_max_budget_switch == 1) | |
1373 | bfq_diskctx->bfq_flag |= BFQ_FLAG_AUTO_MAX_BUDGET; | |
1374 | else if (auto_max_budget_switch == 0) | |
1375 | bfq_diskctx->bfq_flag &= ~(BFQ_FLAG_AUTO_MAX_BUDGET); | |
1376 | else | |
1377 | return 0; | |
1378 | ||
1379 | return error; | |
1380 | } | |
1381 | ||
1f509c0d | 1382 | DSCHED_POLICY_MODULE(dsched_bfq, bfq_mod_handler, 1); |