Commit | Line | Data |
---|---|---|
74ce043b AH |
1 | /* |
2 | * Copyright (c) 2009, 2010 The DragonFly Project. All rights reserved. | |
3 | * | |
4 | * This code is derived from software contributed to The DragonFly Project | |
5 | * by Alex Hornung <ahornung@gmail.com> | |
6 | * | |
7 | * Redistribution and use in source and binary forms, with or without | |
8 | * modification, are permitted provided that the following conditions | |
9 | * are met: | |
10 | * | |
11 | * 1. Redistributions of source code must retain the above copyright | |
12 | * notice, this list of conditions and the following disclaimer. | |
13 | * 2. Redistributions in binary form must reproduce the above copyright | |
14 | * notice, this list of conditions and the following disclaimer in | |
15 | * the documentation and/or other materials provided with the | |
16 | * distribution. | |
17 | * 3. Neither the name of The DragonFly Project nor the names of its | |
18 | * contributors may be used to endorse or promote products derived | |
19 | * from this software without specific, prior written permission. | |
20 | * | |
21 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
22 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
23 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS | |
24 | * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE | |
25 | * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, | |
26 | * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, | |
27 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
28 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED | |
29 | * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
30 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT | |
31 | * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
32 | * SUCH DAMAGE. | |
33 | */ | |
34 | #include <sys/param.h> | |
35 | #include <sys/systm.h> | |
36 | #include <sys/kernel.h> | |
37 | #include <sys/proc.h> | |
38 | #include <sys/sysctl.h> | |
39 | #include <sys/buf.h> | |
40 | #include <sys/conf.h> | |
41 | #include <sys/diskslice.h> | |
42 | #include <sys/disk.h> | |
43 | #include <machine/atomic.h> | |
74ce043b AH |
44 | #include <sys/thread.h> |
45 | #include <sys/thread2.h> | |
74ce043b | 46 | #include <sys/ctype.h> |
74ce043b | 47 | #include <sys/buf2.h> |
e02e815e | 48 | #include <sys/syslog.h> |
74ce043b | 49 | #include <sys/dsched.h> |
74ce043b AH |
50 | #include <machine/param.h> |
51 | ||
c6d593c9 | 52 | #include <kern/dsched/fq/fq.h> |
74ce043b | 53 | |
e02e815e | 54 | static int dsched_fq_version_maj = 1; |
009da86a | 55 | static int dsched_fq_version_min = 1; |
74ce043b | 56 | |
e02e815e AH |
57 | /* Make sure our structs fit */ |
58 | CTASSERT(sizeof(struct fq_thread_io) <= DSCHED_THREAD_IO_MAX_SZ); | |
59 | CTASSERT(sizeof(struct fq_disk_ctx) <= DSCHED_DISK_CTX_MAX_SZ); | |
74ce043b AH |
60 | |
61 | struct dsched_fq_stats fq_stats; | |
62 | ||
0160356d | 63 | extern struct dsched_policy dsched_fq_policy; |
74ce043b | 64 | |
74ce043b | 65 | void |
0160356d | 66 | fq_dispatcher(struct fq_disk_ctx *diskctx) |
74ce043b | 67 | { |
7c76e73a | 68 | struct dispatch_prep *dispatch_ary; |
e02e815e AH |
69 | struct dsched_thread_io *ds_tdio, *ds_tdio2; |
70 | struct fq_thread_io *tdio; | |
74ce043b | 71 | struct bio *bio, *bio2; |
0f0e78e2 | 72 | int idle; |
009da86a | 73 | int i, prepd_io; |
74ce043b | 74 | |
7c76e73a MD |
75 | /* |
76 | * Array is dangerously big for an on-stack declaration, allocate | |
77 | * it instead. | |
78 | */ | |
79 | dispatch_ary = kmalloc(sizeof(*dispatch_ary) * FQ_DISPATCH_ARRAY_SZ, | |
80 | M_TEMP, M_INTWAIT | M_ZERO); | |
81 | ||
aa166ad1 | 82 | /* |
0160356d | 83 | * We need to manually assign an tdio to the tdctx of this thread |
aa166ad1 AH |
84 | * since it isn't assigned one during fq_prepare, as the disk |
85 | * is not set up yet. | |
86 | */ | |
89dabacd AH |
87 | tdio = (struct fq_thread_io *)dsched_new_policy_thread_tdio(&diskctx->head, |
88 | &dsched_fq_policy); | |
aa166ad1 | 89 | |
e02e815e | 90 | DSCHED_DISK_CTX_LOCK(&diskctx->head); |
74ce043b | 91 | for(;;) { |
9cc004d0 | 92 | idle = 0; |
7c76e73a MD |
93 | /* |
94 | * sleep ~60 ms, failsafe low hz rates. | |
95 | */ | |
96 | if ((lksleep(diskctx, &diskctx->head.lock, 0, | |
97 | "fq_dispatcher", (hz + 14) / 15) == 0)) { | |
0f0e78e2 AH |
98 | /* |
99 | * We've been woken up; this either means that we are | |
100 | * supposed to die away nicely or that the disk is idle. | |
101 | */ | |
102 | ||
0160356d | 103 | if (__predict_false(diskctx->die == 1)) { |
173c72b7 | 104 | /* If we are supposed to die, drain all queues */ |
0160356d | 105 | fq_drain(diskctx, FQ_DRAIN_FLUSH); |
173c72b7 AH |
106 | |
107 | /* Now we can safely unlock and exit */ | |
e02e815e | 108 | DSCHED_DISK_CTX_UNLOCK(&diskctx->head); |
9cc004d0 AH |
109 | kprintf("fq_dispatcher is peacefully dying\n"); |
110 | lwkt_exit(); | |
111 | /* NOTREACHED */ | |
9cc004d0 | 112 | } |
0f0e78e2 AH |
113 | |
114 | /* | |
115 | * We have been awakened because the disk is idle. | |
116 | * So let's get ready to dispatch some extra bios. | |
117 | */ | |
118 | idle = 1; | |
74ce043b AH |
119 | } |
120 | ||
0f0e78e2 | 121 | /* Maybe the disk is idle and we just didn't get the wakeup */ |
9cc004d0 | 122 | if (idle == 0) |
0160356d | 123 | idle = diskctx->idle; |
9cc004d0 | 124 | |
009da86a AH |
125 | /* Set the number of prepared requests to 0 */ |
126 | i = 0; | |
127 | ||
0f0e78e2 AH |
128 | /* |
129 | * XXX: further room for improvements here. It would be better | |
0160356d | 130 | * to dispatch a few requests from each tdio as to ensure |
0f0e78e2 AH |
131 | * real fairness. |
132 | */ | |
e02e815e AH |
133 | TAILQ_FOREACH_MUTABLE(ds_tdio, &diskctx->head.tdio_list, dlink, ds_tdio2) { |
134 | tdio = (struct fq_thread_io *)ds_tdio; | |
135 | if (tdio->head.qlength == 0) | |
0f0e78e2 AH |
136 | continue; |
137 | ||
e02e815e | 138 | DSCHED_THREAD_IO_LOCK(&tdio->head); |
0160356d AH |
139 | if (atomic_cmpset_int(&tdio->rebalance, 1, 0)) |
140 | fq_balance_self(tdio); | |
0f0e78e2 AH |
141 | /* |
142 | * XXX: why 5 extra? should probably be dynamic, | |
143 | * relying on information on latency. | |
144 | */ | |
0160356d AH |
145 | if ((tdio->max_tp > 0) && idle && |
146 | (tdio->issued >= tdio->max_tp)) { | |
147 | tdio->max_tp += 5; | |
0f0e78e2 AH |
148 | } |
149 | ||
009da86a | 150 | prepd_io = 0; |
e02e815e | 151 | TAILQ_FOREACH_MUTABLE(bio, &tdio->head.queue, link, bio2) { |
0160356d AH |
152 | if (atomic_cmpset_int(&tdio->rebalance, 1, 0)) |
153 | fq_balance_self(tdio); | |
009da86a AH |
154 | if (((tdio->max_tp > 0) && |
155 | (tdio->issued + prepd_io >= tdio->max_tp)) || | |
156 | (i == FQ_DISPATCH_ARRAY_SZ)) | |
0f0e78e2 AH |
157 | break; |
158 | ||
e02e815e AH |
159 | TAILQ_REMOVE(&tdio->head.queue, bio, link); |
160 | --tdio->head.qlength; | |
74ce043b | 161 | |
9cc004d0 | 162 | /* |
0160356d | 163 | * beware that we do have an tdio reference |
0f0e78e2 | 164 | * from the queueing |
009da86a AH |
165 | * |
166 | * XXX: note that here we don't dispatch it yet | |
167 | * but just prepare it for dispatch so | |
168 | * that no locks are held when calling | |
169 | * into the drivers. | |
9cc004d0 | 170 | */ |
009da86a AH |
171 | dispatch_ary[i].bio = bio; |
172 | dispatch_ary[i].tdio = tdio; | |
173 | ++i; | |
174 | ++prepd_io; | |
74ce043b | 175 | } |
e02e815e | 176 | DSCHED_THREAD_IO_UNLOCK(&tdio->head); |
0f0e78e2 | 177 | |
74ce043b | 178 | } |
009da86a AH |
179 | |
180 | dsched_disk_ctx_ref(&diskctx->head); | |
181 | DSCHED_DISK_CTX_UNLOCK(&diskctx->head); | |
182 | ||
183 | /* | |
184 | * Dispatch all the previously prepared bios, now without | |
185 | * holding any locks. | |
186 | */ | |
187 | for (--i; i >= 0; i--) { | |
188 | bio = dispatch_ary[i].bio; | |
189 | tdio = dispatch_ary[i].tdio; | |
190 | fq_dispatch(diskctx, bio, tdio); | |
191 | } | |
192 | ||
193 | DSCHED_DISK_CTX_LOCK(&diskctx->head); | |
194 | dsched_disk_ctx_unref(&diskctx->head); | |
74ce043b AH |
195 | } |
196 | } | |
197 | ||
74ce043b | 198 | void |
0160356d | 199 | fq_balance_thread(struct fq_disk_ctx *diskctx) |
74ce043b | 200 | { |
e02e815e AH |
201 | struct dsched_thread_io *ds_tdio; |
202 | struct fq_thread_io *tdio; | |
654a51ea | 203 | struct timeval tv, old_tv; |
0746e160 | 204 | int64_t total_budget, product; |
38f2331e | 205 | int64_t budget[FQ_PRIO_MAX+1]; |
e6c2b48a | 206 | int n, i, sum, total_disk_time; |
0746e160 | 207 | int lost_bits; |
74ce043b | 208 | |
e02e815e | 209 | DSCHED_DISK_CTX_LOCK(&diskctx->head); |
654a51ea AH |
210 | |
211 | getmicrotime(&diskctx->start_interval); | |
212 | ||
81b5f250 AH |
213 | for (;;) { |
214 | /* sleep ~1s */ | |
e02e815e | 215 | if ((lksleep(curthread, &diskctx->head.lock, 0, "fq_balancer", hz/2) == 0)) { |
0160356d | 216 | if (__predict_false(diskctx->die)) { |
e02e815e | 217 | DSCHED_DISK_CTX_UNLOCK(&diskctx->head); |
81b5f250 AH |
218 | lwkt_exit(); |
219 | } | |
220 | } | |
221 | ||
38f2331e | 222 | bzero(budget, sizeof(budget)); |
81b5f250 AH |
223 | total_budget = 0; |
224 | n = 0; | |
225 | ||
654a51ea | 226 | old_tv = diskctx->start_interval; |
81b5f250 | 227 | getmicrotime(&tv); |
ef46c87b | 228 | |
ef46c87b AH |
229 | total_disk_time = (int)(1000000*((tv.tv_sec - old_tv.tv_sec)) + |
230 | (tv.tv_usec - old_tv.tv_usec)); | |
41b0c7c3 AH |
231 | |
232 | if (total_disk_time == 0) | |
233 | total_disk_time = 1; | |
234 | ||
ef46c87b | 235 | dsched_debug(LOG_INFO, "total_disk_time = %d\n", total_disk_time); |
ef46c87b | 236 | |
654a51ea | 237 | diskctx->start_interval = tv; |
ef46c87b | 238 | |
0160356d AH |
239 | diskctx->disk_busy = (100*(total_disk_time - diskctx->idle_time)) / total_disk_time; |
240 | if (diskctx->disk_busy < 0) | |
241 | diskctx->disk_busy = 0; | |
74ce043b | 242 | |
0160356d | 243 | diskctx->idle_time = 0; |
0746e160 | 244 | lost_bits = 0; |
d161bce9 | 245 | |
e02e815e AH |
246 | TAILQ_FOREACH(ds_tdio, &diskctx->head.tdio_list, dlink) { |
247 | tdio = (struct fq_thread_io *)ds_tdio; | |
0160356d AH |
248 | tdio->interval_avg_latency = tdio->avg_latency; |
249 | tdio->interval_transactions = tdio->transactions; | |
250 | if (tdio->interval_transactions > 0) { | |
51a4bb9f SW |
251 | product = (int64_t)tdio->interval_avg_latency * |
252 | tdio->interval_transactions; | |
0746e160 AH |
253 | product >>= lost_bits; |
254 | while(total_budget >= INT64_MAX - product) { | |
255 | ++lost_bits; | |
256 | product >>= 1; | |
257 | total_budget >>= 1; | |
258 | } | |
259 | total_budget += product; | |
e02e815e | 260 | ++budget[(tdio->head.p) ? tdio->head.p->p_ionice : 0]; |
38f2331e | 261 | KKASSERT(total_budget >= 0); |
81b5f250 AH |
262 | dsched_debug(LOG_INFO, |
263 | "%d) avg_latency = %d, transactions = %d, ioprio = %d\n", | |
0160356d | 264 | n, tdio->interval_avg_latency, tdio->interval_transactions, |
e02e815e | 265 | (tdio->head.p) ? tdio->head.p->p_ionice : 0); |
81b5f250 AH |
266 | ++n; |
267 | } else { | |
0160356d | 268 | tdio->max_tp = 0; |
81b5f250 | 269 | } |
0160356d AH |
270 | tdio->rebalance = 0; |
271 | tdio->transactions = 0; | |
272 | tdio->avg_latency = 0; | |
273 | tdio->issued = 0; | |
74ce043b | 274 | } |
74ce043b | 275 | |
81b5f250 | 276 | dsched_debug(LOG_INFO, "%d procs competing for disk\n" |
760d1e3d AHJ |
277 | "total_budget = %jd (lost bits = %d)\n" |
278 | "incomplete tp = %d\n", n, (intmax_t)total_budget, | |
0160356d | 279 | lost_bits, diskctx->incomplete_tp); |
d161bce9 | 280 | |
81b5f250 | 281 | if (n == 0) |
aa166ad1 | 282 | continue; |
3ee00e04 | 283 | |
81b5f250 | 284 | sum = 0; |
3ee00e04 | 285 | |
81b5f250 | 286 | for (i = 0; i < FQ_PRIO_MAX+1; i++) { |
38f2331e | 287 | if (budget[i] == 0) |
81b5f250 | 288 | continue; |
38f2331e | 289 | sum += (FQ_PRIO_BIAS+i)*budget[i]; |
81b5f250 | 290 | } |
74ce043b | 291 | |
81b5f250 AH |
292 | if (sum == 0) |
293 | sum = 1; | |
aa166ad1 | 294 | |
81b5f250 | 295 | dsched_debug(LOG_INFO, "sum = %d\n", sum); |
aa166ad1 | 296 | |
81b5f250 | 297 | for (i = 0; i < FQ_PRIO_MAX+1; i++) { |
38f2331e | 298 | if (budget[i] == 0) |
81b5f250 | 299 | continue; |
aa166ad1 | 300 | |
0746e160 AH |
301 | /* |
302 | * XXX: if we still overflow here, we really need to switch to | |
303 | * some more advanced mechanism such as compound int128 or | |
304 | * storing the lost bits so they can be used in the | |
305 | * fq_balance_self. | |
306 | */ | |
0160356d AH |
307 | diskctx->budgetpb[i] = ((FQ_PRIO_BIAS+i)*total_budget/sum) << lost_bits; |
308 | KKASSERT(diskctx->budgetpb[i] >= 0); | |
81b5f250 | 309 | } |
d161bce9 | 310 | |
0160356d | 311 | dsched_debug(4, "disk is %d%% busy\n", diskctx->disk_busy); |
e02e815e AH |
312 | TAILQ_FOREACH(ds_tdio, &diskctx->head.tdio_list, dlink) { |
313 | tdio = (struct fq_thread_io *)ds_tdio; | |
0160356d | 314 | tdio->rebalance = 1; |
e6c2b48a | 315 | } |
74ce043b | 316 | |
0160356d AH |
317 | diskctx->prev_full = diskctx->last_full; |
318 | diskctx->last_full = (diskctx->disk_busy >= 90)?1:0; | |
e6c2b48a AH |
319 | } |
320 | } | |
81b5f250 | 321 | |
81b5f250 | 322 | |
e6c2b48a AH |
323 | /* |
324 | * fq_balance_self should be called from all sorts of dispatchers. It basically | |
325 | * offloads some of the heavier calculations on throttling onto the process that | |
326 | * wants to do I/O instead of doing it in the fq_balance thread. | |
0160356d | 327 | * - should be called with diskctx lock held |
e6c2b48a AH |
328 | */ |
329 | void | |
0160356d AH |
330 | fq_balance_self(struct fq_thread_io *tdio) { |
331 | struct fq_disk_ctx *diskctx; | |
81b5f250 | 332 | |
e6c2b48a AH |
333 | int64_t budget, used_budget; |
334 | int64_t avg_latency; | |
335 | int64_t transactions; | |
74ce043b | 336 | |
0160356d AH |
337 | transactions = (int64_t)tdio->interval_transactions; |
338 | avg_latency = (int64_t)tdio->interval_avg_latency; | |
e02e815e | 339 | diskctx = (struct fq_disk_ctx *)tdio->head.diskctx; |
0160356d AH |
340 | |
341 | #if 0 | |
342 | /* XXX: do we really require the lock? */ | |
e02e815e | 343 | DSCHED_DISK_CTX_LOCK_ASSERT(diskctx); |
0160356d | 344 | #endif |
e6c2b48a | 345 | |
51295aee | 346 | used_budget = avg_latency * transactions; |
e02e815e | 347 | budget = diskctx->budgetpb[(tdio->head.p) ? tdio->head.p->p_ionice : 0]; |
e6c2b48a AH |
348 | |
349 | if (used_budget > 0) { | |
350 | dsched_debug(LOG_INFO, | |
760d1e3d AHJ |
351 | "info: used_budget = %jd, budget = %jd\n", |
352 | (intmax_t)used_budget, budget); | |
e6c2b48a AH |
353 | } |
354 | ||
0160356d | 355 | if ((used_budget > budget) && (diskctx->disk_busy >= 90)) { |
e6c2b48a AH |
356 | KKASSERT(avg_latency != 0); |
357 | ||
0160356d | 358 | tdio->max_tp = budget/(avg_latency); |
e6c2b48a AH |
359 | atomic_add_int(&fq_stats.procs_limited, 1); |
360 | ||
361 | dsched_debug(LOG_INFO, | |
0160356d | 362 | "rate limited to %d transactions\n", tdio->max_tp); |
e6c2b48a | 363 | |
0160356d AH |
364 | } else if (((used_budget*2 < budget) || (diskctx->disk_busy < 80)) && |
365 | (!diskctx->prev_full && !diskctx->last_full)) { | |
366 | tdio->max_tp = 0; | |
81b5f250 | 367 | } |
74ce043b AH |
368 | } |
369 | ||
370 | ||
371 | static int | |
372 | do_fqstats(SYSCTL_HANDLER_ARGS) | |
373 | { | |
374 | return (sysctl_handle_opaque(oidp, &fq_stats, sizeof(struct dsched_fq_stats), req)); | |
375 | } | |
376 | ||
e02e815e AH |
377 | static int |
378 | fq_mod_handler(module_t mod, int type, void *unused) | |
74ce043b | 379 | { |
e02e815e AH |
380 | static struct sysctl_ctx_list sysctl_ctx; |
381 | static struct sysctl_oid *oid; | |
382 | static char version[16]; | |
383 | int error; | |
74ce043b | 384 | |
e02e815e | 385 | ksnprintf(version, sizeof(version), "%d.%d", |
74ce043b | 386 | dsched_fq_version_maj, dsched_fq_version_min); |
74ce043b | 387 | |
e02e815e AH |
388 | switch (type) { |
389 | case MOD_LOAD: | |
390 | bzero(&fq_stats, sizeof(struct dsched_fq_stats)); | |
391 | if ((error = dsched_register(&dsched_fq_policy))) | |
392 | return (error); | |
393 | ||
394 | sysctl_ctx_init(&sysctl_ctx); | |
395 | oid = SYSCTL_ADD_NODE(&sysctl_ctx, | |
396 | SYSCTL_STATIC_CHILDREN(_dsched), | |
397 | OID_AUTO, | |
398 | "fq", | |
399 | CTLFLAG_RD, 0, ""); | |
400 | ||
401 | SYSCTL_ADD_PROC(&sysctl_ctx, SYSCTL_CHILDREN(oid), | |
402 | OID_AUTO, "stats", CTLTYPE_OPAQUE|CTLFLAG_RD, | |
403 | 0, 0, do_fqstats, "S,dsched_fq_stats", "fq statistics"); | |
404 | ||
405 | SYSCTL_ADD_STRING(&sysctl_ctx, SYSCTL_CHILDREN(oid), | |
406 | OID_AUTO, "version", CTLFLAG_RD, version, 0, "fq version"); | |
407 | ||
408 | kprintf("FQ scheduler policy version %d.%d loaded\n", | |
409 | dsched_fq_version_maj, dsched_fq_version_min); | |
410 | break; | |
411 | ||
412 | case MOD_UNLOAD: | |
413 | if ((error = dsched_unregister(&dsched_fq_policy))) | |
414 | return (error); | |
415 | sysctl_ctx_free(&sysctl_ctx); | |
416 | kprintf("FQ scheduler policy unloaded\n"); | |
417 | break; | |
418 | ||
419 | default: | |
420 | break; | |
421 | } | |
74ce043b | 422 | |
e02e815e AH |
423 | return 0; |
424 | } | |
74ce043b | 425 | |
e02e815e | 426 | DSCHED_POLICY_MODULE(dsched_fq, fq_mod_handler); |