2 * Copyright (c) 2009, 2010 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Alex Hornung <ahornung@gmail.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/kernel.h>
38 #include <sys/sysctl.h>
41 #include <sys/diskslice.h>
43 #include <machine/atomic.h>
44 #include <sys/thread.h>
45 #include <sys/thread2.h>
46 #include <sys/ctype.h>
48 #include <sys/syslog.h>
49 #include <sys/dsched.h>
50 #include <machine/param.h>
52 #include <kern/dsched/fq/fq.h>
54 static int dsched_fq_version_maj = 1;
55 static int dsched_fq_version_min = 1;
57 /* Make sure our structs fit */
58 CTASSERT(sizeof(struct fq_thread_io) <= DSCHED_THREAD_IO_MAX_SZ);
59 CTASSERT(sizeof(struct fq_disk_ctx) <= DSCHED_DISK_CTX_MAX_SZ);
61 struct dsched_fq_stats fq_stats;
63 extern struct dsched_policy dsched_fq_policy;
66 fq_dispatcher(struct fq_disk_ctx *diskctx)
68 struct dispatch_prep dispatch_ary[FQ_DISPATCH_ARRAY_SZ];
69 struct dsched_thread_io *ds_tdio, *ds_tdio2;
70 struct fq_thread_io *tdio;
71 struct bio *bio, *bio2;
76 * We need to manually assign an tdio to the tdctx of this thread
77 * since it isn't assigned one during fq_prepare, as the disk
80 tdio = (struct fq_thread_io *)dsched_new_policy_thread_tdio(&diskctx->head,
83 DSCHED_DISK_CTX_LOCK(&diskctx->head);
87 if ((lksleep(diskctx, &diskctx->head.lock, 0, "fq_dispatcher", hz/15) == 0)) {
89 * We've been woken up; this either means that we are
90 * supposed to die away nicely or that the disk is idle.
93 if (__predict_false(diskctx->die == 1)) {
94 /* If we are supposed to die, drain all queues */
95 fq_drain(diskctx, FQ_DRAIN_FLUSH);
97 /* Now we can safely unlock and exit */
98 DSCHED_DISK_CTX_UNLOCK(&diskctx->head);
99 kprintf("fq_dispatcher is peacefully dying\n");
105 * We have been awakened because the disk is idle.
106 * So let's get ready to dispatch some extra bios.
111 /* Maybe the disk is idle and we just didn't get the wakeup */
113 idle = diskctx->idle;
115 /* Set the number of prepared requests to 0 */
119 * XXX: further room for improvements here. It would be better
120 * to dispatch a few requests from each tdio as to ensure
123 TAILQ_FOREACH_MUTABLE(ds_tdio, &diskctx->head.tdio_list, dlink, ds_tdio2) {
124 tdio = (struct fq_thread_io *)ds_tdio;
125 if (tdio->head.qlength == 0)
128 DSCHED_THREAD_IO_LOCK(&tdio->head);
129 if (atomic_cmpset_int(&tdio->rebalance, 1, 0))
130 fq_balance_self(tdio);
132 * XXX: why 5 extra? should probably be dynamic,
133 * relying on information on latency.
135 if ((tdio->max_tp > 0) && idle &&
136 (tdio->issued >= tdio->max_tp)) {
141 TAILQ_FOREACH_MUTABLE(bio, &tdio->head.queue, link, bio2) {
142 if (atomic_cmpset_int(&tdio->rebalance, 1, 0))
143 fq_balance_self(tdio);
144 if (((tdio->max_tp > 0) &&
145 (tdio->issued + prepd_io >= tdio->max_tp)) ||
146 (i == FQ_DISPATCH_ARRAY_SZ))
149 TAILQ_REMOVE(&tdio->head.queue, bio, link);
150 --tdio->head.qlength;
153 * beware that we do have an tdio reference
156 * XXX: note that here we don't dispatch it yet
157 * but just prepare it for dispatch so
158 * that no locks are held when calling
161 dispatch_ary[i].bio = bio;
162 dispatch_ary[i].tdio = tdio;
166 DSCHED_THREAD_IO_UNLOCK(&tdio->head);
170 dsched_disk_ctx_ref(&diskctx->head);
171 DSCHED_DISK_CTX_UNLOCK(&diskctx->head);
174 * Dispatch all the previously prepared bios, now without
177 for (--i; i >= 0; i--) {
178 bio = dispatch_ary[i].bio;
179 tdio = dispatch_ary[i].tdio;
180 fq_dispatch(diskctx, bio, tdio);
183 DSCHED_DISK_CTX_LOCK(&diskctx->head);
184 dsched_disk_ctx_unref(&diskctx->head);
189 fq_balance_thread(struct fq_disk_ctx *diskctx)
191 struct dsched_thread_io *ds_tdio;
192 struct fq_thread_io *tdio;
193 struct timeval tv, old_tv;
194 int64_t total_budget, product;
195 int64_t budget[FQ_PRIO_MAX+1];
196 int n, i, sum, total_disk_time;
199 DSCHED_DISK_CTX_LOCK(&diskctx->head);
201 getmicrotime(&diskctx->start_interval);
205 if ((lksleep(curthread, &diskctx->head.lock, 0, "fq_balancer", hz/2) == 0)) {
206 if (__predict_false(diskctx->die)) {
207 DSCHED_DISK_CTX_UNLOCK(&diskctx->head);
212 bzero(budget, sizeof(budget));
216 old_tv = diskctx->start_interval;
219 total_disk_time = (int)(1000000*((tv.tv_sec - old_tv.tv_sec)) +
220 (tv.tv_usec - old_tv.tv_usec));
222 if (total_disk_time == 0)
225 dsched_debug(LOG_INFO, "total_disk_time = %d\n", total_disk_time);
227 diskctx->start_interval = tv;
229 diskctx->disk_busy = (100*(total_disk_time - diskctx->idle_time)) / total_disk_time;
230 if (diskctx->disk_busy < 0)
231 diskctx->disk_busy = 0;
233 diskctx->idle_time = 0;
236 TAILQ_FOREACH(ds_tdio, &diskctx->head.tdio_list, dlink) {
237 tdio = (struct fq_thread_io *)ds_tdio;
238 tdio->interval_avg_latency = tdio->avg_latency;
239 tdio->interval_transactions = tdio->transactions;
240 if (tdio->interval_transactions > 0) {
241 product = (int64_t)tdio->interval_avg_latency *
242 tdio->interval_transactions;
243 product >>= lost_bits;
244 while(total_budget >= INT64_MAX - product) {
249 total_budget += product;
250 ++budget[(tdio->head.p) ? tdio->head.p->p_ionice : 0];
251 KKASSERT(total_budget >= 0);
252 dsched_debug(LOG_INFO,
253 "%d) avg_latency = %d, transactions = %d, ioprio = %d\n",
254 n, tdio->interval_avg_latency, tdio->interval_transactions,
255 (tdio->head.p) ? tdio->head.p->p_ionice : 0);
261 tdio->transactions = 0;
262 tdio->avg_latency = 0;
266 dsched_debug(LOG_INFO, "%d procs competing for disk\n"
267 "total_budget = %jd (lost bits = %d)\n"
268 "incomplete tp = %d\n", n, (intmax_t)total_budget,
269 lost_bits, diskctx->incomplete_tp);
276 for (i = 0; i < FQ_PRIO_MAX+1; i++) {
279 sum += (FQ_PRIO_BIAS+i)*budget[i];
285 dsched_debug(LOG_INFO, "sum = %d\n", sum);
287 for (i = 0; i < FQ_PRIO_MAX+1; i++) {
292 * XXX: if we still overflow here, we really need to switch to
293 * some more advanced mechanism such as compound int128 or
294 * storing the lost bits so they can be used in the
297 diskctx->budgetpb[i] = ((FQ_PRIO_BIAS+i)*total_budget/sum) << lost_bits;
298 KKASSERT(diskctx->budgetpb[i] >= 0);
301 dsched_debug(4, "disk is %d%% busy\n", diskctx->disk_busy);
302 TAILQ_FOREACH(ds_tdio, &diskctx->head.tdio_list, dlink) {
303 tdio = (struct fq_thread_io *)ds_tdio;
307 diskctx->prev_full = diskctx->last_full;
308 diskctx->last_full = (diskctx->disk_busy >= 90)?1:0;
314 * fq_balance_self should be called from all sorts of dispatchers. It basically
315 * offloads some of the heavier calculations on throttling onto the process that
316 * wants to do I/O instead of doing it in the fq_balance thread.
317 * - should be called with diskctx lock held
320 fq_balance_self(struct fq_thread_io *tdio) {
321 struct fq_disk_ctx *diskctx;
323 int64_t budget, used_budget;
325 int64_t transactions;
327 transactions = (int64_t)tdio->interval_transactions;
328 avg_latency = (int64_t)tdio->interval_avg_latency;
329 diskctx = (struct fq_disk_ctx *)tdio->head.diskctx;
332 /* XXX: do we really require the lock? */
333 DSCHED_DISK_CTX_LOCK_ASSERT(diskctx);
336 used_budget = ((int64_t)avg_latency * transactions);
337 budget = diskctx->budgetpb[(tdio->head.p) ? tdio->head.p->p_ionice : 0];
339 if (used_budget > 0) {
340 dsched_debug(LOG_INFO,
341 "info: used_budget = %jd, budget = %jd\n",
342 (intmax_t)used_budget, budget);
345 if ((used_budget > budget) && (diskctx->disk_busy >= 90)) {
346 KKASSERT(avg_latency != 0);
348 tdio->max_tp = budget/(avg_latency);
349 atomic_add_int(&fq_stats.procs_limited, 1);
351 dsched_debug(LOG_INFO,
352 "rate limited to %d transactions\n", tdio->max_tp);
354 } else if (((used_budget*2 < budget) || (diskctx->disk_busy < 80)) &&
355 (!diskctx->prev_full && !diskctx->last_full)) {
362 do_fqstats(SYSCTL_HANDLER_ARGS)
364 return (sysctl_handle_opaque(oidp, &fq_stats, sizeof(struct dsched_fq_stats), req));
368 fq_mod_handler(module_t mod, int type, void *unused)
370 static struct sysctl_ctx_list sysctl_ctx;
371 static struct sysctl_oid *oid;
372 static char version[16];
375 ksnprintf(version, sizeof(version), "%d.%d",
376 dsched_fq_version_maj, dsched_fq_version_min);
380 bzero(&fq_stats, sizeof(struct dsched_fq_stats));
381 if ((error = dsched_register(&dsched_fq_policy)))
384 sysctl_ctx_init(&sysctl_ctx);
385 oid = SYSCTL_ADD_NODE(&sysctl_ctx,
386 SYSCTL_STATIC_CHILDREN(_dsched),
391 SYSCTL_ADD_PROC(&sysctl_ctx, SYSCTL_CHILDREN(oid),
392 OID_AUTO, "stats", CTLTYPE_OPAQUE|CTLFLAG_RD,
393 0, 0, do_fqstats, "S,dsched_fq_stats", "fq statistics");
395 SYSCTL_ADD_STRING(&sysctl_ctx, SYSCTL_CHILDREN(oid),
396 OID_AUTO, "version", CTLFLAG_RD, version, 0, "fq version");
398 kprintf("FQ scheduler policy version %d.%d loaded\n",
399 dsched_fq_version_maj, dsched_fq_version_min);
403 if ((error = dsched_unregister(&dsched_fq_policy)))
405 sysctl_ctx_free(&sysctl_ctx);
406 kprintf("FQ scheduler policy unloaded\n");
416 DSCHED_POLICY_MODULE(dsched_fq, fq_mod_handler);