kernel/dsched: Add a version parameter to the DSCHED_POLICY_MODULE macro.
[dragonfly.git] / sys / kern / dsched / fq / fq_core.c
CommitLineData
74ce043b
AH
1/*
2 * Copyright (c) 2009, 2010 The DragonFly Project. All rights reserved.
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Alex Hornung <ahornung@gmail.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 */
34#include <sys/param.h>
35#include <sys/systm.h>
36#include <sys/kernel.h>
37#include <sys/proc.h>
38#include <sys/sysctl.h>
39#include <sys/buf.h>
40#include <sys/conf.h>
41#include <sys/diskslice.h>
42#include <sys/disk.h>
43#include <machine/atomic.h>
74ce043b
AH
44#include <sys/thread.h>
45#include <sys/thread2.h>
74ce043b 46#include <sys/ctype.h>
74ce043b 47#include <sys/buf2.h>
e02e815e 48#include <sys/syslog.h>
74ce043b 49#include <sys/dsched.h>
74ce043b
AH
50#include <machine/param.h>
51
c6d593c9 52#include <kern/dsched/fq/fq.h>
74ce043b 53
e02e815e 54static int dsched_fq_version_maj = 1;
009da86a 55static int dsched_fq_version_min = 1;
74ce043b 56
e02e815e
AH
57/* Make sure our structs fit */
58CTASSERT(sizeof(struct fq_thread_io) <= DSCHED_THREAD_IO_MAX_SZ);
59CTASSERT(sizeof(struct fq_disk_ctx) <= DSCHED_DISK_CTX_MAX_SZ);
74ce043b
AH
60
61struct dsched_fq_stats fq_stats;
62
0160356d 63extern struct dsched_policy dsched_fq_policy;
74ce043b 64
74ce043b 65void
0160356d 66fq_dispatcher(struct fq_disk_ctx *diskctx)
74ce043b 67{
7c76e73a 68 struct dispatch_prep *dispatch_ary;
e02e815e
AH
69 struct dsched_thread_io *ds_tdio, *ds_tdio2;
70 struct fq_thread_io *tdio;
74ce043b 71 struct bio *bio, *bio2;
0f0e78e2 72 int idle;
009da86a 73 int i, prepd_io;
74ce043b 74
aa166ad1 75 /*
7c76e73a
MD
76 * Array is dangerously big for an on-stack declaration, allocate
77 * it instead.
78 */
79 dispatch_ary = kmalloc(sizeof(*dispatch_ary) * FQ_DISPATCH_ARRAY_SZ,
80 M_TEMP, M_INTWAIT | M_ZERO);
81
82 /*
0160356d 83 * We need to manually assign an tdio to the tdctx of this thread
aa166ad1
AH
84 * since it isn't assigned one during fq_prepare, as the disk
85 * is not set up yet.
86 */
89dabacd
AH
87 tdio = (struct fq_thread_io *)dsched_new_policy_thread_tdio(&diskctx->head,
88 &dsched_fq_policy);
aa166ad1 89
e02e815e 90 DSCHED_DISK_CTX_LOCK(&diskctx->head);
74ce043b 91 for(;;) {
9cc004d0 92 idle = 0;
7c76e73a
MD
93 /*
94 * sleep ~60 ms, failsafe low hz rates.
95 */
96 if ((lksleep(diskctx, &diskctx->head.lock, 0,
97 "fq_dispatcher", (hz + 14) / 15) == 0)) {
0f0e78e2
AH
98 /*
99 * We've been woken up; this either means that we are
100 * supposed to die away nicely or that the disk is idle.
101 */
102
4cda7147
MD
103 if (__predict_false(diskctx->die == 1))
104 break;
0f0e78e2
AH
105
106 /*
107 * We have been awakened because the disk is idle.
108 * So let's get ready to dispatch some extra bios.
109 */
110 idle = 1;
74ce043b
AH
111 }
112
0f0e78e2 113 /* Maybe the disk is idle and we just didn't get the wakeup */
9cc004d0 114 if (idle == 0)
0160356d 115 idle = diskctx->idle;
9cc004d0 116
009da86a
AH
117 /* Set the number of prepared requests to 0 */
118 i = 0;
119
0f0e78e2
AH
120 /*
121 * XXX: further room for improvements here. It would be better
0160356d 122 * to dispatch a few requests from each tdio as to ensure
0f0e78e2
AH
123 * real fairness.
124 */
e02e815e
AH
125 TAILQ_FOREACH_MUTABLE(ds_tdio, &diskctx->head.tdio_list, dlink, ds_tdio2) {
126 tdio = (struct fq_thread_io *)ds_tdio;
127 if (tdio->head.qlength == 0)
0f0e78e2
AH
128 continue;
129
e02e815e 130 DSCHED_THREAD_IO_LOCK(&tdio->head);
0160356d
AH
131 if (atomic_cmpset_int(&tdio->rebalance, 1, 0))
132 fq_balance_self(tdio);
0f0e78e2
AH
133 /*
134 * XXX: why 5 extra? should probably be dynamic,
135 * relying on information on latency.
136 */
0160356d
AH
137 if ((tdio->max_tp > 0) && idle &&
138 (tdio->issued >= tdio->max_tp)) {
139 tdio->max_tp += 5;
0f0e78e2
AH
140 }
141
009da86a 142 prepd_io = 0;
e02e815e 143 TAILQ_FOREACH_MUTABLE(bio, &tdio->head.queue, link, bio2) {
0160356d
AH
144 if (atomic_cmpset_int(&tdio->rebalance, 1, 0))
145 fq_balance_self(tdio);
009da86a
AH
146 if (((tdio->max_tp > 0) &&
147 (tdio->issued + prepd_io >= tdio->max_tp)) ||
148 (i == FQ_DISPATCH_ARRAY_SZ))
0f0e78e2
AH
149 break;
150
e02e815e
AH
151 TAILQ_REMOVE(&tdio->head.queue, bio, link);
152 --tdio->head.qlength;
74ce043b 153
9cc004d0 154 /*
0160356d 155 * beware that we do have an tdio reference
0f0e78e2 156 * from the queueing
009da86a
AH
157 *
158 * XXX: note that here we don't dispatch it yet
159 * but just prepare it for dispatch so
160 * that no locks are held when calling
161 * into the drivers.
9cc004d0 162 */
009da86a
AH
163 dispatch_ary[i].bio = bio;
164 dispatch_ary[i].tdio = tdio;
165 ++i;
166 ++prepd_io;
74ce043b 167 }
e02e815e 168 DSCHED_THREAD_IO_UNLOCK(&tdio->head);
0f0e78e2 169
74ce043b 170 }
009da86a
AH
171
172 dsched_disk_ctx_ref(&diskctx->head);
173 DSCHED_DISK_CTX_UNLOCK(&diskctx->head);
174
175 /*
176 * Dispatch all the previously prepared bios, now without
177 * holding any locks.
178 */
179 for (--i; i >= 0; i--) {
180 bio = dispatch_ary[i].bio;
181 tdio = dispatch_ary[i].tdio;
182 fq_dispatch(diskctx, bio, tdio);
183 }
184
185 DSCHED_DISK_CTX_LOCK(&diskctx->head);
186 dsched_disk_ctx_unref(&diskctx->head);
74ce043b 187 }
4cda7147
MD
188
189 /*
190 * If we are supposed to die, drain all queues, then
191 * unlock and exit.
192 */
193 fq_drain(diskctx, FQ_DRAIN_FLUSH);
194 DSCHED_DISK_CTX_UNLOCK(&diskctx->head);
195 kfree(dispatch_ary, M_TEMP);
196
197 kprintf("fq_dispatcher is peacefully dying\n");
198 lwkt_exit();
199 /* NOTREACHED */
74ce043b
AH
200}
201
74ce043b 202void
0160356d 203fq_balance_thread(struct fq_disk_ctx *diskctx)
74ce043b 204{
e02e815e
AH
205 struct dsched_thread_io *ds_tdio;
206 struct fq_thread_io *tdio;
654a51ea 207 struct timeval tv, old_tv;
0746e160 208 int64_t total_budget, product;
38f2331e 209 int64_t budget[FQ_PRIO_MAX+1];
e6c2b48a 210 int n, i, sum, total_disk_time;
0746e160 211 int lost_bits;
74ce043b 212
e02e815e 213 DSCHED_DISK_CTX_LOCK(&diskctx->head);
654a51ea
AH
214
215 getmicrotime(&diskctx->start_interval);
216
81b5f250
AH
217 for (;;) {
218 /* sleep ~1s */
e02e815e 219 if ((lksleep(curthread, &diskctx->head.lock, 0, "fq_balancer", hz/2) == 0)) {
0160356d 220 if (__predict_false(diskctx->die)) {
e02e815e 221 DSCHED_DISK_CTX_UNLOCK(&diskctx->head);
81b5f250
AH
222 lwkt_exit();
223 }
224 }
225
38f2331e 226 bzero(budget, sizeof(budget));
81b5f250
AH
227 total_budget = 0;
228 n = 0;
229
654a51ea 230 old_tv = diskctx->start_interval;
81b5f250 231 getmicrotime(&tv);
ef46c87b 232
ef46c87b
AH
233 total_disk_time = (int)(1000000*((tv.tv_sec - old_tv.tv_sec)) +
234 (tv.tv_usec - old_tv.tv_usec));
41b0c7c3
AH
235
236 if (total_disk_time == 0)
237 total_disk_time = 1;
238
ef46c87b 239 dsched_debug(LOG_INFO, "total_disk_time = %d\n", total_disk_time);
ef46c87b 240
654a51ea 241 diskctx->start_interval = tv;
ef46c87b 242
0160356d
AH
243 diskctx->disk_busy = (100*(total_disk_time - diskctx->idle_time)) / total_disk_time;
244 if (diskctx->disk_busy < 0)
245 diskctx->disk_busy = 0;
74ce043b 246
0160356d 247 diskctx->idle_time = 0;
0746e160 248 lost_bits = 0;
d161bce9 249
e02e815e
AH
250 TAILQ_FOREACH(ds_tdio, &diskctx->head.tdio_list, dlink) {
251 tdio = (struct fq_thread_io *)ds_tdio;
0160356d
AH
252 tdio->interval_avg_latency = tdio->avg_latency;
253 tdio->interval_transactions = tdio->transactions;
254 if (tdio->interval_transactions > 0) {
51a4bb9f
SW
255 product = (int64_t)tdio->interval_avg_latency *
256 tdio->interval_transactions;
0746e160
AH
257 product >>= lost_bits;
258 while(total_budget >= INT64_MAX - product) {
259 ++lost_bits;
260 product >>= 1;
261 total_budget >>= 1;
262 }
263 total_budget += product;
e02e815e 264 ++budget[(tdio->head.p) ? tdio->head.p->p_ionice : 0];
38f2331e 265 KKASSERT(total_budget >= 0);
81b5f250
AH
266 dsched_debug(LOG_INFO,
267 "%d) avg_latency = %d, transactions = %d, ioprio = %d\n",
0160356d 268 n, tdio->interval_avg_latency, tdio->interval_transactions,
e02e815e 269 (tdio->head.p) ? tdio->head.p->p_ionice : 0);
81b5f250
AH
270 ++n;
271 } else {
0160356d 272 tdio->max_tp = 0;
81b5f250 273 }
0160356d
AH
274 tdio->rebalance = 0;
275 tdio->transactions = 0;
276 tdio->avg_latency = 0;
277 tdio->issued = 0;
74ce043b 278 }
74ce043b 279
81b5f250 280 dsched_debug(LOG_INFO, "%d procs competing for disk\n"
760d1e3d
AHJ
281 "total_budget = %jd (lost bits = %d)\n"
282 "incomplete tp = %d\n", n, (intmax_t)total_budget,
0160356d 283 lost_bits, diskctx->incomplete_tp);
d161bce9 284
81b5f250 285 if (n == 0)
aa166ad1 286 continue;
3ee00e04 287
81b5f250 288 sum = 0;
3ee00e04 289
81b5f250 290 for (i = 0; i < FQ_PRIO_MAX+1; i++) {
38f2331e 291 if (budget[i] == 0)
81b5f250 292 continue;
38f2331e 293 sum += (FQ_PRIO_BIAS+i)*budget[i];
81b5f250 294 }
74ce043b 295
81b5f250
AH
296 if (sum == 0)
297 sum = 1;
aa166ad1 298
81b5f250 299 dsched_debug(LOG_INFO, "sum = %d\n", sum);
aa166ad1 300
81b5f250 301 for (i = 0; i < FQ_PRIO_MAX+1; i++) {
38f2331e 302 if (budget[i] == 0)
81b5f250 303 continue;
aa166ad1 304
0746e160
AH
305 /*
306 * XXX: if we still overflow here, we really need to switch to
307 * some more advanced mechanism such as compound int128 or
308 * storing the lost bits so they can be used in the
309 * fq_balance_self.
310 */
0160356d
AH
311 diskctx->budgetpb[i] = ((FQ_PRIO_BIAS+i)*total_budget/sum) << lost_bits;
312 KKASSERT(diskctx->budgetpb[i] >= 0);
81b5f250 313 }
d161bce9 314
0160356d 315 dsched_debug(4, "disk is %d%% busy\n", diskctx->disk_busy);
e02e815e
AH
316 TAILQ_FOREACH(ds_tdio, &diskctx->head.tdio_list, dlink) {
317 tdio = (struct fq_thread_io *)ds_tdio;
0160356d 318 tdio->rebalance = 1;
e6c2b48a 319 }
74ce043b 320
0160356d
AH
321 diskctx->prev_full = diskctx->last_full;
322 diskctx->last_full = (diskctx->disk_busy >= 90)?1:0;
e6c2b48a
AH
323 }
324}
81b5f250 325
81b5f250 326
e6c2b48a
AH
327/*
328 * fq_balance_self should be called from all sorts of dispatchers. It basically
329 * offloads some of the heavier calculations on throttling onto the process that
330 * wants to do I/O instead of doing it in the fq_balance thread.
0160356d 331 * - should be called with diskctx lock held
e6c2b48a
AH
332 */
333void
0160356d
AH
334fq_balance_self(struct fq_thread_io *tdio) {
335 struct fq_disk_ctx *diskctx;
81b5f250 336
e6c2b48a
AH
337 int64_t budget, used_budget;
338 int64_t avg_latency;
339 int64_t transactions;
74ce043b 340
0160356d
AH
341 transactions = (int64_t)tdio->interval_transactions;
342 avg_latency = (int64_t)tdio->interval_avg_latency;
e02e815e 343 diskctx = (struct fq_disk_ctx *)tdio->head.diskctx;
0160356d
AH
344
345#if 0
346 /* XXX: do we really require the lock? */
e02e815e 347 DSCHED_DISK_CTX_LOCK_ASSERT(diskctx);
0160356d 348#endif
e6c2b48a 349
51295aee 350 used_budget = avg_latency * transactions;
e02e815e 351 budget = diskctx->budgetpb[(tdio->head.p) ? tdio->head.p->p_ionice : 0];
e6c2b48a
AH
352
353 if (used_budget > 0) {
354 dsched_debug(LOG_INFO,
760d1e3d
AHJ
355 "info: used_budget = %jd, budget = %jd\n",
356 (intmax_t)used_budget, budget);
e6c2b48a
AH
357 }
358
0160356d 359 if ((used_budget > budget) && (diskctx->disk_busy >= 90)) {
e6c2b48a
AH
360 KKASSERT(avg_latency != 0);
361
0160356d 362 tdio->max_tp = budget/(avg_latency);
e6c2b48a
AH
363 atomic_add_int(&fq_stats.procs_limited, 1);
364
365 dsched_debug(LOG_INFO,
0160356d 366 "rate limited to %d transactions\n", tdio->max_tp);
e6c2b48a 367
0160356d
AH
368 } else if (((used_budget*2 < budget) || (diskctx->disk_busy < 80)) &&
369 (!diskctx->prev_full && !diskctx->last_full)) {
370 tdio->max_tp = 0;
81b5f250 371 }
74ce043b
AH
372}
373
374
375static int
376do_fqstats(SYSCTL_HANDLER_ARGS)
377{
378 return (sysctl_handle_opaque(oidp, &fq_stats, sizeof(struct dsched_fq_stats), req));
379}
380
e02e815e
AH
381static int
382fq_mod_handler(module_t mod, int type, void *unused)
74ce043b 383{
e02e815e
AH
384 static struct sysctl_ctx_list sysctl_ctx;
385 static struct sysctl_oid *oid;
386 static char version[16];
387 int error;
74ce043b 388
e02e815e 389 ksnprintf(version, sizeof(version), "%d.%d",
74ce043b 390 dsched_fq_version_maj, dsched_fq_version_min);
74ce043b 391
e02e815e
AH
392 switch (type) {
393 case MOD_LOAD:
394 bzero(&fq_stats, sizeof(struct dsched_fq_stats));
395 if ((error = dsched_register(&dsched_fq_policy)))
396 return (error);
397
398 sysctl_ctx_init(&sysctl_ctx);
399 oid = SYSCTL_ADD_NODE(&sysctl_ctx,
400 SYSCTL_STATIC_CHILDREN(_dsched),
401 OID_AUTO,
402 "fq",
403 CTLFLAG_RD, 0, "");
404
405 SYSCTL_ADD_PROC(&sysctl_ctx, SYSCTL_CHILDREN(oid),
406 OID_AUTO, "stats", CTLTYPE_OPAQUE|CTLFLAG_RD,
407 0, 0, do_fqstats, "S,dsched_fq_stats", "fq statistics");
408
409 SYSCTL_ADD_STRING(&sysctl_ctx, SYSCTL_CHILDREN(oid),
410 OID_AUTO, "version", CTLFLAG_RD, version, 0, "fq version");
411
412 kprintf("FQ scheduler policy version %d.%d loaded\n",
413 dsched_fq_version_maj, dsched_fq_version_min);
414 break;
415
416 case MOD_UNLOAD:
417 if ((error = dsched_unregister(&dsched_fq_policy)))
418 return (error);
419 sysctl_ctx_free(&sysctl_ctx);
420 kprintf("FQ scheduler policy unloaded\n");
421 break;
422
423 default:
424 break;
425 }
74ce043b 426
e02e815e
AH
427 return 0;
428}
74ce043b 429
1f509c0d 430DSCHED_POLICY_MODULE(dsched_fq, fq_mod_handler, 1);