2 * Copyright (c) 2009, 2010 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Alex Hornung <ahornung@gmail.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/kernel.h>
38 #include <sys/sysctl.h>
41 #include <sys/diskslice.h>
43 #include <machine/atomic.h>
44 #include <sys/malloc.h>
45 #include <sys/thread.h>
46 #include <sys/thread2.h>
47 #include <sys/sysctl.h>
48 #include <sys/spinlock2.h>
49 #include <machine/md_var.h>
50 #include <sys/ctype.h>
51 #include <sys/syslog.h>
52 #include <sys/device.h>
53 #include <sys/msgport.h>
54 #include <sys/msgport2.h>
56 #include <sys/dsched.h>
57 #include <machine/varargs.h>
58 #include <machine/param.h>
60 #include <dsched/fq/dsched_fq.h>
62 MALLOC_DECLARE(M_DSCHEDFQ);
64 static int dsched_fq_version_maj = 0;
65 static int dsched_fq_version_min = 7;
67 struct dsched_fq_stats fq_stats;
69 struct objcache_malloc_args dsched_fq_dpriv_malloc_args = {
70 sizeof(struct dsched_fq_dpriv), M_DSCHEDFQ };
71 struct objcache_malloc_args dsched_fq_priv_malloc_args = {
72 sizeof(struct dsched_fq_priv), M_DSCHEDFQ };
73 struct objcache_malloc_args dsched_fq_mpriv_malloc_args = {
74 sizeof(struct dsched_fq_mpriv), M_DSCHEDFQ };
76 static struct objcache *fq_dpriv_cache;
77 static struct objcache *fq_mpriv_cache;
78 static struct objcache *fq_priv_cache;
80 TAILQ_HEAD(, dsched_fq_mpriv) dsched_fqmp_list =
81 TAILQ_HEAD_INITIALIZER(dsched_fqmp_list);
83 struct spinlock fq_fqmp_lock;
84 struct callout fq_callout;
86 extern struct dsched_ops dsched_fq_ops;
89 fq_reference_dpriv(struct dsched_fq_dpriv *dpriv)
93 refcount = atomic_fetchadd_int(&dpriv->refcount, 1);
95 KKASSERT(refcount >= 0);
99 fq_reference_priv(struct dsched_fq_priv *fqp)
103 refcount = atomic_fetchadd_int(&fqp->refcount, 1);
105 KKASSERT(refcount >= 0);
109 fq_reference_mpriv(struct dsched_fq_mpriv *fqmp)
113 refcount = atomic_fetchadd_int(&fqmp->refcount, 1);
115 KKASSERT(refcount >= 0);
119 fq_dereference_dpriv(struct dsched_fq_dpriv *dpriv)
121 struct dsched_fq_priv *fqp, *fqp2;
124 refcount = atomic_fetchadd_int(&dpriv->refcount, -1);
127 KKASSERT(refcount >= -3);
130 atomic_subtract_int(&dpriv->refcount, 3); /* mark as: in destruction */
132 kprintf("dpriv (%p) destruction started, trace:\n", dpriv);
135 spin_lock_wr(&dpriv->lock);
136 TAILQ_FOREACH_MUTABLE(fqp, &dpriv->fq_priv_list, dlink, fqp2) {
137 TAILQ_REMOVE(&dpriv->fq_priv_list, fqp, dlink);
138 fqp->flags &= ~FQP_LINKED_DPRIV;
139 fq_dereference_priv(fqp);
141 spin_unlock_wr(&dpriv->lock);
143 objcache_put(fq_dpriv_cache, dpriv);
144 atomic_subtract_int(&fq_stats.dpriv_allocations, 1);
149 fq_dereference_priv(struct dsched_fq_priv *fqp)
151 struct dsched_fq_mpriv *fqmp;
152 struct dsched_fq_dpriv *dpriv;
155 refcount = atomic_fetchadd_int(&fqp->refcount, -1);
157 KKASSERT(refcount >= -3);
160 atomic_subtract_int(&fqp->refcount, 3); /* mark as: in destruction */
162 kprintf("fqp (%p) destruction started, trace:\n", fqp);
166 KKASSERT(dpriv != NULL);
168 spin_lock_wr(&fqp->lock);
170 KKASSERT(fqp->qlength == 0);
172 if (fqp->flags & FQP_LINKED_DPRIV) {
173 spin_lock_wr(&dpriv->lock);
175 TAILQ_REMOVE(&dpriv->fq_priv_list, fqp, dlink);
176 fqp->flags &= ~FQP_LINKED_DPRIV;
178 spin_unlock_wr(&dpriv->lock);
181 if (fqp->flags & FQP_LINKED_FQMP) {
183 KKASSERT(fqmp != NULL);
185 spin_lock_wr(&fqmp->lock);
187 TAILQ_REMOVE(&fqmp->fq_priv_list, fqp, link);
188 fqp->flags &= ~FQP_LINKED_FQMP;
190 spin_unlock_wr(&fqmp->lock);
193 spin_unlock_wr(&fqp->lock);
195 objcache_put(fq_priv_cache, fqp);
196 atomic_subtract_int(&fq_stats.fqp_allocations, 1);
198 fq_dereference_dpriv(dpriv);
204 fq_dereference_mpriv(struct dsched_fq_mpriv *fqmp)
206 struct dsched_fq_priv *fqp, *fqp2;
209 refcount = atomic_fetchadd_int(&fqmp->refcount, -1);
211 KKASSERT(refcount >= -3);
214 atomic_subtract_int(&fqmp->refcount, 3); /* mark as: in destruction */
216 kprintf("fqmp (%p) destruction started, trace:\n", fqmp);
219 FQ_GLOBAL_FQMP_LOCK();
220 spin_lock_wr(&fqmp->lock);
222 TAILQ_FOREACH_MUTABLE(fqp, &fqmp->fq_priv_list, link, fqp2) {
223 TAILQ_REMOVE(&fqmp->fq_priv_list, fqp, link);
224 fqp->flags &= ~FQP_LINKED_FQMP;
225 fq_dereference_priv(fqp);
227 TAILQ_REMOVE(&dsched_fqmp_list, fqmp, link);
229 spin_unlock_wr(&fqmp->lock);
230 FQ_GLOBAL_FQMP_UNLOCK();
232 objcache_put(fq_mpriv_cache, fqmp);
233 atomic_subtract_int(&fq_stats.fqmp_allocations, 1);
238 struct dsched_fq_priv *
239 fq_alloc_priv(struct disk *dp)
241 struct dsched_fq_priv *fqp;
243 fq_reference_dpriv(dsched_get_disk_priv(dp));
245 fqp = objcache_get(fq_priv_cache, M_WAITOK);
246 bzero(fqp, sizeof(struct dsched_fq_priv));
248 /* XXX: maybe we do need another ref for the disk list for fqp */
249 fq_reference_priv(fqp);
251 FQ_FQP_LOCKINIT(fqp);
255 fqp->dpriv = dsched_get_disk_priv(dp);
257 TAILQ_INIT(&fqp->queue);
258 TAILQ_INSERT_TAIL(&fqp->dpriv->fq_priv_list, fqp, dlink);
259 fqp->flags |= FQP_LINKED_DPRIV;
261 atomic_add_int(&fq_stats.fqp_allocations, 1);
267 struct dsched_fq_dpriv *
268 fq_alloc_dpriv(struct disk *dp)
270 struct dsched_fq_dpriv *dpriv;
272 dpriv = objcache_get(fq_dpriv_cache, M_WAITOK);
273 bzero(dpriv, sizeof(struct dsched_fq_dpriv));
274 fq_reference_dpriv(dpriv);
276 dpriv->avg_rq_time = 0;
277 dpriv->incomplete_tp = 0;
278 FQ_DPRIV_LOCKINIT(dpriv);
279 TAILQ_INIT(&dpriv->fq_priv_list);
281 atomic_add_int(&fq_stats.dpriv_allocations, 1);
286 struct dsched_fq_mpriv *
287 fq_alloc_mpriv(struct proc *p)
289 struct dsched_fq_mpriv *fqmp;
290 struct dsched_fq_priv *fqp;
291 struct disk *dp = NULL;
293 fqmp = objcache_get(fq_mpriv_cache, M_WAITOK);
294 bzero(fqmp, sizeof(struct dsched_fq_mpriv));
295 fq_reference_mpriv(fqmp);
297 kprintf("fq_alloc_mpriv, new fqmp = %p\n", fqmp);
299 FQ_FQMP_LOCKINIT(fqmp);
301 TAILQ_INIT(&fqmp->fq_priv_list);
303 while ((dp = dsched_disk_enumerate(dp, &dsched_fq_ops))) {
304 fqp = fq_alloc_priv(dp);
307 fq_reference_priv(fqp);
310 TAILQ_INSERT_TAIL(&fqmp->fq_priv_list, fqp, link);
311 fqp->flags |= FQP_LINKED_FQMP;
314 FQ_GLOBAL_FQMP_LOCK();
315 TAILQ_INSERT_TAIL(&dsched_fqmp_list, fqmp, link);
316 FQ_GLOBAL_FQMP_UNLOCK();
317 FQ_FQMP_UNLOCK(fqmp);
319 atomic_add_int(&fq_stats.fqmp_allocations, 1);
325 fq_dispatcher(struct dsched_fq_dpriv *dpriv)
327 struct dsched_fq_mpriv *fqmp;
328 struct dsched_fq_priv *fqp, *fqp2;
329 struct bio *bio, *bio2;
333 * We need to manually assign an fqp to the fqmp of this thread
334 * since it isn't assigned one during fq_prepare, as the disk
337 fqmp = dsched_get_thread_priv(curthread);
338 KKASSERT(fqmp != NULL);
340 fqp = fq_alloc_priv(dpriv->dp);
343 fq_reference_priv(fqp);
345 TAILQ_INSERT_TAIL(&fqmp->fq_priv_list, fqp, link);
346 FQ_FQMP_UNLOCK(fqmp);
349 FQ_DPRIV_LOCK(dpriv);
353 if ((ssleep(dpriv, &dpriv->lock, 0, "fq_dispatcher", hz/15) == 0)) {
355 * We've been woken up; this either means that we are
356 * supposed to die away nicely or that the disk is idle.
359 if (__predict_false(dpriv->die == 1)) {
360 /* If we are supposed to die, drain all queues */
361 fq_drain(dpriv, FQ_DRAIN_FLUSH);
363 /* Now we can safely unlock and exit */
364 FQ_DPRIV_UNLOCK(dpriv);
365 kprintf("fq_dispatcher is peacefully dying\n");
371 * We have been awakened because the disk is idle.
372 * So let's get ready to dispatch some extra bios.
377 /* Maybe the disk is idle and we just didn't get the wakeup */
382 * XXX: further room for improvements here. It would be better
383 * to dispatch a few requests from each fqp as to ensure
386 TAILQ_FOREACH_MUTABLE(fqp, &dpriv->fq_priv_list, dlink, fqp2) {
387 if (fqp->qlength == 0)
393 * XXX: why 5 extra? should probably be dynamic,
394 * relying on information on latency.
396 if ((fqp->max_tp > 0) && idle &&
397 (fqp->issued >= fqp->max_tp)) {
401 TAILQ_FOREACH_MUTABLE(bio, &fqp->queue, link, bio2) {
402 if ((fqp->max_tp > 0) &&
403 ((fqp->issued >= fqp->max_tp)))
406 TAILQ_REMOVE(&fqp->queue, bio, link);
410 * beware that we do have an fqp reference
413 fq_dispatch(dpriv, bio, fqp);
422 fq_balance_thread(struct dsched_fq_dpriv *dpriv)
424 struct dsched_fq_priv *fqp, *fqp2;
425 static struct timeval old_tv;
428 static int last_full = 0, prev_full = 0;
429 static int limited_procs = 0;
430 static int first_run = 1;
433 int64_t budget, total_budget, used_budget;
434 int64_t budgetpb[FQ_PRIO_MAX+1];
437 bzero(budgetpb, sizeof(budgetpb));
442 if (__predict_false(first_run)) {
443 total_disk_time = FQ_TOTAL_DISK_TIME;
446 total_disk_time = (int)(1000000*((tv.tv_sec - old_tv.tv_sec)) +
447 (tv.tv_usec - old_tv.tv_usec));
448 dsched_debug(LOG_INFO, "total_disk_time = %d\n", total_disk_time);
451 FQ_DPRIV_LOCK(dpriv);
453 disk_busy = (100*(total_disk_time - dpriv->idle_time)) / total_disk_time;
457 dpriv->idle_time = 0;
459 TAILQ_FOREACH_MUTABLE(fqp, &dpriv->fq_priv_list, dlink, fqp2) {
460 if (fqp->transactions > 0 /* 30 */) {
461 total_budget += (fqp->avg_latency * fqp->transactions);
462 ++budgetpb[(fqp->p) ? fqp->p->p_ionice : 0];
464 dsched_debug(LOG_INFO,
465 "%d) avg_latency = %d, transactions = %d, ioprio = %d\n",
466 n, fqp->avg_latency, fqp->transactions,
467 (fqp->p) ? fqp->p->p_ionice : 0);
471 fqp->avg_latency = 0;
475 dsched_debug(LOG_INFO, "%d procs competing for disk\n"
476 "total_budget = %lld\n"
477 "incomplete tp = %d\n", n, total_budget, dpriv->incomplete_tp);
484 for (i = 0; i < FQ_PRIO_MAX+1; i++) {
485 if (budgetpb[i] == 0)
487 sum += (FQ_PRIO_BIAS+i)*budgetpb[i];
493 dsched_debug(LOG_INFO, "sum = %d\n", sum);
495 for (i = 0; i < FQ_PRIO_MAX+1; i++) {
496 if (budgetpb[i] == 0)
499 budgetpb[i] = ((FQ_PRIO_BIAS+i)*10)*total_budget/sum;
502 if (total_budget > dpriv->max_budget)
503 dpriv->max_budget = total_budget;
507 dsched_debug(4, "disk is %d\% busy\n", disk_busy);
510 * XXX: eventually remove all the silly *10...
512 TAILQ_FOREACH_MUTABLE(fqp, &dpriv->fq_priv_list, dlink, fqp2) {
513 budget = budgetpb[(fqp->p) ? fqp->p->p_ionice : 0];
515 used_budget = ((int64_t)10*(int64_t)fqp->avg_latency *
516 (int64_t)fqp->transactions);
517 if (used_budget > 0) {
518 dsched_debug(LOG_INFO,
519 "info: used_budget = %lld, budget = %lld\n", used_budget,
524 * process is exceeding its fair share; rate-limit it, but only
525 * if the disk is being used at 90+% of capacity
527 if ((used_budget > budget) && (disk_busy >= 90)) {
528 KKASSERT(fqp->avg_latency != 0);
530 fqp->max_tp = budget/(10*fqp->avg_latency);
532 dsched_debug(LOG_INFO,
533 "rate limited to %d transactions\n", fqp->max_tp);
534 atomic_add_int(&fq_stats.procs_limited, 1);
535 } else if (((used_budget*2 < budget) || (disk_busy < 90)) &&
536 (!prev_full && !last_full)) {
538 * process is really using little of its timeslice, or the
539 * disk is not busy, so let's reset the rate-limit.
540 * Without this, exceeding processes will get an unlimited
541 * slice every other slice.
542 * XXX: this still doesn't quite fix the issue, but maybe
543 * it's good that way, so that heavy writes are interleaved.
547 fqp->transactions = 0;
548 fqp->avg_latency = 0;
552 prev_full = last_full;
553 last_full = (disk_busy >= 90)?1:0;
556 FQ_DPRIV_UNLOCK(dpriv);
557 callout_reset(&fq_callout, hz * FQ_REBALANCE_TIMEOUT,
558 (void (*)(void *))fq_balance_thread, dpriv);
563 do_fqstats(SYSCTL_HANDLER_ARGS)
565 return (sysctl_handle_opaque(oidp, &fq_stats, sizeof(struct dsched_fq_stats), req));
569 SYSCTL_PROC(_kern, OID_AUTO, fq_stats, CTLTYPE_OPAQUE|CTLFLAG_RD,
570 0, sizeof(struct dsched_fq_stats), do_fqstats, "fq_stats",
571 "dsched_fq statistics");
591 fq_priv_cache = objcache_create("fq-priv-cache", 0, 0,
593 objcache_malloc_alloc,
594 objcache_malloc_free,
595 &dsched_fq_priv_malloc_args );
597 fq_mpriv_cache = objcache_create("fq-mpriv-cache", 0, 0,
599 objcache_malloc_alloc,
600 objcache_malloc_free,
601 &dsched_fq_mpriv_malloc_args );
603 FQ_GLOBAL_FQMP_LOCKINIT();
605 fq_dpriv_cache = objcache_create("fq-dpriv-cache", 0, 0,
607 objcache_malloc_alloc,
608 objcache_malloc_free,
609 &dsched_fq_dpriv_malloc_args );
611 bzero(&fq_stats, sizeof(struct dsched_fq_stats));
613 dsched_register(&dsched_fq_ops);
614 callout_init_mp(&fq_callout);
616 kprintf("FQ scheduler policy version %d.%d loaded\n",
617 dsched_fq_version_maj, dsched_fq_version_min);
623 callout_stop(&fq_callout);
624 callout_deactivate(&fq_callout);
628 SYSINIT(fq_register, SI_SUB_PRE_DRIVERS, SI_ORDER_ANY, fq_init, NULL);
629 SYSUNINIT(fq_register, SI_SUB_PRE_DRIVERS, SI_ORDER_FIRST, fq_uninit, NULL);
631 SYSINIT(fq_early, SI_SUB_CREATE_INIT-1, SI_ORDER_FIRST, fq_earlyinit, NULL);
632 SYSUNINIT(fq_early, SI_SUB_CREATE_INIT-1, SI_ORDER_ANY, fq_earlyuninit, NULL);