2 * Copyright (c) 2009, 2010 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Alex Hornung <ahornung@gmail.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/kernel.h>
38 #include <sys/sysctl.h>
41 #include <sys/diskslice.h>
43 #include <machine/atomic.h>
44 #include <sys/malloc.h>
45 #include <sys/thread.h>
46 #include <sys/thread2.h>
47 #include <sys/sysctl.h>
48 #include <sys/spinlock2.h>
49 #include <machine/md_var.h>
50 #include <sys/ctype.h>
51 #include <sys/syslog.h>
52 #include <sys/device.h>
53 #include <sys/msgport.h>
54 #include <sys/msgport2.h>
56 #include <sys/dsched.h>
57 #include <machine/varargs.h>
58 #include <machine/param.h>
60 #include <dsched/fq/dsched_fq.h>
62 MALLOC_DECLARE(M_DSCHEDFQ);
64 static int dsched_fq_version_maj = 0;
65 static int dsched_fq_version_min = 8;
67 struct dsched_fq_stats fq_stats;
69 struct objcache_malloc_args dsched_fq_dpriv_malloc_args = {
70 sizeof(struct dsched_fq_dpriv), M_DSCHEDFQ };
71 struct objcache_malloc_args dsched_fq_priv_malloc_args = {
72 sizeof(struct dsched_fq_priv), M_DSCHEDFQ };
73 struct objcache_malloc_args dsched_fq_mpriv_malloc_args = {
74 sizeof(struct dsched_fq_mpriv), M_DSCHEDFQ };
76 static struct objcache *fq_dpriv_cache;
77 static struct objcache *fq_mpriv_cache;
78 static struct objcache *fq_priv_cache;
80 TAILQ_HEAD(, dsched_fq_mpriv) dsched_fqmp_list =
81 TAILQ_HEAD_INITIALIZER(dsched_fqmp_list);
83 struct lock fq_fqmp_lock;
84 struct callout fq_callout;
86 extern struct dsched_ops dsched_fq_ops;
89 fq_reference_dpriv(struct dsched_fq_dpriv *dpriv)
93 refcount = atomic_fetchadd_int(&dpriv->refcount, 1);
95 KKASSERT(refcount >= 0);
99 fq_reference_priv(struct dsched_fq_priv *fqp)
103 refcount = atomic_fetchadd_int(&fqp->refcount, 1);
105 KKASSERT(refcount >= 0);
109 fq_reference_mpriv(struct dsched_fq_mpriv *fqmp)
113 refcount = atomic_fetchadd_int(&fqmp->refcount, 1);
115 KKASSERT(refcount >= 0);
119 fq_dereference_dpriv(struct dsched_fq_dpriv *dpriv)
121 struct dsched_fq_priv *fqp, *fqp2;
124 refcount = atomic_fetchadd_int(&dpriv->refcount, -1);
127 KKASSERT(refcount >= 0 || refcount <= -0x400);
130 atomic_subtract_int(&dpriv->refcount, 0x400); /* mark as: in destruction */
132 kprintf("dpriv (%p) destruction started, trace:\n", dpriv);
135 lockmgr(&dpriv->lock, LK_EXCLUSIVE);
136 TAILQ_FOREACH_MUTABLE(fqp, &dpriv->fq_priv_list, dlink, fqp2) {
137 TAILQ_REMOVE(&dpriv->fq_priv_list, fqp, dlink);
138 fqp->flags &= ~FQP_LINKED_DPRIV;
139 fq_dereference_priv(fqp);
141 lockmgr(&dpriv->lock, LK_RELEASE);
143 objcache_put(fq_dpriv_cache, dpriv);
144 atomic_subtract_int(&fq_stats.dpriv_allocations, 1);
149 fq_dereference_priv(struct dsched_fq_priv *fqp)
151 struct dsched_fq_mpriv *fqmp;
152 struct dsched_fq_dpriv *dpriv;
155 refcount = atomic_fetchadd_int(&fqp->refcount, -1);
157 KKASSERT(refcount >= 0 || refcount <= -0x400);
160 atomic_subtract_int(&fqp->refcount, 0x400); /* mark as: in destruction */
162 kprintf("fqp (%p) destruction started, trace:\n", fqp);
166 KKASSERT(dpriv != NULL);
167 KKASSERT(fqp->qlength == 0);
169 if (fqp->flags & FQP_LINKED_DPRIV) {
170 lockmgr(&dpriv->lock, LK_EXCLUSIVE);
172 TAILQ_REMOVE(&dpriv->fq_priv_list, fqp, dlink);
173 fqp->flags &= ~FQP_LINKED_DPRIV;
175 lockmgr(&dpriv->lock, LK_RELEASE);
178 if (fqp->flags & FQP_LINKED_FQMP) {
180 KKASSERT(fqmp != NULL);
182 spin_lock_wr(&fqmp->lock);
184 TAILQ_REMOVE(&fqmp->fq_priv_list, fqp, link);
185 fqp->flags &= ~FQP_LINKED_FQMP;
187 spin_unlock_wr(&fqmp->lock);
190 objcache_put(fq_priv_cache, fqp);
191 atomic_subtract_int(&fq_stats.fqp_allocations, 1);
193 fq_dereference_dpriv(dpriv);
199 fq_dereference_mpriv(struct dsched_fq_mpriv *fqmp)
201 struct dsched_fq_priv *fqp, *fqp2;
204 refcount = atomic_fetchadd_int(&fqmp->refcount, -1);
206 KKASSERT(refcount >= 0 || refcount <= -0x400);
209 atomic_subtract_int(&fqmp->refcount, 0x400); /* mark as: in destruction */
211 kprintf("fqmp (%p) destruction started, trace:\n", fqmp);
214 FQ_GLOBAL_FQMP_LOCK();
216 TAILQ_FOREACH_MUTABLE(fqp, &fqmp->fq_priv_list, link, fqp2) {
217 TAILQ_REMOVE(&fqmp->fq_priv_list, fqp, link);
218 fqp->flags &= ~FQP_LINKED_FQMP;
219 fq_dereference_priv(fqp);
221 TAILQ_REMOVE(&dsched_fqmp_list, fqmp, link);
223 FQ_GLOBAL_FQMP_UNLOCK();
225 objcache_put(fq_mpriv_cache, fqmp);
226 atomic_subtract_int(&fq_stats.fqmp_allocations, 1);
231 struct dsched_fq_priv *
232 fq_alloc_priv(struct disk *dp, struct dsched_fq_mpriv *fqmp)
234 struct dsched_fq_priv *fqp;
236 fq_reference_dpriv(dsched_get_disk_priv(dp));
238 fqp = objcache_get(fq_priv_cache, M_WAITOK);
239 bzero(fqp, sizeof(struct dsched_fq_priv));
241 /* XXX: maybe we do need another ref for the disk list for fqp */
242 fq_reference_priv(fqp);
244 FQ_FQP_LOCKINIT(fqp);
247 fqp->dpriv = dsched_get_disk_priv(dp);
248 TAILQ_INIT(&fqp->queue);
250 TAILQ_INSERT_TAIL(&fqp->dpriv->fq_priv_list, fqp, dlink);
251 fqp->flags |= FQP_LINKED_DPRIV;
257 /* Put the fqp in the fqmp list */
259 TAILQ_INSERT_TAIL(&fqmp->fq_priv_list, fqp, link);
260 FQ_FQMP_UNLOCK(fqmp);
261 fqp->flags |= FQP_LINKED_FQMP;
264 atomic_add_int(&fq_stats.fqp_allocations, 1);
269 struct dsched_fq_dpriv *
270 fq_alloc_dpriv(struct disk *dp)
272 struct dsched_fq_dpriv *dpriv;
274 dpriv = objcache_get(fq_dpriv_cache, M_WAITOK);
275 bzero(dpriv, sizeof(struct dsched_fq_dpriv));
276 fq_reference_dpriv(dpriv);
278 dpriv->avg_rq_time = 0;
279 dpriv->incomplete_tp = 0;
280 FQ_DPRIV_LOCKINIT(dpriv);
281 TAILQ_INIT(&dpriv->fq_priv_list);
283 atomic_add_int(&fq_stats.dpriv_allocations, 1);
288 struct dsched_fq_mpriv *
289 fq_alloc_mpriv(struct proc *p)
291 struct dsched_fq_mpriv *fqmp;
292 struct dsched_fq_priv *fqp;
293 struct disk *dp = NULL;
295 fqmp = objcache_get(fq_mpriv_cache, M_WAITOK);
296 bzero(fqmp, sizeof(struct dsched_fq_mpriv));
297 fq_reference_mpriv(fqmp);
299 kprintf("fq_alloc_mpriv, new fqmp = %p\n", fqmp);
301 FQ_FQMP_LOCKINIT(fqmp);
302 TAILQ_INIT(&fqmp->fq_priv_list);
305 while ((dp = dsched_disk_enumerate(dp, &dsched_fq_ops))) {
306 fqp = fq_alloc_priv(dp, fqmp);
308 fq_reference_priv(fqp);
312 FQ_GLOBAL_FQMP_LOCK();
313 TAILQ_INSERT_TAIL(&dsched_fqmp_list, fqmp, link);
314 FQ_GLOBAL_FQMP_UNLOCK();
316 atomic_add_int(&fq_stats.fqmp_allocations, 1);
322 fq_dispatcher(struct dsched_fq_dpriv *dpriv)
324 struct dsched_fq_mpriv *fqmp;
325 struct dsched_fq_priv *fqp, *fqp2;
326 struct bio *bio, *bio2;
330 * We need to manually assign an fqp to the fqmp of this thread
331 * since it isn't assigned one during fq_prepare, as the disk
334 fqmp = dsched_get_thread_priv(curthread);
335 KKASSERT(fqmp != NULL);
337 fqp = fq_alloc_priv(dpriv->dp, fqmp);
339 fq_reference_priv(fqp);
342 FQ_DPRIV_LOCK(dpriv);
346 if ((lksleep(dpriv, &dpriv->lock, 0, "fq_dispatcher", hz/15) == 0)) {
348 * We've been woken up; this either means that we are
349 * supposed to die away nicely or that the disk is idle.
352 if (__predict_false(dpriv->die == 1)) {
353 /* If we are supposed to die, drain all queues */
354 fq_drain(dpriv, FQ_DRAIN_FLUSH);
356 /* Now we can safely unlock and exit */
357 FQ_DPRIV_UNLOCK(dpriv);
358 kprintf("fq_dispatcher is peacefully dying\n");
364 * We have been awakened because the disk is idle.
365 * So let's get ready to dispatch some extra bios.
370 /* Maybe the disk is idle and we just didn't get the wakeup */
375 * XXX: further room for improvements here. It would be better
376 * to dispatch a few requests from each fqp as to ensure
379 TAILQ_FOREACH_MUTABLE(fqp, &dpriv->fq_priv_list, dlink, fqp2) {
380 if (fqp->qlength == 0)
384 if (atomic_cmpset_int(&fqp->rebalance, 1, 0))
385 fq_balance_self(fqp);
387 * XXX: why 5 extra? should probably be dynamic,
388 * relying on information on latency.
390 if ((fqp->max_tp > 0) && idle &&
391 (fqp->issued >= fqp->max_tp)) {
395 TAILQ_FOREACH_MUTABLE(bio, &fqp->queue, link, bio2) {
396 if (atomic_cmpset_int(&fqp->rebalance, 1, 0))
397 fq_balance_self(fqp);
398 if ((fqp->max_tp > 0) &&
399 ((fqp->issued >= fqp->max_tp)))
402 TAILQ_REMOVE(&fqp->queue, bio, link);
406 * beware that we do have an fqp reference
409 fq_dispatch(dpriv, bio, fqp);
418 fq_balance_thread(struct dsched_fq_dpriv *dpriv)
420 struct dsched_fq_priv *fqp, *fqp2;
421 static struct timeval old_tv;
423 int64_t total_budget, product;
424 int64_t budget[FQ_PRIO_MAX+1];
425 int n, i, sum, total_disk_time;
428 getmicrotime(&old_tv);
430 FQ_DPRIV_LOCK(dpriv);
433 if ((lksleep(curthread, &dpriv->lock, 0, "fq_balancer", hz/2) == 0)) {
434 if (__predict_false(dpriv->die)) {
435 FQ_DPRIV_UNLOCK(dpriv);
440 bzero(budget, sizeof(budget));
446 total_disk_time = (int)(1000000*((tv.tv_sec - old_tv.tv_sec)) +
447 (tv.tv_usec - old_tv.tv_usec));
449 if (total_disk_time == 0)
452 dsched_debug(LOG_INFO, "total_disk_time = %d\n", total_disk_time);
456 dpriv->disk_busy = (100*(total_disk_time - dpriv->idle_time)) / total_disk_time;
457 if (dpriv->disk_busy < 0)
458 dpriv->disk_busy = 0;
460 dpriv->idle_time = 0;
463 TAILQ_FOREACH_MUTABLE(fqp, &dpriv->fq_priv_list, dlink, fqp2) {
464 fqp->s_avg_latency = fqp->avg_latency;
465 fqp->s_transactions = fqp->transactions;
466 if (fqp->s_transactions > 0 /* 30 */) {
467 product = fqp->s_avg_latency * fqp->s_transactions;
468 product >>= lost_bits;
469 while(total_budget >= INT64_MAX - product) {
474 total_budget += product;
475 ++budget[(fqp->p) ? fqp->p->p_ionice : 0];
476 KKASSERT(total_budget >= 0);
477 dsched_debug(LOG_INFO,
478 "%d) avg_latency = %d, transactions = %d, ioprio = %d\n",
479 n, fqp->s_avg_latency, fqp->s_transactions,
480 (fqp->p) ? fqp->p->p_ionice : 0);
486 fqp->transactions = 0;
487 fqp->avg_latency = 0;
491 dsched_debug(LOG_INFO, "%d procs competing for disk\n"
492 "total_budget = %jd (lost bits = %d)\n"
493 "incomplete tp = %d\n", n, (intmax_t)total_budget,
494 lost_bits, dpriv->incomplete_tp);
501 for (i = 0; i < FQ_PRIO_MAX+1; i++) {
504 sum += (FQ_PRIO_BIAS+i)*budget[i];
510 dsched_debug(LOG_INFO, "sum = %d\n", sum);
512 for (i = 0; i < FQ_PRIO_MAX+1; i++) {
517 * XXX: if we still overflow here, we really need to switch to
518 * some more advanced mechanism such as compound int128 or
519 * storing the lost bits so they can be used in the
522 dpriv->budgetpb[i] = ((FQ_PRIO_BIAS+i)*total_budget/sum) << lost_bits;
523 KKASSERT(dpriv->budgetpb[i] >= 0);
526 if (total_budget > dpriv->max_budget)
527 dpriv->max_budget = total_budget;
529 dsched_debug(4, "disk is %d%% busy\n", dpriv->disk_busy);
530 TAILQ_FOREACH(fqp, &dpriv->fq_priv_list, dlink) {
534 dpriv->prev_full = dpriv->last_full;
535 dpriv->last_full = (dpriv->disk_busy >= 90)?1:0;
541 * fq_balance_self should be called from all sorts of dispatchers. It basically
542 * offloads some of the heavier calculations on throttling onto the process that
543 * wants to do I/O instead of doing it in the fq_balance thread.
544 * - should be called with dpriv lock held
547 fq_balance_self(struct dsched_fq_priv *fqp) {
548 struct dsched_fq_dpriv *dpriv;
550 int64_t budget, used_budget;
552 int64_t transactions;
554 transactions = (int64_t)fqp->s_transactions;
555 avg_latency = (int64_t)fqp->s_avg_latency;
558 used_budget = ((int64_t)avg_latency * transactions);
559 budget = dpriv->budgetpb[(fqp->p) ? fqp->p->p_ionice : 0];
561 if (used_budget > 0) {
562 dsched_debug(LOG_INFO,
563 "info: used_budget = %jd, budget = %jd\n",
564 (intmax_t)used_budget, budget);
567 if ((used_budget > budget) && (dpriv->disk_busy >= 90)) {
568 KKASSERT(avg_latency != 0);
570 fqp->max_tp = budget/(avg_latency);
571 atomic_add_int(&fq_stats.procs_limited, 1);
573 dsched_debug(LOG_INFO,
574 "rate limited to %d transactions\n", fqp->max_tp);
576 } else if (((used_budget*2 < budget) || (dpriv->disk_busy < 80)) &&
577 (!dpriv->prev_full && !dpriv->last_full)) {
584 do_fqstats(SYSCTL_HANDLER_ARGS)
586 return (sysctl_handle_opaque(oidp, &fq_stats, sizeof(struct dsched_fq_stats), req));
590 SYSCTL_PROC(_kern, OID_AUTO, fq_stats, CTLTYPE_OPAQUE|CTLFLAG_RD,
591 0, sizeof(struct dsched_fq_stats), do_fqstats, "fq_stats",
592 "dsched_fq statistics");
610 fq_priv_cache = objcache_create("fq-priv-cache", 0, 0,
612 objcache_malloc_alloc,
613 objcache_malloc_free,
614 &dsched_fq_priv_malloc_args );
616 fq_mpriv_cache = objcache_create("fq-mpriv-cache", 0, 0,
618 objcache_malloc_alloc,
619 objcache_malloc_free,
620 &dsched_fq_mpriv_malloc_args );
622 FQ_GLOBAL_FQMP_LOCKINIT();
624 fq_dpriv_cache = objcache_create("fq-dpriv-cache", 0, 0,
626 objcache_malloc_alloc,
627 objcache_malloc_free,
628 &dsched_fq_dpriv_malloc_args );
630 bzero(&fq_stats, sizeof(struct dsched_fq_stats));
632 dsched_register(&dsched_fq_ops);
633 callout_init_mp(&fq_callout);
635 kprintf("FQ scheduler policy version %d.%d loaded\n",
636 dsched_fq_version_maj, dsched_fq_version_min);
642 callout_stop(&fq_callout);
643 callout_deactivate(&fq_callout);
647 SYSINIT(fq_register, SI_SUB_PRE_DRIVERS, SI_ORDER_ANY, fq_init, NULL);
648 SYSUNINIT(fq_register, SI_SUB_PRE_DRIVERS, SI_ORDER_FIRST, fq_uninit, NULL);
650 SYSINIT(fq_early, SI_SUB_CREATE_INIT-1, SI_ORDER_FIRST, fq_earlyinit, NULL);
651 SYSUNINIT(fq_early, SI_SUB_CREATE_INIT-1, SI_ORDER_ANY, fq_earlyuninit, NULL);