2 * Copyright (c) 2009, 2010 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Alex Hornung <ahornung@gmail.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/kernel.h>
38 #include <sys/sysctl.h>
41 #include <sys/diskslice.h>
43 #include <machine/atomic.h>
44 #include <sys/malloc.h>
45 #include <sys/thread.h>
46 #include <sys/thread2.h>
47 #include <sys/sysctl.h>
48 #include <sys/spinlock2.h>
49 #include <machine/md_var.h>
50 #include <sys/ctype.h>
51 #include <sys/syslog.h>
52 #include <sys/device.h>
53 #include <sys/msgport.h>
54 #include <sys/msgport2.h>
56 #include <sys/dsched.h>
57 #include <machine/varargs.h>
58 #include <machine/param.h>
60 #include <dsched/fq/dsched_fq.h>
62 MALLOC_DECLARE(M_DSCHEDFQ);
64 static int dsched_fq_version_maj = 0;
65 static int dsched_fq_version_min = 8;
67 struct dsched_fq_stats fq_stats;
69 struct objcache_malloc_args dsched_fq_dpriv_malloc_args = {
70 sizeof(struct dsched_fq_dpriv), M_DSCHEDFQ };
71 struct objcache_malloc_args dsched_fq_priv_malloc_args = {
72 sizeof(struct dsched_fq_priv), M_DSCHEDFQ };
73 struct objcache_malloc_args dsched_fq_mpriv_malloc_args = {
74 sizeof(struct dsched_fq_mpriv), M_DSCHEDFQ };
76 static struct objcache *fq_dpriv_cache;
77 static struct objcache *fq_mpriv_cache;
78 static struct objcache *fq_priv_cache;
80 TAILQ_HEAD(, dsched_fq_mpriv) dsched_fqmp_list =
81 TAILQ_HEAD_INITIALIZER(dsched_fqmp_list);
83 struct spinlock fq_fqmp_lock;
84 struct callout fq_callout;
86 extern struct dsched_ops dsched_fq_ops;
89 fq_reference_dpriv(struct dsched_fq_dpriv *dpriv)
93 refcount = atomic_fetchadd_int(&dpriv->refcount, 1);
95 KKASSERT(refcount >= 0);
99 fq_reference_priv(struct dsched_fq_priv *fqp)
103 refcount = atomic_fetchadd_int(&fqp->refcount, 1);
105 KKASSERT(refcount >= 0);
109 fq_reference_mpriv(struct dsched_fq_mpriv *fqmp)
113 refcount = atomic_fetchadd_int(&fqmp->refcount, 1);
115 KKASSERT(refcount >= 0);
119 fq_dereference_dpriv(struct dsched_fq_dpriv *dpriv)
121 struct dsched_fq_priv *fqp, *fqp2;
124 refcount = atomic_fetchadd_int(&dpriv->refcount, -1);
127 KKASSERT(refcount >= 0 || refcount <= -0x400);
130 atomic_subtract_int(&dpriv->refcount, 0x400); /* mark as: in destruction */
132 kprintf("dpriv (%p) destruction started, trace:\n", dpriv);
135 spin_lock_wr(&dpriv->lock);
136 TAILQ_FOREACH_MUTABLE(fqp, &dpriv->fq_priv_list, dlink, fqp2) {
137 TAILQ_REMOVE(&dpriv->fq_priv_list, fqp, dlink);
138 fqp->flags &= ~FQP_LINKED_DPRIV;
139 fq_dereference_priv(fqp);
141 spin_unlock_wr(&dpriv->lock);
143 objcache_put(fq_dpriv_cache, dpriv);
144 atomic_subtract_int(&fq_stats.dpriv_allocations, 1);
149 fq_dereference_priv(struct dsched_fq_priv *fqp)
151 struct dsched_fq_mpriv *fqmp;
152 struct dsched_fq_dpriv *dpriv;
155 refcount = atomic_fetchadd_int(&fqp->refcount, -1);
157 KKASSERT(refcount >= 0 || refcount <= -0x400);
160 atomic_subtract_int(&fqp->refcount, 0x400); /* mark as: in destruction */
162 kprintf("fqp (%p) destruction started, trace:\n", fqp);
166 KKASSERT(dpriv != NULL);
168 spin_lock_wr(&fqp->lock);
170 KKASSERT(fqp->qlength == 0);
172 if (fqp->flags & FQP_LINKED_DPRIV) {
173 spin_lock_wr(&dpriv->lock);
175 TAILQ_REMOVE(&dpriv->fq_priv_list, fqp, dlink);
176 fqp->flags &= ~FQP_LINKED_DPRIV;
178 spin_unlock_wr(&dpriv->lock);
181 if (fqp->flags & FQP_LINKED_FQMP) {
183 KKASSERT(fqmp != NULL);
185 spin_lock_wr(&fqmp->lock);
187 TAILQ_REMOVE(&fqmp->fq_priv_list, fqp, link);
188 fqp->flags &= ~FQP_LINKED_FQMP;
190 spin_unlock_wr(&fqmp->lock);
193 spin_unlock_wr(&fqp->lock);
195 objcache_put(fq_priv_cache, fqp);
196 atomic_subtract_int(&fq_stats.fqp_allocations, 1);
198 fq_dereference_dpriv(dpriv);
204 fq_dereference_mpriv(struct dsched_fq_mpriv *fqmp)
206 struct dsched_fq_priv *fqp, *fqp2;
209 refcount = atomic_fetchadd_int(&fqmp->refcount, -1);
211 KKASSERT(refcount >= 0 || refcount <= -0x400);
214 atomic_subtract_int(&fqmp->refcount, 0x400); /* mark as: in destruction */
216 kprintf("fqmp (%p) destruction started, trace:\n", fqmp);
219 FQ_GLOBAL_FQMP_LOCK();
220 spin_lock_wr(&fqmp->lock);
222 TAILQ_FOREACH_MUTABLE(fqp, &fqmp->fq_priv_list, link, fqp2) {
223 TAILQ_REMOVE(&fqmp->fq_priv_list, fqp, link);
224 fqp->flags &= ~FQP_LINKED_FQMP;
225 fq_dereference_priv(fqp);
227 TAILQ_REMOVE(&dsched_fqmp_list, fqmp, link);
229 spin_unlock_wr(&fqmp->lock);
230 FQ_GLOBAL_FQMP_UNLOCK();
232 objcache_put(fq_mpriv_cache, fqmp);
233 atomic_subtract_int(&fq_stats.fqmp_allocations, 1);
238 struct dsched_fq_priv *
239 fq_alloc_priv(struct disk *dp, struct dsched_fq_mpriv *fqmp)
241 struct dsched_fq_priv *fqp;
243 fq_reference_dpriv(dsched_get_disk_priv(dp));
245 fqp = objcache_get(fq_priv_cache, M_WAITOK);
246 bzero(fqp, sizeof(struct dsched_fq_priv));
248 /* XXX: maybe we do need another ref for the disk list for fqp */
249 fq_reference_priv(fqp);
251 FQ_FQP_LOCKINIT(fqp);
255 fqp->dpriv = dsched_get_disk_priv(dp);
261 /* Put the fqp in the fqmp list */
263 TAILQ_INSERT_TAIL(&fqmp->fq_priv_list, fqp, link);
264 FQ_FQMP_UNLOCK(fqmp);
265 fqp->flags |= FQP_LINKED_FQMP;
268 TAILQ_INIT(&fqp->queue);
269 TAILQ_INSERT_TAIL(&fqp->dpriv->fq_priv_list, fqp, dlink);
270 fqp->flags |= FQP_LINKED_DPRIV;
272 atomic_add_int(&fq_stats.fqp_allocations, 1);
278 struct dsched_fq_dpriv *
279 fq_alloc_dpriv(struct disk *dp)
281 struct dsched_fq_dpriv *dpriv;
283 dpriv = objcache_get(fq_dpriv_cache, M_WAITOK);
284 bzero(dpriv, sizeof(struct dsched_fq_dpriv));
285 fq_reference_dpriv(dpriv);
287 dpriv->avg_rq_time = 0;
288 dpriv->incomplete_tp = 0;
289 FQ_DPRIV_LOCKINIT(dpriv);
290 TAILQ_INIT(&dpriv->fq_priv_list);
292 atomic_add_int(&fq_stats.dpriv_allocations, 1);
297 struct dsched_fq_mpriv *
298 fq_alloc_mpriv(struct proc *p)
300 struct dsched_fq_mpriv *fqmp;
301 struct dsched_fq_priv *fqp;
302 struct disk *dp = NULL;
304 fqmp = objcache_get(fq_mpriv_cache, M_WAITOK);
305 bzero(fqmp, sizeof(struct dsched_fq_mpriv));
306 fq_reference_mpriv(fqmp);
308 kprintf("fq_alloc_mpriv, new fqmp = %p\n", fqmp);
310 FQ_FQMP_LOCKINIT(fqmp);
311 TAILQ_INIT(&fqmp->fq_priv_list);
314 while ((dp = dsched_disk_enumerate(dp, &dsched_fq_ops))) {
315 fqp = fq_alloc_priv(dp, fqmp);
317 fq_reference_priv(fqp);
321 FQ_GLOBAL_FQMP_LOCK();
322 TAILQ_INSERT_TAIL(&dsched_fqmp_list, fqmp, link);
323 FQ_GLOBAL_FQMP_UNLOCK();
325 atomic_add_int(&fq_stats.fqmp_allocations, 1);
331 fq_dispatcher(struct dsched_fq_dpriv *dpriv)
333 struct dsched_fq_mpriv *fqmp;
334 struct dsched_fq_priv *fqp, *fqp2;
335 struct bio *bio, *bio2;
339 * We need to manually assign an fqp to the fqmp of this thread
340 * since it isn't assigned one during fq_prepare, as the disk
343 fqmp = dsched_get_thread_priv(curthread);
344 KKASSERT(fqmp != NULL);
346 fqp = fq_alloc_priv(dpriv->dp, fqmp);
348 fq_reference_priv(fqp);
351 FQ_DPRIV_LOCK(dpriv);
355 if ((ssleep(dpriv, &dpriv->lock, 0, "fq_dispatcher", hz/15) == 0)) {
357 * We've been woken up; this either means that we are
358 * supposed to die away nicely or that the disk is idle.
361 if (__predict_false(dpriv->die == 1)) {
362 /* If we are supposed to die, drain all queues */
363 fq_drain(dpriv, FQ_DRAIN_FLUSH);
365 /* Now we can safely unlock and exit */
366 FQ_DPRIV_UNLOCK(dpriv);
367 kprintf("fq_dispatcher is peacefully dying\n");
373 * We have been awakened because the disk is idle.
374 * So let's get ready to dispatch some extra bios.
379 /* Maybe the disk is idle and we just didn't get the wakeup */
384 * XXX: further room for improvements here. It would be better
385 * to dispatch a few requests from each fqp as to ensure
388 TAILQ_FOREACH_MUTABLE(fqp, &dpriv->fq_priv_list, dlink, fqp2) {
389 if (fqp->qlength == 0)
393 if (atomic_cmpset_int(&fqp->rebalance, 1, 0))
394 fq_balance_self(fqp);
396 * XXX: why 5 extra? should probably be dynamic,
397 * relying on information on latency.
399 if ((fqp->max_tp > 0) && idle &&
400 (fqp->issued >= fqp->max_tp)) {
404 TAILQ_FOREACH_MUTABLE(bio, &fqp->queue, link, bio2) {
405 if (atomic_cmpset_int(&fqp->rebalance, 1, 0))
406 fq_balance_self(fqp);
407 if ((fqp->max_tp > 0) &&
408 ((fqp->issued >= fqp->max_tp)))
411 TAILQ_REMOVE(&fqp->queue, bio, link);
415 * beware that we do have an fqp reference
418 fq_dispatch(dpriv, bio, fqp);
427 fq_balance_thread(struct dsched_fq_dpriv *dpriv)
429 struct dsched_fq_priv *fqp, *fqp2;
430 static struct timeval old_tv;
432 int64_t total_budget, product;
433 int64_t budget[FQ_PRIO_MAX+1];
434 int n, i, sum, total_disk_time;
437 getmicrotime(&old_tv);
439 FQ_DPRIV_LOCK(dpriv);
442 if ((ssleep(curthread, &dpriv->lock, 0, "fq_balancer", hz/2) == 0)) {
443 if (__predict_false(dpriv->die)) {
444 FQ_DPRIV_UNLOCK(dpriv);
449 bzero(budget, sizeof(budget));
455 total_disk_time = (int)(1000000*((tv.tv_sec - old_tv.tv_sec)) +
456 (tv.tv_usec - old_tv.tv_usec));
458 if (total_disk_time == 0)
461 dsched_debug(LOG_INFO, "total_disk_time = %d\n", total_disk_time);
465 dpriv->disk_busy = (100*(total_disk_time - dpriv->idle_time)) / total_disk_time;
466 if (dpriv->disk_busy < 0)
467 dpriv->disk_busy = 0;
469 dpriv->idle_time = 0;
472 TAILQ_FOREACH_MUTABLE(fqp, &dpriv->fq_priv_list, dlink, fqp2) {
473 fqp->s_avg_latency = fqp->avg_latency;
474 fqp->s_transactions = fqp->transactions;
475 if (fqp->s_transactions > 0 /* 30 */) {
476 product = fqp->s_avg_latency * fqp->s_transactions;
477 product >>= lost_bits;
478 while(total_budget >= INT64_MAX - product) {
483 total_budget += product;
484 ++budget[(fqp->p) ? fqp->p->p_ionice : 0];
485 KKASSERT(total_budget >= 0);
486 dsched_debug(LOG_INFO,
487 "%d) avg_latency = %d, transactions = %d, ioprio = %d\n",
488 n, fqp->s_avg_latency, fqp->s_transactions,
489 (fqp->p) ? fqp->p->p_ionice : 0);
495 fqp->transactions = 0;
496 fqp->avg_latency = 0;
500 dsched_debug(LOG_INFO, "%d procs competing for disk\n"
501 "total_budget = %lld (lost bits = %d)\n"
502 "incomplete tp = %d\n", n, total_budget, lost_bits,
503 dpriv->incomplete_tp);
510 for (i = 0; i < FQ_PRIO_MAX+1; i++) {
513 sum += (FQ_PRIO_BIAS+i)*budget[i];
519 dsched_debug(LOG_INFO, "sum = %d\n", sum);
521 for (i = 0; i < FQ_PRIO_MAX+1; i++) {
526 * XXX: if we still overflow here, we really need to switch to
527 * some more advanced mechanism such as compound int128 or
528 * storing the lost bits so they can be used in the
531 dpriv->budgetpb[i] = ((FQ_PRIO_BIAS+i)*total_budget/sum) << lost_bits;
532 KKASSERT(dpriv->budgetpb[i] >= 0);
535 if (total_budget > dpriv->max_budget)
536 dpriv->max_budget = total_budget;
538 dsched_debug(4, "disk is %d\% busy\n", dpriv->disk_busy);
539 TAILQ_FOREACH(fqp, &dpriv->fq_priv_list, dlink) {
543 dpriv->prev_full = dpriv->last_full;
544 dpriv->last_full = (dpriv->disk_busy >= 90)?1:0;
550 * fq_balance_self should be called from all sorts of dispatchers. It basically
551 * offloads some of the heavier calculations on throttling onto the process that
552 * wants to do I/O instead of doing it in the fq_balance thread.
553 * - should be called with dpriv lock held
556 fq_balance_self(struct dsched_fq_priv *fqp) {
557 struct dsched_fq_dpriv *dpriv;
559 int64_t budget, used_budget;
561 int64_t transactions;
563 transactions = (int64_t)fqp->s_transactions;
564 avg_latency = (int64_t)fqp->s_avg_latency;
567 used_budget = ((int64_t)avg_latency * transactions);
568 budget = dpriv->budgetpb[(fqp->p) ? fqp->p->p_ionice : 0];
570 if (used_budget > 0) {
571 dsched_debug(LOG_INFO,
572 "info: used_budget = %lld, budget = %lld\n", used_budget,
576 if ((used_budget > budget) && (dpriv->disk_busy >= 90)) {
577 KKASSERT(avg_latency != 0);
579 fqp->max_tp = budget/(avg_latency);
580 atomic_add_int(&fq_stats.procs_limited, 1);
582 dsched_debug(LOG_INFO,
583 "rate limited to %d transactions\n", fqp->max_tp);
585 } else if (((used_budget*2 < budget) || (dpriv->disk_busy < 80)) &&
586 (!dpriv->prev_full && !dpriv->last_full)) {
593 do_fqstats(SYSCTL_HANDLER_ARGS)
595 return (sysctl_handle_opaque(oidp, &fq_stats, sizeof(struct dsched_fq_stats), req));
599 SYSCTL_PROC(_kern, OID_AUTO, fq_stats, CTLTYPE_OPAQUE|CTLFLAG_RD,
600 0, sizeof(struct dsched_fq_stats), do_fqstats, "fq_stats",
601 "dsched_fq statistics");
619 fq_priv_cache = objcache_create("fq-priv-cache", 0, 0,
621 objcache_malloc_alloc,
622 objcache_malloc_free,
623 &dsched_fq_priv_malloc_args );
625 fq_mpriv_cache = objcache_create("fq-mpriv-cache", 0, 0,
627 objcache_malloc_alloc,
628 objcache_malloc_free,
629 &dsched_fq_mpriv_malloc_args );
631 FQ_GLOBAL_FQMP_LOCKINIT();
633 fq_dpriv_cache = objcache_create("fq-dpriv-cache", 0, 0,
635 objcache_malloc_alloc,
636 objcache_malloc_free,
637 &dsched_fq_dpriv_malloc_args );
639 bzero(&fq_stats, sizeof(struct dsched_fq_stats));
641 dsched_register(&dsched_fq_ops);
642 callout_init_mp(&fq_callout);
644 kprintf("FQ scheduler policy version %d.%d loaded\n",
645 dsched_fq_version_maj, dsched_fq_version_min);
651 callout_stop(&fq_callout);
652 callout_deactivate(&fq_callout);
656 SYSINIT(fq_register, SI_SUB_PRE_DRIVERS, SI_ORDER_ANY, fq_init, NULL);
657 SYSUNINIT(fq_register, SI_SUB_PRE_DRIVERS, SI_ORDER_FIRST, fq_uninit, NULL);
659 SYSINIT(fq_early, SI_SUB_CREATE_INIT-1, SI_ORDER_FIRST, fq_earlyinit, NULL);
660 SYSUNINIT(fq_early, SI_SUB_CREATE_INIT-1, SI_ORDER_ANY, fq_earlyuninit, NULL);