2 * Copyright (c) 2009, 2010 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Alex Hornung <ahornung@gmail.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/kernel.h>
38 #include <sys/sysctl.h>
41 #include <sys/diskslice.h>
43 #include <machine/atomic.h>
44 #include <sys/malloc.h>
45 #include <sys/thread.h>
46 #include <sys/thread2.h>
47 #include <sys/sysctl.h>
48 #include <sys/spinlock2.h>
49 #include <machine/md_var.h>
50 #include <sys/ctype.h>
51 #include <sys/syslog.h>
52 #include <sys/device.h>
53 #include <sys/msgport.h>
54 #include <sys/msgport2.h>
56 #include <sys/dsched.h>
57 #include <machine/varargs.h>
58 #include <machine/param.h>
60 #include <dsched/fq/dsched_fq.h>
62 MALLOC_DECLARE(M_DSCHEDFQ);
64 static int dsched_fq_version_maj = 0;
65 static int dsched_fq_version_min = 7;
67 struct dsched_fq_stats fq_stats;
69 struct objcache_malloc_args dsched_fq_dpriv_malloc_args = {
70 sizeof(struct dsched_fq_dpriv), M_DSCHEDFQ };
71 struct objcache_malloc_args dsched_fq_priv_malloc_args = {
72 sizeof(struct dsched_fq_priv), M_DSCHEDFQ };
73 struct objcache_malloc_args dsched_fq_mpriv_malloc_args = {
74 sizeof(struct dsched_fq_mpriv), M_DSCHEDFQ };
76 static struct objcache *fq_dpriv_cache;
77 static struct objcache *fq_mpriv_cache;
78 static struct objcache *fq_priv_cache;
80 TAILQ_HEAD(, dsched_fq_mpriv) dsched_fqmp_list =
81 TAILQ_HEAD_INITIALIZER(dsched_fqmp_list);
83 struct spinlock fq_fqmp_lock;
84 struct callout fq_callout;
86 extern struct dsched_ops dsched_fq_ops;
88 void db_stack_trace_cmd(int addr, boolean_t have_addr, int count,char *modif);
89 void fq_print_backtrace(void);
92 fq_print_backtrace(void)
96 __asm __volatile("movl %%ebp, %0" : "=r" (ebp));
97 db_stack_trace_cmd(ebp, 1, 4, NULL);
101 fq_reference_dpriv(struct dsched_fq_dpriv *dpriv)
105 refcount = atomic_fetchadd_int(&dpriv->refcount, 1);
107 KKASSERT(refcount >= 0);
111 fq_reference_priv(struct dsched_fq_priv *fqp)
115 refcount = atomic_fetchadd_int(&fqp->refcount, 1);
117 KKASSERT(refcount >= 0);
121 fq_reference_mpriv(struct dsched_fq_mpriv *fqmp)
125 refcount = atomic_fetchadd_int(&fqmp->refcount, 1);
127 KKASSERT(refcount >= 0);
131 fq_dereference_dpriv(struct dsched_fq_dpriv *dpriv)
133 struct dsched_fq_priv *fqp, *fqp2;
136 refcount = atomic_fetchadd_int(&dpriv->refcount, -1);
139 KKASSERT(refcount >= -3);
142 atomic_subtract_int(&dpriv->refcount, 3); /* mark as: in destruction */
144 kprintf("dpriv (%p) destruction started, trace:\n", dpriv);
145 fq_print_backtrace();
147 spin_lock_wr(&dpriv->lock);
148 TAILQ_FOREACH_MUTABLE(fqp, &dpriv->fq_priv_list, dlink, fqp2) {
149 TAILQ_REMOVE(&dpriv->fq_priv_list, fqp, dlink);
150 fqp->flags &= ~FQP_LINKED_DPRIV;
151 fq_dereference_priv(fqp);
153 spin_unlock_wr(&dpriv->lock);
155 objcache_put(fq_dpriv_cache, dpriv);
156 atomic_subtract_int(&fq_stats.dpriv_allocations, 1);
161 fq_dereference_priv(struct dsched_fq_priv *fqp)
163 struct dsched_fq_mpriv *fqmp;
164 struct dsched_fq_dpriv *dpriv;
167 refcount = atomic_fetchadd_int(&fqp->refcount, -1);
169 KKASSERT(refcount >= -3);
172 atomic_subtract_int(&fqp->refcount, 3); /* mark as: in destruction */
174 kprintf("fqp (%p) destruction started, trace:\n", fqp);
175 fq_print_backtrace();
178 KKASSERT(dpriv != NULL);
180 spin_lock_wr(&fqp->lock);
182 KKASSERT(fqp->qlength == 0);
184 if (fqp->flags & FQP_LINKED_DPRIV) {
185 spin_lock_wr(&dpriv->lock);
187 TAILQ_REMOVE(&dpriv->fq_priv_list, fqp, dlink);
188 fqp->flags &= ~FQP_LINKED_DPRIV;
190 spin_unlock_wr(&dpriv->lock);
193 if (fqp->flags & FQP_LINKED_FQMP) {
195 KKASSERT(fqmp != NULL);
197 spin_lock_wr(&fqmp->lock);
199 TAILQ_REMOVE(&fqmp->fq_priv_list, fqp, link);
200 fqp->flags &= ~FQP_LINKED_FQMP;
202 spin_unlock_wr(&fqmp->lock);
205 spin_unlock_wr(&fqp->lock);
207 objcache_put(fq_priv_cache, fqp);
208 atomic_subtract_int(&fq_stats.fqp_allocations, 1);
210 fq_dereference_dpriv(dpriv);
216 fq_dereference_mpriv(struct dsched_fq_mpriv *fqmp)
218 struct dsched_fq_priv *fqp, *fqp2;
221 refcount = atomic_fetchadd_int(&fqmp->refcount, -1);
223 KKASSERT(refcount >= -3);
226 atomic_subtract_int(&fqmp->refcount, 3); /* mark as: in destruction */
228 kprintf("fqmp (%p) destruction started, trace:\n", fqmp);
229 fq_print_backtrace();
231 FQ_GLOBAL_FQMP_LOCK();
232 spin_lock_wr(&fqmp->lock);
234 TAILQ_FOREACH_MUTABLE(fqp, &fqmp->fq_priv_list, link, fqp2) {
235 TAILQ_REMOVE(&fqmp->fq_priv_list, fqp, link);
236 fqp->flags &= ~FQP_LINKED_FQMP;
237 fq_dereference_priv(fqp);
239 TAILQ_REMOVE(&dsched_fqmp_list, fqmp, link);
241 spin_unlock_wr(&fqmp->lock);
242 FQ_GLOBAL_FQMP_UNLOCK();
244 objcache_put(fq_mpriv_cache, fqmp);
245 atomic_subtract_int(&fq_stats.fqmp_allocations, 1);
250 struct dsched_fq_priv *
251 fq_alloc_priv(struct disk *dp)
253 struct dsched_fq_priv *fqp;
255 fq_reference_dpriv(dsched_get_disk_priv(dp));
257 fqp = objcache_get(fq_priv_cache, M_WAITOK);
258 bzero(fqp, sizeof(struct dsched_fq_priv));
260 /* XXX: maybe we do need another ref for the disk list for fqp */
261 fq_reference_priv(fqp);
263 FQ_FQP_LOCKINIT(fqp);
267 fqp->dpriv = dsched_get_disk_priv(dp);
269 TAILQ_INIT(&fqp->queue);
270 TAILQ_INSERT_TAIL(&fqp->dpriv->fq_priv_list, fqp, dlink);
271 fqp->flags |= FQP_LINKED_DPRIV;
273 atomic_add_int(&fq_stats.fqp_allocations, 1);
279 struct dsched_fq_dpriv *
280 fq_alloc_dpriv(struct disk *dp)
282 struct dsched_fq_dpriv *dpriv;
284 dpriv = objcache_get(fq_dpriv_cache, M_WAITOK);
285 bzero(dpriv, sizeof(struct dsched_fq_dpriv));
286 fq_reference_dpriv(dpriv);
288 dpriv->avg_rq_time = 0;
289 dpriv->incomplete_tp = 0;
290 FQ_DPRIV_LOCKINIT(dpriv);
291 TAILQ_INIT(&dpriv->fq_priv_list);
293 atomic_add_int(&fq_stats.dpriv_allocations, 1);
298 struct dsched_fq_mpriv *
299 fq_alloc_mpriv(struct proc *p)
301 struct dsched_fq_mpriv *fqmp;
302 struct dsched_fq_priv *fqp;
303 struct disk *dp = NULL;
305 fqmp = objcache_get(fq_mpriv_cache, M_WAITOK);
306 bzero(fqmp, sizeof(struct dsched_fq_mpriv));
307 fq_reference_mpriv(fqmp);
309 kprintf("fq_alloc_mpriv, new fqmp = %p\n", fqmp);
311 FQ_FQMP_LOCKINIT(fqmp);
313 TAILQ_INIT(&fqmp->fq_priv_list);
315 while ((dp = dsched_disk_enumerate(dp, &dsched_fq_ops))) {
316 fqp = fq_alloc_priv(dp);
319 fq_reference_priv(fqp);
322 TAILQ_INSERT_TAIL(&fqmp->fq_priv_list, fqp, link);
323 fqp->flags |= FQP_LINKED_FQMP;
326 FQ_GLOBAL_FQMP_LOCK();
327 TAILQ_INSERT_TAIL(&dsched_fqmp_list, fqmp, link);
328 FQ_GLOBAL_FQMP_UNLOCK();
329 FQ_FQMP_UNLOCK(fqmp);
331 atomic_add_int(&fq_stats.fqmp_allocations, 1);
337 fq_dispatcher(struct dsched_fq_dpriv *dpriv)
339 struct dsched_fq_mpriv *fqmp;
340 struct dsched_fq_priv *fqp, *fqp2;
341 struct bio *bio, *bio2;
345 * We need to manually assign an fqp to the fqmp of this thread
346 * since it isn't assigned one during fq_prepare, as the disk
349 fqmp = dsched_get_thread_priv(curthread);
350 KKASSERT(fqmp != NULL);
352 fqp = fq_alloc_priv(dpriv->dp);
355 fq_reference_priv(fqp);
357 TAILQ_INSERT_TAIL(&fqmp->fq_priv_list, fqp, link);
358 FQ_FQMP_UNLOCK(fqmp);
361 FQ_DPRIV_LOCK(dpriv);
365 if ((ssleep(dpriv, &dpriv->lock, 0, "fq_dispatcher", hz/15) == 0)) {
367 * We've been woken up; this either means that we are
368 * supposed to die away nicely or that the disk is idle.
371 if (dpriv->die == 1) {
372 FQ_DPRIV_UNLOCK(dpriv);
373 kprintf("fq_dispatcher is peacefully dying\n");
379 * We have been awakened because the disk is idle.
380 * So let's get ready to dispatch some extra bios.
385 /* Maybe the disk is idle and we just didn't get the wakeup */
390 * XXX: further room for improvements here. It would be better
391 * to dispatch a few requests from each fqp as to ensure
394 TAILQ_FOREACH_MUTABLE(fqp, &dpriv->fq_priv_list, dlink, fqp2) {
395 if (fqp->qlength == 0)
401 * XXX: why 5 extra? should probably be dynamic,
402 * relying on information on latency.
404 if ((fqp->max_tp > 0) && idle &&
405 (fqp->issued >= fqp->max_tp)) {
407 ++fqp->idle_generation;
410 TAILQ_FOREACH_MUTABLE(bio, &fqp->queue, link, bio2) {
411 if ((fqp->max_tp > 0) &&
412 ((fqp->issued >= fqp->max_tp)))
415 TAILQ_REMOVE(&fqp->queue, bio, link);
419 * beware that we do have an fqp reference
422 fq_dispatch(dpriv, bio, fqp);
431 fq_balance_thread(struct dsched_fq_dpriv *dpriv)
433 struct dsched_fq_priv *fqp, *fqp2;
435 static int last_full = 0, prev_full = 0;
436 static int limited_procs = 0;
439 int64_t budget, total_budget, used_budget;
440 int64_t budgetpb[FQ_PRIO_MAX+1];
443 bzero(budgetpb, sizeof(budgetpb));
446 FQ_DPRIV_LOCK(dpriv);
447 disk_busy = (100*(FQ_TOTAL_DISK_TIME - dpriv->idle_time)) /
451 incomplete_tp = dpriv->incomplete_tp;
452 dpriv->idle_time = 0;
454 TAILQ_FOREACH_MUTABLE(fqp, &dpriv->fq_priv_list, dlink, fqp2) {
455 if (fqp->transactions > 0 /* 30 */) {
457 total_budget += (fqp->avg_latency * fqp->transactions);
459 * XXX: while the code below really sucked, the problem needs to
460 * be addressed eventually. Some processes take up their "fair"
461 * slice, but don't really need even a 10th of it.
462 * This kills performance for those that do need the
467 * This is *very* hackish. It basically tries to avoid that
468 * processes that do only very few tps take away more bandwidth
471 if ((limited_procs >= 1) && (fqp->transactions < 25) &&
472 (budgetpb[(fqp->p) ? fqp->p->p_ionice : 0] >= 1))
476 ++budgetpb[(fqp->p) ? fqp->p->p_ionice : 0];
478 dsched_debug(LOG_INFO,
479 "%d) avg_latency = %d, transactions = %d, ioprio = %d\n",
480 n, fqp->avg_latency, fqp->transactions,
481 (fqp->p) ? fqp->p->p_ionice : 0);
485 fqp->avg_latency = 0;
489 dsched_debug(LOG_INFO, "%d procs competing for disk\n"
490 "total_budget = %lld\n"
491 "incomplete tp = %d\n", n, total_budget, incomplete_tp);
497 for (i = 0; i < FQ_PRIO_MAX+1; i++) {
498 if (budgetpb[i] == 0)
500 sum += (FQ_PRIO_BIAS+i)*budgetpb[i];
504 dsched_debug(LOG_INFO, "sum = %d\n", sum);
506 for (i = 0; i < FQ_PRIO_MAX+1; i++) {
507 if (budgetpb[i] == 0)
510 budgetpb[i] = ((FQ_PRIO_BIAS+i)*10)*total_budget/sum;
513 if (total_budget > dpriv->max_budget)
514 dpriv->max_budget = total_budget;
518 dsched_debug(4, "disk is %d\% busy\n", disk_busy);
521 * XXX: eventually remove all the silly *10...
523 TAILQ_FOREACH_MUTABLE(fqp, &dpriv->fq_priv_list, dlink, fqp2) {
524 budget = budgetpb[(fqp->p) ? fqp->p->p_ionice : 0];
526 used_budget = ((int64_t)10*(int64_t)fqp->avg_latency *
527 (int64_t)fqp->transactions);
528 if (used_budget > 0) {
529 dsched_debug(LOG_INFO,
530 "info: used_budget = %lld, budget = %lld\n", used_budget,
535 * process is exceeding its fair share; rate-limit it, but only
536 * if the disk is being used at 90+% of capacity
538 if ((used_budget > budget) && (disk_busy >= 90) &&
539 (incomplete_tp > n*2)) {
540 KKASSERT(fqp->avg_latency != 0);
542 fqp->max_tp = budget/(10*fqp->avg_latency);
544 dsched_debug(LOG_INFO,
545 "rate limited to %d transactions\n", fqp->max_tp);
546 atomic_add_int(&fq_stats.procs_limited, 1);
547 } else if (((used_budget*2 < budget) || (disk_busy < 90)) &&
548 (!prev_full && !last_full)) {
550 * process is really using little of its timeslice, or the
551 * disk is not busy, so let's reset the rate-limit.
552 * Without this, exceeding processes will get an unlimited
553 * slice every other slice.
554 * XXX: this still doesn't quite fix the issue, but maybe
555 * it's good that way, so that heavy writes are interleaved.
559 fqp->transactions = 0;
560 fqp->avg_latency = 0;
564 prev_full = last_full;
565 last_full = (disk_busy >= 90)?1:0;
568 FQ_DPRIV_UNLOCK(dpriv);
569 callout_reset(&fq_callout, hz * FQ_REBALANCE_TIMEOUT,
570 (void (*)(void *))fq_balance_thread, dpriv);
575 do_fqstats(SYSCTL_HANDLER_ARGS)
577 return (sysctl_handle_opaque(oidp, &fq_stats, sizeof(struct dsched_fq_stats), req));
581 SYSCTL_PROC(_kern, OID_AUTO, fq_stats, CTLTYPE_OPAQUE|CTLFLAG_RD,
582 0, sizeof(struct dsched_fq_stats), do_fqstats, "fq_stats",
583 "dsched_fq statistics");
603 fq_priv_cache = objcache_create("fq-priv-cache", 0, 0,
605 objcache_malloc_alloc,
606 objcache_malloc_free,
607 &dsched_fq_priv_malloc_args );
609 fq_mpriv_cache = objcache_create("fq-mpriv-cache", 0, 0,
611 objcache_malloc_alloc,
612 objcache_malloc_free,
613 &dsched_fq_mpriv_malloc_args );
615 FQ_GLOBAL_FQMP_LOCKINIT();
617 fq_dpriv_cache = objcache_create("fq-dpriv-cache", 0, 0,
619 objcache_malloc_alloc,
620 objcache_malloc_free,
621 &dsched_fq_dpriv_malloc_args );
623 bzero(&fq_stats, sizeof(struct dsched_fq_stats));
625 dsched_register(&dsched_fq_ops);
626 callout_init_mp(&fq_callout);
628 kprintf("FQ scheduler policy version %d.%d loaded\n",
629 dsched_fq_version_maj, dsched_fq_version_min);
635 callout_stop(&fq_callout);
636 callout_deactivate(&fq_callout);
640 SYSINIT(fq_register, SI_SUB_PRE_DRIVERS, SI_ORDER_ANY, fq_init, NULL);
641 SYSUNINIT(fq_register, SI_SUB_PRE_DRIVERS, SI_ORDER_FIRST, fq_uninit, NULL);
643 SYSINIT(fq_early, SI_SUB_CREATE_INIT-1, SI_ORDER_FIRST, fq_earlyinit, NULL);
644 SYSUNINIT(fq_early, SI_SUB_CREATE_INIT-1, SI_ORDER_ANY, fq_earlyuninit, NULL);