2 * Copyright (c) 2009, 2010 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Alex Hornung <ahornung@gmail.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/kernel.h>
38 #include <sys/sysctl.h>
41 #include <sys/diskslice.h>
43 #include <machine/atomic.h>
44 #include <sys/malloc.h>
45 #include <sys/thread.h>
46 #include <sys/thread2.h>
47 #include <sys/sysctl.h>
48 #include <sys/spinlock2.h>
49 #include <machine/md_var.h>
50 #include <sys/ctype.h>
51 #include <sys/syslog.h>
52 #include <sys/device.h>
53 #include <sys/msgport.h>
54 #include <sys/msgport2.h>
56 #include <sys/dsched.h>
57 #include <machine/varargs.h>
58 #include <machine/param.h>
60 #include <dsched/fq/dsched_fq.h>
62 MALLOC_DECLARE(M_DSCHEDFQ);
64 static int dsched_fq_version_maj = 0;
65 static int dsched_fq_version_min = 7;
67 struct dsched_fq_stats fq_stats;
69 struct objcache_malloc_args dsched_fq_dpriv_malloc_args = {
70 sizeof(struct dsched_fq_dpriv), M_DSCHEDFQ };
71 struct objcache_malloc_args dsched_fq_priv_malloc_args = {
72 sizeof(struct dsched_fq_priv), M_DSCHEDFQ };
73 struct objcache_malloc_args dsched_fq_mpriv_malloc_args = {
74 sizeof(struct dsched_fq_mpriv), M_DSCHEDFQ };
76 static struct objcache *fq_dpriv_cache;
77 static struct objcache *fq_mpriv_cache;
78 static struct objcache *fq_priv_cache;
80 TAILQ_HEAD(, dsched_fq_mpriv) dsched_fqmp_list =
81 TAILQ_HEAD_INITIALIZER(dsched_fqmp_list);
83 struct spinlock fq_fqmp_lock;
84 struct callout fq_callout;
86 extern struct dsched_ops dsched_fq_ops;
88 void db_stack_trace_cmd(int addr, boolean_t have_addr, int count,char *modif);
89 void fq_print_backtrace(void);
92 fq_print_backtrace(void)
96 __asm __volatile("movl %%ebp, %0" : "=r" (ebp));
97 db_stack_trace_cmd(ebp, 1, 4, NULL);
101 fq_reference_dpriv(struct dsched_fq_dpriv *dpriv)
105 refcount = atomic_fetchadd_int(&dpriv->refcount, 1);
107 KKASSERT(refcount >= 0);
111 fq_reference_priv(struct dsched_fq_priv *fqp)
115 refcount = atomic_fetchadd_int(&fqp->refcount, 1);
117 KKASSERT(refcount >= 0);
121 fq_reference_mpriv(struct dsched_fq_mpriv *fqmp)
125 refcount = atomic_fetchadd_int(&fqmp->refcount, 1);
127 KKASSERT(refcount >= 0);
131 fq_dereference_dpriv(struct dsched_fq_dpriv *dpriv)
133 struct dsched_fq_priv *fqp, *fqp2;
136 refcount = atomic_fetchadd_int(&dpriv->refcount, -1);
139 KKASSERT(refcount >= -3);
142 atomic_subtract_int(&dpriv->refcount, 3); /* mark as: in destruction */
144 kprintf("dpriv (%p) destruction started, trace:\n", dpriv);
145 fq_print_backtrace();
147 spin_lock_wr(&dpriv->lock);
148 TAILQ_FOREACH_MUTABLE(fqp, &dpriv->fq_priv_list, dlink, fqp2) {
149 TAILQ_REMOVE(&dpriv->fq_priv_list, fqp, dlink);
150 fqp->flags &= ~FQP_LINKED_DPRIV;
151 fq_dereference_priv(fqp);
153 spin_unlock_wr(&dpriv->lock);
155 objcache_put(fq_dpriv_cache, dpriv);
156 atomic_subtract_int(&fq_stats.dpriv_allocations, 1);
161 fq_dereference_priv(struct dsched_fq_priv *fqp)
163 struct dsched_fq_mpriv *fqmp;
164 struct dsched_fq_dpriv *dpriv;
167 refcount = atomic_fetchadd_int(&fqp->refcount, -1);
169 KKASSERT(refcount >= -3);
172 atomic_subtract_int(&fqp->refcount, 3); /* mark as: in destruction */
174 kprintf("fqp (%p) destruction started, trace:\n", fqp);
175 fq_print_backtrace();
178 KKASSERT(dpriv != NULL);
180 spin_lock_wr(&fqp->lock);
182 KKASSERT(fqp->qlength == 0);
184 if (fqp->flags & FQP_LINKED_DPRIV) {
185 spin_lock_wr(&dpriv->lock);
187 TAILQ_REMOVE(&dpriv->fq_priv_list, fqp, dlink);
188 fqp->flags &= ~FQP_LINKED_DPRIV;
190 spin_unlock_wr(&dpriv->lock);
193 if (fqp->flags & FQP_LINKED_FQMP) {
195 KKASSERT(fqmp != NULL);
197 spin_lock_wr(&fqmp->lock);
199 TAILQ_REMOVE(&fqmp->fq_priv_list, fqp, link);
200 fqp->flags &= ~FQP_LINKED_FQMP;
202 spin_unlock_wr(&fqmp->lock);
205 spin_unlock_wr(&fqp->lock);
207 objcache_put(fq_priv_cache, fqp);
208 atomic_subtract_int(&fq_stats.fqp_allocations, 1);
210 fq_dereference_dpriv(dpriv);
216 fq_dereference_mpriv(struct dsched_fq_mpriv *fqmp)
218 struct dsched_fq_priv *fqp, *fqp2;
221 refcount = atomic_fetchadd_int(&fqmp->refcount, -1);
223 KKASSERT(refcount >= -3);
226 atomic_subtract_int(&fqmp->refcount, 3); /* mark as: in destruction */
228 kprintf("fqmp (%p) destruction started, trace:\n", fqmp);
229 fq_print_backtrace();
231 FQ_GLOBAL_FQMP_LOCK();
232 spin_lock_wr(&fqmp->lock);
234 TAILQ_FOREACH_MUTABLE(fqp, &fqmp->fq_priv_list, link, fqp2) {
235 TAILQ_REMOVE(&fqmp->fq_priv_list, fqp, link);
236 fqp->flags &= ~FQP_LINKED_FQMP;
237 fq_dereference_priv(fqp);
239 TAILQ_REMOVE(&dsched_fqmp_list, fqmp, link);
241 spin_unlock_wr(&fqmp->lock);
242 FQ_GLOBAL_FQMP_UNLOCK();
244 objcache_put(fq_mpriv_cache, fqmp);
245 atomic_subtract_int(&fq_stats.fqmp_allocations, 1);
250 struct dsched_fq_priv *
251 fq_alloc_priv(struct disk *dp)
253 struct dsched_fq_priv *fqp;
255 fq_reference_dpriv(dsched_get_disk_priv(dp));
257 fqp = objcache_get(fq_priv_cache, M_WAITOK);
258 bzero(fqp, sizeof(struct dsched_fq_priv));
260 /* XXX: maybe we do need another ref for the disk list for fqp */
261 fq_reference_priv(fqp);
263 FQ_FQP_LOCKINIT(fqp);
267 fqp->dpriv = dsched_get_disk_priv(dp);
269 TAILQ_INIT(&fqp->queue);
270 TAILQ_INSERT_TAIL(&fqp->dpriv->fq_priv_list, fqp, dlink);
271 fqp->flags |= FQP_LINKED_DPRIV;
273 atomic_add_int(&fq_stats.fqp_allocations, 1);
279 struct dsched_fq_dpriv *
280 fq_alloc_dpriv(struct disk *dp)
282 struct dsched_fq_dpriv *dpriv;
284 dpriv = objcache_get(fq_dpriv_cache, M_WAITOK);
285 bzero(dpriv, sizeof(struct dsched_fq_dpriv));
286 fq_reference_dpriv(dpriv);
288 dpriv->avg_rq_time = 0;
289 dpriv->incomplete_tp = 0;
290 FQ_DPRIV_LOCKINIT(dpriv);
291 TAILQ_INIT(&dpriv->fq_priv_list);
293 atomic_add_int(&fq_stats.dpriv_allocations, 1);
298 struct dsched_fq_mpriv *
301 struct dsched_fq_mpriv *fqmp;
302 struct dsched_fq_priv *fqp;
303 struct disk *dp = NULL;
305 fqmp = objcache_get(fq_mpriv_cache, M_WAITOK);
306 bzero(fqmp, sizeof(struct dsched_fq_mpriv));
307 fq_reference_mpriv(fqmp);
309 kprintf("fq_alloc_mpriv, new fqmp = %p\n", fqmp);
311 FQ_FQMP_LOCKINIT(fqmp);
313 TAILQ_INIT(&fqmp->fq_priv_list);
315 while ((dp = dsched_disk_enumerate(dp, &dsched_fq_ops))) {
316 fqp = fq_alloc_priv(dp);
319 fq_reference_priv(fqp);
322 TAILQ_INSERT_TAIL(&fqmp->fq_priv_list, fqp, link);
323 fqp->flags |= FQP_LINKED_FQMP;
326 FQ_GLOBAL_FQMP_LOCK();
327 TAILQ_INSERT_TAIL(&dsched_fqmp_list, fqmp, link);
328 FQ_GLOBAL_FQMP_UNLOCK();
329 FQ_FQMP_UNLOCK(fqmp);
331 atomic_add_int(&fq_stats.fqmp_allocations, 1);
337 fq_dispatcher(struct dsched_fq_dpriv *dpriv)
339 struct dsched_fq_priv *fqp, *fqp2;
340 struct bio *bio, *bio2;
343 FQ_DPRIV_LOCK(dpriv);
346 if (ssleep(dpriv, &dpriv->lock, 0, "fq_dispatcher", hz/15) == 0) {
347 FQ_DPRIV_UNLOCK(dpriv);
348 kprintf("fq_dispatcher is peacefully dying\n");
352 TAILQ_FOREACH_MUTABLE(fqp, &dpriv->fq_priv_list, dlink, fqp2) {
353 if (fqp->qlength > 0) {
357 TAILQ_FOREACH_MUTABLE(bio, &fqp->queue, link, bio2) {
358 if ((fqp->max_tp > 0) &&
359 ((count >= fqp->max_tp) ||
360 (fqp->transactions >= fqp->max_tp)))
362 TAILQ_REMOVE(&fqp->queue, bio, link);
365 KKASSERT(fqp != NULL);
367 * beware that we do have an fqp reference
370 dsched_strategy_async(dpriv->dp, bio,
372 atomic_add_int(&dpriv->incomplete_tp, 1);
373 atomic_add_int(&fq_stats.transactions, 1);
384 fq_balance_thread(struct dsched_fq_dpriv *dpriv)
386 struct dsched_fq_priv *fqp, *fqp2;
388 static int last_full = 0, prev_full = 0;
390 int64_t total_budget, use_pct, avail_pct;
393 FQ_DPRIV_LOCK(dpriv);
394 incomplete_tp = dpriv->incomplete_tp;
396 TAILQ_FOREACH_MUTABLE(fqp, &dpriv->fq_priv_list, dlink, fqp2) {
397 if (fqp->transactions > 0 /* 30 */) {
398 total_budget += (fqp->avg_latency * fqp->transactions);
399 dsched_debug(LOG_INFO,
400 "%d) avg_latency = %d, transactions = %d\n",
401 n, fqp->avg_latency, fqp->transactions);
405 fqp->avg_latency = 0;
409 dsched_debug(LOG_INFO, "%d procs competing for disk\n"
410 "total_budget = %lld\n"
411 "incomplete tp = %d\n", n, total_budget, incomplete_tp);
418 * XXX: hack. don't know why total_budget can be zero here
419 * -> this doesn't apply anymore. total_budget is never 0 now
421 if (total_budget == 0)
425 TAILQ_FOREACH_MUTABLE(fqp, &dpriv->fq_priv_list, dlink, fqp2) {
426 /* XXX: proportional to scheduler class! */
427 avail_pct = (int64_t)1000/(int64_t)n;
429 /* XXX: 100/(sum of scheduler priorities) * scheduler priority */
430 /* XXX: but need to process queues of fqp on buckets or so...*/
432 use_pct = ((int64_t)1000* (int64_t)fqp->avg_latency *
433 (int64_t)fqp->transactions)/(int64_t)total_budget;
435 /* process is exceeding its fair share; rate-limit it */
436 if ((use_pct > avail_pct) && (incomplete_tp > n*2)) {
437 /* kprintf("here we are, use_pct > avail_pct\n"); */
438 /* fqp->max_tp = avail_pct * fqp->avg_latency; */
439 fqp->max_tp = total_budget/(n * fqp->avg_latency);
440 dsched_debug(LOG_INFO,
441 "rate limited to %d transactions\n", fqp->max_tp);
442 atomic_add_int(&fq_stats.procs_limited, 1);
443 } else if (((use_pct < avail_pct/2) || (incomplete_tp < n*2)) &&
444 (!prev_full && !last_full)) {
446 * process is really using little of its timeslice, or the
447 * disk is not busy, so let's reset the rate-limit.
448 * Without this, exceeding processes will get an unlimited
449 * slice every other slice.
450 * XXX: this still doesn't quite fix the issue, but maybe,
451 * it's good that way so that heavy writes are interleaved.
455 fqp->transactions = 0;
456 fqp->avg_latency = 0;
459 prev_full = last_full;
460 last_full = (incomplete_tp > n*2)?1:0;
463 FQ_DPRIV_UNLOCK(dpriv);
464 callout_reset(&fq_callout, hz * FQ_REBALANCE_TIMEOUT,
465 (void (*)(void *))fq_balance_thread, dpriv);
470 do_fqstats(SYSCTL_HANDLER_ARGS)
472 return (sysctl_handle_opaque(oidp, &fq_stats, sizeof(struct dsched_fq_stats), req));
476 SYSCTL_PROC(_kern, OID_AUTO, fq_stats, CTLTYPE_OPAQUE|CTLFLAG_RD,
477 0, sizeof(struct dsched_fq_stats), do_fqstats, "fq_stats",
478 "dsched_fq statistics");
498 fq_priv_cache = objcache_create("fq-priv-cache", 0, 0,
500 objcache_malloc_alloc,
501 objcache_malloc_free,
502 &dsched_fq_priv_malloc_args );
504 fq_mpriv_cache = objcache_create("fq-mpriv-cache", 0, 0,
506 objcache_malloc_alloc,
507 objcache_malloc_free,
508 &dsched_fq_mpriv_malloc_args );
510 FQ_GLOBAL_FQMP_LOCKINIT();
512 fq_dpriv_cache = objcache_create("fq-dpriv-cache", 0, 0,
514 objcache_malloc_alloc,
515 objcache_malloc_free,
516 &dsched_fq_dpriv_malloc_args );
518 bzero(&fq_stats, sizeof(struct dsched_fq_stats));
520 dsched_register(&dsched_fq_ops);
521 callout_init_mp(&fq_callout);
523 kprintf("FQ scheduler policy version %d.%d loaded\n",
524 dsched_fq_version_maj, dsched_fq_version_min);
530 callout_stop(&fq_callout);
531 callout_deactivate(&fq_callout);
535 SYSINIT(fq_register, SI_SUB_PRE_DRIVERS, SI_ORDER_ANY, fq_init, NULL);
536 SYSUNINIT(fq_register, SI_SUB_PRE_DRIVERS, SI_ORDER_FIRST, fq_uninit, NULL);
538 SYSINIT(fq_early, SI_SUB_CREATE_INIT-1, SI_ORDER_FIRST, fq_earlyinit, NULL);
539 SYSUNINIT(fq_early, SI_SUB_CREATE_INIT-1, SI_ORDER_ANY, fq_earlyuninit, NULL);