dsched - Add the FQ policy
[dragonfly.git] / sys / dsched / fq / dsched_fq_core.c
1 /*
2  * Copyright (c) 2009, 2010 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Alex Hornung <ahornung@gmail.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/kernel.h>
37 #include <sys/proc.h>
38 #include <sys/sysctl.h>
39 #include <sys/buf.h>
40 #include <sys/conf.h>
41 #include <sys/diskslice.h>
42 #include <sys/disk.h>
43 #include <machine/atomic.h>
44 #include <sys/malloc.h>
45 #include <sys/thread.h>
46 #include <sys/thread2.h>
47 #include <sys/sysctl.h>
48 #include <sys/spinlock2.h>
49 #include <machine/md_var.h>
50 #include <sys/ctype.h>
51 #include <sys/syslog.h>
52 #include <sys/device.h>
53 #include <sys/msgport.h>
54 #include <sys/msgport2.h>
55 #include <sys/buf2.h>
56 #include <sys/dsched.h>
57 #include <machine/varargs.h>
58 #include <machine/param.h>
59
60 #include <dsched/fq/dsched_fq.h>
61
62 MALLOC_DECLARE(M_DSCHEDFQ);
63
64 static int      dsched_fq_version_maj = 0;
65 static int      dsched_fq_version_min = 7;
66
67 struct dsched_fq_stats  fq_stats;
68
69 struct objcache_malloc_args dsched_fq_dpriv_malloc_args = {
70         sizeof(struct dsched_fq_dpriv), M_DSCHEDFQ };
71 struct objcache_malloc_args dsched_fq_priv_malloc_args = {
72         sizeof(struct dsched_fq_priv), M_DSCHEDFQ };
73 struct objcache_malloc_args dsched_fq_mpriv_malloc_args = {
74         sizeof(struct dsched_fq_mpriv), M_DSCHEDFQ };
75
76 static struct objcache  *fq_dpriv_cache;
77 static struct objcache  *fq_mpriv_cache;
78 static struct objcache  *fq_priv_cache;
79
80 TAILQ_HEAD(, dsched_fq_mpriv)   dsched_fqmp_list =
81                 TAILQ_HEAD_INITIALIZER(dsched_fqmp_list);
82
83 struct spinlock fq_fqmp_lock;
84 struct callout  fq_callout;
85
86 extern struct dsched_ops dsched_fq_ops;
87
88 void db_stack_trace_cmd(int addr, boolean_t have_addr, int count,char *modif);
89 void fq_print_backtrace(void);
90
91 void
92 fq_print_backtrace(void)
93 {
94         register_t  ebp;
95
96         __asm __volatile("movl %%ebp, %0" : "=r" (ebp));
97         db_stack_trace_cmd(ebp, 1, 4, NULL);
98 }
99
100 void
101 fq_reference_dpriv(struct dsched_fq_dpriv *dpriv)
102 {
103         int refcount;
104
105         refcount = atomic_fetchadd_int(&dpriv->refcount, 1);
106
107         KKASSERT(refcount >= 0);
108 }
109
110 void
111 fq_reference_priv(struct dsched_fq_priv *fqp)
112 {
113         int refcount;
114
115         refcount = atomic_fetchadd_int(&fqp->refcount, 1);
116
117         KKASSERT(refcount >= 0);
118 }
119
120 void
121 fq_reference_mpriv(struct dsched_fq_mpriv *fqmp)
122 {
123         int refcount;
124
125         refcount = atomic_fetchadd_int(&fqmp->refcount, 1);
126
127         KKASSERT(refcount >= 0);
128 }
129
130 void
131 fq_dereference_dpriv(struct dsched_fq_dpriv *dpriv)
132 {
133         struct dsched_fq_priv   *fqp, *fqp2;
134         int refcount;
135
136         refcount = atomic_fetchadd_int(&dpriv->refcount, -1);
137
138
139         KKASSERT(refcount >= -3);
140
141         if (refcount == 1) {
142                 atomic_subtract_int(&dpriv->refcount, 3); /* mark as: in destruction */
143 #if 1
144                 kprintf("dpriv (%p) destruction started, trace:\n", dpriv);
145                 fq_print_backtrace();
146 #endif
147                 spin_lock_wr(&dpriv->lock);
148                 TAILQ_FOREACH_MUTABLE(fqp, &dpriv->fq_priv_list, dlink, fqp2) {
149                         TAILQ_REMOVE(&dpriv->fq_priv_list, fqp, dlink);
150                         fqp->flags &= ~FQP_LINKED_DPRIV;
151                         fq_dereference_priv(fqp);
152                 }
153                 spin_unlock_wr(&dpriv->lock);
154
155                 objcache_put(fq_dpriv_cache, dpriv);
156                 atomic_subtract_int(&fq_stats.dpriv_allocations, 1);
157         }
158 }
159
160 void
161 fq_dereference_priv(struct dsched_fq_priv *fqp)
162 {
163         struct dsched_fq_mpriv  *fqmp;
164         struct dsched_fq_dpriv  *dpriv;
165         int refcount;
166
167         refcount = atomic_fetchadd_int(&fqp->refcount, -1);
168
169         KKASSERT(refcount >= -3);
170
171         if (refcount == 1) {
172                 atomic_subtract_int(&fqp->refcount, 3); /* mark as: in destruction */
173 #if 0
174                 kprintf("fqp (%p) destruction started, trace:\n", fqp);
175                 fq_print_backtrace();
176 #endif
177                 dpriv = fqp->dpriv;
178                 KKASSERT(dpriv != NULL);
179
180                 spin_lock_wr(&fqp->lock);
181
182                 KKASSERT(fqp->qlength == 0);
183
184                 if (fqp->flags & FQP_LINKED_DPRIV) {
185                         spin_lock_wr(&dpriv->lock);
186
187                         TAILQ_REMOVE(&dpriv->fq_priv_list, fqp, dlink);
188                         fqp->flags &= ~FQP_LINKED_DPRIV;
189
190                         spin_unlock_wr(&dpriv->lock);
191                 }
192
193                 if (fqp->flags & FQP_LINKED_FQMP) {
194                         fqmp = fqp->fqmp;
195                         KKASSERT(fqmp != NULL);
196
197                         spin_lock_wr(&fqmp->lock);
198
199                         TAILQ_REMOVE(&fqmp->fq_priv_list, fqp, link);
200                         fqp->flags &= ~FQP_LINKED_FQMP;
201
202                         spin_unlock_wr(&fqmp->lock);
203                 }
204
205                 spin_unlock_wr(&fqp->lock);
206
207                 objcache_put(fq_priv_cache, fqp);
208                 atomic_subtract_int(&fq_stats.fqp_allocations, 1);
209 #if 0
210                 fq_dereference_dpriv(dpriv);
211 #endif
212         }
213 }
214
215 void
216 fq_dereference_mpriv(struct dsched_fq_mpriv *fqmp)
217 {
218         struct dsched_fq_priv   *fqp, *fqp2;
219         int refcount;
220
221         refcount = atomic_fetchadd_int(&fqmp->refcount, -1);
222
223         KKASSERT(refcount >= -3);
224
225         if (refcount == 1) {
226                 atomic_subtract_int(&fqmp->refcount, 3); /* mark as: in destruction */
227 #if 0
228                 kprintf("fqmp (%p) destruction started, trace:\n", fqmp);
229                 fq_print_backtrace();
230 #endif
231                 FQ_GLOBAL_FQMP_LOCK();
232                 spin_lock_wr(&fqmp->lock);
233
234                 TAILQ_FOREACH_MUTABLE(fqp, &fqmp->fq_priv_list, link, fqp2) {
235                         TAILQ_REMOVE(&fqmp->fq_priv_list, fqp, link);
236                         fqp->flags &= ~FQP_LINKED_FQMP;
237                         fq_dereference_priv(fqp);
238                 }
239                 TAILQ_REMOVE(&dsched_fqmp_list, fqmp, link);
240
241                 spin_unlock_wr(&fqmp->lock);
242                 FQ_GLOBAL_FQMP_UNLOCK();
243
244                 objcache_put(fq_mpriv_cache, fqmp);
245                 atomic_subtract_int(&fq_stats.fqmp_allocations, 1);
246         }
247 }
248
249
250 struct dsched_fq_priv *
251 fq_alloc_priv(struct disk *dp)
252 {
253         struct dsched_fq_priv   *fqp;
254 #if 0
255         fq_reference_dpriv(dsched_get_disk_priv(dp));
256 #endif
257         fqp = objcache_get(fq_priv_cache, M_WAITOK);
258         bzero(fqp, sizeof(struct dsched_fq_priv));
259
260         /* XXX: maybe we do need another ref for the disk list for fqp */
261         fq_reference_priv(fqp);
262
263         FQ_FQP_LOCKINIT(fqp);
264         FQ_FQP_LOCK(fqp);
265         fqp->dp = dp;
266
267         fqp->dpriv = dsched_get_disk_priv(dp);
268
269         TAILQ_INIT(&fqp->queue);
270         TAILQ_INSERT_TAIL(&fqp->dpriv->fq_priv_list, fqp, dlink);
271         fqp->flags |= FQP_LINKED_DPRIV;
272
273         atomic_add_int(&fq_stats.fqp_allocations, 1);
274         FQ_FQP_UNLOCK(fqp);
275         return fqp;
276 }
277
278
279 struct dsched_fq_dpriv *
280 fq_alloc_dpriv(struct disk *dp)
281 {
282         struct dsched_fq_dpriv *dpriv;
283
284         dpriv = objcache_get(fq_dpriv_cache, M_WAITOK);
285         bzero(dpriv, sizeof(struct dsched_fq_dpriv));
286         fq_reference_dpriv(dpriv);
287         dpriv->dp = dp;
288         dpriv->avg_rq_time = 0;
289         dpriv->incomplete_tp = 0;
290         FQ_DPRIV_LOCKINIT(dpriv);
291         TAILQ_INIT(&dpriv->fq_priv_list);
292
293         atomic_add_int(&fq_stats.dpriv_allocations, 1);
294         return dpriv;
295 }
296
297
298 struct dsched_fq_mpriv *
299 fq_alloc_mpriv()
300 {
301         struct dsched_fq_mpriv  *fqmp;
302         struct dsched_fq_priv   *fqp;
303         struct disk     *dp = NULL;
304
305         fqmp = objcache_get(fq_mpriv_cache, M_WAITOK);
306         bzero(fqmp, sizeof(struct dsched_fq_mpriv));
307         fq_reference_mpriv(fqmp);
308 #if 0
309         kprintf("fq_alloc_mpriv, new fqmp = %p\n", fqmp);
310 #endif
311         FQ_FQMP_LOCKINIT(fqmp);
312         FQ_FQMP_LOCK(fqmp);
313         TAILQ_INIT(&fqmp->fq_priv_list);
314
315         while ((dp = dsched_disk_enumerate(dp, &dsched_fq_ops))) {
316                 fqp = fq_alloc_priv(dp);
317
318 #if 0
319                 fq_reference_priv(fqp);
320 #endif
321                 fqp->fqmp = fqmp;
322                 TAILQ_INSERT_TAIL(&fqmp->fq_priv_list, fqp, link);
323                 fqp->flags |= FQP_LINKED_FQMP;
324         }
325
326         FQ_GLOBAL_FQMP_LOCK();
327         TAILQ_INSERT_TAIL(&dsched_fqmp_list, fqmp, link);
328         FQ_GLOBAL_FQMP_UNLOCK();
329         FQ_FQMP_UNLOCK(fqmp);
330
331         atomic_add_int(&fq_stats.fqmp_allocations, 1);
332         return fqmp;
333 }
334
335
336 void
337 fq_dispatcher(struct dsched_fq_dpriv *dpriv)
338 {
339         struct dsched_fq_priv   *fqp, *fqp2;
340         struct bio *bio, *bio2;
341         int count;
342
343         FQ_DPRIV_LOCK(dpriv);
344         for(;;) {
345                 /* sleep ~60 ms */
346                 if (ssleep(dpriv, &dpriv->lock, 0, "fq_dispatcher", hz/15) == 0) {
347                         FQ_DPRIV_UNLOCK(dpriv);
348                         kprintf("fq_dispatcher is peacefully dying\n");
349                         lwkt_exit();
350                 }
351
352                 TAILQ_FOREACH_MUTABLE(fqp, &dpriv->fq_priv_list, dlink, fqp2) {
353                         if (fqp->qlength > 0) {
354                                 FQ_FQP_LOCK(fqp);
355                                 count = 0;
356
357                                 TAILQ_FOREACH_MUTABLE(bio, &fqp->queue, link, bio2) {
358                                         if ((fqp->max_tp > 0) &&
359                                             ((count >= fqp->max_tp) ||
360                                             (fqp->transactions >= fqp->max_tp)))
361                                                 break;
362                                         TAILQ_REMOVE(&fqp->queue, bio, link);
363
364                                         --fqp->qlength;
365                                         KKASSERT(fqp != NULL);
366                                         /*
367                                          * beware that we do have an fqp reference
368                                          * from the queueing
369                                          */
370                                         dsched_strategy_async(dpriv->dp, bio,
371                                             fq_completed, fqp);
372                                         atomic_add_int(&dpriv->incomplete_tp, 1);
373                                         atomic_add_int(&fq_stats.transactions, 1);
374                                         ++count;
375                                 }
376                                 FQ_FQP_UNLOCK(fqp);
377                         }
378                 }
379         }
380 }
381
382
383 void
384 fq_balance_thread(struct dsched_fq_dpriv *dpriv)
385 {
386         struct  dsched_fq_priv  *fqp, *fqp2;
387         int     n = 0;
388         static int last_full = 0, prev_full = 0;
389         int     incomplete_tp;
390         int64_t total_budget, use_pct, avail_pct;
391         total_budget = 0;
392
393         FQ_DPRIV_LOCK(dpriv);
394         incomplete_tp = dpriv->incomplete_tp;
395
396         TAILQ_FOREACH_MUTABLE(fqp, &dpriv->fq_priv_list, dlink, fqp2) {
397                 if (fqp->transactions > 0 /* 30 */) {
398                         total_budget += (fqp->avg_latency * fqp->transactions);
399                         dsched_debug(LOG_INFO,
400                             "%d) avg_latency = %d, transactions = %d\n",
401                             n, fqp->avg_latency, fqp->transactions);
402                         ++n;
403                 } else {
404                         fqp->max_tp = 0;
405                         fqp->avg_latency = 0;
406                 }
407         }
408
409         dsched_debug(LOG_INFO, "%d procs competing for disk\n"
410             "total_budget = %lld\n"
411             "incomplete tp = %d\n", n, total_budget, incomplete_tp);
412
413         if (n == 0)
414                 goto done;
415
416 #if 0
417         /*
418          * XXX: hack. don't know why total_budget can be zero here
419          * -> this doesn't apply anymore. total_budget is never 0 now
420          */
421         if (total_budget == 0)
422                 total_budget = 1;
423 #endif
424
425         TAILQ_FOREACH_MUTABLE(fqp, &dpriv->fq_priv_list, dlink, fqp2) {
426                 /* XXX: proportional to scheduler class! */
427                 avail_pct = (int64_t)1000/(int64_t)n;
428
429                 /* XXX: 100/(sum of scheduler priorities) * scheduler priority */
430                 /* XXX: but need to process queues of fqp on buckets or so...*/
431
432                 use_pct = ((int64_t)1000* (int64_t)fqp->avg_latency *
433                     (int64_t)fqp->transactions)/(int64_t)total_budget;
434
435                 /* process is exceeding its fair share; rate-limit it */
436                 if ((use_pct > avail_pct) && (incomplete_tp > n*2)) {
437                         /* kprintf("here we are, use_pct > avail_pct\n"); */
438                         /* fqp->max_tp = avail_pct * fqp->avg_latency; */
439                         fqp->max_tp = total_budget/(n * fqp->avg_latency);
440                         dsched_debug(LOG_INFO,
441                             "rate limited to %d transactions\n", fqp->max_tp);
442                         atomic_add_int(&fq_stats.procs_limited, 1);
443                 } else if (((use_pct < avail_pct/2) || (incomplete_tp < n*2)) &&
444                     (!prev_full && !last_full)) {
445                         /*
446                          * process is really using little of its timeslice, or the
447                          * disk is not busy, so let's reset the rate-limit.
448                          * Without this, exceeding processes will get an unlimited
449                          * slice every other slice.
450                          * XXX: this still doesn't quite fix the issue, but maybe,
451                          * it's good that way so that heavy writes are interleaved.
452                          */
453                         fqp->max_tp = 0;
454                 }
455                 fqp->transactions = 0;
456                 fqp->avg_latency = 0;
457         }
458
459         prev_full = last_full;
460         last_full = (incomplete_tp > n*2)?1:0;
461
462 done:
463         FQ_DPRIV_UNLOCK(dpriv);
464         callout_reset(&fq_callout, hz * FQ_REBALANCE_TIMEOUT,
465             (void (*)(void *))fq_balance_thread, dpriv);
466 }
467
468
469 static int
470 do_fqstats(SYSCTL_HANDLER_ARGS)
471 {
472         return (sysctl_handle_opaque(oidp, &fq_stats, sizeof(struct dsched_fq_stats), req));
473 }
474
475
476 SYSCTL_PROC(_kern, OID_AUTO, fq_stats, CTLTYPE_OPAQUE|CTLFLAG_RD,
477     0, sizeof(struct dsched_fq_stats), do_fqstats, "fq_stats",
478     "dsched_fq statistics");
479
480
481
482
483 static void
484 fq_init(void)
485 {
486
487 }
488
489 static void
490 fq_uninit(void)
491 {
492
493 }
494
495 static void
496 fq_earlyinit(void)
497 {
498         fq_priv_cache = objcache_create("fq-priv-cache", 0, 0,
499                                            NULL, NULL, NULL,
500                                            objcache_malloc_alloc,
501                                            objcache_malloc_free,
502                                            &dsched_fq_priv_malloc_args );
503
504         fq_mpriv_cache = objcache_create("fq-mpriv-cache", 0, 0,
505                                            NULL, NULL, NULL,
506                                            objcache_malloc_alloc,
507                                            objcache_malloc_free,
508                                            &dsched_fq_mpriv_malloc_args );
509
510         FQ_GLOBAL_FQMP_LOCKINIT();
511
512         fq_dpriv_cache = objcache_create("fq-dpriv-cache", 0, 0,
513                                            NULL, NULL, NULL,
514                                            objcache_malloc_alloc,
515                                            objcache_malloc_free,
516                                            &dsched_fq_dpriv_malloc_args );
517
518         bzero(&fq_stats, sizeof(struct dsched_fq_stats));
519
520         dsched_register(&dsched_fq_ops);
521         callout_init_mp(&fq_callout);
522
523         kprintf("FQ scheduler policy version %d.%d loaded\n",
524             dsched_fq_version_maj, dsched_fq_version_min);
525 }
526
527 static void
528 fq_earlyuninit(void)
529 {
530         callout_stop(&fq_callout);
531         callout_deactivate(&fq_callout);
532         return;
533 }
534
535 SYSINIT(fq_register, SI_SUB_PRE_DRIVERS, SI_ORDER_ANY, fq_init, NULL);
536 SYSUNINIT(fq_register, SI_SUB_PRE_DRIVERS, SI_ORDER_FIRST, fq_uninit, NULL);
537
538 SYSINIT(fq_early, SI_SUB_CREATE_INIT-1, SI_ORDER_FIRST, fq_earlyinit, NULL);
539 SYSUNINIT(fq_early, SI_SUB_CREATE_INIT-1, SI_ORDER_ANY, fq_earlyuninit, NULL);