dsched - Implement priorities and other improvements
[dragonfly.git] / sys / dsched / fq / dsched_fq_core.c
1 /*
2  * Copyright (c) 2009, 2010 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Alex Hornung <ahornung@gmail.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/kernel.h>
37 #include <sys/proc.h>
38 #include <sys/sysctl.h>
39 #include <sys/buf.h>
40 #include <sys/conf.h>
41 #include <sys/diskslice.h>
42 #include <sys/disk.h>
43 #include <machine/atomic.h>
44 #include <sys/malloc.h>
45 #include <sys/thread.h>
46 #include <sys/thread2.h>
47 #include <sys/sysctl.h>
48 #include <sys/spinlock2.h>
49 #include <machine/md_var.h>
50 #include <sys/ctype.h>
51 #include <sys/syslog.h>
52 #include <sys/device.h>
53 #include <sys/msgport.h>
54 #include <sys/msgport2.h>
55 #include <sys/buf2.h>
56 #include <sys/dsched.h>
57 #include <machine/varargs.h>
58 #include <machine/param.h>
59
60 #include <dsched/fq/dsched_fq.h>
61
62 MALLOC_DECLARE(M_DSCHEDFQ);
63
64 static int      dsched_fq_version_maj = 0;
65 static int      dsched_fq_version_min = 7;
66
67 struct dsched_fq_stats  fq_stats;
68
69 struct objcache_malloc_args dsched_fq_dpriv_malloc_args = {
70         sizeof(struct dsched_fq_dpriv), M_DSCHEDFQ };
71 struct objcache_malloc_args dsched_fq_priv_malloc_args = {
72         sizeof(struct dsched_fq_priv), M_DSCHEDFQ };
73 struct objcache_malloc_args dsched_fq_mpriv_malloc_args = {
74         sizeof(struct dsched_fq_mpriv), M_DSCHEDFQ };
75
76 static struct objcache  *fq_dpriv_cache;
77 static struct objcache  *fq_mpriv_cache;
78 static struct objcache  *fq_priv_cache;
79
80 TAILQ_HEAD(, dsched_fq_mpriv)   dsched_fqmp_list =
81                 TAILQ_HEAD_INITIALIZER(dsched_fqmp_list);
82
83 struct spinlock fq_fqmp_lock;
84 struct callout  fq_callout;
85
86 extern struct dsched_ops dsched_fq_ops;
87
88 void db_stack_trace_cmd(int addr, boolean_t have_addr, int count,char *modif);
89 void fq_print_backtrace(void);
90
91 void
92 fq_print_backtrace(void)
93 {
94         register_t  ebp;
95
96         __asm __volatile("movl %%ebp, %0" : "=r" (ebp));
97         db_stack_trace_cmd(ebp, 1, 4, NULL);
98 }
99
100 void
101 fq_reference_dpriv(struct dsched_fq_dpriv *dpriv)
102 {
103         int refcount;
104
105         refcount = atomic_fetchadd_int(&dpriv->refcount, 1);
106
107         KKASSERT(refcount >= 0);
108 }
109
110 void
111 fq_reference_priv(struct dsched_fq_priv *fqp)
112 {
113         int refcount;
114
115         refcount = atomic_fetchadd_int(&fqp->refcount, 1);
116
117         KKASSERT(refcount >= 0);
118 }
119
120 void
121 fq_reference_mpriv(struct dsched_fq_mpriv *fqmp)
122 {
123         int refcount;
124
125         refcount = atomic_fetchadd_int(&fqmp->refcount, 1);
126
127         KKASSERT(refcount >= 0);
128 }
129
130 void
131 fq_dereference_dpriv(struct dsched_fq_dpriv *dpriv)
132 {
133         struct dsched_fq_priv   *fqp, *fqp2;
134         int refcount;
135
136         refcount = atomic_fetchadd_int(&dpriv->refcount, -1);
137
138
139         KKASSERT(refcount >= -3);
140
141         if (refcount == 1) {
142                 atomic_subtract_int(&dpriv->refcount, 3); /* mark as: in destruction */
143 #if 1
144                 kprintf("dpriv (%p) destruction started, trace:\n", dpriv);
145                 fq_print_backtrace();
146 #endif
147                 spin_lock_wr(&dpriv->lock);
148                 TAILQ_FOREACH_MUTABLE(fqp, &dpriv->fq_priv_list, dlink, fqp2) {
149                         TAILQ_REMOVE(&dpriv->fq_priv_list, fqp, dlink);
150                         fqp->flags &= ~FQP_LINKED_DPRIV;
151                         fq_dereference_priv(fqp);
152                 }
153                 spin_unlock_wr(&dpriv->lock);
154
155                 objcache_put(fq_dpriv_cache, dpriv);
156                 atomic_subtract_int(&fq_stats.dpriv_allocations, 1);
157         }
158 }
159
160 void
161 fq_dereference_priv(struct dsched_fq_priv *fqp)
162 {
163         struct dsched_fq_mpriv  *fqmp;
164         struct dsched_fq_dpriv  *dpriv;
165         int refcount;
166
167         refcount = atomic_fetchadd_int(&fqp->refcount, -1);
168
169         KKASSERT(refcount >= -3);
170
171         if (refcount == 1) {
172                 atomic_subtract_int(&fqp->refcount, 3); /* mark as: in destruction */
173 #if 0
174                 kprintf("fqp (%p) destruction started, trace:\n", fqp);
175                 fq_print_backtrace();
176 #endif
177                 dpriv = fqp->dpriv;
178                 KKASSERT(dpriv != NULL);
179
180                 spin_lock_wr(&fqp->lock);
181
182                 KKASSERT(fqp->qlength == 0);
183
184                 if (fqp->flags & FQP_LINKED_DPRIV) {
185                         spin_lock_wr(&dpriv->lock);
186
187                         TAILQ_REMOVE(&dpriv->fq_priv_list, fqp, dlink);
188                         fqp->flags &= ~FQP_LINKED_DPRIV;
189
190                         spin_unlock_wr(&dpriv->lock);
191                 }
192
193                 if (fqp->flags & FQP_LINKED_FQMP) {
194                         fqmp = fqp->fqmp;
195                         KKASSERT(fqmp != NULL);
196
197                         spin_lock_wr(&fqmp->lock);
198
199                         TAILQ_REMOVE(&fqmp->fq_priv_list, fqp, link);
200                         fqp->flags &= ~FQP_LINKED_FQMP;
201
202                         spin_unlock_wr(&fqmp->lock);
203                 }
204
205                 spin_unlock_wr(&fqp->lock);
206
207                 objcache_put(fq_priv_cache, fqp);
208                 atomic_subtract_int(&fq_stats.fqp_allocations, 1);
209 #if 0
210                 fq_dereference_dpriv(dpriv);
211 #endif
212         }
213 }
214
215 void
216 fq_dereference_mpriv(struct dsched_fq_mpriv *fqmp)
217 {
218         struct dsched_fq_priv   *fqp, *fqp2;
219         int refcount;
220
221         refcount = atomic_fetchadd_int(&fqmp->refcount, -1);
222
223         KKASSERT(refcount >= -3);
224
225         if (refcount == 1) {
226                 atomic_subtract_int(&fqmp->refcount, 3); /* mark as: in destruction */
227 #if 0
228                 kprintf("fqmp (%p) destruction started, trace:\n", fqmp);
229                 fq_print_backtrace();
230 #endif
231                 FQ_GLOBAL_FQMP_LOCK();
232                 spin_lock_wr(&fqmp->lock);
233
234                 TAILQ_FOREACH_MUTABLE(fqp, &fqmp->fq_priv_list, link, fqp2) {
235                         TAILQ_REMOVE(&fqmp->fq_priv_list, fqp, link);
236                         fqp->flags &= ~FQP_LINKED_FQMP;
237                         fq_dereference_priv(fqp);
238                 }
239                 TAILQ_REMOVE(&dsched_fqmp_list, fqmp, link);
240
241                 spin_unlock_wr(&fqmp->lock);
242                 FQ_GLOBAL_FQMP_UNLOCK();
243
244                 objcache_put(fq_mpriv_cache, fqmp);
245                 atomic_subtract_int(&fq_stats.fqmp_allocations, 1);
246         }
247 }
248
249
250 struct dsched_fq_priv *
251 fq_alloc_priv(struct disk *dp)
252 {
253         struct dsched_fq_priv   *fqp;
254 #if 0
255         fq_reference_dpriv(dsched_get_disk_priv(dp));
256 #endif
257         fqp = objcache_get(fq_priv_cache, M_WAITOK);
258         bzero(fqp, sizeof(struct dsched_fq_priv));
259
260         /* XXX: maybe we do need another ref for the disk list for fqp */
261         fq_reference_priv(fqp);
262
263         FQ_FQP_LOCKINIT(fqp);
264         FQ_FQP_LOCK(fqp);
265         fqp->dp = dp;
266
267         fqp->dpriv = dsched_get_disk_priv(dp);
268
269         TAILQ_INIT(&fqp->queue);
270         TAILQ_INSERT_TAIL(&fqp->dpriv->fq_priv_list, fqp, dlink);
271         fqp->flags |= FQP_LINKED_DPRIV;
272
273         atomic_add_int(&fq_stats.fqp_allocations, 1);
274         FQ_FQP_UNLOCK(fqp);
275         return fqp;
276 }
277
278
279 struct dsched_fq_dpriv *
280 fq_alloc_dpriv(struct disk *dp)
281 {
282         struct dsched_fq_dpriv *dpriv;
283
284         dpriv = objcache_get(fq_dpriv_cache, M_WAITOK);
285         bzero(dpriv, sizeof(struct dsched_fq_dpriv));
286         fq_reference_dpriv(dpriv);
287         dpriv->dp = dp;
288         dpriv->avg_rq_time = 0;
289         dpriv->incomplete_tp = 0;
290         FQ_DPRIV_LOCKINIT(dpriv);
291         TAILQ_INIT(&dpriv->fq_priv_list);
292
293         atomic_add_int(&fq_stats.dpriv_allocations, 1);
294         return dpriv;
295 }
296
297
298 struct dsched_fq_mpriv *
299 fq_alloc_mpriv(struct proc *p)
300 {
301         struct dsched_fq_mpriv  *fqmp;
302         struct dsched_fq_priv   *fqp;
303         struct disk     *dp = NULL;
304
305         fqmp = objcache_get(fq_mpriv_cache, M_WAITOK);
306         bzero(fqmp, sizeof(struct dsched_fq_mpriv));
307         fq_reference_mpriv(fqmp);
308 #if 0
309         kprintf("fq_alloc_mpriv, new fqmp = %p\n", fqmp);
310 #endif
311         FQ_FQMP_LOCKINIT(fqmp);
312         FQ_FQMP_LOCK(fqmp);
313         TAILQ_INIT(&fqmp->fq_priv_list);
314
315         while ((dp = dsched_disk_enumerate(dp, &dsched_fq_ops))) {
316                 fqp = fq_alloc_priv(dp);
317                 fqp->p = p;
318 #if 0
319                 fq_reference_priv(fqp);
320 #endif
321                 fqp->fqmp = fqmp;
322                 TAILQ_INSERT_TAIL(&fqmp->fq_priv_list, fqp, link);
323                 fqp->flags |= FQP_LINKED_FQMP;
324         }
325
326         FQ_GLOBAL_FQMP_LOCK();
327         TAILQ_INSERT_TAIL(&dsched_fqmp_list, fqmp, link);
328         FQ_GLOBAL_FQMP_UNLOCK();
329         FQ_FQMP_UNLOCK(fqmp);
330
331         atomic_add_int(&fq_stats.fqmp_allocations, 1);
332         return fqmp;
333 }
334
335
336 void
337 fq_dispatcher(struct dsched_fq_dpriv *dpriv)
338 {
339         struct dsched_fq_mpriv  *fqmp;
340         struct dsched_fq_priv   *fqp, *fqp2;
341         struct bio *bio, *bio2;
342         int count;
343
344         /*
345          * We need to manually assign an fqp to the fqmp of this thread
346          * since it isn't assigned one during fq_prepare, as the disk
347          * is not set up yet.
348          */
349         fqmp = dsched_get_thread_priv(curthread);
350         /* If fqmp is NULL, something went seriously wrong */
351         KKASSERT(fqmp != NULL);
352         fqp = fq_alloc_priv(dpriv->dp);
353         FQ_FQMP_LOCK(fqmp);
354 #if 0
355         fq_reference_priv(fqp);
356 #endif
357         TAILQ_INSERT_TAIL(&fqmp->fq_priv_list, fqp, link);
358         FQ_FQMP_UNLOCK(fqmp);
359
360
361         FQ_DPRIV_LOCK(dpriv);
362         for(;;) {
363                 /* sleep ~60 ms */
364                 if (ssleep(dpriv, &dpriv->lock, 0, "fq_dispatcher", hz/15) == 0) {
365                         FQ_DPRIV_UNLOCK(dpriv);
366                         kprintf("fq_dispatcher is peacefully dying\n");
367                         lwkt_exit();
368                 }
369
370                 TAILQ_FOREACH_MUTABLE(fqp, &dpriv->fq_priv_list, dlink, fqp2) {
371                         if (fqp->qlength > 0) {
372                                 FQ_FQP_LOCK(fqp);
373                                 count = 0;
374
375                                 TAILQ_FOREACH_MUTABLE(bio, &fqp->queue, link, bio2) {
376                                         if ((fqp->max_tp > 0) &&
377                                             ((fqp->issued >= fqp->max_tp)))
378                                                 break;
379                                         TAILQ_REMOVE(&fqp->queue, bio, link);
380
381                                         --fqp->qlength;
382                                         KKASSERT(fqp != NULL);
383                                         /*
384                                          * beware that we do have an fqp reference
385                                          * from the queueing
386                                          */
387                                         dsched_strategy_async(dpriv->dp, bio,
388                                             fq_completed, fqp);
389                                         atomic_add_int(&fqp->issued, 1);
390                                         atomic_add_int(&dpriv->incomplete_tp, 1);
391                                         atomic_add_int(&fq_stats.transactions, 1);
392                                 }
393                                 FQ_FQP_UNLOCK(fqp);
394                         }
395                 }
396         }
397 }
398
399 void
400 fq_balance_thread(struct dsched_fq_dpriv *dpriv)
401 {
402         struct  dsched_fq_priv  *fqp, *fqp2;
403         int     n = 0;
404         static int last_full = 0, prev_full = 0;
405         static int limited_procs = 0;
406         int     incomplete_tp;
407         int64_t budget, total_budget, used_budget;
408         int64_t budgetpb[FQ_PRIO_MAX+1];
409         int sum, i;
410
411         bzero(budgetpb, sizeof(budgetpb));
412         total_budget = 0;
413         
414         FQ_DPRIV_LOCK(dpriv);
415         incomplete_tp = dpriv->incomplete_tp;
416
417         TAILQ_FOREACH_MUTABLE(fqp, &dpriv->fq_priv_list, dlink, fqp2) {
418                 if (fqp->transactions > 0 /* 30 */) {
419
420                         total_budget += (fqp->avg_latency * fqp->transactions);
421                         /*
422                          * XXX: while the code below really sucked, the problem needs to
423                          *      be addressed eventually. Some processes take up their "fair"
424                          *      slice, but don't really need even a 10th of it.
425                          *      This kills performance for those that do need the
426                          *      performance.
427                          */
428 #if 0
429                         /*
430                          * This is *very* hackish. It basically tries to avoid that
431                          * processes that do only very few tps take away more bandwidth
432                          * than they should.
433                          */
434                         if ((limited_procs >= 1) && (fqp->transactions < 25) &&
435                             (budgetpb[(fqp->p) ? fqp->p->p_ionice : 0] >= 1))
436                                 continue;
437 #endif
438
439                         ++budgetpb[(fqp->p) ? fqp->p->p_ionice : 0];
440                         
441                         dsched_debug(LOG_INFO,
442                             "%d) avg_latency = %d, transactions = %d, ioprio = %d\n",
443                             n, fqp->avg_latency, fqp->transactions,
444                             (fqp->p) ? fqp->p->p_ionice : 0);
445                         ++n;
446                 } else {
447                         fqp->max_tp = 0;
448                         fqp->avg_latency = 0;
449                 }
450         }
451
452         dsched_debug(LOG_INFO, "%d procs competing for disk\n"
453             "total_budget = %lld\n"
454             "incomplete tp = %d\n", n, total_budget, incomplete_tp);
455
456         if (n == 0)
457                 goto done;      
458         
459         sum = 0;
460         for (i = 0; i < FQ_PRIO_MAX+1; i++) {
461                 if (budgetpb[i] == 0)
462                         continue;
463                 sum += (FQ_PRIO_BIAS+i)*budgetpb[i];
464         }
465         if (sum == 0)
466                 sum = 1;
467         dsched_debug(LOG_INFO, "sum = %d\n", sum);
468
469         for (i = 0; i < FQ_PRIO_MAX+1; i++) {
470                 if (budgetpb[i] == 0)
471                         continue;
472
473                 budgetpb[i] = ((FQ_PRIO_BIAS+i)*10)*total_budget/sum;
474         }
475
476         if (total_budget > dpriv->max_budget)
477                 dpriv->max_budget = total_budget;
478
479         limited_procs = 0;
480         /*
481          * XXX: eventually remove all the silly *10...
482          */
483         TAILQ_FOREACH_MUTABLE(fqp, &dpriv->fq_priv_list, dlink, fqp2) {
484                 budget = budgetpb[(fqp->p) ? fqp->p->p_ionice : 0];
485
486                 used_budget = ((int64_t)10*(int64_t)fqp->avg_latency *
487                     (int64_t)fqp->transactions);
488                 if (used_budget > 0) {
489                         dsched_debug(LOG_INFO,
490                             "info: used_budget = %lld, budget = %lld\n", used_budget,
491                             budget);
492                 }
493
494                 /*
495                  * process is exceeding its fair share; rate-limit it, but only
496                  * if the disk is actually fully used.
497                  */
498                 if ((used_budget > budget) && (incomplete_tp > n*2)) {
499                         /* kprintf("here we are, use_pct > avail_pct\n"); */
500                         /* fqp->max_tp = avail_pct * fqp->avg_latency; */
501                         KKASSERT(fqp->avg_latency != 0);
502
503                         /*
504                          * If the disk has not been fully used lately, augment the
505                          * budget.
506                          */
507                         if (total_budget*3 < dpriv->max_budget*2) {
508                                 budget *= 2;
509                                 budget /= 3;
510                         }
511
512                         fqp->max_tp = budget/(10*fqp->avg_latency);
513                         ++limited_procs;
514                         dsched_debug(LOG_INFO,
515                             "rate limited to %d transactions\n", fqp->max_tp);
516                         atomic_add_int(&fq_stats.procs_limited, 1);
517                 } else if (((used_budget*2 < budget) || (incomplete_tp < n*2)) &&
518                     (!prev_full && !last_full)) {
519                         /*
520                          * process is really using little of its timeslice, or the
521                          * disk is not busy, so let's reset the rate-limit.
522                          * Without this, exceeding processes will get an unlimited
523                          * slice every other slice.
524                          * XXX: this still doesn't quite fix the issue, but maybe
525                          * it's good that way, so that heavy writes are interleaved.
526                          */
527                         fqp->max_tp = 0;
528                 }
529                 fqp->transactions = 0;
530                 fqp->avg_latency = 0;
531                 fqp->issued = 0;
532         }
533
534         prev_full = last_full;
535         last_full = (incomplete_tp > n*2)?1:0;
536
537 done:
538         FQ_DPRIV_UNLOCK(dpriv);
539         callout_reset(&fq_callout, hz * FQ_REBALANCE_TIMEOUT,
540             (void (*)(void *))fq_balance_thread, dpriv);
541 }
542
543
544 static int
545 do_fqstats(SYSCTL_HANDLER_ARGS)
546 {
547         return (sysctl_handle_opaque(oidp, &fq_stats, sizeof(struct dsched_fq_stats), req));
548 }
549
550
551 SYSCTL_PROC(_kern, OID_AUTO, fq_stats, CTLTYPE_OPAQUE|CTLFLAG_RD,
552     0, sizeof(struct dsched_fq_stats), do_fqstats, "fq_stats",
553     "dsched_fq statistics");
554
555
556
557
558 static void
559 fq_init(void)
560 {
561
562 }
563
564 static void
565 fq_uninit(void)
566 {
567
568 }
569
570 static void
571 fq_earlyinit(void)
572 {
573         fq_priv_cache = objcache_create("fq-priv-cache", 0, 0,
574                                            NULL, NULL, NULL,
575                                            objcache_malloc_alloc,
576                                            objcache_malloc_free,
577                                            &dsched_fq_priv_malloc_args );
578
579         fq_mpriv_cache = objcache_create("fq-mpriv-cache", 0, 0,
580                                            NULL, NULL, NULL,
581                                            objcache_malloc_alloc,
582                                            objcache_malloc_free,
583                                            &dsched_fq_mpriv_malloc_args );
584
585         FQ_GLOBAL_FQMP_LOCKINIT();
586
587         fq_dpriv_cache = objcache_create("fq-dpriv-cache", 0, 0,
588                                            NULL, NULL, NULL,
589                                            objcache_malloc_alloc,
590                                            objcache_malloc_free,
591                                            &dsched_fq_dpriv_malloc_args );
592
593         bzero(&fq_stats, sizeof(struct dsched_fq_stats));
594
595         dsched_register(&dsched_fq_ops);
596         callout_init_mp(&fq_callout);
597
598         kprintf("FQ scheduler policy version %d.%d loaded\n",
599             dsched_fq_version_maj, dsched_fq_version_min);
600 }
601
602 static void
603 fq_earlyuninit(void)
604 {
605         callout_stop(&fq_callout);
606         callout_deactivate(&fq_callout);
607         return;
608 }
609
610 SYSINIT(fq_register, SI_SUB_PRE_DRIVERS, SI_ORDER_ANY, fq_init, NULL);
611 SYSUNINIT(fq_register, SI_SUB_PRE_DRIVERS, SI_ORDER_FIRST, fq_uninit, NULL);
612
613 SYSINIT(fq_early, SI_SUB_CREATE_INIT-1, SI_ORDER_FIRST, fq_earlyinit, NULL);
614 SYSUNINIT(fq_early, SI_SUB_CREATE_INIT-1, SI_ORDER_ANY, fq_earlyuninit, NULL);