dsched_fq - Improve disk busy-% calculation
[games.git] / sys / dsched / fq / dsched_fq_core.c
1 /*
2  * Copyright (c) 2009, 2010 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Alex Hornung <ahornung@gmail.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/kernel.h>
37 #include <sys/proc.h>
38 #include <sys/sysctl.h>
39 #include <sys/buf.h>
40 #include <sys/conf.h>
41 #include <sys/diskslice.h>
42 #include <sys/disk.h>
43 #include <machine/atomic.h>
44 #include <sys/malloc.h>
45 #include <sys/thread.h>
46 #include <sys/thread2.h>
47 #include <sys/sysctl.h>
48 #include <sys/spinlock2.h>
49 #include <machine/md_var.h>
50 #include <sys/ctype.h>
51 #include <sys/syslog.h>
52 #include <sys/device.h>
53 #include <sys/msgport.h>
54 #include <sys/msgport2.h>
55 #include <sys/buf2.h>
56 #include <sys/dsched.h>
57 #include <machine/varargs.h>
58 #include <machine/param.h>
59
60 #include <dsched/fq/dsched_fq.h>
61
62 MALLOC_DECLARE(M_DSCHEDFQ);
63
64 static int      dsched_fq_version_maj = 0;
65 static int      dsched_fq_version_min = 7;
66
67 struct dsched_fq_stats  fq_stats;
68
69 struct objcache_malloc_args dsched_fq_dpriv_malloc_args = {
70         sizeof(struct dsched_fq_dpriv), M_DSCHEDFQ };
71 struct objcache_malloc_args dsched_fq_priv_malloc_args = {
72         sizeof(struct dsched_fq_priv), M_DSCHEDFQ };
73 struct objcache_malloc_args dsched_fq_mpriv_malloc_args = {
74         sizeof(struct dsched_fq_mpriv), M_DSCHEDFQ };
75
76 static struct objcache  *fq_dpriv_cache;
77 static struct objcache  *fq_mpriv_cache;
78 static struct objcache  *fq_priv_cache;
79
80 TAILQ_HEAD(, dsched_fq_mpriv)   dsched_fqmp_list =
81                 TAILQ_HEAD_INITIALIZER(dsched_fqmp_list);
82
83 struct spinlock fq_fqmp_lock;
84 struct callout  fq_callout;
85
86 extern struct dsched_ops dsched_fq_ops;
87
88 void db_stack_trace_cmd(int addr, boolean_t have_addr, int count,char *modif);
89 void fq_print_backtrace(void);
90
91 void
92 fq_print_backtrace(void)
93 {
94         register_t  ebp;
95
96         __asm __volatile("movl %%ebp, %0" : "=r" (ebp));
97         db_stack_trace_cmd(ebp, 1, 4, NULL);
98 }
99
100 void
101 fq_reference_dpriv(struct dsched_fq_dpriv *dpriv)
102 {
103         int refcount;
104
105         refcount = atomic_fetchadd_int(&dpriv->refcount, 1);
106
107         KKASSERT(refcount >= 0);
108 }
109
110 void
111 fq_reference_priv(struct dsched_fq_priv *fqp)
112 {
113         int refcount;
114
115         refcount = atomic_fetchadd_int(&fqp->refcount, 1);
116
117         KKASSERT(refcount >= 0);
118 }
119
120 void
121 fq_reference_mpriv(struct dsched_fq_mpriv *fqmp)
122 {
123         int refcount;
124
125         refcount = atomic_fetchadd_int(&fqmp->refcount, 1);
126
127         KKASSERT(refcount >= 0);
128 }
129
130 void
131 fq_dereference_dpriv(struct dsched_fq_dpriv *dpriv)
132 {
133         struct dsched_fq_priv   *fqp, *fqp2;
134         int refcount;
135
136         refcount = atomic_fetchadd_int(&dpriv->refcount, -1);
137
138
139         KKASSERT(refcount >= -3);
140
141         if (refcount == 1) {
142                 atomic_subtract_int(&dpriv->refcount, 3); /* mark as: in destruction */
143 #if 1
144                 kprintf("dpriv (%p) destruction started, trace:\n", dpriv);
145                 fq_print_backtrace();
146 #endif
147                 spin_lock_wr(&dpriv->lock);
148                 TAILQ_FOREACH_MUTABLE(fqp, &dpriv->fq_priv_list, dlink, fqp2) {
149                         TAILQ_REMOVE(&dpriv->fq_priv_list, fqp, dlink);
150                         fqp->flags &= ~FQP_LINKED_DPRIV;
151                         fq_dereference_priv(fqp);
152                 }
153                 spin_unlock_wr(&dpriv->lock);
154
155                 objcache_put(fq_dpriv_cache, dpriv);
156                 atomic_subtract_int(&fq_stats.dpriv_allocations, 1);
157         }
158 }
159
160 void
161 fq_dereference_priv(struct dsched_fq_priv *fqp)
162 {
163         struct dsched_fq_mpriv  *fqmp;
164         struct dsched_fq_dpriv  *dpriv;
165         int refcount;
166
167         refcount = atomic_fetchadd_int(&fqp->refcount, -1);
168
169         KKASSERT(refcount >= -3);
170
171         if (refcount == 1) {
172                 atomic_subtract_int(&fqp->refcount, 3); /* mark as: in destruction */
173 #if 0
174                 kprintf("fqp (%p) destruction started, trace:\n", fqp);
175                 fq_print_backtrace();
176 #endif
177                 dpriv = fqp->dpriv;
178                 KKASSERT(dpriv != NULL);
179
180                 spin_lock_wr(&fqp->lock);
181
182                 KKASSERT(fqp->qlength == 0);
183
184                 if (fqp->flags & FQP_LINKED_DPRIV) {
185                         spin_lock_wr(&dpriv->lock);
186
187                         TAILQ_REMOVE(&dpriv->fq_priv_list, fqp, dlink);
188                         fqp->flags &= ~FQP_LINKED_DPRIV;
189
190                         spin_unlock_wr(&dpriv->lock);
191                 }
192
193                 if (fqp->flags & FQP_LINKED_FQMP) {
194                         fqmp = fqp->fqmp;
195                         KKASSERT(fqmp != NULL);
196
197                         spin_lock_wr(&fqmp->lock);
198
199                         TAILQ_REMOVE(&fqmp->fq_priv_list, fqp, link);
200                         fqp->flags &= ~FQP_LINKED_FQMP;
201
202                         spin_unlock_wr(&fqmp->lock);
203                 }
204
205                 spin_unlock_wr(&fqp->lock);
206
207                 objcache_put(fq_priv_cache, fqp);
208                 atomic_subtract_int(&fq_stats.fqp_allocations, 1);
209 #if 0
210                 fq_dereference_dpriv(dpriv);
211 #endif
212         }
213 }
214
215 void
216 fq_dereference_mpriv(struct dsched_fq_mpriv *fqmp)
217 {
218         struct dsched_fq_priv   *fqp, *fqp2;
219         int refcount;
220
221         refcount = atomic_fetchadd_int(&fqmp->refcount, -1);
222
223         KKASSERT(refcount >= -3);
224
225         if (refcount == 1) {
226                 atomic_subtract_int(&fqmp->refcount, 3); /* mark as: in destruction */
227 #if 0
228                 kprintf("fqmp (%p) destruction started, trace:\n", fqmp);
229                 fq_print_backtrace();
230 #endif
231                 FQ_GLOBAL_FQMP_LOCK();
232                 spin_lock_wr(&fqmp->lock);
233
234                 TAILQ_FOREACH_MUTABLE(fqp, &fqmp->fq_priv_list, link, fqp2) {
235                         TAILQ_REMOVE(&fqmp->fq_priv_list, fqp, link);
236                         fqp->flags &= ~FQP_LINKED_FQMP;
237                         fq_dereference_priv(fqp);
238                 }
239                 TAILQ_REMOVE(&dsched_fqmp_list, fqmp, link);
240
241                 spin_unlock_wr(&fqmp->lock);
242                 FQ_GLOBAL_FQMP_UNLOCK();
243
244                 objcache_put(fq_mpriv_cache, fqmp);
245                 atomic_subtract_int(&fq_stats.fqmp_allocations, 1);
246         }
247 }
248
249
250 struct dsched_fq_priv *
251 fq_alloc_priv(struct disk *dp)
252 {
253         struct dsched_fq_priv   *fqp;
254 #if 0
255         fq_reference_dpriv(dsched_get_disk_priv(dp));
256 #endif
257         fqp = objcache_get(fq_priv_cache, M_WAITOK);
258         bzero(fqp, sizeof(struct dsched_fq_priv));
259
260         /* XXX: maybe we do need another ref for the disk list for fqp */
261         fq_reference_priv(fqp);
262
263         FQ_FQP_LOCKINIT(fqp);
264         FQ_FQP_LOCK(fqp);
265         fqp->dp = dp;
266
267         fqp->dpriv = dsched_get_disk_priv(dp);
268
269         TAILQ_INIT(&fqp->queue);
270         TAILQ_INSERT_TAIL(&fqp->dpriv->fq_priv_list, fqp, dlink);
271         fqp->flags |= FQP_LINKED_DPRIV;
272
273         atomic_add_int(&fq_stats.fqp_allocations, 1);
274         FQ_FQP_UNLOCK(fqp);
275         return fqp;
276 }
277
278
279 struct dsched_fq_dpriv *
280 fq_alloc_dpriv(struct disk *dp)
281 {
282         struct dsched_fq_dpriv *dpriv;
283
284         dpriv = objcache_get(fq_dpriv_cache, M_WAITOK);
285         bzero(dpriv, sizeof(struct dsched_fq_dpriv));
286         fq_reference_dpriv(dpriv);
287         dpriv->dp = dp;
288         dpriv->avg_rq_time = 0;
289         dpriv->incomplete_tp = 0;
290         FQ_DPRIV_LOCKINIT(dpriv);
291         TAILQ_INIT(&dpriv->fq_priv_list);
292
293         atomic_add_int(&fq_stats.dpriv_allocations, 1);
294         return dpriv;
295 }
296
297
298 struct dsched_fq_mpriv *
299 fq_alloc_mpriv(struct proc *p)
300 {
301         struct dsched_fq_mpriv  *fqmp;
302         struct dsched_fq_priv   *fqp;
303         struct disk     *dp = NULL;
304
305         fqmp = objcache_get(fq_mpriv_cache, M_WAITOK);
306         bzero(fqmp, sizeof(struct dsched_fq_mpriv));
307         fq_reference_mpriv(fqmp);
308 #if 0
309         kprintf("fq_alloc_mpriv, new fqmp = %p\n", fqmp);
310 #endif
311         FQ_FQMP_LOCKINIT(fqmp);
312         FQ_FQMP_LOCK(fqmp);
313         TAILQ_INIT(&fqmp->fq_priv_list);
314
315         while ((dp = dsched_disk_enumerate(dp, &dsched_fq_ops))) {
316                 fqp = fq_alloc_priv(dp);
317                 fqp->p = p;
318 #if 0
319                 fq_reference_priv(fqp);
320 #endif
321                 fqp->fqmp = fqmp;
322                 TAILQ_INSERT_TAIL(&fqmp->fq_priv_list, fqp, link);
323                 fqp->flags |= FQP_LINKED_FQMP;
324         }
325
326         FQ_GLOBAL_FQMP_LOCK();
327         TAILQ_INSERT_TAIL(&dsched_fqmp_list, fqmp, link);
328         FQ_GLOBAL_FQMP_UNLOCK();
329         FQ_FQMP_UNLOCK(fqmp);
330
331         atomic_add_int(&fq_stats.fqmp_allocations, 1);
332         return fqmp;
333 }
334
335
336 void
337 fq_dispatcher(struct dsched_fq_dpriv *dpriv)
338 {
339         struct dsched_fq_mpriv  *fqmp;
340         struct dsched_fq_priv   *fqp, *fqp2;
341         struct bio *bio, *bio2;
342         int count;
343
344         /*
345          * We need to manually assign an fqp to the fqmp of this thread
346          * since it isn't assigned one during fq_prepare, as the disk
347          * is not set up yet.
348          */
349         fqmp = dsched_get_thread_priv(curthread);
350         /* If fqmp is NULL, something went seriously wrong */
351         KKASSERT(fqmp != NULL);
352         fqp = fq_alloc_priv(dpriv->dp);
353         FQ_FQMP_LOCK(fqmp);
354 #if 0
355         fq_reference_priv(fqp);
356 #endif
357         TAILQ_INSERT_TAIL(&fqmp->fq_priv_list, fqp, link);
358         FQ_FQMP_UNLOCK(fqmp);
359
360
361         FQ_DPRIV_LOCK(dpriv);
362         for(;;) {
363                 /* sleep ~60 ms */
364                 if (ssleep(dpriv, &dpriv->lock, 0, "fq_dispatcher", hz/15) == 0) {
365                         FQ_DPRIV_UNLOCK(dpriv);
366                         kprintf("fq_dispatcher is peacefully dying\n");
367                         lwkt_exit();
368                 }
369
370                 TAILQ_FOREACH_MUTABLE(fqp, &dpriv->fq_priv_list, dlink, fqp2) {
371                         if (fqp->qlength > 0) {
372                                 FQ_FQP_LOCK(fqp);
373                                 count = 0;
374
375                                 TAILQ_FOREACH_MUTABLE(bio, &fqp->queue, link, bio2) {
376                                         if ((fqp->max_tp > 0) &&
377                                             ((fqp->issued >= fqp->max_tp)))
378                                                 break;
379                                         TAILQ_REMOVE(&fqp->queue, bio, link);
380
381                                         --fqp->qlength;
382                                         KKASSERT(fqp != NULL);
383                                         /*
384                                          * beware that we do have an fqp reference
385                                          * from the queueing
386                                          */
387                                         fq_dispatch(dpriv, bio, fqp);
388                                 }
389                                 FQ_FQP_UNLOCK(fqp);
390                         }
391                 }
392         }
393 }
394
395 void
396 fq_balance_thread(struct dsched_fq_dpriv *dpriv)
397 {
398         struct  dsched_fq_priv  *fqp, *fqp2;
399         int     n = 0;
400         static int last_full = 0, prev_full = 0;
401         static int limited_procs = 0;
402         int     incomplete_tp;
403         int     disk_busy;
404         int64_t budget, total_budget, used_budget;
405         int64_t budgetpb[FQ_PRIO_MAX+1];
406         int sum, i;
407
408         bzero(budgetpb, sizeof(budgetpb));
409         total_budget = 0;
410
411         FQ_DPRIV_LOCK(dpriv);
412         disk_busy = (100*(FQ_TOTAL_DISK_TIME - dpriv->idle_time)) /
413             FQ_TOTAL_DISK_TIME;
414         if (disk_busy < 0)
415                 disk_busy = 0;
416         incomplete_tp = dpriv->incomplete_tp;
417         dpriv->idle_time = 0;
418
419         TAILQ_FOREACH_MUTABLE(fqp, &dpriv->fq_priv_list, dlink, fqp2) {
420                 if (fqp->transactions > 0 /* 30 */) {
421
422                         total_budget += (fqp->avg_latency * fqp->transactions);
423                         /*
424                          * XXX: while the code below really sucked, the problem needs to
425                          *      be addressed eventually. Some processes take up their "fair"
426                          *      slice, but don't really need even a 10th of it.
427                          *      This kills performance for those that do need the
428                          *      performance.
429                          */
430 #if 0
431                         /*
432                          * This is *very* hackish. It basically tries to avoid that
433                          * processes that do only very few tps take away more bandwidth
434                          * than they should.
435                          */
436                         if ((limited_procs >= 1) && (fqp->transactions < 25) &&
437                             (budgetpb[(fqp->p) ? fqp->p->p_ionice : 0] >= 1))
438                                 continue;
439 #endif
440
441                         ++budgetpb[(fqp->p) ? fqp->p->p_ionice : 0];
442
443                         dsched_debug(LOG_INFO,
444                             "%d) avg_latency = %d, transactions = %d, ioprio = %d\n",
445                             n, fqp->avg_latency, fqp->transactions,
446                             (fqp->p) ? fqp->p->p_ionice : 0);
447                         ++n;
448                 } else {
449                         fqp->max_tp = 0;
450                         fqp->avg_latency = 0;
451                 }
452         }
453
454         dsched_debug(LOG_INFO, "%d procs competing for disk\n"
455             "total_budget = %lld\n"
456             "incomplete tp = %d\n", n, total_budget, incomplete_tp);
457
458         if (n == 0)
459                 goto done;
460
461         sum = 0;
462         for (i = 0; i < FQ_PRIO_MAX+1; i++) {
463                 if (budgetpb[i] == 0)
464                         continue;
465                 sum += (FQ_PRIO_BIAS+i)*budgetpb[i];
466         }
467         if (sum == 0)
468                 sum = 1;
469         dsched_debug(LOG_INFO, "sum = %d\n", sum);
470
471         for (i = 0; i < FQ_PRIO_MAX+1; i++) {
472                 if (budgetpb[i] == 0)
473                         continue;
474
475                 budgetpb[i] = ((FQ_PRIO_BIAS+i)*10)*total_budget/sum;
476         }
477
478         if (total_budget > dpriv->max_budget)
479                 dpriv->max_budget = total_budget;
480
481         limited_procs = 0;
482
483         dsched_debug(4, "disk is %d\% busy\n", disk_busy);
484
485         /*
486          * XXX: eventually remove all the silly *10...
487          */
488         TAILQ_FOREACH_MUTABLE(fqp, &dpriv->fq_priv_list, dlink, fqp2) {
489                 budget = budgetpb[(fqp->p) ? fqp->p->p_ionice : 0];
490
491                 used_budget = ((int64_t)10*(int64_t)fqp->avg_latency *
492                     (int64_t)fqp->transactions);
493                 if (used_budget > 0) {
494                         dsched_debug(LOG_INFO,
495                             "info: used_budget = %lld, budget = %lld\n", used_budget,
496                             budget);
497                 }
498
499                 /*
500                  * process is exceeding its fair share; rate-limit it, but only
501                  * if the disk is being used at 90+% of capacity
502                  */
503                 if ((used_budget > budget) && (disk_busy >= 90) &&
504                     (incomplete_tp > n*2)) {
505                         KKASSERT(fqp->avg_latency != 0);
506
507                         fqp->max_tp = budget/(10*fqp->avg_latency);
508                         ++limited_procs;
509                         dsched_debug(LOG_INFO,
510                             "rate limited to %d transactions\n", fqp->max_tp);
511                         atomic_add_int(&fq_stats.procs_limited, 1);
512                 } else if (((used_budget*2 < budget) || (disk_busy < 90)) &&
513                     (!prev_full && !last_full)) {
514                         /*
515                          * process is really using little of its timeslice, or the
516                          * disk is not busy, so let's reset the rate-limit.
517                          * Without this, exceeding processes will get an unlimited
518                          * slice every other slice.
519                          * XXX: this still doesn't quite fix the issue, but maybe
520                          * it's good that way, so that heavy writes are interleaved.
521                          */
522                         fqp->max_tp = 0;
523                 }
524                 fqp->transactions = 0;
525                 fqp->avg_latency = 0;
526                 fqp->issued = 0;
527         }
528
529         prev_full = last_full;
530         last_full = (disk_busy >= 90)?1:0;
531
532 done:
533         FQ_DPRIV_UNLOCK(dpriv);
534         callout_reset(&fq_callout, hz * FQ_REBALANCE_TIMEOUT,
535             (void (*)(void *))fq_balance_thread, dpriv);
536 }
537
538
539 static int
540 do_fqstats(SYSCTL_HANDLER_ARGS)
541 {
542         return (sysctl_handle_opaque(oidp, &fq_stats, sizeof(struct dsched_fq_stats), req));
543 }
544
545
546 SYSCTL_PROC(_kern, OID_AUTO, fq_stats, CTLTYPE_OPAQUE|CTLFLAG_RD,
547     0, sizeof(struct dsched_fq_stats), do_fqstats, "fq_stats",
548     "dsched_fq statistics");
549
550
551
552
553 static void
554 fq_init(void)
555 {
556
557 }
558
559 static void
560 fq_uninit(void)
561 {
562
563 }
564
565 static void
566 fq_earlyinit(void)
567 {
568         fq_priv_cache = objcache_create("fq-priv-cache", 0, 0,
569                                            NULL, NULL, NULL,
570                                            objcache_malloc_alloc,
571                                            objcache_malloc_free,
572                                            &dsched_fq_priv_malloc_args );
573
574         fq_mpriv_cache = objcache_create("fq-mpriv-cache", 0, 0,
575                                            NULL, NULL, NULL,
576                                            objcache_malloc_alloc,
577                                            objcache_malloc_free,
578                                            &dsched_fq_mpriv_malloc_args );
579
580         FQ_GLOBAL_FQMP_LOCKINIT();
581
582         fq_dpriv_cache = objcache_create("fq-dpriv-cache", 0, 0,
583                                            NULL, NULL, NULL,
584                                            objcache_malloc_alloc,
585                                            objcache_malloc_free,
586                                            &dsched_fq_dpriv_malloc_args );
587
588         bzero(&fq_stats, sizeof(struct dsched_fq_stats));
589
590         dsched_register(&dsched_fq_ops);
591         callout_init_mp(&fq_callout);
592
593         kprintf("FQ scheduler policy version %d.%d loaded\n",
594             dsched_fq_version_maj, dsched_fq_version_min);
595 }
596
597 static void
598 fq_earlyuninit(void)
599 {
600         callout_stop(&fq_callout);
601         callout_deactivate(&fq_callout);
602         return;
603 }
604
605 SYSINIT(fq_register, SI_SUB_PRE_DRIVERS, SI_ORDER_ANY, fq_init, NULL);
606 SYSUNINIT(fq_register, SI_SUB_PRE_DRIVERS, SI_ORDER_FIRST, fq_uninit, NULL);
607
608 SYSINIT(fq_early, SI_SUB_CREATE_INIT-1, SI_ORDER_FIRST, fq_earlyinit, NULL);
609 SYSUNINIT(fq_early, SI_SUB_CREATE_INIT-1, SI_ORDER_ANY, fq_earlyuninit, NULL);