dsched_fq - Finalize switch to new disk-busy calc
[dragonfly.git] / sys / dsched / fq / dsched_fq_core.c
1 /*
2  * Copyright (c) 2009, 2010 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Alex Hornung <ahornung@gmail.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/kernel.h>
37 #include <sys/proc.h>
38 #include <sys/sysctl.h>
39 #include <sys/buf.h>
40 #include <sys/conf.h>
41 #include <sys/diskslice.h>
42 #include <sys/disk.h>
43 #include <machine/atomic.h>
44 #include <sys/malloc.h>
45 #include <sys/thread.h>
46 #include <sys/thread2.h>
47 #include <sys/sysctl.h>
48 #include <sys/spinlock2.h>
49 #include <machine/md_var.h>
50 #include <sys/ctype.h>
51 #include <sys/syslog.h>
52 #include <sys/device.h>
53 #include <sys/msgport.h>
54 #include <sys/msgport2.h>
55 #include <sys/buf2.h>
56 #include <sys/dsched.h>
57 #include <machine/varargs.h>
58 #include <machine/param.h>
59
60 #include <dsched/fq/dsched_fq.h>
61
62 MALLOC_DECLARE(M_DSCHEDFQ);
63
64 static int      dsched_fq_version_maj = 0;
65 static int      dsched_fq_version_min = 7;
66
67 struct dsched_fq_stats  fq_stats;
68
69 struct objcache_malloc_args dsched_fq_dpriv_malloc_args = {
70         sizeof(struct dsched_fq_dpriv), M_DSCHEDFQ };
71 struct objcache_malloc_args dsched_fq_priv_malloc_args = {
72         sizeof(struct dsched_fq_priv), M_DSCHEDFQ };
73 struct objcache_malloc_args dsched_fq_mpriv_malloc_args = {
74         sizeof(struct dsched_fq_mpriv), M_DSCHEDFQ };
75
76 static struct objcache  *fq_dpriv_cache;
77 static struct objcache  *fq_mpriv_cache;
78 static struct objcache  *fq_priv_cache;
79
80 TAILQ_HEAD(, dsched_fq_mpriv)   dsched_fqmp_list =
81                 TAILQ_HEAD_INITIALIZER(dsched_fqmp_list);
82
83 struct spinlock fq_fqmp_lock;
84 struct callout  fq_callout;
85
86 extern struct dsched_ops dsched_fq_ops;
87
88 void
89 fq_reference_dpriv(struct dsched_fq_dpriv *dpriv)
90 {
91         int refcount;
92
93         refcount = atomic_fetchadd_int(&dpriv->refcount, 1);
94
95         KKASSERT(refcount >= 0);
96 }
97
98 void
99 fq_reference_priv(struct dsched_fq_priv *fqp)
100 {
101         int refcount;
102
103         refcount = atomic_fetchadd_int(&fqp->refcount, 1);
104
105         KKASSERT(refcount >= 0);
106 }
107
108 void
109 fq_reference_mpriv(struct dsched_fq_mpriv *fqmp)
110 {
111         int refcount;
112
113         refcount = atomic_fetchadd_int(&fqmp->refcount, 1);
114
115         KKASSERT(refcount >= 0);
116 }
117
118 void
119 fq_dereference_dpriv(struct dsched_fq_dpriv *dpriv)
120 {
121         struct dsched_fq_priv   *fqp, *fqp2;
122         int refcount;
123
124         refcount = atomic_fetchadd_int(&dpriv->refcount, -1);
125
126
127         KKASSERT(refcount >= -3);
128
129         if (refcount == 1) {
130                 atomic_subtract_int(&dpriv->refcount, 3); /* mark as: in destruction */
131 #if 1
132                 kprintf("dpriv (%p) destruction started, trace:\n", dpriv);
133                 print_backtrace(4);
134 #endif
135                 spin_lock_wr(&dpriv->lock);
136                 TAILQ_FOREACH_MUTABLE(fqp, &dpriv->fq_priv_list, dlink, fqp2) {
137                         TAILQ_REMOVE(&dpriv->fq_priv_list, fqp, dlink);
138                         fqp->flags &= ~FQP_LINKED_DPRIV;
139                         fq_dereference_priv(fqp);
140                 }
141                 spin_unlock_wr(&dpriv->lock);
142
143                 objcache_put(fq_dpriv_cache, dpriv);
144                 atomic_subtract_int(&fq_stats.dpriv_allocations, 1);
145         }
146 }
147
148 void
149 fq_dereference_priv(struct dsched_fq_priv *fqp)
150 {
151         struct dsched_fq_mpriv  *fqmp;
152         struct dsched_fq_dpriv  *dpriv;
153         int refcount;
154
155         refcount = atomic_fetchadd_int(&fqp->refcount, -1);
156
157         KKASSERT(refcount >= -3);
158
159         if (refcount == 1) {
160                 atomic_subtract_int(&fqp->refcount, 3); /* mark as: in destruction */
161 #if 0
162                 kprintf("fqp (%p) destruction started, trace:\n", fqp);
163                 print_backtrace(8);
164 #endif
165                 dpriv = fqp->dpriv;
166                 KKASSERT(dpriv != NULL);
167
168                 spin_lock_wr(&fqp->lock);
169
170                 KKASSERT(fqp->qlength == 0);
171
172                 if (fqp->flags & FQP_LINKED_DPRIV) {
173                         spin_lock_wr(&dpriv->lock);
174
175                         TAILQ_REMOVE(&dpriv->fq_priv_list, fqp, dlink);
176                         fqp->flags &= ~FQP_LINKED_DPRIV;
177
178                         spin_unlock_wr(&dpriv->lock);
179                 }
180
181                 if (fqp->flags & FQP_LINKED_FQMP) {
182                         fqmp = fqp->fqmp;
183                         KKASSERT(fqmp != NULL);
184
185                         spin_lock_wr(&fqmp->lock);
186
187                         TAILQ_REMOVE(&fqmp->fq_priv_list, fqp, link);
188                         fqp->flags &= ~FQP_LINKED_FQMP;
189
190                         spin_unlock_wr(&fqmp->lock);
191                 }
192
193                 spin_unlock_wr(&fqp->lock);
194
195                 objcache_put(fq_priv_cache, fqp);
196                 atomic_subtract_int(&fq_stats.fqp_allocations, 1);
197 #if 0
198                 fq_dereference_dpriv(dpriv);
199 #endif
200         }
201 }
202
203 void
204 fq_dereference_mpriv(struct dsched_fq_mpriv *fqmp)
205 {
206         struct dsched_fq_priv   *fqp, *fqp2;
207         int refcount;
208
209         refcount = atomic_fetchadd_int(&fqmp->refcount, -1);
210
211         KKASSERT(refcount >= -3);
212
213         if (refcount == 1) {
214                 atomic_subtract_int(&fqmp->refcount, 3); /* mark as: in destruction */
215 #if 0
216                 kprintf("fqmp (%p) destruction started, trace:\n", fqmp);
217                 print_backtrace(8);
218 #endif
219                 FQ_GLOBAL_FQMP_LOCK();
220                 spin_lock_wr(&fqmp->lock);
221
222                 TAILQ_FOREACH_MUTABLE(fqp, &fqmp->fq_priv_list, link, fqp2) {
223                         TAILQ_REMOVE(&fqmp->fq_priv_list, fqp, link);
224                         fqp->flags &= ~FQP_LINKED_FQMP;
225                         fq_dereference_priv(fqp);
226                 }
227                 TAILQ_REMOVE(&dsched_fqmp_list, fqmp, link);
228
229                 spin_unlock_wr(&fqmp->lock);
230                 FQ_GLOBAL_FQMP_UNLOCK();
231
232                 objcache_put(fq_mpriv_cache, fqmp);
233                 atomic_subtract_int(&fq_stats.fqmp_allocations, 1);
234         }
235 }
236
237
238 struct dsched_fq_priv *
239 fq_alloc_priv(struct disk *dp)
240 {
241         struct dsched_fq_priv   *fqp;
242 #if 0
243         fq_reference_dpriv(dsched_get_disk_priv(dp));
244 #endif
245         fqp = objcache_get(fq_priv_cache, M_WAITOK);
246         bzero(fqp, sizeof(struct dsched_fq_priv));
247
248         /* XXX: maybe we do need another ref for the disk list for fqp */
249         fq_reference_priv(fqp);
250
251         FQ_FQP_LOCKINIT(fqp);
252         FQ_FQP_LOCK(fqp);
253         fqp->dp = dp;
254
255         fqp->dpriv = dsched_get_disk_priv(dp);
256
257         TAILQ_INIT(&fqp->queue);
258         TAILQ_INSERT_TAIL(&fqp->dpriv->fq_priv_list, fqp, dlink);
259         fqp->flags |= FQP_LINKED_DPRIV;
260
261         atomic_add_int(&fq_stats.fqp_allocations, 1);
262         FQ_FQP_UNLOCK(fqp);
263         return fqp;
264 }
265
266
267 struct dsched_fq_dpriv *
268 fq_alloc_dpriv(struct disk *dp)
269 {
270         struct dsched_fq_dpriv *dpriv;
271
272         dpriv = objcache_get(fq_dpriv_cache, M_WAITOK);
273         bzero(dpriv, sizeof(struct dsched_fq_dpriv));
274         fq_reference_dpriv(dpriv);
275         dpriv->dp = dp;
276         dpriv->avg_rq_time = 0;
277         dpriv->incomplete_tp = 0;
278         FQ_DPRIV_LOCKINIT(dpriv);
279         TAILQ_INIT(&dpriv->fq_priv_list);
280
281         atomic_add_int(&fq_stats.dpriv_allocations, 1);
282         return dpriv;
283 }
284
285
286 struct dsched_fq_mpriv *
287 fq_alloc_mpriv(struct proc *p)
288 {
289         struct dsched_fq_mpriv  *fqmp;
290         struct dsched_fq_priv   *fqp;
291         struct disk     *dp = NULL;
292
293         fqmp = objcache_get(fq_mpriv_cache, M_WAITOK);
294         bzero(fqmp, sizeof(struct dsched_fq_mpriv));
295         fq_reference_mpriv(fqmp);
296 #if 0
297         kprintf("fq_alloc_mpriv, new fqmp = %p\n", fqmp);
298 #endif
299         FQ_FQMP_LOCKINIT(fqmp);
300         FQ_FQMP_LOCK(fqmp);
301         TAILQ_INIT(&fqmp->fq_priv_list);
302
303         while ((dp = dsched_disk_enumerate(dp, &dsched_fq_ops))) {
304                 fqp = fq_alloc_priv(dp);
305                 fqp->p = p;
306 #if 0
307                 fq_reference_priv(fqp);
308 #endif
309                 fqp->fqmp = fqmp;
310                 TAILQ_INSERT_TAIL(&fqmp->fq_priv_list, fqp, link);
311                 fqp->flags |= FQP_LINKED_FQMP;
312         }
313
314         FQ_GLOBAL_FQMP_LOCK();
315         TAILQ_INSERT_TAIL(&dsched_fqmp_list, fqmp, link);
316         FQ_GLOBAL_FQMP_UNLOCK();
317         FQ_FQMP_UNLOCK(fqmp);
318
319         atomic_add_int(&fq_stats.fqmp_allocations, 1);
320         return fqmp;
321 }
322
323
324 void
325 fq_dispatcher(struct dsched_fq_dpriv *dpriv)
326 {
327         struct dsched_fq_mpriv  *fqmp;
328         struct dsched_fq_priv   *fqp, *fqp2;
329         struct bio *bio, *bio2;
330         int idle;
331
332         /*
333          * We need to manually assign an fqp to the fqmp of this thread
334          * since it isn't assigned one during fq_prepare, as the disk
335          * is not set up yet.
336          */
337         fqmp = dsched_get_thread_priv(curthread);
338         KKASSERT(fqmp != NULL);
339
340         fqp = fq_alloc_priv(dpriv->dp);
341         FQ_FQMP_LOCK(fqmp);
342 #if 0
343         fq_reference_priv(fqp);
344 #endif
345         TAILQ_INSERT_TAIL(&fqmp->fq_priv_list, fqp, link);
346         FQ_FQMP_UNLOCK(fqmp);
347
348
349         FQ_DPRIV_LOCK(dpriv);
350         for(;;) {
351                 idle = 0;
352                 /* sleep ~60 ms */
353                 if ((ssleep(dpriv, &dpriv->lock, 0, "fq_dispatcher", hz/15) == 0)) {
354                         /*
355                          * We've been woken up; this either means that we are
356                          * supposed to die away nicely or that the disk is idle.
357                          */
358
359                         if (__predict_false(dpriv->die == 1)) {
360                                 /* If we are supposed to die, drain all queues */
361                                 fq_drain(dpriv, FQ_DRAIN_FLUSH);
362
363                                 /* Now we can safely unlock and exit */
364                                 FQ_DPRIV_UNLOCK(dpriv);
365                                 kprintf("fq_dispatcher is peacefully dying\n");
366                                 lwkt_exit();
367                                 /* NOTREACHED */
368                         }
369
370                         /*
371                          * We have been awakened because the disk is idle.
372                          * So let's get ready to dispatch some extra bios.
373                          */
374                         idle = 1;
375                 }
376
377                 /* Maybe the disk is idle and we just didn't get the wakeup */
378                 if (idle == 0)
379                         idle = dpriv->idle;
380
381                 /*
382                  * XXX: further room for improvements here. It would be better
383                  *      to dispatch a few requests from each fqp as to ensure
384                  *      real fairness.
385                  */
386                 TAILQ_FOREACH_MUTABLE(fqp, &dpriv->fq_priv_list, dlink, fqp2) {
387                         if (fqp->qlength == 0)
388                                 continue;
389
390                         FQ_FQP_LOCK(fqp);
391
392                         /*
393                          * XXX: why 5 extra? should probably be dynamic,
394                          *      relying on information on latency.
395                          */
396                         if ((fqp->max_tp > 0) && idle &&
397                             (fqp->issued >= fqp->max_tp)) {
398                                 fqp->max_tp += 5;
399                         }
400
401                         TAILQ_FOREACH_MUTABLE(bio, &fqp->queue, link, bio2) {
402                                 if ((fqp->max_tp > 0) &&
403                                     ((fqp->issued >= fqp->max_tp)))
404                                         break;
405
406                                 TAILQ_REMOVE(&fqp->queue, bio, link);
407                                 --fqp->qlength;
408
409                                 /*
410                                  * beware that we do have an fqp reference
411                                  * from the queueing
412                                  */
413                                 fq_dispatch(dpriv, bio, fqp);
414                         }
415                         FQ_FQP_UNLOCK(fqp);
416
417                 }
418         }
419 }
420
421 void
422 fq_balance_thread(struct dsched_fq_dpriv *dpriv)
423 {
424         struct  dsched_fq_priv  *fqp, *fqp2;
425         static struct timeval old_tv;
426         struct timeval tv;
427         int     n = 0;
428         static int last_full = 0, prev_full = 0;
429         static int limited_procs = 0;
430         static int first_run = 1;
431         int     disk_busy;
432         int     total_disk_time;
433         int64_t budget, total_budget, used_budget;
434         int64_t budgetpb[FQ_PRIO_MAX+1];
435         int sum, i;
436
437         bzero(budgetpb, sizeof(budgetpb));
438         total_budget = 0;
439
440         getmicrotime(&tv);
441
442         if (__predict_false(first_run)) {
443                 total_disk_time = FQ_TOTAL_DISK_TIME;
444                 first_run = 0;
445         } else {
446                 total_disk_time = (int)(1000000*((tv.tv_sec - old_tv.tv_sec)) +
447                     (tv.tv_usec - old_tv.tv_usec));
448                 dsched_debug(LOG_INFO, "total_disk_time = %d\n", total_disk_time);
449         }
450         old_tv = tv;
451         FQ_DPRIV_LOCK(dpriv);
452
453         disk_busy = (100*(total_disk_time - dpriv->idle_time)) / total_disk_time;
454         if (disk_busy < 0)
455                 disk_busy = 0;
456
457         dpriv->idle_time = 0;
458
459         TAILQ_FOREACH_MUTABLE(fqp, &dpriv->fq_priv_list, dlink, fqp2) {
460                 if (fqp->transactions > 0 /* 30 */) {
461                         total_budget += (fqp->avg_latency * fqp->transactions);
462                         ++budgetpb[(fqp->p) ? fqp->p->p_ionice : 0];
463
464                         dsched_debug(LOG_INFO,
465                             "%d) avg_latency = %d, transactions = %d, ioprio = %d\n",
466                             n, fqp->avg_latency, fqp->transactions,
467                             (fqp->p) ? fqp->p->p_ionice : 0);
468                         ++n;
469                 } else {
470                         fqp->max_tp = 0;
471                         fqp->avg_latency = 0;
472                 }
473         }
474
475         dsched_debug(LOG_INFO, "%d procs competing for disk\n"
476             "total_budget = %lld\n"
477             "incomplete tp = %d\n", n, total_budget, dpriv->incomplete_tp);
478
479         if (n == 0)
480                 goto done;
481
482         sum = 0;
483
484         for (i = 0; i < FQ_PRIO_MAX+1; i++) {
485                 if (budgetpb[i] == 0)
486                         continue;
487                 sum += (FQ_PRIO_BIAS+i)*budgetpb[i];
488         }
489
490         if (sum == 0)
491                 sum = 1;
492
493         dsched_debug(LOG_INFO, "sum = %d\n", sum);
494
495         for (i = 0; i < FQ_PRIO_MAX+1; i++) {
496                 if (budgetpb[i] == 0)
497                         continue;
498
499                 budgetpb[i] = ((FQ_PRIO_BIAS+i)*10)*total_budget/sum;
500         }
501
502         if (total_budget > dpriv->max_budget)
503                 dpriv->max_budget = total_budget;
504
505         limited_procs = 0;
506
507         dsched_debug(4, "disk is %d\% busy\n", disk_busy);
508
509         /*
510          * XXX: eventually remove all the silly *10...
511          */
512         TAILQ_FOREACH_MUTABLE(fqp, &dpriv->fq_priv_list, dlink, fqp2) {
513                 budget = budgetpb[(fqp->p) ? fqp->p->p_ionice : 0];
514
515                 used_budget = ((int64_t)10*(int64_t)fqp->avg_latency *
516                     (int64_t)fqp->transactions);
517                 if (used_budget > 0) {
518                         dsched_debug(LOG_INFO,
519                             "info: used_budget = %lld, budget = %lld\n", used_budget,
520                             budget);
521                 }
522
523                 /*
524                  * process is exceeding its fair share; rate-limit it, but only
525                  * if the disk is being used at 90+% of capacity
526                  */
527                 if ((used_budget > budget) && (disk_busy >= 90)) {
528                         KKASSERT(fqp->avg_latency != 0);
529
530                         fqp->max_tp = budget/(10*fqp->avg_latency);
531                         ++limited_procs;
532                         dsched_debug(LOG_INFO,
533                             "rate limited to %d transactions\n", fqp->max_tp);
534                         atomic_add_int(&fq_stats.procs_limited, 1);
535                 } else if (((used_budget*2 < budget) || (disk_busy < 90)) &&
536                     (!prev_full && !last_full)) {
537                         /*
538                          * process is really using little of its timeslice, or the
539                          * disk is not busy, so let's reset the rate-limit.
540                          * Without this, exceeding processes will get an unlimited
541                          * slice every other slice.
542                          * XXX: this still doesn't quite fix the issue, but maybe
543                          * it's good that way, so that heavy writes are interleaved.
544                          */
545                         fqp->max_tp = 0;
546                 }
547                 fqp->transactions = 0;
548                 fqp->avg_latency = 0;
549                 fqp->issued = 0;
550         }
551
552         prev_full = last_full;
553         last_full = (disk_busy >= 90)?1:0;
554
555 done:
556         FQ_DPRIV_UNLOCK(dpriv);
557         callout_reset(&fq_callout, hz * FQ_REBALANCE_TIMEOUT,
558             (void (*)(void *))fq_balance_thread, dpriv);
559 }
560
561
562 static int
563 do_fqstats(SYSCTL_HANDLER_ARGS)
564 {
565         return (sysctl_handle_opaque(oidp, &fq_stats, sizeof(struct dsched_fq_stats), req));
566 }
567
568
569 SYSCTL_PROC(_kern, OID_AUTO, fq_stats, CTLTYPE_OPAQUE|CTLFLAG_RD,
570     0, sizeof(struct dsched_fq_stats), do_fqstats, "fq_stats",
571     "dsched_fq statistics");
572
573
574
575
576 static void
577 fq_init(void)
578 {
579
580 }
581
582 static void
583 fq_uninit(void)
584 {
585
586 }
587
588 static void
589 fq_earlyinit(void)
590 {
591         fq_priv_cache = objcache_create("fq-priv-cache", 0, 0,
592                                            NULL, NULL, NULL,
593                                            objcache_malloc_alloc,
594                                            objcache_malloc_free,
595                                            &dsched_fq_priv_malloc_args );
596
597         fq_mpriv_cache = objcache_create("fq-mpriv-cache", 0, 0,
598                                            NULL, NULL, NULL,
599                                            objcache_malloc_alloc,
600                                            objcache_malloc_free,
601                                            &dsched_fq_mpriv_malloc_args );
602
603         FQ_GLOBAL_FQMP_LOCKINIT();
604
605         fq_dpriv_cache = objcache_create("fq-dpriv-cache", 0, 0,
606                                            NULL, NULL, NULL,
607                                            objcache_malloc_alloc,
608                                            objcache_malloc_free,
609                                            &dsched_fq_dpriv_malloc_args );
610
611         bzero(&fq_stats, sizeof(struct dsched_fq_stats));
612
613         dsched_register(&dsched_fq_ops);
614         callout_init_mp(&fq_callout);
615
616         kprintf("FQ scheduler policy version %d.%d loaded\n",
617             dsched_fq_version_maj, dsched_fq_version_min);
618 }
619
620 static void
621 fq_earlyuninit(void)
622 {
623         callout_stop(&fq_callout);
624         callout_deactivate(&fq_callout);
625         return;
626 }
627
628 SYSINIT(fq_register, SI_SUB_PRE_DRIVERS, SI_ORDER_ANY, fq_init, NULL);
629 SYSUNINIT(fq_register, SI_SUB_PRE_DRIVERS, SI_ORDER_FIRST, fq_uninit, NULL);
630
631 SYSINIT(fq_early, SI_SUB_CREATE_INIT-1, SI_ORDER_FIRST, fq_earlyinit, NULL);
632 SYSUNINIT(fq_early, SI_SUB_CREATE_INIT-1, SI_ORDER_ANY, fq_earlyuninit, NULL);