sys/kern/dsched/fq/fq_core.c

   1 /*
   2  * Copyright (c) 2009, 2010 The DragonFly Project.  All rights reserved.
   3  *
   4  * This code is derived from software contributed to The DragonFly Project
   5  * by Alex Hornung <ahornung@gmail.com>
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  *
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in
  15  *    the documentation and/or other materials provided with the
  16  *    distribution.
  17  * 3. Neither the name of The DragonFly Project nor the names of its
  18  *    contributors may be used to endorse or promote products derived
  19  *    from this software without specific, prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  */
  34 #include <sys/param.h>
  35 #include <sys/systm.h>
  36 #include <sys/kernel.h>
  37 #include <sys/proc.h>
  38 #include <sys/sysctl.h>
  39 #include <sys/buf.h>
  40 #include <sys/conf.h>
  41 #include <sys/diskslice.h>
  42 #include <sys/disk.h>
  43 #include <machine/atomic.h>
  44 #include <sys/thread.h>
  45 #include <sys/thread2.h>
  46 #include <sys/ctype.h>
  47 #include <sys/buf2.h>
  48 #include <sys/syslog.h>
  49 #include <sys/dsched.h>
  50 #include <machine/param.h>
  51
  52 #include <kern/dsched/fq/fq.h>
  53
  54 static int      dsched_fq_version_maj = 1;
  55 static int      dsched_fq_version_min = 0;
  56
  57 /* Make sure our structs fit */
  58 CTASSERT(sizeof(struct fq_thread_io) <= DSCHED_THREAD_IO_MAX_SZ);
  59 CTASSERT(sizeof(struct fq_disk_ctx) <= DSCHED_DISK_CTX_MAX_SZ);
  60
  61 struct dsched_fq_stats  fq_stats;
  62
  63 extern struct dsched_policy dsched_fq_policy;
  64
  65 void
  66 fq_dispatcher(struct fq_disk_ctx *diskctx)
  67 {
  68         struct dsched_thread_io *ds_tdio, *ds_tdio2;
  69         struct fq_thread_io     *tdio;
  70         struct bio *bio, *bio2;
  71         int idle;
  72
  73         /*
  74          * We need to manually assign an tdio to the tdctx of this thread
  75          * since it isn't assigned one during fq_prepare, as the disk
  76          * is not set up yet.
  77          */
  78         tdio = (struct fq_thread_io *)dsched_new_policy_thread_tdio(&diskctx->head,
  79             &dsched_fq_policy);
  80
  81         DSCHED_DISK_CTX_LOCK(&diskctx->head);
  82         for(;;) {
  83                 idle = 0;
  84                 /* sleep ~60 ms */
  85                 if ((lksleep(diskctx, &diskctx->head.lock, 0, "fq_dispatcher", hz/15) == 0)) {
  86                         /*
  87                          * We've been woken up; this either means that we are
  88                          * supposed to die away nicely or that the disk is idle.
  89                          */
  90
  91                         if (__predict_false(diskctx->die == 1)) {
  92                                 /* If we are supposed to die, drain all queues */
  93                                 fq_drain(diskctx, FQ_DRAIN_FLUSH);
  94
  95                                 /* Now we can safely unlock and exit */
  96                                 DSCHED_DISK_CTX_UNLOCK(&diskctx->head);
  97                                 kprintf("fq_dispatcher is peacefully dying\n");
  98                                 lwkt_exit();
  99                                 /* NOTREACHED */
 100                         }
 101
 102                         /*
 103                          * We have been awakened because the disk is idle.
 104                          * So let's get ready to dispatch some extra bios.
 105                          */
 106                         idle = 1;
 107                 }
 108
 109                 /* Maybe the disk is idle and we just didn't get the wakeup */
 110                 if (idle == 0)
 111                         idle = diskctx->idle;
 112
 113                 /*
 114                  * XXX: further room for improvements here. It would be better
 115                  *      to dispatch a few requests from each tdio as to ensure
 116                  *      real fairness.
 117                  */
 118                 TAILQ_FOREACH_MUTABLE(ds_tdio, &diskctx->head.tdio_list, dlink, ds_tdio2) {
 119                         tdio = (struct fq_thread_io *)ds_tdio;
 120                         if (tdio->head.qlength == 0)
 121                                 continue;
 122
 123                         DSCHED_THREAD_IO_LOCK(&tdio->head);
 124                         if (atomic_cmpset_int(&tdio->rebalance, 1, 0))
 125                                 fq_balance_self(tdio);
 126                         /*
 127                          * XXX: why 5 extra? should probably be dynamic,
 128                          *      relying on information on latency.
 129                          */
 130                         if ((tdio->max_tp > 0) && idle &&
 131                             (tdio->issued >= tdio->max_tp)) {
 132                                 tdio->max_tp += 5;
 133                         }
 134
 135                         TAILQ_FOREACH_MUTABLE(bio, &tdio->head.queue, link, bio2) {
 136                                 if (atomic_cmpset_int(&tdio->rebalance, 1, 0))
 137                                         fq_balance_self(tdio);
 138                                 if ((tdio->max_tp > 0) &&
 139                                     ((tdio->issued >= tdio->max_tp)))
 140                                         break;
 141
 142                                 TAILQ_REMOVE(&tdio->head.queue, bio, link);
 143                                 --tdio->head.qlength;
 144
 145                                 /*
 146                                  * beware that we do have an tdio reference
 147                                  * from the queueing
 148                                  */
 149                                 fq_dispatch(diskctx, bio, tdio);
 150                         }
 151                         DSCHED_THREAD_IO_UNLOCK(&tdio->head);
 152
 153                 }
 154         }
 155 }
 156
 157 void
 158 fq_balance_thread(struct fq_disk_ctx *diskctx)
 159 {
 160         struct dsched_thread_io *ds_tdio;
 161         struct  fq_thread_io    *tdio;
 162         struct timeval tv, old_tv;
 163         int64_t total_budget, product;
 164         int64_t budget[FQ_PRIO_MAX+1];
 165         int     n, i, sum, total_disk_time;
 166         int     lost_bits;
 167
 168         DSCHED_DISK_CTX_LOCK(&diskctx->head);
 169
 170         getmicrotime(&diskctx->start_interval);
 171
 172         for (;;) {
 173                 /* sleep ~1s */
 174                 if ((lksleep(curthread, &diskctx->head.lock, 0, "fq_balancer", hz/2) == 0)) {
 175                         if (__predict_false(diskctx->die)) {
 176                                 DSCHED_DISK_CTX_UNLOCK(&diskctx->head);
 177                                 lwkt_exit();
 178                         }
 179                 }
 180
 181                 bzero(budget, sizeof(budget));
 182                 total_budget = 0;
 183                 n = 0;
 184
 185                 old_tv = diskctx->start_interval;
 186                 getmicrotime(&tv);
 187
 188                 total_disk_time = (int)(1000000*((tv.tv_sec - old_tv.tv_sec)) +
 189                     (tv.tv_usec - old_tv.tv_usec));
 190
 191                 if (total_disk_time == 0)
 192                         total_disk_time = 1;
 193
 194                 dsched_debug(LOG_INFO, "total_disk_time = %d\n", total_disk_time);
 195
 196                 diskctx->start_interval = tv;
 197
 198                 diskctx->disk_busy = (100*(total_disk_time - diskctx->idle_time)) / total_disk_time;
 199                 if (diskctx->disk_busy < 0)
 200                         diskctx->disk_busy = 0;
 201
 202                 diskctx->idle_time = 0;
 203                 lost_bits = 0;
 204
 205                 TAILQ_FOREACH(ds_tdio, &diskctx->head.tdio_list, dlink) {
 206                         tdio = (struct fq_thread_io *)ds_tdio;
 207                         tdio->interval_avg_latency = tdio->avg_latency;
 208                         tdio->interval_transactions = tdio->transactions;
 209                         if (tdio->interval_transactions > 0) {
 210                                 product = (int64_t)tdio->interval_avg_latency *
 211                                     tdio->interval_transactions;
 212                                 product >>= lost_bits;
 213                                 while(total_budget >= INT64_MAX - product) {
 214                                         ++lost_bits;
 215                                         product >>= 1;
 216                                         total_budget >>= 1;
 217                                 }
 218                                 total_budget += product;
 219                                 ++budget[(tdio->head.p) ? tdio->head.p->p_ionice : 0];
 220                                 KKASSERT(total_budget >= 0);
 221                                 dsched_debug(LOG_INFO,
 222                                     "%d) avg_latency = %d, transactions = %d, ioprio = %d\n",
 223                                     n, tdio->interval_avg_latency, tdio->interval_transactions,
 224                                     (tdio->head.p) ? tdio->head.p->p_ionice : 0);
 225                                 ++n;
 226                         } else {
 227                                 tdio->max_tp = 0;
 228                         }
 229                         tdio->rebalance = 0;
 230                         tdio->transactions = 0;
 231                         tdio->avg_latency = 0;
 232                         tdio->issued = 0;
 233                 }
 234
 235                 dsched_debug(LOG_INFO, "%d procs competing for disk\n"
 236                     "total_budget = %jd (lost bits = %d)\n"
 237                     "incomplete tp = %d\n", n, (intmax_t)total_budget,
 238                     lost_bits, diskctx->incomplete_tp);
 239
 240                 if (n == 0)
 241                         continue;
 242
 243                 sum = 0;
 244
 245                 for (i = 0; i < FQ_PRIO_MAX+1; i++) {
 246                         if (budget[i] == 0)
 247                                 continue;
 248                         sum += (FQ_PRIO_BIAS+i)*budget[i];
 249                 }
 250
 251                 if (sum == 0)
 252                         sum = 1;
 253
 254                 dsched_debug(LOG_INFO, "sum = %d\n", sum);
 255
 256                 for (i = 0; i < FQ_PRIO_MAX+1; i++) {
 257                         if (budget[i] == 0)
 258                                 continue;
 259
 260                         /*
 261                          * XXX: if we still overflow here, we really need to switch to
 262                          *      some more advanced mechanism such as compound int128 or
 263                          *      storing the lost bits so they can be used in the
 264                          *      fq_balance_self.
 265                          */
 266                         diskctx->budgetpb[i] = ((FQ_PRIO_BIAS+i)*total_budget/sum) << lost_bits;
 267                         KKASSERT(diskctx->budgetpb[i] >= 0);
 268                 }
 269
 270                 dsched_debug(4, "disk is %d%% busy\n", diskctx->disk_busy);
 271                 TAILQ_FOREACH(ds_tdio, &diskctx->head.tdio_list, dlink) {
 272                         tdio = (struct fq_thread_io *)ds_tdio;
 273                         tdio->rebalance = 1;
 274                 }
 275
 276                 diskctx->prev_full = diskctx->last_full;
 277                 diskctx->last_full = (diskctx->disk_busy >= 90)?1:0;
 278         }
 279 }
 280
 281
 282 /*
 283  * fq_balance_self should be called from all sorts of dispatchers. It basically
 284  * offloads some of the heavier calculations on throttling onto the process that
 285  * wants to do I/O instead of doing it in the fq_balance thread.
 286  * - should be called with diskctx lock held
 287  */
 288 void
 289 fq_balance_self(struct fq_thread_io *tdio) {
 290         struct fq_disk_ctx *diskctx;
 291
 292         int64_t budget, used_budget;
 293         int64_t avg_latency;
 294         int64_t transactions;
 295
 296         transactions = (int64_t)tdio->interval_transactions;
 297         avg_latency = (int64_t)tdio->interval_avg_latency;
 298         diskctx = (struct fq_disk_ctx *)tdio->head.diskctx;
 299
 300 #if 0
 301         /* XXX: do we really require the lock? */
 302         DSCHED_DISK_CTX_LOCK_ASSERT(diskctx);
 303 #endif
 304
 305         used_budget = ((int64_t)avg_latency * transactions);
 306         budget = diskctx->budgetpb[(tdio->head.p) ? tdio->head.p->p_ionice : 0];
 307
 308         if (used_budget > 0) {
 309                 dsched_debug(LOG_INFO,
 310                     "info: used_budget = %jd, budget = %jd\n",
 311                     (intmax_t)used_budget, budget);
 312         }
 313
 314         if ((used_budget > budget) && (diskctx->disk_busy >= 90)) {
 315                 KKASSERT(avg_latency != 0);
 316
 317                 tdio->max_tp = budget/(avg_latency);
 318                 atomic_add_int(&fq_stats.procs_limited, 1);
 319
 320                 dsched_debug(LOG_INFO,
 321                     "rate limited to %d transactions\n", tdio->max_tp);
 322
 323         } else if (((used_budget*2 < budget) || (diskctx->disk_busy < 80)) &&
 324             (!diskctx->prev_full && !diskctx->last_full)) {
 325                 tdio->max_tp = 0;
 326         }
 327 }
 328
 329
 330 static int
 331 do_fqstats(SYSCTL_HANDLER_ARGS)
 332 {
 333         return (sysctl_handle_opaque(oidp, &fq_stats, sizeof(struct dsched_fq_stats), req));
 334 }
 335
 336 static int
 337 fq_mod_handler(module_t mod, int type, void *unused)
 338 {
 339         static struct sysctl_ctx_list sysctl_ctx;
 340         static struct sysctl_oid *oid;
 341         static char version[16];
 342         int error;
 343
 344         ksnprintf(version, sizeof(version), "%d.%d",
 345             dsched_fq_version_maj, dsched_fq_version_min);
 346
 347         switch (type) {
 348         case MOD_LOAD:
 349                 bzero(&fq_stats, sizeof(struct dsched_fq_stats));
 350                 if ((error = dsched_register(&dsched_fq_policy)))
 351                         return (error);
 352
 353                 sysctl_ctx_init(&sysctl_ctx);
 354                 oid = SYSCTL_ADD_NODE(&sysctl_ctx,
 355                     SYSCTL_STATIC_CHILDREN(_dsched),
 356                     OID_AUTO,
 357                     "fq",
 358                     CTLFLAG_RD, 0, "");
 359
 360                 SYSCTL_ADD_PROC(&sysctl_ctx, SYSCTL_CHILDREN(oid),
 361                     OID_AUTO, "stats", CTLTYPE_OPAQUE|CTLFLAG_RD,
 362                     0, 0, do_fqstats, "S,dsched_fq_stats", "fq statistics");
 363
 364                 SYSCTL_ADD_STRING(&sysctl_ctx, SYSCTL_CHILDREN(oid),
 365                     OID_AUTO, "version", CTLFLAG_RD, version, 0, "fq version");
 366
 367                 kprintf("FQ scheduler policy version %d.%d loaded\n",
 368                     dsched_fq_version_maj, dsched_fq_version_min);
 369                 break;
 370
 371         case MOD_UNLOAD:
 372                 if ((error = dsched_unregister(&dsched_fq_policy)))
 373                         return (error);
 374                 sysctl_ctx_free(&sysctl_ctx);
 375                 kprintf("FQ scheduler policy unloaded\n");
 376                 break;
 377
 378         default:
 379                 break;
 380         }
 381
 382         return 0;
 383 }
 384
 385 DSCHED_POLICY_MODULE(dsched_fq, fq_mod_handler);