dsched_fq - Refactor fq_balance_thread
authorAlex Hornung <ahornung@gmail.com>
Thu, 1 Apr 2010 08:40:50 +0000 (08:40 +0000)
committerAlex Hornung <ahornung@gmail.com>
Thu, 15 Apr 2010 20:24:49 +0000 (20:24 +0000)
* Refactor fq_balance_thread to be an LWKT instead of callout.

* Also make sure fqp->transactions, fqp->avg_latency don't change while
  we use them.

sys/dsched/fq/dsched_fq.h
sys/dsched/fq/dsched_fq_core.c
sys/dsched/fq/dsched_fq_diskops.c

index ba10f74..25d7b32 100644 (file)
@@ -113,12 +113,15 @@ struct dsched_fq_priv {
        int     refcount;
        int32_t transactions;
        int32_t avg_latency;
+       int32_t s_transactions;
+       int32_t s_avg_latency;
        int32_t max_tp;
        int32_t issued;
 };
 
 struct dsched_fq_dpriv {
        struct thread   *td;
+       struct thread   *td_balance;
        struct disk     *dp;
        struct spinlock lock;
        int     refcount;
index 3a95915..c2d57b1 100644 (file)
@@ -424,138 +424,139 @@ fq_balance_thread(struct dsched_fq_dpriv *dpriv)
        struct  dsched_fq_priv  *fqp, *fqp2;
        static struct timeval old_tv;
        struct timeval tv;
-       int     n = 0;
-       static int last_full = 0, prev_full = 0;
-       static int limited_procs = 0;
-       static int first_run = 1;
+       int     n;
+       int last_full = 0, prev_full = 0;
        int     disk_busy;
        int     total_disk_time;
        int64_t budget, total_budget, used_budget;
        int64_t budgetpb[FQ_PRIO_MAX+1];
        int sum, i;
 
-       bzero(budgetpb, sizeof(budgetpb));
-       total_budget = 0;
+       getmicrotime(&old_tv);
 
-       getmicrotime(&tv);
+       FQ_DPRIV_LOCK(dpriv);
+       for (;;) {
+               /* sleep ~1s */
+               if ((ssleep(curthread, &dpriv->lock, 0, "fq_balancer", hz) == 0)) {
+                       if (__predict_false(dpriv->die)) {
+                               FQ_DPRIV_UNLOCK(dpriv);
+                               lwkt_exit();
+                       }
+               }
+
+               bzero(budgetpb, sizeof(budgetpb));
+               total_budget = 0;
+               n = 0;
+
+               getmicrotime(&tv);
 
-       if (__predict_false(first_run)) {
-               total_disk_time = FQ_TOTAL_DISK_TIME;
-               first_run = 0;
-       } else {
                total_disk_time = (int)(1000000*((tv.tv_sec - old_tv.tv_sec)) +
                    (tv.tv_usec - old_tv.tv_usec));
                dsched_debug(LOG_INFO, "total_disk_time = %d\n", total_disk_time);
-       }
-       old_tv = tv;
-       FQ_DPRIV_LOCK(dpriv);
 
-       disk_busy = (100*(total_disk_time - dpriv->idle_time)) / total_disk_time;
-       if (disk_busy < 0)
-               disk_busy = 0;
+               old_tv = tv;
 
-       dpriv->idle_time = 0;
+               disk_busy = (100*(total_disk_time - dpriv->idle_time)) / total_disk_time;
+               if (disk_busy < 0)
+                       disk_busy = 0;
 
-       TAILQ_FOREACH_MUTABLE(fqp, &dpriv->fq_priv_list, dlink, fqp2) {
-               if (fqp->transactions > 0 /* 30 */) {
-                       total_budget += (fqp->avg_latency * fqp->transactions);
-                       ++budgetpb[(fqp->p) ? fqp->p->p_ionice : 0];
+               dpriv->idle_time = 0;
 
-                       dsched_debug(LOG_INFO,
-                           "%d) avg_latency = %d, transactions = %d, ioprio = %d\n",
-                           n, fqp->avg_latency, fqp->transactions,
-                           (fqp->p) ? fqp->p->p_ionice : 0);
-                       ++n;
-               } else {
-                       fqp->max_tp = 0;
-                       fqp->avg_latency = 0;
+               TAILQ_FOREACH_MUTABLE(fqp, &dpriv->fq_priv_list, dlink, fqp2) {
+                       fqp->s_avg_latency = fqp->avg_latency;
+                       fqp->s_transactions = fqp->transactions;
+                       if (fqp->transactions > 0 /* 30 */) {
+                               total_budget += (fqp->s_avg_latency * fqp->s_transactions);
+                               ++budgetpb[(fqp->p) ? fqp->p->p_ionice : 0];
+
+                               dsched_debug(LOG_INFO,
+                                   "%d) avg_latency = %d, transactions = %d, ioprio = %d\n",
+                                   n, fqp->s_avg_latency, fqp->s_transactions,
+                                   (fqp->p) ? fqp->p->p_ionice : 0);
+                               ++n;
+                       } else {
+                               fqp->max_tp = 0;
+                               fqp->avg_latency = 0;
+                       }
                }
-       }
-
-       dsched_debug(LOG_INFO, "%d procs competing for disk\n"
-           "total_budget = %lld\n"
-           "incomplete tp = %d\n", n, total_budget, dpriv->incomplete_tp);
 
-       if (n == 0)
-               goto done;
+               dsched_debug(LOG_INFO, "%d procs competing for disk\n"
+                   "total_budget = %lld\n"
+                   "incomplete tp = %d\n", n, total_budget, dpriv->incomplete_tp);
 
-       sum = 0;
-
-       for (i = 0; i < FQ_PRIO_MAX+1; i++) {
-               if (budgetpb[i] == 0)
+               if (n == 0)
                        continue;
-               sum += (FQ_PRIO_BIAS+i)*budgetpb[i];
-       }
 
-       if (sum == 0)
-               sum = 1;
+               sum = 0;
 
-       dsched_debug(LOG_INFO, "sum = %d\n", sum);
+               for (i = 0; i < FQ_PRIO_MAX+1; i++) {
+                       if (budgetpb[i] == 0)
+                               continue;
+                       sum += (FQ_PRIO_BIAS+i)*budgetpb[i];
+               }
 
-       for (i = 0; i < FQ_PRIO_MAX+1; i++) {
-               if (budgetpb[i] == 0)
-                       continue;
+               if (sum == 0)
+                       sum = 1;
 
-               budgetpb[i] = ((FQ_PRIO_BIAS+i)*10)*total_budget/sum;
-       }
+               dsched_debug(LOG_INFO, "sum = %d\n", sum);
 
-       if (total_budget > dpriv->max_budget)
-               dpriv->max_budget = total_budget;
+               for (i = 0; i < FQ_PRIO_MAX+1; i++) {
+                       if (budgetpb[i] == 0)
+                               continue;
 
-       limited_procs = 0;
+                       budgetpb[i] = ((FQ_PRIO_BIAS+i)*10)*total_budget/sum;
+               }
 
-       dsched_debug(4, "disk is %d\% busy\n", disk_busy);
+               if (total_budget > dpriv->max_budget)
+                       dpriv->max_budget = total_budget;
 
-       /*
-        * XXX: eventually remove all the silly *10...
-        */
-       TAILQ_FOREACH_MUTABLE(fqp, &dpriv->fq_priv_list, dlink, fqp2) {
-               budget = budgetpb[(fqp->p) ? fqp->p->p_ionice : 0];
-
-               used_budget = ((int64_t)10*(int64_t)fqp->avg_latency *
-                   (int64_t)fqp->transactions);
-               if (used_budget > 0) {
-                       dsched_debug(LOG_INFO,
-                           "info: used_budget = %lld, budget = %lld\n", used_budget,
-                           budget);
-               }
+               dsched_debug(4, "disk is %d\% busy\n", disk_busy);
 
                /*
-                * process is exceeding its fair share; rate-limit it, but only
-                * if the disk is being used at 90+% of capacity
+                * XXX: eventually remove all the silly *10...
                 */
-               if ((used_budget > budget) && (disk_busy >= 90)) {
-                       KKASSERT(fqp->avg_latency != 0);
-
-                       fqp->max_tp = budget/(10*fqp->avg_latency);
-                       ++limited_procs;
-                       dsched_debug(LOG_INFO,
-                           "rate limited to %d transactions\n", fqp->max_tp);
-                       atomic_add_int(&fq_stats.procs_limited, 1);
-               } else if (((used_budget*2 < budget) || (disk_busy < 90)) &&
-                   (!prev_full && !last_full)) {
+               TAILQ_FOREACH_MUTABLE(fqp, &dpriv->fq_priv_list, dlink, fqp2) {
+                       budget = budgetpb[(fqp->p) ? fqp->p->p_ionice : 0];
+
+                       used_budget = ((int64_t)10*(int64_t)fqp->s_avg_latency *
+                           (int64_t)fqp->s_transactions);
+                       if (used_budget > 0) {
+                               dsched_debug(LOG_INFO,
+                                   "info: used_budget = %lld, budget = %lld\n", used_budget,
+                                   budget);
+                       }
+
                        /*
-                        * process is really using little of its timeslice, or the
-                        * disk is not busy, so let's reset the rate-limit.
-                        * Without this, exceeding processes will get an unlimited
-                        * slice every other slice.
-                        * XXX: this still doesn't quite fix the issue, but maybe
-                        * it's good that way, so that heavy writes are interleaved.
+                        * process is exceeding its fair share; rate-limit it, but only
+                        * if the disk is being used at 90+% of capacity
                         */
-                       fqp->max_tp = 0;
+                       if ((used_budget > budget) && (disk_busy >= 90)) {
+                               KKASSERT(fqp->s_avg_latency != 0);
+
+                               fqp->max_tp = budget/(10*fqp->s_avg_latency);
+                               dsched_debug(LOG_INFO,
+                                   "rate limited to %d transactions\n", fqp->max_tp);
+                               atomic_add_int(&fq_stats.procs_limited, 1);
+                       } else if (((used_budget*2 < budget) || (disk_busy < 80)) &&
+                           (!prev_full && !last_full)) {
+                               /*
+                                * process is really using little of its timeslice, or the
+                                * disk is not busy, so let's reset the rate-limit.
+                                * Without this, exceeding processes will get an unlimited
+                                * slice every other slice.
+                                * XXX: this still doesn't quite fix the issue, but maybe
+                                * it's good that way, so that heavy writes are interleaved.
+                                */
+                               fqp->max_tp = 0;
+                       }
+                       fqp->transactions = 0;
+                       fqp->avg_latency = 0;
+                       fqp->issued = 0;
                }
-               fqp->transactions = 0;
-               fqp->avg_latency = 0;
-               fqp->issued = 0;
-       }
 
-       prev_full = last_full;
-       last_full = (disk_busy >= 90)?1:0;
-
-done:
-       FQ_DPRIV_UNLOCK(dpriv);
-       callout_reset(&fq_callout, hz * FQ_REBALANCE_TIMEOUT,
-           (void (*)(void *))fq_balance_thread, dpriv);
+               prev_full = last_full;
+               last_full = (disk_busy >= 90)?1:0;
+       }
 }
 
 
index 886a999..9e6b25a 100644 (file)
@@ -107,7 +107,7 @@ fq_prepare(struct disk *dp)
        struct  dsched_fq_dpriv *dpriv;
        struct dsched_fq_mpriv  *fqmp;
        struct dsched_fq_priv   *fqp;
-       struct thread *td_core;
+       struct thread *td_core, *td_balance;
 
        dpriv = fq_alloc_dpriv(dp);
        fq_reference_dpriv(dpriv);
@@ -127,8 +127,10 @@ fq_prepare(struct disk *dp)
 
        FQ_GLOBAL_FQMP_UNLOCK();
        lwkt_create((void (*)(void *))fq_dispatcher, dpriv, &td_core, NULL,
-           0, 0, "fq_dispatcher_%s", dp->d_cdev->si_name);
-       fq_balance_thread(dpriv);
+           0, 0, "fq_dispatch_%s", dp->d_cdev->si_name);
+       lwkt_create((void (*)(void *))fq_balance_thread, dpriv, &td_balance,
+           NULL, 0, 0, "fq_balance_%s", dp->d_cdev->si_name);
+       dpriv->td_balance = td_balance;
 
        return 0;
 }
@@ -144,16 +146,15 @@ fq_teardown(struct disk *dp)
        KKASSERT(dpriv != NULL);
 
        /* Basically kill the dispatcher thread */
-       callout_stop(&fq_callout);
        dpriv->die = 1;
+       wakeup(dpriv->td_balance);
        wakeup(dpriv);
        tsleep(dpriv, 0, "fq_dispatcher", hz/5); /* wait 200 ms */
-       callout_stop(&fq_callout);
+       wakeup(dpriv->td_balance);
+       wakeup(dpriv);
+       tsleep(dpriv, 0, "fq_dispatcher", hz/10); /* wait 100 ms */
+       wakeup(dpriv->td_balance);
        wakeup(dpriv);
-       tsleep(dpriv, 0, "fq_dispatcher", hz/5); /* wait 200 ms */
-       callout_stop(&fq_callout);
-       /* XXX: we really need callout_drain, this REALLY sucks */
-
 
        fq_dereference_dpriv(dpriv); /* from prepare */
        fq_dereference_dpriv(dpriv); /* from alloc */