dsched - Implement priorities and other improvements
authorAlex Hornung <ahornung@gmail.com>
Tue, 30 Mar 2010 02:18:32 +0000 (02:18 +0000)
committerAlex Hornung <ahornung@gmail.com>
Thu, 15 Apr 2010 20:24:47 +0000 (20:24 +0000)
* Implemented fq priority support. This also includes the userland tool
  ionice.

* Implement an exit_buffer hook to be able to clean up bufs before they
  are reused.

* Some minor performance improvements in fq.

24 files changed:
sys/dsched/fq/dsched_fq.h
sys/dsched/fq/dsched_fq_core.c
sys/dsched/fq/dsched_fq_diskops.c
sys/dsched/fq/dsched_fq_procops.c
sys/kern/init_sysent.c
sys/kern/kern_resource.c
sys/kern/lwkt_thread.c
sys/kern/subr_dsched.c
sys/kern/syscalls.c
sys/kern/syscalls.master
sys/kern/vfs_bio.c
sys/sys/dsched.h
sys/sys/proc.h
sys/sys/resource.h
sys/sys/syscall-hide.h
sys/sys/syscall.h
sys/sys/syscall.mk
sys/sys/sysproto.h
sys/sys/sysunion.h
sys/vm/vm_pager.c
test/dsched_fq/fqstats.c
usr.bin/Makefile
usr.bin/ionice/Makefile [new file with mode: 0644]
usr.bin/ionice/ionice.c [new file with mode: 0644]

index c7ad728..5d56fb4 100644 (file)
@@ -109,6 +109,7 @@ struct dsched_fq_priv {
        struct  disk            *dp;
        struct dsched_fq_dpriv  *dpriv;
        struct dsched_fq_mpriv  *fqmp;
+       struct proc             *p;
 
        int32_t qlength;
        int32_t flags;
@@ -117,6 +118,7 @@ struct dsched_fq_priv {
        int32_t transactions;
        int32_t avg_latency;
        int32_t max_tp;
+       int32_t issued;
 };
 
 struct dsched_fq_dpriv {
@@ -127,6 +129,7 @@ struct dsched_fq_dpriv {
 
        int     avg_rq_time;    /* XXX: unused */
        int32_t incomplete_tp;
+       int64_t max_budget;
 
        /* list contains all fq_priv for this disk */
        TAILQ_HEAD(, dsched_fq_priv)    fq_priv_list;
@@ -134,6 +137,9 @@ struct dsched_fq_dpriv {
 };
 
 struct dsched_fq_mpriv {
+       struct proc *p;
+       struct thread *td;
+       int dead;
        struct spinlock lock;
        int     refcount;
        TAILQ_HEAD(, dsched_fq_priv)    fq_priv_list;
@@ -156,14 +162,17 @@ struct dsched_fq_bucket {
 
 };
 
-
+#define FQ_PRIO_BIAS           5
+#define FQ_PRIO_MAX            10
+#define FQ_PRIO_MIN            1
+#define FQ_PRIO_IDLE           -1
 #define        FQ_BUCKET_ACTIVE        0x01
 
 
 
 struct dsched_fq_priv  *fq_alloc_priv(struct disk *dp);
 struct dsched_fq_dpriv *fq_alloc_dpriv(struct disk *dp);
-struct dsched_fq_mpriv *fq_alloc_mpriv(void);
+struct dsched_fq_mpriv *fq_alloc_mpriv(struct proc *p);
 void   fq_balance_thread(struct dsched_fq_dpriv *dpriv);
 void   fq_dispatcher(struct dsched_fq_dpriv *dpriv);
 biodone_t              fq_completed;
@@ -190,6 +199,11 @@ struct dsched_fq_stats {
        int32_t cancelled;
 
        int32_t no_fqmp;
+
+       int32_t nthreads;
+       int32_t nprocs;
+
+       int32_t nbufs;
 };
 
 #endif /* _DSCHED_FQ_H_ */
index 5b7e5e9..d7d5a8b 100644 (file)
@@ -296,7 +296,7 @@ fq_alloc_dpriv(struct disk *dp)
 
 
 struct dsched_fq_mpriv *
-fq_alloc_mpriv()
+fq_alloc_mpriv(struct proc *p)
 {
        struct dsched_fq_mpriv  *fqmp;
        struct dsched_fq_priv   *fqp;
@@ -314,7 +314,7 @@ fq_alloc_mpriv()
 
        while ((dp = dsched_disk_enumerate(dp, &dsched_fq_ops))) {
                fqp = fq_alloc_priv(dp);
-
+               fqp->p = p;
 #if 0
                fq_reference_priv(fqp);
 #endif
@@ -336,10 +336,28 @@ fq_alloc_mpriv()
 void
 fq_dispatcher(struct dsched_fq_dpriv *dpriv)
 {
+       struct dsched_fq_mpriv  *fqmp;
        struct dsched_fq_priv   *fqp, *fqp2;
        struct bio *bio, *bio2;
        int count;
 
+       /*
+        * We need to manually assign an fqp to the fqmp of this thread
+        * since it isn't assigned one during fq_prepare, as the disk
+        * is not set up yet.
+        */
+       fqmp = dsched_get_thread_priv(curthread);
+       /* If fqmp is NULL, something went seriously wrong */
+       KKASSERT(fqmp != NULL);
+       fqp = fq_alloc_priv(dpriv->dp);
+       FQ_FQMP_LOCK(fqmp);
+#if 0
+       fq_reference_priv(fqp);
+#endif
+       TAILQ_INSERT_TAIL(&fqmp->fq_priv_list, fqp, link);
+       FQ_FQMP_UNLOCK(fqmp);
+
+
        FQ_DPRIV_LOCK(dpriv);
        for(;;) {
                /* sleep ~60 ms */
@@ -356,8 +374,7 @@ fq_dispatcher(struct dsched_fq_dpriv *dpriv)
 
                                TAILQ_FOREACH_MUTABLE(bio, &fqp->queue, link, bio2) {
                                        if ((fqp->max_tp > 0) &&
-                                           ((count >= fqp->max_tp) ||
-                                           (fqp->transactions >= fqp->max_tp)))
+                                           ((fqp->issued >= fqp->max_tp)))
                                                break;
                                        TAILQ_REMOVE(&fqp->queue, bio, link);
 
@@ -369,9 +386,9 @@ fq_dispatcher(struct dsched_fq_dpriv *dpriv)
                                         */
                                        dsched_strategy_async(dpriv->dp, bio,
                                            fq_completed, fqp);
+                                       atomic_add_int(&fqp->issued, 1);
                                        atomic_add_int(&dpriv->incomplete_tp, 1);
                                        atomic_add_int(&fq_stats.transactions, 1);
-                                       ++count;
                                }
                                FQ_FQP_UNLOCK(fqp);
                        }
@@ -379,26 +396,52 @@ fq_dispatcher(struct dsched_fq_dpriv *dpriv)
        }
 }
 
-
 void
 fq_balance_thread(struct dsched_fq_dpriv *dpriv)
 {
        struct  dsched_fq_priv  *fqp, *fqp2;
        int     n = 0;
        static int last_full = 0, prev_full = 0;
+       static int limited_procs = 0;
        int     incomplete_tp;
-       int64_t total_budget, use_pct, avail_pct;
-       total_budget = 0;
+       int64_t budget, total_budget, used_budget;
+       int64_t budgetpb[FQ_PRIO_MAX+1];
+       int sum, i;
 
+       bzero(budgetpb, sizeof(budgetpb));
+       total_budget = 0;
+       
        FQ_DPRIV_LOCK(dpriv);
        incomplete_tp = dpriv->incomplete_tp;
 
        TAILQ_FOREACH_MUTABLE(fqp, &dpriv->fq_priv_list, dlink, fqp2) {
                if (fqp->transactions > 0 /* 30 */) {
+
                        total_budget += (fqp->avg_latency * fqp->transactions);
+                       /*
+                        * XXX: while the code below really sucked, the problem needs to
+                        *      be addressed eventually. Some processes take up their "fair"
+                        *      slice, but don't really need even a 10th of it.
+                        *      This kills performance for those that do need the
+                        *      performance.
+                        */
+#if 0
+                       /*
+                        * This is *very* hackish. It basically tries to avoid that
+                        * processes that do only very few tps take away more bandwidth
+                        * than they should.
+                        */
+                       if ((limited_procs >= 1) && (fqp->transactions < 25) &&
+                           (budgetpb[(fqp->p) ? fqp->p->p_ionice : 0] >= 1))
+                               continue;
+#endif
+
+                       ++budgetpb[(fqp->p) ? fqp->p->p_ionice : 0];
+                       
                        dsched_debug(LOG_INFO,
-                           "%d) avg_latency = %d, transactions = %d\n",
-                           n, fqp->avg_latency, fqp->transactions);
+                           "%d) avg_latency = %d, transactions = %d, ioprio = %d\n",
+                           n, fqp->avg_latency, fqp->transactions,
+                           (fqp->p) ? fqp->p->p_ionice : 0);
                        ++n;
                } else {
                        fqp->max_tp = 0;
@@ -411,49 +454,81 @@ fq_balance_thread(struct dsched_fq_dpriv *dpriv)
            "incomplete tp = %d\n", n, total_budget, incomplete_tp);
 
        if (n == 0)
-               goto done;
+               goto done;      
+       
+       sum = 0;
+       for (i = 0; i < FQ_PRIO_MAX+1; i++) {
+               if (budgetpb[i] == 0)
+                       continue;
+               sum += (FQ_PRIO_BIAS+i)*budgetpb[i];
+       }
+       if (sum == 0)
+               sum = 1;
+       dsched_debug(LOG_INFO, "sum = %d\n", sum);
 
-#if 0
+       for (i = 0; i < FQ_PRIO_MAX+1; i++) {
+               if (budgetpb[i] == 0)
+                       continue;
+
+               budgetpb[i] = ((FQ_PRIO_BIAS+i)*10)*total_budget/sum;
+       }
+
+       if (total_budget > dpriv->max_budget)
+               dpriv->max_budget = total_budget;
+
+       limited_procs = 0;
        /*
-        * XXX: hack. don't know why total_budget can be zero here
-        * -> this doesn't apply anymore. total_budget is never 0 now
+        * XXX: eventually remove all the silly *10...
         */
-       if (total_budget == 0)
-               total_budget = 1;
-#endif
-
        TAILQ_FOREACH_MUTABLE(fqp, &dpriv->fq_priv_list, dlink, fqp2) {
-               /* XXX: proportional to scheduler class! */
-               avail_pct = (int64_t)1000/(int64_t)n;
-
-               /* XXX: 100/(sum of scheduler priorities) * scheduler priority */
-               /* XXX: but need to process queues of fqp on buckets or so...*/
+               budget = budgetpb[(fqp->p) ? fqp->p->p_ionice : 0];
 
-               use_pct = ((int64_t)1000* (int64_t)fqp->avg_latency *
-                   (int64_t)fqp->transactions)/(int64_t)total_budget;
+               used_budget = ((int64_t)10*(int64_t)fqp->avg_latency *
+                   (int64_t)fqp->transactions);
+               if (used_budget > 0) {
+                       dsched_debug(LOG_INFO,
+                           "info: used_budget = %lld, budget = %lld\n", used_budget,
+                           budget);
+               }
 
-               /* process is exceeding its fair share; rate-limit it */
-               if ((use_pct > avail_pct) && (incomplete_tp > n*2)) {
+               /*
+                * process is exceeding its fair share; rate-limit it, but only
+                * if the disk is actually fully used.
+                */
+               if ((used_budget > budget) && (incomplete_tp > n*2)) {
                        /* kprintf("here we are, use_pct > avail_pct\n"); */
                        /* fqp->max_tp = avail_pct * fqp->avg_latency; */
-                       fqp->max_tp = total_budget/(n * fqp->avg_latency);
+                       KKASSERT(fqp->avg_latency != 0);
+
+                       /*
+                        * If the disk has not been fully used lately, augment the
+                        * budget.
+                        */
+                       if (total_budget*3 < dpriv->max_budget*2) {
+                               budget *= 2;
+                               budget /= 3;
+                       }
+
+                       fqp->max_tp = budget/(10*fqp->avg_latency);
+                       ++limited_procs;
                        dsched_debug(LOG_INFO,
                            "rate limited to %d transactions\n", fqp->max_tp);
                        atomic_add_int(&fq_stats.procs_limited, 1);
-               } else if (((use_pct < avail_pct/2) || (incomplete_tp < n*2)) &&
+               } else if (((used_budget*2 < budget) || (incomplete_tp < n*2)) &&
                    (!prev_full && !last_full)) {
                        /*
                         * process is really using little of its timeslice, or the
                         * disk is not busy, so let's reset the rate-limit.
                         * Without this, exceeding processes will get an unlimited
                         * slice every other slice.
-                        * XXX: this still doesn't quite fix the issue, but maybe,
-                        * it's good that way so that heavy writes are interleaved.
+                        * XXX: this still doesn't quite fix the issue, but maybe
+                        * it's good that way, so that heavy writes are interleaved.
                         */
                        fqp->max_tp = 0;
                }
                fqp->transactions = 0;
                fqp->avg_latency = 0;
+               fqp->issued = 0;
        }
 
        prev_full = last_full;
index 7382b88..ce607b5 100644 (file)
@@ -72,6 +72,7 @@ static dsched_queue_t         fq_queue;
 dsched_new_buf_t       fq_new_buf;
 dsched_new_proc_t      fq_new_proc;
 dsched_new_thread_t    fq_new_thread;
+dsched_exit_buf_t      fq_exit_buf;
 dsched_exit_proc_t     fq_exit_proc;
 dsched_exit_thread_t   fq_exit_thread;
 
@@ -93,6 +94,7 @@ struct dsched_ops dsched_fq_ops = {
        .new_buf = fq_new_buf,
        .new_proc = fq_new_proc,
        .new_thread = fq_new_thread,
+       .exit_buf = fq_exit_buf,
        .exit_proc = fq_exit_proc,
        .exit_thread = fq_exit_thread,
 };
@@ -246,14 +248,19 @@ fq_queue(struct disk *dp, struct bio *obio)
                }
        }
        FQ_FQMP_UNLOCK(fqmp);
+       dsched_clr_buf_priv(obio->bio_buf);
        fq_dereference_mpriv(fqmp); /* acquired on new_buf */
+       atomic_subtract_int(&fq_stats.nbufs, 1);
 
        KKASSERT(found == 1);
        dpriv = dsched_get_disk_priv(dp);
 
        /* XXX: probably rather pointless doing this atomically */
        max_tp = atomic_fetchadd_int(&fqp->max_tp, 0);
+#if 0
        transactions = atomic_fetchadd_int(&fqp->transactions, 0);
+#endif
+       transactions = atomic_fetchadd_int(&fqp->issued, 0);
 
        /* | No rate limiting || Hasn't reached limit rate |     */
        if ((max_tp == 0) || (transactions < max_tp)) {
@@ -268,7 +275,7 @@ fq_queue(struct disk *dp, struct bio *obio)
                        count = 0;
 
                        TAILQ_FOREACH_MUTABLE(bio, &fqp->queue, link, bio2) {
-                               if ((fqp->max_tp > 0) && (count >= fqp->max_tp))
+                               if ((fqp->max_tp > 0) && (fqp->issued >= fqp->max_tp))
                                        break;
                                TAILQ_REMOVE(&fqp->queue, bio, link);
                                --fqp->qlength;
@@ -278,6 +285,7 @@ fq_queue(struct disk *dp, struct bio *obio)
                                 * queueing
                                 */
                                dsched_strategy_async(dp, bio, fq_completed, fqp);
+                               atomic_add_int(&fqp->issued, 1);
                                atomic_add_int(&dpriv->incomplete_tp, 1);
                                atomic_add_int(&fq_stats.transactions, 1);
                        }
@@ -288,6 +296,7 @@ fq_queue(struct disk *dp, struct bio *obio)
                fq_reference_priv(fqp);
 
                dsched_strategy_async(dp, obio, fq_completed, fqp);
+               atomic_add_int(&fqp->issued, 1);
                atomic_add_int(&dpriv->incomplete_tp, 1);
                atomic_add_int(&fq_stats.transactions, 1);
        } else {
@@ -300,7 +309,20 @@ fq_queue(struct disk *dp, struct bio *obio)
                 */
                FQ_FQP_LOCK(fqp);
                fq_reference_priv(fqp);
-               TAILQ_INSERT_TAIL(&fqp->queue, obio, link);
+
+               /*
+                * Prioritize reads by inserting them at the front of the
+                * queue.
+                *
+                * XXX: this might cause issues with data that should
+                *      have been written and is being read, but hasn't
+                *      actually been written yet.
+                */
+               if (obio->bio_buf->b_cmd == BUF_CMD_READ)
+                       TAILQ_INSERT_HEAD(&fqp->queue, obio, link);
+               else
+                       TAILQ_INSERT_TAIL(&fqp->queue, obio, link);
+
                ++fqp->qlength;
                FQ_FQP_UNLOCK(fqp);
        }
index 8fb83d3..840f2de 100644 (file)
@@ -64,9 +64,12 @@ MALLOC_DECLARE(M_DSCHEDFQ);
 dsched_new_buf_t       fq_new_buf;
 dsched_new_proc_t      fq_new_proc;
 dsched_new_thread_t    fq_new_thread;
+dsched_exit_buf_t      fq_exit_buf;
 dsched_exit_proc_t     fq_exit_proc;
 dsched_exit_thread_t   fq_exit_thread;
 
+extern struct dsched_fq_stats  fq_stats;
+
 void
 fq_new_buf(struct buf *bp)
 {
@@ -89,9 +92,25 @@ fq_new_buf(struct buf *bp)
        KKASSERT(fqmp != NULL);
 #endif
 
-       if (fqmp)
+       if (fqmp) {
+               atomic_add_int(&fq_stats.nbufs, 1);
                fq_reference_mpriv(fqmp);
+       }
        dsched_set_buf_priv(bp, fqmp);
+       
+}
+
+void
+fq_exit_buf(struct buf *bp)
+{
+       struct dsched_fq_mpriv  *fqmp;
+
+       fqmp = dsched_get_buf_priv(bp);
+       if (fqmp != NULL) {
+               dsched_clr_buf_priv(bp);
+               fq_dereference_mpriv(fqmp);
+               atomic_subtract_int(&fq_stats.nbufs, 1);
+       }
 }
 
 void
@@ -101,9 +120,11 @@ fq_new_proc(struct proc *p)
 
        KKASSERT(p != NULL);
 
-       fqmp = fq_alloc_mpriv();
+       fqmp = fq_alloc_mpriv(p);
        fq_reference_mpriv(fqmp);
        dsched_set_proc_priv(p, fqmp);
+       atomic_add_int(&fq_stats.nprocs, 1);
+       fqmp->p = p;
 }
 
 void
@@ -113,9 +134,11 @@ fq_new_thread(struct thread *td)
 
        KKASSERT(td != NULL);
 
-       fqmp = fq_alloc_mpriv();
+       fqmp = fq_alloc_mpriv(NULL);
        fq_reference_mpriv(fqmp);
        dsched_set_thread_priv(td, fqmp);
+       atomic_add_int(&fq_stats.nthreads, 1);
+       fqmp->td = td;
 }
 
 void
@@ -130,9 +153,11 @@ fq_exit_proc(struct proc *p)
 #if 0
        kprintf("exit_proc: fqmp = %p\n", fqmp);
 #endif
+       fqmp->dead = 0x1337;
        dsched_set_proc_priv(p, 0);
        fq_dereference_mpriv(fqmp); /* one for alloc, */
        fq_dereference_mpriv(fqmp); /* one for ref */
+       atomic_subtract_int(&fq_stats.nprocs, 1);
 }
 
 void
@@ -147,7 +172,9 @@ fq_exit_thread(struct thread *td)
 #if 0
        kprintf("exit_thread: fqmp = %p\n", fqmp);
 #endif
+       fqmp->dead = 0x1337;
        dsched_set_thread_priv(td, 0);
        fq_dereference_mpriv(fqmp); /* one for alloc, */
        fq_dereference_mpriv(fqmp); /* one for ref */
+       atomic_subtract_int(&fq_stats.nthreads, 1);
 }
index a205b57..089c558 100644 (file)
@@ -553,4 +553,6 @@ struct sysent sysent[] = {
        { AS(mq_receive_args), (sy_call_t *)sys_mq_receive },   /* 517 = mq_receive */
        { AS(mq_timedsend_args), (sy_call_t *)sys_mq_timedsend },       /* 518 = mq_timedsend */
        { AS(mq_timedreceive_args), (sy_call_t *)sys_mq_timedreceive }, /* 519 = mq_timedreceive */
+       { AS(ioprio_set_args), (sy_call_t *)sys_ioprio_set },   /* 520 = ioprio_set */
+       { AS(ioprio_get_args), (sy_call_t *)sys_ioprio_get },   /* 521 = ioprio_get */
 };
index 1c923b0..10914be 100644 (file)
@@ -66,6 +66,7 @@
 #include <sys/mplock2.h>
 
 static int donice (struct proc *chgp, int n);
+static int doionice (struct proc *chgp, int n);
 
 static MALLOC_DEFINE(M_UIDINFO, "uidinfo", "uidinfo structures");
 #define        UIHASH(uid)     (&uihashtbl[(uid) & uihash])
@@ -286,6 +287,213 @@ donice(struct proc *chgp, int n)
        return (0);
 }
 
+
+struct ioprio_get_info {
+       int high;
+       int who;
+};
+
+static int ioprio_get_callback(struct proc *p, void *data);
+
+/*
+ * MPALMOSTSAFE
+ */
+int
+sys_ioprio_get(struct ioprio_get_args *uap)
+{
+       struct ioprio_get_info info;
+       struct proc *curp = curproc;
+       struct proc *p;
+       int high = IOPRIO_MIN-2;
+       int error;
+
+       get_mplock();
+
+       switch (uap->which) {
+       case PRIO_PROCESS:
+               if (uap->who == 0)
+                       p = curp;
+               else
+                       p = pfind(uap->who);
+               if (p == 0)
+                       break;
+               if (!PRISON_CHECK(curp->p_ucred, p->p_ucred))
+                       break;
+               high = p->p_ionice;
+               break;
+
+       case PRIO_PGRP:
+       {
+               struct pgrp *pg;
+
+               if (uap->who == 0)
+                       pg = curp->p_pgrp;
+               else if ((pg = pgfind(uap->who)) == NULL)
+                       break;
+               LIST_FOREACH(p, &pg->pg_members, p_pglist) {
+                       if ((PRISON_CHECK(curp->p_ucred, p->p_ucred) && p->p_nice > high))
+                               high = p->p_ionice;
+               }
+               break;
+       }
+       case PRIO_USER:
+               if (uap->who == 0)
+                       uap->who = curp->p_ucred->cr_uid;
+               info.high = high;
+               info.who = uap->who;
+               allproc_scan(ioprio_get_callback, &info);
+               high = info.high;
+               break;
+
+       default:
+               error = EINVAL;
+               goto done;
+       }
+       if (high == IOPRIO_MIN-2) {
+               error = ESRCH;
+               goto done;
+       }
+       uap->sysmsg_result = high;
+       error = 0;
+done:
+       rel_mplock();
+       return (error);
+}
+
+/*
+ * Figure out the current lowest nice priority for processes owned
+ * by the specified user.
+ */
+static
+int
+ioprio_get_callback(struct proc *p, void *data)
+{
+       struct ioprio_get_info *info = data;
+
+       if (PRISON_CHECK(curproc->p_ucred, p->p_ucred) &&
+           p->p_ucred->cr_uid == info->who &&
+           p->p_ionice > info->high) {
+               info->high = p->p_ionice;
+       }
+       return(0);
+}
+
+
+struct ioprio_set_info {
+       int prio;
+       int who;
+       int error;
+       int found;
+};
+
+static int ioprio_set_callback(struct proc *p, void *data);
+
+/*
+ * MPALMOSTSAFE
+ */
+int
+sys_ioprio_set(struct ioprio_set_args *uap)
+{
+       struct ioprio_set_info info;
+       struct proc *curp = curproc;
+       struct proc *p;
+       int found = 0, error = 0;
+
+       get_mplock();
+
+       switch (uap->which) {
+       case PRIO_PROCESS:
+               if (uap->who == 0)
+                       p = curp;
+               else
+                       p = pfind(uap->who);
+               if (p == 0)
+                       break;
+               if (!PRISON_CHECK(curp->p_ucred, p->p_ucred))
+                       break;
+               error = doionice(p, uap->prio);
+               found++;
+               break;
+
+       case PRIO_PGRP:
+       {
+               struct pgrp *pg;
+
+               if (uap->who == 0)
+                       pg = curp->p_pgrp;
+               else if ((pg = pgfind(uap->who)) == NULL)
+                       break;
+               LIST_FOREACH(p, &pg->pg_members, p_pglist) {
+                       if (PRISON_CHECK(curp->p_ucred, p->p_ucred)) {
+                               error = doionice(p, uap->prio);
+                               found++;
+                       }
+               }
+               break;
+       }
+       case PRIO_USER:
+               if (uap->who == 0)
+                       uap->who = curp->p_ucred->cr_uid;
+               info.prio = uap->prio;
+               info.who = uap->who;
+               info.error = 0;
+               info.found = 0;
+               allproc_scan(ioprio_set_callback, &info);
+               error = info.error;
+               found = info.found;
+               break;
+
+       default:
+               error = EINVAL;
+               found = 1;
+               break;
+       }
+
+       rel_mplock();
+       if (found == 0)
+               error = ESRCH;
+       return (error);
+}
+
+static
+int
+ioprio_set_callback(struct proc *p, void *data)
+{
+       struct ioprio_set_info *info = data;
+       int error;
+
+       if (p->p_ucred->cr_uid == info->who &&
+           PRISON_CHECK(curproc->p_ucred, p->p_ucred)) {
+               error = doionice(p, info->prio);
+               if (error)
+                       info->error = error;
+               ++info->found;
+       }
+       return(0);
+}
+
+int
+doionice(struct proc *chgp, int n)
+{
+       struct proc *curp = curproc;
+       struct ucred *cr = curp->p_ucred;
+
+       if (cr->cr_uid && cr->cr_ruid &&
+           cr->cr_uid != chgp->p_ucred->cr_uid &&
+           cr->cr_ruid != chgp->p_ucred->cr_uid)
+               return (EPERM);
+       if (n > IOPRIO_MAX)
+               n = IOPRIO_MAX;
+       if (n < IOPRIO_MIN)
+               n = IOPRIO_MIN;
+       if (n < chgp->p_ionice && priv_check_cred(cr, PRIV_SCHED_SETPRIORITY, 0))
+               return (EACCES);
+       chgp->p_ionice = n;
+
+       return (0);
+
+}
+
 /*
  * MPALMOSTSAFE
  */
index edd1507..8eabc98 100644 (file)
@@ -423,8 +423,6 @@ lwkt_free_thread(thread_t td)
     KASSERT((td->td_flags & TDF_RUNNING) == 0,
        ("lwkt_free_thread: did not exit! %p", td));
 
-    dsched_exit_thread(td);
-
     if (td->td_flags & TDF_ALLOCATED_THREAD) {
        objcache_put(thread_cache, td);
     } else if (td->td_flags & TDF_ALLOCATED_STACK) {
@@ -1490,6 +1488,7 @@ lwkt_remove_tdallq(thread_t td)
 {
     KKASSERT(td->td_gd == mycpu);
     TAILQ_REMOVE(&td->td_gd->gd_tdallq, td, td_allq);
+    dsched_exit_thread(td);
 }
 
 void
index ea59974..48bfcaa 100644 (file)
@@ -67,6 +67,7 @@ static biodone_t      default_completed;
 dsched_new_buf_t       *default_new_buf;
 dsched_new_proc_t      *default_new_proc;
 dsched_new_thread_t    *default_new_thread;
+dsched_exit_buf_t      *default_exit_buf;
 dsched_exit_proc_t     *default_exit_proc;
 dsched_exit_thread_t   *default_exit_thread;
 
@@ -90,7 +91,7 @@ static struct dsched_policy_head dsched_policy_list =
 
 static struct dsched_ops dsched_default_ops = {
        .head = {
-               .name = "default"
+               .name = "noop"
        },
        .prepare = default_prepare,
        .teardown = default_teardown,
@@ -243,6 +244,7 @@ dsched_register(struct dsched_ops *d_ops)
                        default_new_buf = d_ops->new_buf;
                        default_new_proc = d_ops->new_proc;
                        default_new_thread = d_ops->new_thread;
+                       default_exit_buf = d_ops->exit_buf;
                        default_exit_proc = d_ops->exit_proc;
                        default_exit_thread = d_ops->exit_thread;
                }
@@ -491,6 +493,12 @@ dsched_new_buf(struct buf *bp)
                default_new_buf(bp);
 }
 
+void
+dsched_exit_buf(struct buf *bp)
+{
+       if (default_exit_buf != NULL)
+               default_exit_buf(bp);
+}
 
 void
 dsched_new_proc(struct proc *p)
index bf958e9..a2bbb29 100644 (file)
@@ -527,4 +527,6 @@ const char *syscallnames[] = {
        "mq_receive",                   /* 517 = mq_receive */
        "mq_timedsend",                 /* 518 = mq_timedsend */
        "mq_timedreceive",                      /* 519 = mq_timedreceive */
+       "ioprio_set",                   /* 520 = ioprio_set */
+       "ioprio_get",                   /* 521 = ioprio_get */
 };
index ed70fb6..2cb5e02 100644 (file)
 519    STD     POSIX   { ssize_t mq_timedreceive(mqd_t mqdes, \
                                  char *msg_ptr, size_t msg_len, unsigned *msg_prio, \
                                  const struct timespec *abs_timeout); }
+520    STD     BSD     { int ioprio_set(int which, int who, int prio); }
+521    STD     BSD     { int ioprio_get(int which, int who); }
index ddeaf0f..20969f9 100644 (file)
@@ -1324,7 +1324,7 @@ brelse(struct buf *bp)
         * or B_RELBUF flags.
         */
        bp->b_cmd = BUF_CMD_DONE;
-       dsched_clr_buf_priv(bp);
+       dsched_exit_buf(bp);
 
        /*
         * VMIO buffer rundown.  Make sure the VM page array is restored
@@ -1657,7 +1657,7 @@ bqrelse(struct buf *bp)
         * buffer is actively locked.
         */
        bp->b_flags &= ~(B_ORDERED | B_NOCACHE | B_RELBUF);
-       dsched_clr_buf_priv(bp);
+       dsched_exit_buf(bp);
        BUF_UNLOCK(bp);
 }
 
index 56c55c7..2446391 100644 (file)
@@ -87,6 +87,7 @@ typedef int   dsched_queue_t(struct disk *dp, struct bio *bio);
 typedef        void    dsched_new_buf_t(struct buf *bp);
 typedef        void    dsched_new_proc_t(struct proc *p);
 typedef        void    dsched_new_thread_t(struct thread *td);
+typedef        void    dsched_exit_buf_t(struct buf *bp);
 typedef        void    dsched_exit_proc_t(struct proc *p);
 typedef        void    dsched_exit_thread_t(struct thread *td);
 
@@ -106,6 +107,7 @@ struct dsched_ops {
        dsched_new_buf_t        *new_buf;
        dsched_new_proc_t       *new_proc;
        dsched_new_thread_t     *new_thread;
+       dsched_exit_buf_t       *exit_buf;
        dsched_exit_proc_t      *exit_proc;
        dsched_exit_thread_t    *exit_thread;
 };
@@ -147,7 +149,7 @@ int dsched_debug(int level, char *fmt, ...);
 dsched_new_buf_t       dsched_new_buf;
 dsched_new_proc_t      dsched_new_proc;
 dsched_new_thread_t    dsched_new_thread;
-
+dsched_exit_buf_t      dsched_exit_buf;
 dsched_exit_proc_t     dsched_exit_proc;
 dsched_exit_thread_t   dsched_exit_thread;
 
index b4d8046..8d50470 100644 (file)
@@ -299,6 +299,7 @@ struct      proc {
        struct pargs    *p_args;
        u_short         p_xstat;        /* Exit status or last stop signal */
 
+       int             p_ionice;
        void            *p_dsched_priv2;
 /* End area that is copied on creation. */
 #define        p_endcopy       p_dsched_priv2
index 22fe842..f1bc46c 100644 (file)
@@ -48,6 +48,8 @@
  */
 #define        PRIO_MIN        -20
 #define        PRIO_MAX        20
+#define IOPRIO_MIN     1
+#define IOPRIO_MAX     10
 
 #define        PRIO_PROCESS    0
 #define        PRIO_PGRP       1
@@ -150,9 +152,11 @@ int        dosetrlimit (u_int which, struct rlimit *limp);
 
 __BEGIN_DECLS
 int    getpriority (int, int);
+int    ioprio_get (int, int);
 int    getrlimit (int, struct rlimit *);
 int    getrusage (int, struct rusage *);
 int    setpriority (int, int, int);
+int    ioprio_set (int, int, int);
 int    setrlimit (int, const struct rlimit *);
 __END_DECLS
 
index d85c943..a0ec77d 100644 (file)
@@ -350,3 +350,5 @@ HIDE_POSIX(mq_send)
 HIDE_POSIX(mq_receive)
 HIDE_POSIX(mq_timedsend)
 HIDE_POSIX(mq_timedreceive)
+HIDE_BSD(ioprio_set)
+HIDE_BSD(ioprio_get)
index dba8aea..ce21144 100644 (file)
 #define        SYS_mq_receive  517
 #define        SYS_mq_timedsend        518
 #define        SYS_mq_timedreceive     519
-#define        SYS_MAXSYSCALL  520
+#define        SYS_ioprio_set  520
+#define        SYS_ioprio_get  521
+#define        SYS_MAXSYSCALL  522
index 45bb7cd..a06739b 100644 (file)
@@ -298,4 +298,6 @@ MIASM =  \
        mq_send.o \
        mq_receive.o \
        mq_timedsend.o \
-       mq_timedreceive.o
+       mq_timedreceive.o \
+       ioprio_set.o \
+       ioprio_get.o
index b4df93c..09b4d14 100644 (file)
@@ -2276,6 +2276,21 @@ struct   mq_timedreceive_args {
        unsigned *      msg_prio;       char msg_prio_[PAD_(unsigned *)];
        const struct timespec * abs_timeout;    char abs_timeout_[PAD_(const struct timespec *)];
 };
+struct ioprio_set_args {
+#ifdef _KERNEL
+       struct sysmsg sysmsg;
+#endif
+       int     which;  char which_[PAD_(int)];
+       int     who;    char who_[PAD_(int)];
+       int     prio;   char prio_[PAD_(int)];
+};
+struct ioprio_get_args {
+#ifdef _KERNEL
+       struct sysmsg sysmsg;
+#endif
+       int     which;  char which_[PAD_(int)];
+       int     who;    char who_[PAD_(int)];
+};
 
 #ifdef COMPAT_43
 
@@ -2882,6 +2897,8 @@ int       sys_mq_send (struct mq_send_args *);
 int    sys_mq_receive (struct mq_receive_args *);
 int    sys_mq_timedsend (struct mq_timedsend_args *);
 int    sys_mq_timedreceive (struct mq_timedreceive_args *);
+int    sys_ioprio_set (struct ioprio_set_args *);
+int    sys_ioprio_get (struct ioprio_get_args *);
 
 #endif /* !_SYS_SYSPROTO_H_ */
 #undef PAD_
index a0df514..af303c1 100644 (file)
@@ -406,4 +406,6 @@ union sysunion {
        struct  mq_receive_args mq_receive;
        struct  mq_timedsend_args mq_timedsend;
        struct  mq_timedreceive_args mq_timedreceive;
+       struct  ioprio_set_args ioprio_set;
+       struct  ioprio_get_args ioprio_get;
 };
index a323a83..889b848 100644 (file)
@@ -356,7 +356,7 @@ relpbuf(struct buf *bp, int *pfreecnt)
        int wake_freecnt = 0;
 
        KKASSERT(bp->b_flags & B_PAGING);
-       dsched_clr_buf_priv(bp);
+       dsched_exit_buf(bp);
 
        spin_lock_wr(&bswspin);
 
index f81ad58..a395a14 100644 (file)
@@ -26,23 +26,34 @@ int main(void)
                "FQP:\t%d\n"
                "DPRIV:\t%d\n"
                "---------------------------------------------\n"
+               "Procs/Threads tracked\n"
+               "procs:\t\t%d\n"
+               "threads:\t%d\n"
+               "---------------------------------------------\n"
                "Proccesses\n"
                "Rate limited:\t%d\n"
                "---------------------------------------------\n"
                "Transactions\n"
                "Issued:\t\t%d\n"
                "Completed:\t%d\n"
-               "without FQMP:\t%d\n",
+               "without FQMP:\t%d\n"
+               "---------------------------------------------\n"
+               "Misc\n"
+               "FQMP refs for buf:\t%d\n",
 
                fq_stats.fqmp_allocations,
                fq_stats.fqp_allocations,
                fq_stats.dpriv_allocations,
 
+               fq_stats.nprocs,
+               fq_stats.nthreads,
+
                fq_stats.procs_limited,
 
                fq_stats.transactions,
                fq_stats.transactions_completed,
-               fq_stats.no_fqmp
+               fq_stats.no_fqmp,
+               fq_stats.nbufs
                );
 
 
index 6e0f131..c6be0db 100644 (file)
@@ -74,6 +74,7 @@ SUBDIR=       alias \
        iconv \
        id \
        indent \
+       ionice \
        ipcrm \
        ipcs \
        join \
diff --git a/usr.bin/ionice/Makefile b/usr.bin/ionice/Makefile
new file mode 100644 (file)
index 0000000..e7b8322
--- /dev/null
@@ -0,0 +1,7 @@
+#      @(#)Makefile    8.1 (Berkeley) 6/6/93
+# $DragonFly: src/usr.bin/nice/Makefile,v 1.3 2007/08/27 16:50:57 pavalos Exp $
+
+PROG=  ionice
+NOMAN=
+
+.include <bsd.prog.mk>
diff --git a/usr.bin/ionice/ionice.c b/usr.bin/ionice/ionice.c
new file mode 100644 (file)
index 0000000..838ae6d
--- /dev/null
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 1989, 1993, 1994
+ *     The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *     This product includes software developed by the University of
+ *     California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#) Copyright (c) 1989, 1993, 1994 The Regents of the University of California.  All rights reserved.
+ * @(#)nice.c  8.2 (Berkeley) 4/16/94
+ * $FreeBSD: src/usr.bin/nice/nice.c,v 1.4.2.1 2002/06/18 08:40:28 tjr Exp $
+ * $DragonFly: src/usr.bin/nice/nice.c,v 1.6 2005/10/09 15:09:02 liamfoy Exp $
+ */
+
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include <ctype.h>
+#include <err.h>
+#include <errno.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#define        DEFNICE 4
+
+static void    usage(void);
+
+int
+main(int argc, char **argv)
+{
+       long niceness = DEFNICE;
+       int ch;
+       char *ep;
+
+       /* Obsolescent syntax: -number, --number */
+       if (argc >= 2 && argv[1][0] == '-' && (argv[1][1] == '-' ||
+           isdigit(argv[1][1])) && strcmp(argv[1], "--") != 0)
+               if (asprintf(&argv[1], "-n%s", argv[1] + 1) < 0)
+                       err(1, "asprintf");
+
+       while ((ch = getopt(argc, argv, "n:")) != -1) {
+               switch (ch) {
+               case 'n':
+                       errno = 0;
+                       niceness = strtol(optarg, &ep, 10);
+                       if (ep == optarg || *ep != '\0' || errno ||
+                           niceness < INT_MIN || niceness > INT_MAX)
+                               errx(1, "%s: invalid ionice value", optarg);
+                       break;
+               default:
+                       usage();
+               }
+       }
+       argc -= optind;
+       argv += optind;
+
+       if (argc == 0)
+               usage();
+
+       errno = 0;
+       niceness += ioprio_get(PRIO_PROCESS, 0);
+       if (errno)
+               warn("ioprio_get");
+       else if (ioprio_set(PRIO_PROCESS, 0, (int)niceness))
+               warn("ioprio_set");
+       execvp(*argv, argv);
+       err(errno == ENOENT || errno == ENOTDIR ? 127 : 126, "%s", *argv);
+}
+
+static void
+usage(void)
+{
+
+       fprintf(stderr, "usage: ionice [-n incr] utility [arguments]\n");
+       exit(1);
+}