From aa166ad14bc0671c727d08eef84faaf3789a4b30 Mon Sep 17 00:00:00 2001 From: Alex Hornung Date: Tue, 30 Mar 2010 02:18:32 +0000 Subject: [PATCH] dsched - Implement priorities and other improvements * Implemented fq priority support. This also includes the userland tool ionice. * Implement an exit_buffer hook to be able to clean up bufs before they are reused. * Some minor performance improvements in fq. --- sys/dsched/fq/dsched_fq.h | 18 ++- sys/dsched/fq/dsched_fq_core.c | 137 +++++++++++++++----- sys/dsched/fq/dsched_fq_diskops.c | 26 +++- sys/dsched/fq/dsched_fq_procops.c | 33 ++++- sys/kern/init_sysent.c | 2 + sys/kern/kern_resource.c | 208 ++++++++++++++++++++++++++++++ sys/kern/lwkt_thread.c | 3 +- sys/kern/subr_dsched.c | 10 +- sys/kern/syscalls.c | 2 + sys/kern/syscalls.master | 2 + sys/kern/vfs_bio.c | 4 +- sys/sys/dsched.h | 4 +- sys/sys/proc.h | 1 + sys/sys/resource.h | 4 + sys/sys/syscall-hide.h | 2 + sys/sys/syscall.h | 4 +- sys/sys/syscall.mk | 4 +- sys/sys/sysproto.h | 17 +++ sys/sys/sysunion.h | 2 + sys/vm/vm_pager.c | 2 +- test/dsched_fq/fqstats.c | 15 ++- usr.bin/Makefile | 1 + usr.bin/ionice/Makefile | 7 + usr.bin/ionice/ionice.c | 103 +++++++++++++++ 24 files changed, 562 insertions(+), 49 deletions(-) create mode 100644 usr.bin/ionice/Makefile create mode 100644 usr.bin/ionice/ionice.c diff --git a/sys/dsched/fq/dsched_fq.h b/sys/dsched/fq/dsched_fq.h index c7ad7288d5..5d56fb46ae 100644 --- a/sys/dsched/fq/dsched_fq.h +++ b/sys/dsched/fq/dsched_fq.h @@ -109,6 +109,7 @@ struct dsched_fq_priv { struct disk *dp; struct dsched_fq_dpriv *dpriv; struct dsched_fq_mpriv *fqmp; + struct proc *p; int32_t qlength; int32_t flags; @@ -117,6 +118,7 @@ struct dsched_fq_priv { int32_t transactions; int32_t avg_latency; int32_t max_tp; + int32_t issued; }; struct dsched_fq_dpriv { @@ -127,6 +129,7 @@ struct dsched_fq_dpriv { int avg_rq_time; /* XXX: unused */ int32_t incomplete_tp; + int64_t max_budget; /* list contains all fq_priv for this disk */ TAILQ_HEAD(, dsched_fq_priv) fq_priv_list; @@ -134,6 +137,9 @@ struct dsched_fq_dpriv { }; struct dsched_fq_mpriv { + struct proc *p; + struct thread *td; + int dead; struct spinlock lock; int refcount; TAILQ_HEAD(, dsched_fq_priv) fq_priv_list; @@ -156,14 +162,17 @@ struct dsched_fq_bucket { }; - +#define FQ_PRIO_BIAS 5 +#define FQ_PRIO_MAX 10 +#define FQ_PRIO_MIN 1 +#define FQ_PRIO_IDLE -1 #define FQ_BUCKET_ACTIVE 0x01 struct dsched_fq_priv *fq_alloc_priv(struct disk *dp); struct dsched_fq_dpriv *fq_alloc_dpriv(struct disk *dp); -struct dsched_fq_mpriv *fq_alloc_mpriv(void); +struct dsched_fq_mpriv *fq_alloc_mpriv(struct proc *p); void fq_balance_thread(struct dsched_fq_dpriv *dpriv); void fq_dispatcher(struct dsched_fq_dpriv *dpriv); biodone_t fq_completed; @@ -190,6 +199,11 @@ struct dsched_fq_stats { int32_t cancelled; int32_t no_fqmp; + + int32_t nthreads; + int32_t nprocs; + + int32_t nbufs; }; #endif /* _DSCHED_FQ_H_ */ diff --git a/sys/dsched/fq/dsched_fq_core.c b/sys/dsched/fq/dsched_fq_core.c index 5b7e5e996d..d7d5a8bf75 100644 --- a/sys/dsched/fq/dsched_fq_core.c +++ b/sys/dsched/fq/dsched_fq_core.c @@ -296,7 +296,7 @@ fq_alloc_dpriv(struct disk *dp) struct dsched_fq_mpriv * -fq_alloc_mpriv() +fq_alloc_mpriv(struct proc *p) { struct dsched_fq_mpriv *fqmp; struct dsched_fq_priv *fqp; @@ -314,7 +314,7 @@ fq_alloc_mpriv() while ((dp = dsched_disk_enumerate(dp, &dsched_fq_ops))) { fqp = fq_alloc_priv(dp); - + fqp->p = p; #if 0 fq_reference_priv(fqp); #endif @@ -336,10 +336,28 @@ fq_alloc_mpriv() void fq_dispatcher(struct dsched_fq_dpriv *dpriv) { + struct dsched_fq_mpriv *fqmp; struct dsched_fq_priv *fqp, *fqp2; struct bio *bio, *bio2; int count; + /* + * We need to manually assign an fqp to the fqmp of this thread + * since it isn't assigned one during fq_prepare, as the disk + * is not set up yet. + */ + fqmp = dsched_get_thread_priv(curthread); + /* If fqmp is NULL, something went seriously wrong */ + KKASSERT(fqmp != NULL); + fqp = fq_alloc_priv(dpriv->dp); + FQ_FQMP_LOCK(fqmp); +#if 0 + fq_reference_priv(fqp); +#endif + TAILQ_INSERT_TAIL(&fqmp->fq_priv_list, fqp, link); + FQ_FQMP_UNLOCK(fqmp); + + FQ_DPRIV_LOCK(dpriv); for(;;) { /* sleep ~60 ms */ @@ -356,8 +374,7 @@ fq_dispatcher(struct dsched_fq_dpriv *dpriv) TAILQ_FOREACH_MUTABLE(bio, &fqp->queue, link, bio2) { if ((fqp->max_tp > 0) && - ((count >= fqp->max_tp) || - (fqp->transactions >= fqp->max_tp))) + ((fqp->issued >= fqp->max_tp))) break; TAILQ_REMOVE(&fqp->queue, bio, link); @@ -369,9 +386,9 @@ fq_dispatcher(struct dsched_fq_dpriv *dpriv) */ dsched_strategy_async(dpriv->dp, bio, fq_completed, fqp); + atomic_add_int(&fqp->issued, 1); atomic_add_int(&dpriv->incomplete_tp, 1); atomic_add_int(&fq_stats.transactions, 1); - ++count; } FQ_FQP_UNLOCK(fqp); } @@ -379,26 +396,52 @@ fq_dispatcher(struct dsched_fq_dpriv *dpriv) } } - void fq_balance_thread(struct dsched_fq_dpriv *dpriv) { struct dsched_fq_priv *fqp, *fqp2; int n = 0; static int last_full = 0, prev_full = 0; + static int limited_procs = 0; int incomplete_tp; - int64_t total_budget, use_pct, avail_pct; - total_budget = 0; + int64_t budget, total_budget, used_budget; + int64_t budgetpb[FQ_PRIO_MAX+1]; + int sum, i; + bzero(budgetpb, sizeof(budgetpb)); + total_budget = 0; + FQ_DPRIV_LOCK(dpriv); incomplete_tp = dpriv->incomplete_tp; TAILQ_FOREACH_MUTABLE(fqp, &dpriv->fq_priv_list, dlink, fqp2) { if (fqp->transactions > 0 /* 30 */) { + total_budget += (fqp->avg_latency * fqp->transactions); + /* + * XXX: while the code below really sucked, the problem needs to + * be addressed eventually. Some processes take up their "fair" + * slice, but don't really need even a 10th of it. + * This kills performance for those that do need the + * performance. + */ +#if 0 + /* + * This is *very* hackish. It basically tries to avoid that + * processes that do only very few tps take away more bandwidth + * than they should. + */ + if ((limited_procs >= 1) && (fqp->transactions < 25) && + (budgetpb[(fqp->p) ? fqp->p->p_ionice : 0] >= 1)) + continue; +#endif + + ++budgetpb[(fqp->p) ? fqp->p->p_ionice : 0]; + dsched_debug(LOG_INFO, - "%d) avg_latency = %d, transactions = %d\n", - n, fqp->avg_latency, fqp->transactions); + "%d) avg_latency = %d, transactions = %d, ioprio = %d\n", + n, fqp->avg_latency, fqp->transactions, + (fqp->p) ? fqp->p->p_ionice : 0); ++n; } else { fqp->max_tp = 0; @@ -411,49 +454,81 @@ fq_balance_thread(struct dsched_fq_dpriv *dpriv) "incomplete tp = %d\n", n, total_budget, incomplete_tp); if (n == 0) - goto done; + goto done; + + sum = 0; + for (i = 0; i < FQ_PRIO_MAX+1; i++) { + if (budgetpb[i] == 0) + continue; + sum += (FQ_PRIO_BIAS+i)*budgetpb[i]; + } + if (sum == 0) + sum = 1; + dsched_debug(LOG_INFO, "sum = %d\n", sum); -#if 0 + for (i = 0; i < FQ_PRIO_MAX+1; i++) { + if (budgetpb[i] == 0) + continue; + + budgetpb[i] = ((FQ_PRIO_BIAS+i)*10)*total_budget/sum; + } + + if (total_budget > dpriv->max_budget) + dpriv->max_budget = total_budget; + + limited_procs = 0; /* - * XXX: hack. don't know why total_budget can be zero here - * -> this doesn't apply anymore. total_budget is never 0 now + * XXX: eventually remove all the silly *10... */ - if (total_budget == 0) - total_budget = 1; -#endif - TAILQ_FOREACH_MUTABLE(fqp, &dpriv->fq_priv_list, dlink, fqp2) { - /* XXX: proportional to scheduler class! */ - avail_pct = (int64_t)1000/(int64_t)n; - - /* XXX: 100/(sum of scheduler priorities) * scheduler priority */ - /* XXX: but need to process queues of fqp on buckets or so...*/ + budget = budgetpb[(fqp->p) ? fqp->p->p_ionice : 0]; - use_pct = ((int64_t)1000* (int64_t)fqp->avg_latency * - (int64_t)fqp->transactions)/(int64_t)total_budget; + used_budget = ((int64_t)10*(int64_t)fqp->avg_latency * + (int64_t)fqp->transactions); + if (used_budget > 0) { + dsched_debug(LOG_INFO, + "info: used_budget = %lld, budget = %lld\n", used_budget, + budget); + } - /* process is exceeding its fair share; rate-limit it */ - if ((use_pct > avail_pct) && (incomplete_tp > n*2)) { + /* + * process is exceeding its fair share; rate-limit it, but only + * if the disk is actually fully used. + */ + if ((used_budget > budget) && (incomplete_tp > n*2)) { /* kprintf("here we are, use_pct > avail_pct\n"); */ /* fqp->max_tp = avail_pct * fqp->avg_latency; */ - fqp->max_tp = total_budget/(n * fqp->avg_latency); + KKASSERT(fqp->avg_latency != 0); + + /* + * If the disk has not been fully used lately, augment the + * budget. + */ + if (total_budget*3 < dpriv->max_budget*2) { + budget *= 2; + budget /= 3; + } + + fqp->max_tp = budget/(10*fqp->avg_latency); + ++limited_procs; dsched_debug(LOG_INFO, "rate limited to %d transactions\n", fqp->max_tp); atomic_add_int(&fq_stats.procs_limited, 1); - } else if (((use_pct < avail_pct/2) || (incomplete_tp < n*2)) && + } else if (((used_budget*2 < budget) || (incomplete_tp < n*2)) && (!prev_full && !last_full)) { /* * process is really using little of its timeslice, or the * disk is not busy, so let's reset the rate-limit. * Without this, exceeding processes will get an unlimited * slice every other slice. - * XXX: this still doesn't quite fix the issue, but maybe, - * it's good that way so that heavy writes are interleaved. + * XXX: this still doesn't quite fix the issue, but maybe + * it's good that way, so that heavy writes are interleaved. */ fqp->max_tp = 0; } fqp->transactions = 0; fqp->avg_latency = 0; + fqp->issued = 0; } prev_full = last_full; diff --git a/sys/dsched/fq/dsched_fq_diskops.c b/sys/dsched/fq/dsched_fq_diskops.c index 7382b88110..ce607b5fb5 100644 --- a/sys/dsched/fq/dsched_fq_diskops.c +++ b/sys/dsched/fq/dsched_fq_diskops.c @@ -72,6 +72,7 @@ static dsched_queue_t fq_queue; dsched_new_buf_t fq_new_buf; dsched_new_proc_t fq_new_proc; dsched_new_thread_t fq_new_thread; +dsched_exit_buf_t fq_exit_buf; dsched_exit_proc_t fq_exit_proc; dsched_exit_thread_t fq_exit_thread; @@ -93,6 +94,7 @@ struct dsched_ops dsched_fq_ops = { .new_buf = fq_new_buf, .new_proc = fq_new_proc, .new_thread = fq_new_thread, + .exit_buf = fq_exit_buf, .exit_proc = fq_exit_proc, .exit_thread = fq_exit_thread, }; @@ -246,14 +248,19 @@ fq_queue(struct disk *dp, struct bio *obio) } } FQ_FQMP_UNLOCK(fqmp); + dsched_clr_buf_priv(obio->bio_buf); fq_dereference_mpriv(fqmp); /* acquired on new_buf */ + atomic_subtract_int(&fq_stats.nbufs, 1); KKASSERT(found == 1); dpriv = dsched_get_disk_priv(dp); /* XXX: probably rather pointless doing this atomically */ max_tp = atomic_fetchadd_int(&fqp->max_tp, 0); +#if 0 transactions = atomic_fetchadd_int(&fqp->transactions, 0); +#endif + transactions = atomic_fetchadd_int(&fqp->issued, 0); /* | No rate limiting || Hasn't reached limit rate | */ if ((max_tp == 0) || (transactions < max_tp)) { @@ -268,7 +275,7 @@ fq_queue(struct disk *dp, struct bio *obio) count = 0; TAILQ_FOREACH_MUTABLE(bio, &fqp->queue, link, bio2) { - if ((fqp->max_tp > 0) && (count >= fqp->max_tp)) + if ((fqp->max_tp > 0) && (fqp->issued >= fqp->max_tp)) break; TAILQ_REMOVE(&fqp->queue, bio, link); --fqp->qlength; @@ -278,6 +285,7 @@ fq_queue(struct disk *dp, struct bio *obio) * queueing */ dsched_strategy_async(dp, bio, fq_completed, fqp); + atomic_add_int(&fqp->issued, 1); atomic_add_int(&dpriv->incomplete_tp, 1); atomic_add_int(&fq_stats.transactions, 1); } @@ -288,6 +296,7 @@ fq_queue(struct disk *dp, struct bio *obio) fq_reference_priv(fqp); dsched_strategy_async(dp, obio, fq_completed, fqp); + atomic_add_int(&fqp->issued, 1); atomic_add_int(&dpriv->incomplete_tp, 1); atomic_add_int(&fq_stats.transactions, 1); } else { @@ -300,7 +309,20 @@ fq_queue(struct disk *dp, struct bio *obio) */ FQ_FQP_LOCK(fqp); fq_reference_priv(fqp); - TAILQ_INSERT_TAIL(&fqp->queue, obio, link); + + /* + * Prioritize reads by inserting them at the front of the + * queue. + * + * XXX: this might cause issues with data that should + * have been written and is being read, but hasn't + * actually been written yet. + */ + if (obio->bio_buf->b_cmd == BUF_CMD_READ) + TAILQ_INSERT_HEAD(&fqp->queue, obio, link); + else + TAILQ_INSERT_TAIL(&fqp->queue, obio, link); + ++fqp->qlength; FQ_FQP_UNLOCK(fqp); } diff --git a/sys/dsched/fq/dsched_fq_procops.c b/sys/dsched/fq/dsched_fq_procops.c index 8fb83d32d2..840f2de385 100644 --- a/sys/dsched/fq/dsched_fq_procops.c +++ b/sys/dsched/fq/dsched_fq_procops.c @@ -64,9 +64,12 @@ MALLOC_DECLARE(M_DSCHEDFQ); dsched_new_buf_t fq_new_buf; dsched_new_proc_t fq_new_proc; dsched_new_thread_t fq_new_thread; +dsched_exit_buf_t fq_exit_buf; dsched_exit_proc_t fq_exit_proc; dsched_exit_thread_t fq_exit_thread; +extern struct dsched_fq_stats fq_stats; + void fq_new_buf(struct buf *bp) { @@ -89,9 +92,25 @@ fq_new_buf(struct buf *bp) KKASSERT(fqmp != NULL); #endif - if (fqmp) + if (fqmp) { + atomic_add_int(&fq_stats.nbufs, 1); fq_reference_mpriv(fqmp); + } dsched_set_buf_priv(bp, fqmp); + +} + +void +fq_exit_buf(struct buf *bp) +{ + struct dsched_fq_mpriv *fqmp; + + fqmp = dsched_get_buf_priv(bp); + if (fqmp != NULL) { + dsched_clr_buf_priv(bp); + fq_dereference_mpriv(fqmp); + atomic_subtract_int(&fq_stats.nbufs, 1); + } } void @@ -101,9 +120,11 @@ fq_new_proc(struct proc *p) KKASSERT(p != NULL); - fqmp = fq_alloc_mpriv(); + fqmp = fq_alloc_mpriv(p); fq_reference_mpriv(fqmp); dsched_set_proc_priv(p, fqmp); + atomic_add_int(&fq_stats.nprocs, 1); + fqmp->p = p; } void @@ -113,9 +134,11 @@ fq_new_thread(struct thread *td) KKASSERT(td != NULL); - fqmp = fq_alloc_mpriv(); + fqmp = fq_alloc_mpriv(NULL); fq_reference_mpriv(fqmp); dsched_set_thread_priv(td, fqmp); + atomic_add_int(&fq_stats.nthreads, 1); + fqmp->td = td; } void @@ -130,9 +153,11 @@ fq_exit_proc(struct proc *p) #if 0 kprintf("exit_proc: fqmp = %p\n", fqmp); #endif + fqmp->dead = 0x1337; dsched_set_proc_priv(p, 0); fq_dereference_mpriv(fqmp); /* one for alloc, */ fq_dereference_mpriv(fqmp); /* one for ref */ + atomic_subtract_int(&fq_stats.nprocs, 1); } void @@ -147,7 +172,9 @@ fq_exit_thread(struct thread *td) #if 0 kprintf("exit_thread: fqmp = %p\n", fqmp); #endif + fqmp->dead = 0x1337; dsched_set_thread_priv(td, 0); fq_dereference_mpriv(fqmp); /* one for alloc, */ fq_dereference_mpriv(fqmp); /* one for ref */ + atomic_subtract_int(&fq_stats.nthreads, 1); } diff --git a/sys/kern/init_sysent.c b/sys/kern/init_sysent.c index a205b575a3..089c558d79 100644 --- a/sys/kern/init_sysent.c +++ b/sys/kern/init_sysent.c @@ -553,4 +553,6 @@ struct sysent sysent[] = { { AS(mq_receive_args), (sy_call_t *)sys_mq_receive }, /* 517 = mq_receive */ { AS(mq_timedsend_args), (sy_call_t *)sys_mq_timedsend }, /* 518 = mq_timedsend */ { AS(mq_timedreceive_args), (sy_call_t *)sys_mq_timedreceive }, /* 519 = mq_timedreceive */ + { AS(ioprio_set_args), (sy_call_t *)sys_ioprio_set }, /* 520 = ioprio_set */ + { AS(ioprio_get_args), (sy_call_t *)sys_ioprio_get }, /* 521 = ioprio_get */ }; diff --git a/sys/kern/kern_resource.c b/sys/kern/kern_resource.c index 1c923b013f..10914be682 100644 --- a/sys/kern/kern_resource.c +++ b/sys/kern/kern_resource.c @@ -66,6 +66,7 @@ #include static int donice (struct proc *chgp, int n); +static int doionice (struct proc *chgp, int n); static MALLOC_DEFINE(M_UIDINFO, "uidinfo", "uidinfo structures"); #define UIHASH(uid) (&uihashtbl[(uid) & uihash]) @@ -286,6 +287,213 @@ donice(struct proc *chgp, int n) return (0); } + +struct ioprio_get_info { + int high; + int who; +}; + +static int ioprio_get_callback(struct proc *p, void *data); + +/* + * MPALMOSTSAFE + */ +int +sys_ioprio_get(struct ioprio_get_args *uap) +{ + struct ioprio_get_info info; + struct proc *curp = curproc; + struct proc *p; + int high = IOPRIO_MIN-2; + int error; + + get_mplock(); + + switch (uap->which) { + case PRIO_PROCESS: + if (uap->who == 0) + p = curp; + else + p = pfind(uap->who); + if (p == 0) + break; + if (!PRISON_CHECK(curp->p_ucred, p->p_ucred)) + break; + high = p->p_ionice; + break; + + case PRIO_PGRP: + { + struct pgrp *pg; + + if (uap->who == 0) + pg = curp->p_pgrp; + else if ((pg = pgfind(uap->who)) == NULL) + break; + LIST_FOREACH(p, &pg->pg_members, p_pglist) { + if ((PRISON_CHECK(curp->p_ucred, p->p_ucred) && p->p_nice > high)) + high = p->p_ionice; + } + break; + } + case PRIO_USER: + if (uap->who == 0) + uap->who = curp->p_ucred->cr_uid; + info.high = high; + info.who = uap->who; + allproc_scan(ioprio_get_callback, &info); + high = info.high; + break; + + default: + error = EINVAL; + goto done; + } + if (high == IOPRIO_MIN-2) { + error = ESRCH; + goto done; + } + uap->sysmsg_result = high; + error = 0; +done: + rel_mplock(); + return (error); +} + +/* + * Figure out the current lowest nice priority for processes owned + * by the specified user. + */ +static +int +ioprio_get_callback(struct proc *p, void *data) +{ + struct ioprio_get_info *info = data; + + if (PRISON_CHECK(curproc->p_ucred, p->p_ucred) && + p->p_ucred->cr_uid == info->who && + p->p_ionice > info->high) { + info->high = p->p_ionice; + } + return(0); +} + + +struct ioprio_set_info { + int prio; + int who; + int error; + int found; +}; + +static int ioprio_set_callback(struct proc *p, void *data); + +/* + * MPALMOSTSAFE + */ +int +sys_ioprio_set(struct ioprio_set_args *uap) +{ + struct ioprio_set_info info; + struct proc *curp = curproc; + struct proc *p; + int found = 0, error = 0; + + get_mplock(); + + switch (uap->which) { + case PRIO_PROCESS: + if (uap->who == 0) + p = curp; + else + p = pfind(uap->who); + if (p == 0) + break; + if (!PRISON_CHECK(curp->p_ucred, p->p_ucred)) + break; + error = doionice(p, uap->prio); + found++; + break; + + case PRIO_PGRP: + { + struct pgrp *pg; + + if (uap->who == 0) + pg = curp->p_pgrp; + else if ((pg = pgfind(uap->who)) == NULL) + break; + LIST_FOREACH(p, &pg->pg_members, p_pglist) { + if (PRISON_CHECK(curp->p_ucred, p->p_ucred)) { + error = doionice(p, uap->prio); + found++; + } + } + break; + } + case PRIO_USER: + if (uap->who == 0) + uap->who = curp->p_ucred->cr_uid; + info.prio = uap->prio; + info.who = uap->who; + info.error = 0; + info.found = 0; + allproc_scan(ioprio_set_callback, &info); + error = info.error; + found = info.found; + break; + + default: + error = EINVAL; + found = 1; + break; + } + + rel_mplock(); + if (found == 0) + error = ESRCH; + return (error); +} + +static +int +ioprio_set_callback(struct proc *p, void *data) +{ + struct ioprio_set_info *info = data; + int error; + + if (p->p_ucred->cr_uid == info->who && + PRISON_CHECK(curproc->p_ucred, p->p_ucred)) { + error = doionice(p, info->prio); + if (error) + info->error = error; + ++info->found; + } + return(0); +} + +int +doionice(struct proc *chgp, int n) +{ + struct proc *curp = curproc; + struct ucred *cr = curp->p_ucred; + + if (cr->cr_uid && cr->cr_ruid && + cr->cr_uid != chgp->p_ucred->cr_uid && + cr->cr_ruid != chgp->p_ucred->cr_uid) + return (EPERM); + if (n > IOPRIO_MAX) + n = IOPRIO_MAX; + if (n < IOPRIO_MIN) + n = IOPRIO_MIN; + if (n < chgp->p_ionice && priv_check_cred(cr, PRIV_SCHED_SETPRIORITY, 0)) + return (EACCES); + chgp->p_ionice = n; + + return (0); + +} + /* * MPALMOSTSAFE */ diff --git a/sys/kern/lwkt_thread.c b/sys/kern/lwkt_thread.c index edd1507740..8eabc98d05 100644 --- a/sys/kern/lwkt_thread.c +++ b/sys/kern/lwkt_thread.c @@ -423,8 +423,6 @@ lwkt_free_thread(thread_t td) KASSERT((td->td_flags & TDF_RUNNING) == 0, ("lwkt_free_thread: did not exit! %p", td)); - dsched_exit_thread(td); - if (td->td_flags & TDF_ALLOCATED_THREAD) { objcache_put(thread_cache, td); } else if (td->td_flags & TDF_ALLOCATED_STACK) { @@ -1490,6 +1488,7 @@ lwkt_remove_tdallq(thread_t td) { KKASSERT(td->td_gd == mycpu); TAILQ_REMOVE(&td->td_gd->gd_tdallq, td, td_allq); + dsched_exit_thread(td); } void diff --git a/sys/kern/subr_dsched.c b/sys/kern/subr_dsched.c index ea59974648..48bfcaa041 100644 --- a/sys/kern/subr_dsched.c +++ b/sys/kern/subr_dsched.c @@ -67,6 +67,7 @@ static biodone_t default_completed; dsched_new_buf_t *default_new_buf; dsched_new_proc_t *default_new_proc; dsched_new_thread_t *default_new_thread; +dsched_exit_buf_t *default_exit_buf; dsched_exit_proc_t *default_exit_proc; dsched_exit_thread_t *default_exit_thread; @@ -90,7 +91,7 @@ static struct dsched_policy_head dsched_policy_list = static struct dsched_ops dsched_default_ops = { .head = { - .name = "default" + .name = "noop" }, .prepare = default_prepare, .teardown = default_teardown, @@ -243,6 +244,7 @@ dsched_register(struct dsched_ops *d_ops) default_new_buf = d_ops->new_buf; default_new_proc = d_ops->new_proc; default_new_thread = d_ops->new_thread; + default_exit_buf = d_ops->exit_buf; default_exit_proc = d_ops->exit_proc; default_exit_thread = d_ops->exit_thread; } @@ -491,6 +493,12 @@ dsched_new_buf(struct buf *bp) default_new_buf(bp); } +void +dsched_exit_buf(struct buf *bp) +{ + if (default_exit_buf != NULL) + default_exit_buf(bp); +} void dsched_new_proc(struct proc *p) diff --git a/sys/kern/syscalls.c b/sys/kern/syscalls.c index bf958e925c..a2bbb29df6 100644 --- a/sys/kern/syscalls.c +++ b/sys/kern/syscalls.c @@ -527,4 +527,6 @@ const char *syscallnames[] = { "mq_receive", /* 517 = mq_receive */ "mq_timedsend", /* 518 = mq_timedsend */ "mq_timedreceive", /* 519 = mq_timedreceive */ + "ioprio_set", /* 520 = ioprio_set */ + "ioprio_get", /* 521 = ioprio_get */ }; diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master index ed70fb67b9..2cb5e02edb 100644 --- a/sys/kern/syscalls.master +++ b/sys/kern/syscalls.master @@ -728,3 +728,5 @@ 519 STD POSIX { ssize_t mq_timedreceive(mqd_t mqdes, \ char *msg_ptr, size_t msg_len, unsigned *msg_prio, \ const struct timespec *abs_timeout); } +520 STD BSD { int ioprio_set(int which, int who, int prio); } +521 STD BSD { int ioprio_get(int which, int who); } diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index ddeaf0f294..20969f90ed 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -1324,7 +1324,7 @@ brelse(struct buf *bp) * or B_RELBUF flags. */ bp->b_cmd = BUF_CMD_DONE; - dsched_clr_buf_priv(bp); + dsched_exit_buf(bp); /* * VMIO buffer rundown. Make sure the VM page array is restored @@ -1657,7 +1657,7 @@ bqrelse(struct buf *bp) * buffer is actively locked. */ bp->b_flags &= ~(B_ORDERED | B_NOCACHE | B_RELBUF); - dsched_clr_buf_priv(bp); + dsched_exit_buf(bp); BUF_UNLOCK(bp); } diff --git a/sys/sys/dsched.h b/sys/sys/dsched.h index 56c55c7253..2446391e06 100644 --- a/sys/sys/dsched.h +++ b/sys/sys/dsched.h @@ -87,6 +87,7 @@ typedef int dsched_queue_t(struct disk *dp, struct bio *bio); typedef void dsched_new_buf_t(struct buf *bp); typedef void dsched_new_proc_t(struct proc *p); typedef void dsched_new_thread_t(struct thread *td); +typedef void dsched_exit_buf_t(struct buf *bp); typedef void dsched_exit_proc_t(struct proc *p); typedef void dsched_exit_thread_t(struct thread *td); @@ -106,6 +107,7 @@ struct dsched_ops { dsched_new_buf_t *new_buf; dsched_new_proc_t *new_proc; dsched_new_thread_t *new_thread; + dsched_exit_buf_t *exit_buf; dsched_exit_proc_t *exit_proc; dsched_exit_thread_t *exit_thread; }; @@ -147,7 +149,7 @@ int dsched_debug(int level, char *fmt, ...); dsched_new_buf_t dsched_new_buf; dsched_new_proc_t dsched_new_proc; dsched_new_thread_t dsched_new_thread; - +dsched_exit_buf_t dsched_exit_buf; dsched_exit_proc_t dsched_exit_proc; dsched_exit_thread_t dsched_exit_thread; diff --git a/sys/sys/proc.h b/sys/sys/proc.h index b4d8046695..8d50470065 100644 --- a/sys/sys/proc.h +++ b/sys/sys/proc.h @@ -299,6 +299,7 @@ struct proc { struct pargs *p_args; u_short p_xstat; /* Exit status or last stop signal */ + int p_ionice; void *p_dsched_priv2; /* End area that is copied on creation. */ #define p_endcopy p_dsched_priv2 diff --git a/sys/sys/resource.h b/sys/sys/resource.h index 22fe84273c..f1bc46c2a7 100644 --- a/sys/sys/resource.h +++ b/sys/sys/resource.h @@ -48,6 +48,8 @@ */ #define PRIO_MIN -20 #define PRIO_MAX 20 +#define IOPRIO_MIN 1 +#define IOPRIO_MAX 10 #define PRIO_PROCESS 0 #define PRIO_PGRP 1 @@ -150,9 +152,11 @@ int dosetrlimit (u_int which, struct rlimit *limp); __BEGIN_DECLS int getpriority (int, int); +int ioprio_get (int, int); int getrlimit (int, struct rlimit *); int getrusage (int, struct rusage *); int setpriority (int, int, int); +int ioprio_set (int, int, int); int setrlimit (int, const struct rlimit *); __END_DECLS diff --git a/sys/sys/syscall-hide.h b/sys/sys/syscall-hide.h index d85c943bf7..a0ec77dbdd 100644 --- a/sys/sys/syscall-hide.h +++ b/sys/sys/syscall-hide.h @@ -350,3 +350,5 @@ HIDE_POSIX(mq_send) HIDE_POSIX(mq_receive) HIDE_POSIX(mq_timedsend) HIDE_POSIX(mq_timedreceive) +HIDE_BSD(ioprio_set) +HIDE_BSD(ioprio_get) diff --git a/sys/sys/syscall.h b/sys/sys/syscall.h index dba8aeaf18..ce21144f4d 100644 --- a/sys/sys/syscall.h +++ b/sys/sys/syscall.h @@ -359,4 +359,6 @@ #define SYS_mq_receive 517 #define SYS_mq_timedsend 518 #define SYS_mq_timedreceive 519 -#define SYS_MAXSYSCALL 520 +#define SYS_ioprio_set 520 +#define SYS_ioprio_get 521 +#define SYS_MAXSYSCALL 522 diff --git a/sys/sys/syscall.mk b/sys/sys/syscall.mk index 45bb7cd7bf..a06739b1d0 100644 --- a/sys/sys/syscall.mk +++ b/sys/sys/syscall.mk @@ -298,4 +298,6 @@ MIASM = \ mq_send.o \ mq_receive.o \ mq_timedsend.o \ - mq_timedreceive.o + mq_timedreceive.o \ + ioprio_set.o \ + ioprio_get.o diff --git a/sys/sys/sysproto.h b/sys/sys/sysproto.h index b4df93cadf..09b4d14256 100644 --- a/sys/sys/sysproto.h +++ b/sys/sys/sysproto.h @@ -2276,6 +2276,21 @@ struct mq_timedreceive_args { unsigned * msg_prio; char msg_prio_[PAD_(unsigned *)]; const struct timespec * abs_timeout; char abs_timeout_[PAD_(const struct timespec *)]; }; +struct ioprio_set_args { +#ifdef _KERNEL + struct sysmsg sysmsg; +#endif + int which; char which_[PAD_(int)]; + int who; char who_[PAD_(int)]; + int prio; char prio_[PAD_(int)]; +}; +struct ioprio_get_args { +#ifdef _KERNEL + struct sysmsg sysmsg; +#endif + int which; char which_[PAD_(int)]; + int who; char who_[PAD_(int)]; +}; #ifdef COMPAT_43 @@ -2882,6 +2897,8 @@ int sys_mq_send (struct mq_send_args *); int sys_mq_receive (struct mq_receive_args *); int sys_mq_timedsend (struct mq_timedsend_args *); int sys_mq_timedreceive (struct mq_timedreceive_args *); +int sys_ioprio_set (struct ioprio_set_args *); +int sys_ioprio_get (struct ioprio_get_args *); #endif /* !_SYS_SYSPROTO_H_ */ #undef PAD_ diff --git a/sys/sys/sysunion.h b/sys/sys/sysunion.h index a0df514de0..af303c182e 100644 --- a/sys/sys/sysunion.h +++ b/sys/sys/sysunion.h @@ -406,4 +406,6 @@ union sysunion { struct mq_receive_args mq_receive; struct mq_timedsend_args mq_timedsend; struct mq_timedreceive_args mq_timedreceive; + struct ioprio_set_args ioprio_set; + struct ioprio_get_args ioprio_get; }; diff --git a/sys/vm/vm_pager.c b/sys/vm/vm_pager.c index a323a832a9..889b848bd1 100644 --- a/sys/vm/vm_pager.c +++ b/sys/vm/vm_pager.c @@ -356,7 +356,7 @@ relpbuf(struct buf *bp, int *pfreecnt) int wake_freecnt = 0; KKASSERT(bp->b_flags & B_PAGING); - dsched_clr_buf_priv(bp); + dsched_exit_buf(bp); spin_lock_wr(&bswspin); diff --git a/test/dsched_fq/fqstats.c b/test/dsched_fq/fqstats.c index f81ad58a0a..a395a14b14 100644 --- a/test/dsched_fq/fqstats.c +++ b/test/dsched_fq/fqstats.c @@ -26,23 +26,34 @@ int main(void) "FQP:\t%d\n" "DPRIV:\t%d\n" "---------------------------------------------\n" + "Procs/Threads tracked\n" + "procs:\t\t%d\n" + "threads:\t%d\n" + "---------------------------------------------\n" "Proccesses\n" "Rate limited:\t%d\n" "---------------------------------------------\n" "Transactions\n" "Issued:\t\t%d\n" "Completed:\t%d\n" - "without FQMP:\t%d\n", + "without FQMP:\t%d\n" + "---------------------------------------------\n" + "Misc\n" + "FQMP refs for buf:\t%d\n", fq_stats.fqmp_allocations, fq_stats.fqp_allocations, fq_stats.dpriv_allocations, + fq_stats.nprocs, + fq_stats.nthreads, + fq_stats.procs_limited, fq_stats.transactions, fq_stats.transactions_completed, - fq_stats.no_fqmp + fq_stats.no_fqmp, + fq_stats.nbufs ); diff --git a/usr.bin/Makefile b/usr.bin/Makefile index 6e0f13150a..c6be0dbd83 100644 --- a/usr.bin/Makefile +++ b/usr.bin/Makefile @@ -74,6 +74,7 @@ SUBDIR= alias \ iconv \ id \ indent \ + ionice \ ipcrm \ ipcs \ join \ diff --git a/usr.bin/ionice/Makefile b/usr.bin/ionice/Makefile new file mode 100644 index 0000000000..e7b8322d76 --- /dev/null +++ b/usr.bin/ionice/Makefile @@ -0,0 +1,7 @@ +# @(#)Makefile 8.1 (Berkeley) 6/6/93 +# $DragonFly: src/usr.bin/nice/Makefile,v 1.3 2007/08/27 16:50:57 pavalos Exp $ + +PROG= ionice +NOMAN= + +.include diff --git a/usr.bin/ionice/ionice.c b/usr.bin/ionice/ionice.c new file mode 100644 index 0000000000..838ae6da91 --- /dev/null +++ b/usr.bin/ionice/ionice.c @@ -0,0 +1,103 @@ +/* + * Copyright (c) 1989, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#) Copyright (c) 1989, 1993, 1994 The Regents of the University of California. All rights reserved. + * @(#)nice.c 8.2 (Berkeley) 4/16/94 + * $FreeBSD: src/usr.bin/nice/nice.c,v 1.4.2.1 2002/06/18 08:40:28 tjr Exp $ + * $DragonFly: src/usr.bin/nice/nice.c,v 1.6 2005/10/09 15:09:02 liamfoy Exp $ + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#define DEFNICE 4 + +static void usage(void); + +int +main(int argc, char **argv) +{ + long niceness = DEFNICE; + int ch; + char *ep; + + /* Obsolescent syntax: -number, --number */ + if (argc >= 2 && argv[1][0] == '-' && (argv[1][1] == '-' || + isdigit(argv[1][1])) && strcmp(argv[1], "--") != 0) + if (asprintf(&argv[1], "-n%s", argv[1] + 1) < 0) + err(1, "asprintf"); + + while ((ch = getopt(argc, argv, "n:")) != -1) { + switch (ch) { + case 'n': + errno = 0; + niceness = strtol(optarg, &ep, 10); + if (ep == optarg || *ep != '\0' || errno || + niceness < INT_MIN || niceness > INT_MAX) + errx(1, "%s: invalid ionice value", optarg); + break; + default: + usage(); + } + } + argc -= optind; + argv += optind; + + if (argc == 0) + usage(); + + errno = 0; + niceness += ioprio_get(PRIO_PROCESS, 0); + if (errno) + warn("ioprio_get"); + else if (ioprio_set(PRIO_PROCESS, 0, (int)niceness)) + warn("ioprio_set"); + execvp(*argv, argv); + err(errno == ENOENT || errno == ENOTDIR ? 127 : 126, "%s", *argv); +} + +static void +usage(void) +{ + + fprintf(stderr, "usage: ionice [-n incr] utility [arguments]\n"); + exit(1); +} -- 2.41.0