2 * Copyright (c) 2009, 2010 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Alex Hornung <ahornung@gmail.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/kernel.h>
38 #include <sys/sysctl.h>
41 #include <sys/diskslice.h>
43 #include <sys/malloc.h>
44 #include <machine/md_var.h>
45 #include <sys/ctype.h>
46 #include <sys/syslog.h>
47 #include <sys/device.h>
48 #include <sys/msgport.h>
49 #include <sys/msgport2.h>
51 #include <sys/dsched.h>
52 #include <sys/fcntl.h>
53 #include <machine/varargs.h>
55 MALLOC_DEFINE(M_DSCHED, "dsched", "dsched allocs");
57 static dsched_prepare_t noop_prepare;
58 static dsched_teardown_t noop_teardown;
59 static dsched_cancel_t noop_cancel;
60 static dsched_queue_t noop_queue;
62 static void dsched_sysctl_add_disk(struct dsched_disk_ctx *diskctx, char *name);
63 static void dsched_disk_ctx_destroy(struct dsched_disk_ctx *diskctx);
64 static void dsched_thread_io_destroy(struct dsched_thread_io *tdio);
65 static void dsched_thread_ctx_destroy(struct dsched_thread_ctx *tdctx);
67 static int dsched_inited = 0;
68 static int default_set = 0;
70 struct lock dsched_lock;
71 static int dsched_debug_enable = 0;
73 struct dsched_stats dsched_stats;
75 struct objcache_malloc_args dsched_disk_ctx_malloc_args = {
76 DSCHED_DISK_CTX_MAX_SZ, M_DSCHED };
77 struct objcache_malloc_args dsched_thread_io_malloc_args = {
78 DSCHED_THREAD_IO_MAX_SZ, M_DSCHED };
79 struct objcache_malloc_args dsched_thread_ctx_malloc_args = {
80 DSCHED_THREAD_CTX_MAX_SZ, M_DSCHED };
82 static struct objcache *dsched_diskctx_cache;
83 static struct objcache *dsched_tdctx_cache;
84 static struct objcache *dsched_tdio_cache;
86 TAILQ_HEAD(, dsched_thread_ctx) dsched_tdctx_list =
87 TAILQ_HEAD_INITIALIZER(dsched_tdctx_list);
89 struct lock dsched_tdctx_lock;
91 static struct dsched_policy_head dsched_policy_list =
92 TAILQ_HEAD_INITIALIZER(dsched_policy_list);
94 static struct dsched_policy dsched_noop_policy = {
97 .prepare = noop_prepare,
98 .teardown = noop_teardown,
99 .cancel_all = noop_cancel,
100 .bio_queue = noop_queue
103 static struct dsched_policy *default_policy = &dsched_noop_policy;
106 * dsched_debug() is a SYSCTL and TUNABLE controlled debug output function
110 dsched_debug(int level, char *fmt, ...)
115 if (level <= dsched_debug_enable)
123 * Called on disk_create()
124 * tries to read which policy to use from loader.conf, if there's
125 * none specified, the default policy is used.
128 dsched_disk_create_callback(struct disk *dp, const char *head_name, int unit)
130 char tunable_key[SPECNAMELEN + 48];
131 char sched_policy[DSCHED_POLICY_NAME_LENGTH];
133 struct dsched_policy *policy = NULL;
135 /* Also look for serno stuff? */
136 /* kprintf("dsched_disk_create_callback() for disk %s%d\n", head_name, unit); */
137 lockmgr(&dsched_lock, LK_EXCLUSIVE);
139 ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.%s%d",
141 if (TUNABLE_STR_FETCH(tunable_key, sched_policy,
142 sizeof(sched_policy)) != 0) {
143 policy = dsched_find_policy(sched_policy);
146 ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.%s",
148 for (ptr = tunable_key; *ptr; ptr++) {
152 if (!policy && (TUNABLE_STR_FETCH(tunable_key, sched_policy,
153 sizeof(sched_policy)) != 0)) {
154 policy = dsched_find_policy(sched_policy);
157 ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.default");
158 if (!policy && !default_set && (TUNABLE_STR_FETCH(tunable_key, sched_policy,
159 sizeof(sched_policy)) != 0)) {
160 policy = dsched_find_policy(sched_policy);
165 dsched_debug(0, "No policy for %s%d specified, "
166 "or policy not found\n", head_name, unit);
168 dsched_set_policy(dp, default_policy);
170 dsched_set_policy(dp, policy);
173 if (strncmp(head_name, "mapper/", strlen("mapper/")) == 0)
174 ksnprintf(tunable_key, sizeof(tunable_key), "%s", head_name);
176 ksnprintf(tunable_key, sizeof(tunable_key), "%s%d", head_name, unit);
177 for (ptr = tunable_key; *ptr; ptr++) {
181 dsched_sysctl_add_disk(
182 (struct dsched_disk_ctx *)dsched_get_disk_priv(dp),
185 lockmgr(&dsched_lock, LK_RELEASE);
189 * Called from disk_setdiskinfo (or rather _setdiskinfo). This will check if
190 * there's any policy associated with the serial number of the device.
193 dsched_disk_update_callback(struct disk *dp, struct disk_info *info)
195 char tunable_key[SPECNAMELEN + 48];
196 char sched_policy[DSCHED_POLICY_NAME_LENGTH];
197 struct dsched_policy *policy = NULL;
199 if (info->d_serialno == NULL)
202 lockmgr(&dsched_lock, LK_EXCLUSIVE);
204 ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.%s",
207 if((TUNABLE_STR_FETCH(tunable_key, sched_policy,
208 sizeof(sched_policy)) != 0)) {
209 policy = dsched_find_policy(sched_policy);
213 dsched_switch(dp, policy);
216 dsched_sysctl_add_disk(
217 (struct dsched_disk_ctx *)dsched_get_disk_priv(dp),
220 lockmgr(&dsched_lock, LK_RELEASE);
224 * Called on disk_destroy()
225 * shuts down the scheduler core and cancels all remaining bios
228 dsched_disk_destroy_callback(struct disk *dp)
230 struct dsched_policy *old_policy;
231 struct dsched_disk_ctx *diskctx;
233 lockmgr(&dsched_lock, LK_EXCLUSIVE);
235 diskctx = dsched_get_disk_priv(dp);
237 old_policy = dp->d_sched_policy;
238 dp->d_sched_policy = &dsched_noop_policy;
239 old_policy->cancel_all(dsched_get_disk_priv(dp));
240 old_policy->teardown(dsched_get_disk_priv(dp));
242 if (diskctx->flags & DSCHED_SYSCTL_CTX_INITED)
243 sysctl_ctx_free(&diskctx->sysctl_ctx);
246 atomic_subtract_int(&old_policy->ref_count, 1);
247 KKASSERT(old_policy->ref_count >= 0);
249 lockmgr(&dsched_lock, LK_RELEASE);
254 dsched_queue(struct disk *dp, struct bio *bio)
256 struct dsched_thread_ctx *tdctx;
257 struct dsched_thread_io *tdio;
258 struct dsched_disk_ctx *diskctx;
260 int found = 0, error = 0;
262 tdctx = dsched_get_buf_priv(bio->bio_buf);
264 /* We don't handle this case, let dsched dispatch */
265 atomic_add_int(&dsched_stats.no_tdctx, 1);
266 dsched_strategy_raw(dp, bio);
270 DSCHED_THREAD_CTX_LOCK(tdctx);
272 KKASSERT(!TAILQ_EMPTY(&tdctx->tdio_list));
273 TAILQ_FOREACH(tdio, &tdctx->tdio_list, link) {
274 if (tdio->dp == dp) {
275 dsched_thread_io_ref(tdio);
281 DSCHED_THREAD_CTX_UNLOCK(tdctx);
282 dsched_clr_buf_priv(bio->bio_buf);
283 dsched_thread_ctx_unref(tdctx); /* acquired on new_buf */
285 KKASSERT(found == 1);
286 diskctx = dsched_get_disk_priv(dp);
287 dsched_disk_ctx_ref(diskctx);
288 error = dp->d_sched_policy->bio_queue(diskctx, tdio, bio);
291 dsched_strategy_raw(dp, bio);
293 dsched_disk_ctx_unref(diskctx);
294 dsched_thread_io_unref(tdio);
299 * Called from each module_init or module_attach of each policy
300 * registers the policy in the local policy list.
303 dsched_register(struct dsched_policy *d_policy)
305 struct dsched_policy *policy;
308 lockmgr(&dsched_lock, LK_EXCLUSIVE);
310 policy = dsched_find_policy(d_policy->name);
313 TAILQ_INSERT_TAIL(&dsched_policy_list, d_policy, link);
314 atomic_add_int(&d_policy->ref_count, 1);
316 dsched_debug(LOG_ERR, "Policy with name %s already registered!\n",
321 lockmgr(&dsched_lock, LK_RELEASE);
326 * Called from each module_detach of each policy
327 * unregisters the policy
330 dsched_unregister(struct dsched_policy *d_policy)
332 struct dsched_policy *policy;
334 lockmgr(&dsched_lock, LK_EXCLUSIVE);
335 policy = dsched_find_policy(d_policy->name);
338 if (policy->ref_count > 1) {
339 lockmgr(&dsched_lock, LK_RELEASE);
342 TAILQ_REMOVE(&dsched_policy_list, policy, link);
343 atomic_subtract_int(&policy->ref_count, 1);
344 KKASSERT(policy->ref_count == 0);
346 lockmgr(&dsched_lock, LK_RELEASE);
352 * switches the policy by first removing the old one and then
353 * enabling the new one.
356 dsched_switch(struct disk *dp, struct dsched_policy *new_policy)
358 struct dsched_policy *old_policy;
360 /* If we are asked to set the same policy, do nothing */
361 if (dp->d_sched_policy == new_policy)
364 /* lock everything down, diskwise */
365 lockmgr(&dsched_lock, LK_EXCLUSIVE);
366 old_policy = dp->d_sched_policy;
368 atomic_subtract_int(&old_policy->ref_count, 1);
369 KKASSERT(old_policy->ref_count >= 0);
371 dp->d_sched_policy = &dsched_noop_policy;
372 old_policy->teardown(dsched_get_disk_priv(dp));
375 /* Bring everything back to life */
376 dsched_set_policy(dp, new_policy);
377 lockmgr(&dsched_lock, LK_RELEASE);
383 * Loads a given policy and attaches it to the specified disk.
384 * Also initializes the core for the policy
387 dsched_set_policy(struct disk *dp, struct dsched_policy *new_policy)
391 /* Check if it is locked already. if not, we acquire the devfs lock */
392 if (!(lockstatus(&dsched_lock, curthread)) == LK_EXCLUSIVE) {
393 lockmgr(&dsched_lock, LK_EXCLUSIVE);
397 policy_new(dp, new_policy);
398 new_policy->prepare(dsched_get_disk_priv(dp));
399 dp->d_sched_policy = new_policy;
400 atomic_add_int(&new_policy->ref_count, 1);
401 kprintf("disk scheduler: set policy of %s to %s\n", dp->d_cdev->si_name,
404 /* If we acquired the lock, we also get rid of it */
406 lockmgr(&dsched_lock, LK_RELEASE);
409 struct dsched_policy*
410 dsched_find_policy(char *search)
412 struct dsched_policy *policy;
413 struct dsched_policy *policy_found = NULL;
416 /* Check if it is locked already. if not, we acquire the devfs lock */
417 if (!(lockstatus(&dsched_lock, curthread)) == LK_EXCLUSIVE) {
418 lockmgr(&dsched_lock, LK_EXCLUSIVE);
422 TAILQ_FOREACH(policy, &dsched_policy_list, link) {
423 if (!strcmp(policy->name, search)) {
424 policy_found = policy;
429 /* If we acquired the lock, we also get rid of it */
431 lockmgr(&dsched_lock, LK_RELEASE);
437 dsched_find_disk(char *search)
439 struct disk *dp_found = NULL;
440 struct disk *dp = NULL;
442 while((dp = disk_enumerate(dp))) {
443 if (!strcmp(dp->d_cdev->si_name, search)) {
453 dsched_disk_enumerate(struct disk *dp, struct dsched_policy *policy)
455 while ((dp = disk_enumerate(dp))) {
456 if (dp->d_sched_policy == policy)
463 struct dsched_policy *
464 dsched_policy_enumerate(struct dsched_policy *pol)
467 return (TAILQ_FIRST(&dsched_policy_list));
469 return (TAILQ_NEXT(pol, link));
473 dsched_cancel_bio(struct bio *bp)
475 bp->bio_buf->b_error = ENXIO;
476 bp->bio_buf->b_flags |= B_ERROR;
477 bp->bio_buf->b_resid = bp->bio_buf->b_bcount;
483 dsched_strategy_raw(struct disk *dp, struct bio *bp)
486 * Ideally, this stuff shouldn't be needed... but just in case, we leave it in
489 KASSERT(dp->d_rawdev != NULL, ("dsched_strategy_raw sees NULL d_rawdev!!"));
490 if(bp->bio_track != NULL) {
491 dsched_debug(LOG_INFO,
492 "dsched_strategy_raw sees non-NULL bio_track!! "
494 bp->bio_track = NULL;
496 dev_dstrategy(dp->d_rawdev, bp);
500 dsched_strategy_sync(struct disk *dp, struct bio *bio)
502 struct buf *bp, *nbp;
510 nbp->b_cmd = bp->b_cmd;
511 nbp->b_bufsize = bp->b_bufsize;
512 nbp->b_runningbufspace = bp->b_runningbufspace;
513 nbp->b_bcount = bp->b_bcount;
514 nbp->b_resid = bp->b_resid;
515 nbp->b_data = bp->b_data;
518 * Buffers undergoing device I/O do not need a kvabase/size.
520 nbp->b_kvabase = bp->b_kvabase;
521 nbp->b_kvasize = bp->b_kvasize;
523 nbp->b_dirtyend = bp->b_dirtyend;
525 nbio->bio_done = biodone_sync;
526 nbio->bio_flags |= BIO_SYNC;
527 nbio->bio_track = NULL;
529 nbio->bio_caller_info1.ptr = dp;
530 nbio->bio_offset = bio->bio_offset;
532 dev_dstrategy(dp->d_rawdev, nbio);
533 biowait(nbio, "dschedsync");
534 bp->b_resid = nbp->b_resid;
535 bp->b_error = nbp->b_error;
538 nbp->b_kvabase = NULL;
545 dsched_strategy_async(struct disk *dp, struct bio *bio, biodone_t *done, void *priv)
549 nbio = push_bio(bio);
550 nbio->bio_done = done;
551 nbio->bio_offset = bio->bio_offset;
553 dsched_set_bio_dp(nbio, dp);
554 dsched_set_bio_priv(nbio, priv);
556 getmicrotime(&nbio->bio_caller_info3.tv);
557 dev_dstrategy(dp->d_rawdev, nbio);
561 * Ref and deref various structures. The 1->0 transition of the reference
562 * count actually transitions 1->0x80000000 and causes the object to be
563 * destroyed. It is possible for transitory references to occur on the
564 * object while it is being destroyed. We use bit 31 to indicate that
565 * destruction is in progress and to prevent nested destructions.
568 dsched_disk_ctx_ref(struct dsched_disk_ctx *diskctx)
572 refcount = atomic_fetchadd_int(&diskctx->refcount, 1);
576 dsched_thread_io_ref(struct dsched_thread_io *tdio)
580 refcount = atomic_fetchadd_int(&tdio->refcount, 1);
584 dsched_thread_ctx_ref(struct dsched_thread_ctx *tdctx)
588 refcount = atomic_fetchadd_int(&tdctx->refcount, 1);
592 dsched_disk_ctx_unref(struct dsched_disk_ctx *diskctx)
598 * Handle 1->0 transitions for diskctx and nested destruction
599 * recursions. If the refs are already in destruction mode (bit 31
600 * set) on the 1->0 transition we don't try to destruct it again.
602 * 0x80000001->0x80000000 transitions are handled normally and
603 * thus avoid nested dstruction.
606 refs = diskctx->refcount;
610 KKASSERT(((refs ^ nrefs) & 0x80000000) == 0);
612 if (atomic_cmpset_int(&diskctx->refcount, refs, nrefs))
617 if (atomic_cmpset_int(&diskctx->refcount, refs, nrefs)) {
618 dsched_disk_ctx_destroy(diskctx);
626 dsched_disk_ctx_destroy(struct dsched_disk_ctx *diskctx)
628 struct dsched_thread_io *tdio;
631 kprintf("diskctx (%p) destruction started, trace:\n", diskctx);
634 lockmgr(&diskctx->lock, LK_EXCLUSIVE);
635 while ((tdio = TAILQ_FIRST(&diskctx->tdio_list)) != NULL) {
636 KKASSERT(tdio->flags & DSCHED_LINKED_DISK_CTX);
637 TAILQ_REMOVE(&diskctx->tdio_list, tdio, dlink);
638 atomic_clear_int(&tdio->flags, DSCHED_LINKED_DISK_CTX);
639 tdio->diskctx = NULL;
640 /* XXX tdio->diskctx->dp->d_sched_policy->destroy_tdio(tdio);*/
641 dsched_thread_io_unref(tdio);
643 lockmgr(&diskctx->lock, LK_RELEASE);
644 if (diskctx->dp->d_sched_policy->destroy_diskctx)
645 diskctx->dp->d_sched_policy->destroy_diskctx(diskctx);
646 KKASSERT(diskctx->refcount == 0x80000000);
647 objcache_put(dsched_diskctx_cache, diskctx);
648 atomic_subtract_int(&dsched_stats.diskctx_allocations, 1);
652 dsched_thread_io_unref(struct dsched_thread_io *tdio)
658 * Handle 1->0 transitions for tdio and nested destruction
659 * recursions. If the refs are already in destruction mode (bit 31
660 * set) on the 1->0 transition we don't try to destruct it again.
662 * 0x80000001->0x80000000 transitions are handled normally and
663 * thus avoid nested dstruction.
666 refs = tdio->refcount;
670 KKASSERT(((refs ^ nrefs) & 0x80000000) == 0);
672 if (atomic_cmpset_int(&tdio->refcount, refs, nrefs))
677 if (atomic_cmpset_int(&tdio->refcount, refs, nrefs)) {
678 dsched_thread_io_destroy(tdio);
685 dsched_thread_io_destroy(struct dsched_thread_io *tdio)
687 struct dsched_thread_ctx *tdctx;
688 struct dsched_disk_ctx *diskctx;
691 kprintf("tdio (%p) destruction started, trace:\n", tdio);
694 KKASSERT(tdio->qlength == 0);
696 while ((diskctx = tdio->diskctx) != NULL) {
697 dsched_disk_ctx_ref(diskctx);
698 lockmgr(&diskctx->lock, LK_EXCLUSIVE);
699 if (diskctx != tdio->diskctx) {
700 lockmgr(&diskctx->lock, LK_RELEASE);
701 dsched_disk_ctx_unref(diskctx);
704 KKASSERT(tdio->flags & DSCHED_LINKED_DISK_CTX);
705 if (diskctx->dp->d_sched_policy->destroy_tdio)
706 diskctx->dp->d_sched_policy->destroy_tdio(tdio);
707 TAILQ_REMOVE(&diskctx->tdio_list, tdio, dlink);
708 atomic_clear_int(&tdio->flags, DSCHED_LINKED_DISK_CTX);
709 tdio->diskctx = NULL;
710 lockmgr(&diskctx->lock, LK_RELEASE);
711 dsched_disk_ctx_unref(diskctx);
713 while ((tdctx = tdio->tdctx) != NULL) {
714 dsched_thread_ctx_ref(tdctx);
715 lockmgr(&tdctx->lock, LK_EXCLUSIVE);
716 if (tdctx != tdio->tdctx) {
717 lockmgr(&tdctx->lock, LK_RELEASE);
718 dsched_thread_ctx_unref(tdctx);
721 KKASSERT(tdio->flags & DSCHED_LINKED_THREAD_CTX);
722 TAILQ_REMOVE(&tdctx->tdio_list, tdio, link);
723 atomic_clear_int(&tdio->flags, DSCHED_LINKED_THREAD_CTX);
725 lockmgr(&tdctx->lock, LK_RELEASE);
726 dsched_thread_ctx_unref(tdctx);
728 KKASSERT(tdio->refcount == 0x80000000);
729 objcache_put(dsched_tdio_cache, tdio);
730 atomic_subtract_int(&dsched_stats.tdio_allocations, 1);
732 dsched_disk_ctx_unref(diskctx);
737 dsched_thread_ctx_unref(struct dsched_thread_ctx *tdctx)
743 * Handle 1->0 transitions for tdctx and nested destruction
744 * recursions. If the refs are already in destruction mode (bit 31
745 * set) on the 1->0 transition we don't try to destruct it again.
747 * 0x80000001->0x80000000 transitions are handled normally and
748 * thus avoid nested dstruction.
751 refs = tdctx->refcount;
755 KKASSERT(((refs ^ nrefs) & 0x80000000) == 0);
757 if (atomic_cmpset_int(&tdctx->refcount, refs, nrefs))
762 if (atomic_cmpset_int(&tdctx->refcount, refs, nrefs)) {
763 dsched_thread_ctx_destroy(tdctx);
770 dsched_thread_ctx_destroy(struct dsched_thread_ctx *tdctx)
772 struct dsched_thread_io *tdio;
775 kprintf("tdctx (%p) destruction started, trace:\n", tdctx);
778 DSCHED_GLOBAL_THREAD_CTX_LOCK();
780 while ((tdio = TAILQ_FIRST(&tdctx->tdio_list)) != NULL) {
781 KKASSERT(tdio->flags & DSCHED_LINKED_THREAD_CTX);
782 TAILQ_REMOVE(&tdctx->tdio_list, tdio, link);
783 atomic_clear_int(&tdio->flags, DSCHED_LINKED_THREAD_CTX);
785 dsched_thread_io_unref(tdio);
787 KKASSERT(tdctx->refcount == 0x80000000);
788 TAILQ_REMOVE(&dsched_tdctx_list, tdctx, link);
790 DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
792 objcache_put(dsched_tdctx_cache, tdctx);
793 atomic_subtract_int(&dsched_stats.tdctx_allocations, 1);
796 struct dsched_thread_io *
797 dsched_thread_io_alloc(struct disk *dp, struct dsched_thread_ctx *tdctx,
798 struct dsched_policy *pol)
800 struct dsched_thread_io *tdio;
802 dsched_disk_ctx_ref(dsched_get_disk_priv(dp));
804 tdio = objcache_get(dsched_tdio_cache, M_WAITOK);
805 bzero(tdio, DSCHED_THREAD_IO_MAX_SZ);
807 /* XXX: maybe we do need another ref for the disk list for tdio */
808 dsched_thread_io_ref(tdio);
810 DSCHED_THREAD_IO_LOCKINIT(tdio);
813 tdio->diskctx = dsched_get_disk_priv(dp);
814 TAILQ_INIT(&tdio->queue);
819 lockmgr(&tdio->diskctx->lock, LK_EXCLUSIVE);
820 TAILQ_INSERT_TAIL(&tdio->diskctx->tdio_list, tdio, dlink);
821 atomic_set_int(&tdio->flags, DSCHED_LINKED_DISK_CTX);
822 lockmgr(&tdio->diskctx->lock, LK_RELEASE);
828 /* Put the tdio in the tdctx list */
829 DSCHED_THREAD_CTX_LOCK(tdctx);
830 TAILQ_INSERT_TAIL(&tdctx->tdio_list, tdio, link);
831 DSCHED_THREAD_CTX_UNLOCK(tdctx);
832 atomic_set_int(&tdio->flags, DSCHED_LINKED_THREAD_CTX);
835 atomic_add_int(&dsched_stats.tdio_allocations, 1);
840 struct dsched_disk_ctx *
841 dsched_disk_ctx_alloc(struct disk *dp, struct dsched_policy *pol)
843 struct dsched_disk_ctx *diskctx;
845 diskctx = objcache_get(dsched_diskctx_cache, M_WAITOK);
846 bzero(diskctx, DSCHED_DISK_CTX_MAX_SZ);
847 dsched_disk_ctx_ref(diskctx);
849 DSCHED_DISK_CTX_LOCKINIT(diskctx);
850 TAILQ_INIT(&diskctx->tdio_list);
852 atomic_add_int(&dsched_stats.diskctx_allocations, 1);
853 if (pol->new_diskctx)
854 pol->new_diskctx(diskctx);
859 struct dsched_thread_ctx *
860 dsched_thread_ctx_alloc(struct proc *p)
862 struct dsched_thread_ctx *tdctx;
863 struct dsched_thread_io *tdio;
864 struct disk *dp = NULL;
866 tdctx = objcache_get(dsched_tdctx_cache, M_WAITOK);
867 bzero(tdctx, DSCHED_THREAD_CTX_MAX_SZ);
868 dsched_thread_ctx_ref(tdctx);
870 kprintf("dsched_thread_ctx_alloc, new tdctx = %p\n", tdctx);
872 DSCHED_THREAD_CTX_LOCKINIT(tdctx);
873 TAILQ_INIT(&tdctx->tdio_list);
876 DSCHED_GLOBAL_THREAD_CTX_LOCK();
877 while ((dp = disk_enumerate(dp))) {
878 tdio = dsched_thread_io_alloc(dp, tdctx, dp->d_sched_policy);
881 TAILQ_INSERT_TAIL(&dsched_tdctx_list, tdctx, link);
882 DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
884 atomic_add_int(&dsched_stats.tdctx_allocations, 1);
885 /* XXX: no callback here */
890 policy_new(struct disk *dp, struct dsched_policy *pol) {
891 struct dsched_thread_ctx *tdctx;
892 struct dsched_disk_ctx *diskctx;
893 struct dsched_thread_io *tdio;
895 diskctx = dsched_disk_ctx_alloc(dp, pol);
896 dsched_disk_ctx_ref(diskctx);
897 dsched_set_disk_priv(dp, diskctx);
899 DSCHED_GLOBAL_THREAD_CTX_LOCK();
900 TAILQ_FOREACH(tdctx, &dsched_tdctx_list, link) {
901 tdio = dsched_thread_io_alloc(dp, tdctx, pol);
903 DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
908 policy_destroy(struct disk *dp) {
909 struct dsched_disk_ctx *diskctx;
911 diskctx = dsched_get_disk_priv(dp);
912 KKASSERT(diskctx != NULL);
914 dsched_disk_ctx_unref(diskctx); /* from prepare */
915 dsched_disk_ctx_unref(diskctx); /* from alloc */
917 dsched_set_disk_priv(dp, NULL);
921 dsched_new_buf(struct buf *bp)
923 struct dsched_thread_ctx *tdctx = NULL;
925 if (dsched_inited == 0)
928 if (curproc != NULL) {
929 tdctx = dsched_get_proc_priv(curproc);
931 /* This is a kernel thread, so no proc info is available */
932 tdctx = dsched_get_thread_priv(curthread);
937 * XXX: hack. we don't want this assert because we aren't catching all
938 * threads. mi_startup() is still getting away without an tdctx.
941 /* by now we should have an tdctx. if not, something bad is going on */
942 KKASSERT(tdctx != NULL);
946 dsched_thread_ctx_ref(tdctx);
948 dsched_set_buf_priv(bp, tdctx);
952 dsched_exit_buf(struct buf *bp)
954 struct dsched_thread_ctx *tdctx;
956 tdctx = dsched_get_buf_priv(bp);
958 dsched_clr_buf_priv(bp);
959 dsched_thread_ctx_unref(tdctx);
964 dsched_new_proc(struct proc *p)
966 struct dsched_thread_ctx *tdctx;
968 if (dsched_inited == 0)
973 tdctx = dsched_thread_ctx_alloc(p);
975 dsched_thread_ctx_ref(tdctx);
977 dsched_set_proc_priv(p, tdctx);
978 atomic_add_int(&dsched_stats.nprocs, 1);
983 dsched_new_thread(struct thread *td)
985 struct dsched_thread_ctx *tdctx;
987 if (dsched_inited == 0)
990 KKASSERT(td != NULL);
992 tdctx = dsched_thread_ctx_alloc(NULL);
994 dsched_thread_ctx_ref(tdctx);
996 dsched_set_thread_priv(td, tdctx);
997 atomic_add_int(&dsched_stats.nthreads, 1);
1001 dsched_exit_proc(struct proc *p)
1003 struct dsched_thread_ctx *tdctx;
1005 if (dsched_inited == 0)
1008 KKASSERT(p != NULL);
1010 tdctx = dsched_get_proc_priv(p);
1011 KKASSERT(tdctx != NULL);
1013 tdctx->dead = 0xDEAD;
1014 dsched_set_proc_priv(p, NULL);
1016 dsched_thread_ctx_unref(tdctx); /* one for alloc, */
1017 dsched_thread_ctx_unref(tdctx); /* one for ref */
1018 atomic_subtract_int(&dsched_stats.nprocs, 1);
1023 dsched_exit_thread(struct thread *td)
1025 struct dsched_thread_ctx *tdctx;
1027 if (dsched_inited == 0)
1030 KKASSERT(td != NULL);
1032 tdctx = dsched_get_thread_priv(td);
1033 KKASSERT(tdctx != NULL);
1035 tdctx->dead = 0xDEAD;
1036 dsched_set_thread_priv(td, 0);
1038 dsched_thread_ctx_unref(tdctx); /* one for alloc, */
1039 dsched_thread_ctx_unref(tdctx); /* one for ref */
1040 atomic_subtract_int(&dsched_stats.nthreads, 1);
1043 struct dsched_thread_io *
1044 dsched_new_policy_thread_tdio(struct dsched_disk_ctx *diskctx,
1045 struct dsched_policy *pol) {
1046 struct dsched_thread_ctx *tdctx;
1047 struct dsched_thread_io *tdio;
1049 DSCHED_GLOBAL_THREAD_CTX_LOCK();
1051 tdctx = dsched_get_thread_priv(curthread);
1052 KKASSERT(tdctx != NULL);
1053 tdio = dsched_thread_io_alloc(diskctx->dp, tdctx, pol);
1055 DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
1060 /* DEFAULT NOOP POLICY */
1063 noop_prepare(struct dsched_disk_ctx *diskctx)
1069 noop_teardown(struct dsched_disk_ctx *diskctx)
1075 noop_cancel(struct dsched_disk_ctx *diskctx)
1081 noop_queue(struct dsched_disk_ctx *diskctx, struct dsched_thread_io *tdio,
1084 dsched_strategy_raw(diskctx->dp, bio);
1086 dsched_strategy_async(diskctx->dp, bio, noop_completed, NULL);
1097 dsched_tdio_cache = objcache_create("dsched-tdio-cache", 0, 0,
1099 objcache_malloc_alloc,
1100 objcache_malloc_free,
1101 &dsched_thread_io_malloc_args );
1103 dsched_tdctx_cache = objcache_create("dsched-tdctx-cache", 0, 0,
1105 objcache_malloc_alloc,
1106 objcache_malloc_free,
1107 &dsched_thread_ctx_malloc_args );
1109 dsched_diskctx_cache = objcache_create("dsched-diskctx-cache", 0, 0,
1111 objcache_malloc_alloc,
1112 objcache_malloc_free,
1113 &dsched_disk_ctx_malloc_args );
1115 bzero(&dsched_stats, sizeof(struct dsched_stats));
1117 lockinit(&dsched_lock, "dsched lock", 0, LK_CANRECURSE);
1118 DSCHED_GLOBAL_THREAD_CTX_LOCKINIT();
1120 dsched_register(&dsched_noop_policy);
1130 SYSINIT(subr_dsched_register, SI_SUB_CREATE_INIT-1, SI_ORDER_FIRST, dsched_init, NULL);
1131 SYSUNINIT(subr_dsched_register, SI_SUB_CREATE_INIT-1, SI_ORDER_ANY, dsched_uninit, NULL);
1137 sysctl_dsched_stats(SYSCTL_HANDLER_ARGS)
1139 return (sysctl_handle_opaque(oidp, &dsched_stats, sizeof(struct dsched_stats), req));
1143 sysctl_dsched_list_policies(SYSCTL_HANDLER_ARGS)
1145 struct dsched_policy *pol = NULL;
1146 int error, first = 1;
1148 lockmgr(&dsched_lock, LK_EXCLUSIVE);
1150 while ((pol = dsched_policy_enumerate(pol))) {
1152 error = SYSCTL_OUT(req, " ", 1);
1158 error = SYSCTL_OUT(req, pol->name, strlen(pol->name));
1164 lockmgr(&dsched_lock, LK_RELEASE);
1166 error = SYSCTL_OUT(req, "", 1);
1172 sysctl_dsched_policy(SYSCTL_HANDLER_ARGS)
1174 char buf[DSCHED_POLICY_NAME_LENGTH];
1175 struct dsched_disk_ctx *diskctx = arg1;
1176 struct dsched_policy *pol = NULL;
1179 if (diskctx == NULL) {
1183 lockmgr(&dsched_lock, LK_EXCLUSIVE);
1185 pol = diskctx->dp->d_sched_policy;
1186 memcpy(buf, pol->name, DSCHED_POLICY_NAME_LENGTH);
1188 error = sysctl_handle_string(oidp, buf, DSCHED_POLICY_NAME_LENGTH, req);
1189 if (error || req->newptr == NULL) {
1190 lockmgr(&dsched_lock, LK_RELEASE);
1194 pol = dsched_find_policy(buf);
1196 lockmgr(&dsched_lock, LK_RELEASE);
1200 dsched_switch(diskctx->dp, pol);
1202 lockmgr(&dsched_lock, LK_RELEASE);
1208 sysctl_dsched_default_policy(SYSCTL_HANDLER_ARGS)
1210 char buf[DSCHED_POLICY_NAME_LENGTH];
1211 struct dsched_policy *pol = NULL;
1214 lockmgr(&dsched_lock, LK_EXCLUSIVE);
1216 pol = default_policy;
1217 memcpy(buf, pol->name, DSCHED_POLICY_NAME_LENGTH);
1219 error = sysctl_handle_string(oidp, buf, DSCHED_POLICY_NAME_LENGTH, req);
1220 if (error || req->newptr == NULL) {
1221 lockmgr(&dsched_lock, LK_RELEASE);
1225 pol = dsched_find_policy(buf);
1227 lockmgr(&dsched_lock, LK_RELEASE);
1232 default_policy = pol;
1234 lockmgr(&dsched_lock, LK_RELEASE);
1239 SYSCTL_NODE(, OID_AUTO, dsched, CTLFLAG_RD, NULL,
1240 "Disk Scheduler Framework (dsched) magic");
1241 SYSCTL_NODE(_dsched, OID_AUTO, policy, CTLFLAG_RW, NULL,
1242 "List of disks and their policies");
1243 SYSCTL_INT(_dsched, OID_AUTO, debug, CTLFLAG_RW, &dsched_debug_enable,
1244 0, "Enable dsched debugging");
1245 SYSCTL_PROC(_dsched, OID_AUTO, stats, CTLTYPE_OPAQUE|CTLFLAG_RD,
1246 0, sizeof(struct dsched_stats), sysctl_dsched_stats, "dsched_stats",
1247 "dsched statistics");
1248 SYSCTL_PROC(_dsched, OID_AUTO, policies, CTLTYPE_STRING|CTLFLAG_RD,
1249 NULL, 0, sysctl_dsched_list_policies, "A", "names of available policies");
1250 SYSCTL_PROC(_dsched_policy, OID_AUTO, default, CTLTYPE_STRING|CTLFLAG_RW,
1251 NULL, 0, sysctl_dsched_default_policy, "A", "default dsched policy");
1254 dsched_sysctl_add_disk(struct dsched_disk_ctx *diskctx, char *name)
1256 if (!(diskctx->flags & DSCHED_SYSCTL_CTX_INITED)) {
1257 diskctx->flags |= DSCHED_SYSCTL_CTX_INITED;
1258 sysctl_ctx_init(&diskctx->sysctl_ctx);
1261 SYSCTL_ADD_PROC(&diskctx->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_dsched_policy),
1262 OID_AUTO, name, CTLTYPE_STRING|CTLFLAG_RW,
1263 diskctx, 0, sysctl_dsched_policy, "A", "policy");