2 * Copyright (c) 2009, 2010 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Alex Hornung <ahornung@gmail.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/kernel.h>
38 #include <sys/sysctl.h>
41 #include <sys/diskslice.h>
43 #include <sys/malloc.h>
44 #include <machine/md_var.h>
45 #include <sys/ctype.h>
46 #include <sys/syslog.h>
47 #include <sys/device.h>
48 #include <sys/msgport.h>
49 #include <sys/msgport2.h>
51 #include <sys/dsched.h>
52 #include <sys/fcntl.h>
53 #include <machine/varargs.h>
55 TAILQ_HEAD(tdio_list_head, dsched_thread_io);
57 MALLOC_DEFINE(M_DSCHED, "dsched", "dsched allocs");
59 static dsched_prepare_t noop_prepare;
60 static dsched_teardown_t noop_teardown;
61 static dsched_cancel_t noop_cancel;
62 static dsched_queue_t noop_queue;
64 static void dsched_sysctl_add_disk(struct dsched_disk_ctx *diskctx, char *name);
65 static void dsched_disk_ctx_destroy(struct dsched_disk_ctx *diskctx);
66 static void dsched_thread_io_destroy(struct dsched_thread_io *tdio);
67 static void dsched_thread_ctx_destroy(struct dsched_thread_ctx *tdctx);
69 static int dsched_inited = 0;
70 static int default_set = 0;
72 struct lock dsched_lock;
73 static int dsched_debug_enable = 0;
75 struct dsched_stats dsched_stats;
77 struct objcache_malloc_args dsched_disk_ctx_malloc_args = {
78 DSCHED_DISK_CTX_MAX_SZ, M_DSCHED };
79 struct objcache_malloc_args dsched_thread_io_malloc_args = {
80 DSCHED_THREAD_IO_MAX_SZ, M_DSCHED };
81 struct objcache_malloc_args dsched_thread_ctx_malloc_args = {
82 DSCHED_THREAD_CTX_MAX_SZ, M_DSCHED };
84 static struct objcache *dsched_diskctx_cache;
85 static struct objcache *dsched_tdctx_cache;
86 static struct objcache *dsched_tdio_cache;
88 TAILQ_HEAD(, dsched_thread_ctx) dsched_tdctx_list =
89 TAILQ_HEAD_INITIALIZER(dsched_tdctx_list);
91 struct lock dsched_tdctx_lock;
93 static struct dsched_policy_head dsched_policy_list =
94 TAILQ_HEAD_INITIALIZER(dsched_policy_list);
96 static struct dsched_policy dsched_noop_policy = {
99 .prepare = noop_prepare,
100 .teardown = noop_teardown,
101 .cancel_all = noop_cancel,
102 .bio_queue = noop_queue
105 static struct dsched_policy *default_policy = &dsched_noop_policy;
108 * dsched_debug() is a SYSCTL and TUNABLE controlled debug output function
112 dsched_debug(int level, char *fmt, ...)
117 if (level <= dsched_debug_enable)
125 * Called on disk_create()
126 * tries to read which policy to use from loader.conf, if there's
127 * none specified, the default policy is used.
130 dsched_disk_create_callback(struct disk *dp, const char *head_name, int unit)
132 char tunable_key[SPECNAMELEN + 48];
133 char sched_policy[DSCHED_POLICY_NAME_LENGTH];
135 struct dsched_policy *policy = NULL;
137 /* Also look for serno stuff? */
138 /* kprintf("dsched_disk_create_callback() for disk %s%d\n", head_name, unit); */
139 lockmgr(&dsched_lock, LK_EXCLUSIVE);
141 ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.%s%d",
143 if (TUNABLE_STR_FETCH(tunable_key, sched_policy,
144 sizeof(sched_policy)) != 0) {
145 policy = dsched_find_policy(sched_policy);
148 ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.%s",
150 for (ptr = tunable_key; *ptr; ptr++) {
154 if (!policy && (TUNABLE_STR_FETCH(tunable_key, sched_policy,
155 sizeof(sched_policy)) != 0)) {
156 policy = dsched_find_policy(sched_policy);
159 ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.default");
160 if (!policy && !default_set && (TUNABLE_STR_FETCH(tunable_key, sched_policy,
161 sizeof(sched_policy)) != 0)) {
162 policy = dsched_find_policy(sched_policy);
166 if (!default_set && bootverbose) {
168 "No policy for %s%d specified, "
169 "or policy not found\n",
172 dsched_set_policy(dp, default_policy);
174 dsched_set_policy(dp, policy);
177 if (strncmp(head_name, "mapper/", strlen("mapper/")) == 0)
178 ksnprintf(tunable_key, sizeof(tunable_key), "%s", head_name);
180 ksnprintf(tunable_key, sizeof(tunable_key), "%s%d", head_name, unit);
181 for (ptr = tunable_key; *ptr; ptr++) {
185 dsched_sysctl_add_disk(
186 (struct dsched_disk_ctx *)dsched_get_disk_priv(dp),
189 lockmgr(&dsched_lock, LK_RELEASE);
193 * Called from disk_setdiskinfo (or rather _setdiskinfo). This will check if
194 * there's any policy associated with the serial number of the device.
197 dsched_disk_update_callback(struct disk *dp, struct disk_info *info)
199 char tunable_key[SPECNAMELEN + 48];
200 char sched_policy[DSCHED_POLICY_NAME_LENGTH];
201 struct dsched_policy *policy = NULL;
203 if (info->d_serialno == NULL)
206 lockmgr(&dsched_lock, LK_EXCLUSIVE);
208 ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.%s",
211 if((TUNABLE_STR_FETCH(tunable_key, sched_policy,
212 sizeof(sched_policy)) != 0)) {
213 policy = dsched_find_policy(sched_policy);
217 dsched_switch(dp, policy);
220 dsched_sysctl_add_disk(
221 (struct dsched_disk_ctx *)dsched_get_disk_priv(dp),
224 lockmgr(&dsched_lock, LK_RELEASE);
228 * Called on disk_destroy()
229 * shuts down the scheduler core and cancels all remaining bios
232 dsched_disk_destroy_callback(struct disk *dp)
234 struct dsched_policy *old_policy;
235 struct dsched_disk_ctx *diskctx;
237 lockmgr(&dsched_lock, LK_EXCLUSIVE);
239 diskctx = dsched_get_disk_priv(dp);
241 old_policy = dp->d_sched_policy;
242 dp->d_sched_policy = &dsched_noop_policy;
243 old_policy->cancel_all(dsched_get_disk_priv(dp));
244 old_policy->teardown(dsched_get_disk_priv(dp));
246 if (diskctx->flags & DSCHED_SYSCTL_CTX_INITED)
247 sysctl_ctx_free(&diskctx->sysctl_ctx);
250 atomic_subtract_int(&old_policy->ref_count, 1);
251 KKASSERT(old_policy->ref_count >= 0);
253 lockmgr(&dsched_lock, LK_RELEASE);
258 dsched_queue(struct disk *dp, struct bio *bio)
260 struct dsched_thread_ctx *tdctx;
261 struct dsched_thread_io *tdio;
262 struct dsched_disk_ctx *diskctx;
264 int found = 0, error = 0;
266 tdctx = dsched_get_buf_priv(bio->bio_buf);
268 /* We don't handle this case, let dsched dispatch */
269 atomic_add_int(&dsched_stats.no_tdctx, 1);
270 dsched_strategy_raw(dp, bio);
274 DSCHED_THREAD_CTX_LOCK(tdctx);
276 KKASSERT(!TAILQ_EMPTY(&tdctx->tdio_list));
279 * iterate in reverse to make sure we find the most up-to-date
280 * tdio for a given disk. After a switch it may take some time
281 * for everything to clean up.
283 TAILQ_FOREACH_REVERSE(tdio, &tdctx->tdio_list, tdio_list_head, link) {
284 if (tdio->dp == dp) {
285 dsched_thread_io_ref(tdio);
291 DSCHED_THREAD_CTX_UNLOCK(tdctx);
292 dsched_clr_buf_priv(bio->bio_buf);
293 dsched_thread_ctx_unref(tdctx); /* acquired on new_buf */
295 KKASSERT(found == 1);
296 diskctx = dsched_get_disk_priv(dp);
297 dsched_disk_ctx_ref(diskctx);
299 if (dp->d_sched_policy != &dsched_noop_policy)
300 KKASSERT(tdio->debug_policy == dp->d_sched_policy);
302 KKASSERT(tdio->debug_inited == 0xF00F1234);
304 error = dp->d_sched_policy->bio_queue(diskctx, tdio, bio);
307 dsched_strategy_raw(dp, bio);
309 dsched_disk_ctx_unref(diskctx);
310 dsched_thread_io_unref(tdio);
315 * Called from each module_init or module_attach of each policy
316 * registers the policy in the local policy list.
319 dsched_register(struct dsched_policy *d_policy)
321 struct dsched_policy *policy;
324 lockmgr(&dsched_lock, LK_EXCLUSIVE);
326 policy = dsched_find_policy(d_policy->name);
329 TAILQ_INSERT_TAIL(&dsched_policy_list, d_policy, link);
330 atomic_add_int(&d_policy->ref_count, 1);
332 dsched_debug(LOG_ERR, "Policy with name %s already registered!\n",
337 lockmgr(&dsched_lock, LK_RELEASE);
342 * Called from each module_detach of each policy
343 * unregisters the policy
346 dsched_unregister(struct dsched_policy *d_policy)
348 struct dsched_policy *policy;
350 lockmgr(&dsched_lock, LK_EXCLUSIVE);
351 policy = dsched_find_policy(d_policy->name);
354 if (policy->ref_count > 1) {
355 lockmgr(&dsched_lock, LK_RELEASE);
358 TAILQ_REMOVE(&dsched_policy_list, policy, link);
359 atomic_subtract_int(&policy->ref_count, 1);
360 KKASSERT(policy->ref_count == 0);
362 lockmgr(&dsched_lock, LK_RELEASE);
369 * switches the policy by first removing the old one and then
370 * enabling the new one.
373 dsched_switch(struct disk *dp, struct dsched_policy *new_policy)
375 struct dsched_policy *old_policy;
377 /* If we are asked to set the same policy, do nothing */
378 if (dp->d_sched_policy == new_policy)
381 /* lock everything down, diskwise */
382 lockmgr(&dsched_lock, LK_EXCLUSIVE);
383 old_policy = dp->d_sched_policy;
385 atomic_subtract_int(&old_policy->ref_count, 1);
386 KKASSERT(old_policy->ref_count >= 0);
388 dp->d_sched_policy = &dsched_noop_policy;
389 old_policy->teardown(dsched_get_disk_priv(dp));
392 /* Bring everything back to life */
393 dsched_set_policy(dp, new_policy);
394 lockmgr(&dsched_lock, LK_RELEASE);
401 * Loads a given policy and attaches it to the specified disk.
402 * Also initializes the core for the policy
405 dsched_set_policy(struct disk *dp, struct dsched_policy *new_policy)
409 /* Check if it is locked already. if not, we acquire the devfs lock */
410 if (!(lockstatus(&dsched_lock, curthread)) == LK_EXCLUSIVE) {
411 lockmgr(&dsched_lock, LK_EXCLUSIVE);
415 DSCHED_GLOBAL_THREAD_CTX_LOCK();
417 policy_new(dp, new_policy);
418 new_policy->prepare(dsched_get_disk_priv(dp));
419 dp->d_sched_policy = new_policy;
421 DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
423 atomic_add_int(&new_policy->ref_count, 1);
424 kprintf("disk scheduler: set policy of %s to %s\n", dp->d_cdev->si_name,
427 /* If we acquired the lock, we also get rid of it */
429 lockmgr(&dsched_lock, LK_RELEASE);
432 struct dsched_policy*
433 dsched_find_policy(char *search)
435 struct dsched_policy *policy;
436 struct dsched_policy *policy_found = NULL;
439 /* Check if it is locked already. if not, we acquire the devfs lock */
440 if (!(lockstatus(&dsched_lock, curthread)) == LK_EXCLUSIVE) {
441 lockmgr(&dsched_lock, LK_EXCLUSIVE);
445 TAILQ_FOREACH(policy, &dsched_policy_list, link) {
446 if (!strcmp(policy->name, search)) {
447 policy_found = policy;
452 /* If we acquired the lock, we also get rid of it */
454 lockmgr(&dsched_lock, LK_RELEASE);
460 dsched_find_disk(char *search)
462 struct disk *dp_found = NULL;
463 struct disk *dp = NULL;
465 while((dp = disk_enumerate(dp))) {
466 if (!strcmp(dp->d_cdev->si_name, search)) {
476 dsched_disk_enumerate(struct disk *dp, struct dsched_policy *policy)
478 while ((dp = disk_enumerate(dp))) {
479 if (dp->d_sched_policy == policy)
486 struct dsched_policy *
487 dsched_policy_enumerate(struct dsched_policy *pol)
490 return (TAILQ_FIRST(&dsched_policy_list));
492 return (TAILQ_NEXT(pol, link));
496 dsched_cancel_bio(struct bio *bp)
498 bp->bio_buf->b_error = ENXIO;
499 bp->bio_buf->b_flags |= B_ERROR;
500 bp->bio_buf->b_resid = bp->bio_buf->b_bcount;
506 dsched_strategy_raw(struct disk *dp, struct bio *bp)
509 * Ideally, this stuff shouldn't be needed... but just in case, we leave it in
512 KASSERT(dp->d_rawdev != NULL, ("dsched_strategy_raw sees NULL d_rawdev!!"));
513 if(bp->bio_track != NULL) {
514 dsched_debug(LOG_INFO,
515 "dsched_strategy_raw sees non-NULL bio_track!! "
517 bp->bio_track = NULL;
519 dev_dstrategy(dp->d_rawdev, bp);
523 dsched_strategy_sync(struct disk *dp, struct bio *bio)
525 struct buf *bp, *nbp;
533 nbp->b_cmd = bp->b_cmd;
534 nbp->b_bufsize = bp->b_bufsize;
535 nbp->b_runningbufspace = bp->b_runningbufspace;
536 nbp->b_bcount = bp->b_bcount;
537 nbp->b_resid = bp->b_resid;
538 nbp->b_data = bp->b_data;
541 * Buffers undergoing device I/O do not need a kvabase/size.
543 nbp->b_kvabase = bp->b_kvabase;
544 nbp->b_kvasize = bp->b_kvasize;
546 nbp->b_dirtyend = bp->b_dirtyend;
548 nbio->bio_done = biodone_sync;
549 nbio->bio_flags |= BIO_SYNC;
550 nbio->bio_track = NULL;
552 nbio->bio_caller_info1.ptr = dp;
553 nbio->bio_offset = bio->bio_offset;
555 dev_dstrategy(dp->d_rawdev, nbio);
556 biowait(nbio, "dschedsync");
557 bp->b_resid = nbp->b_resid;
558 bp->b_error = nbp->b_error;
561 nbp->b_kvabase = NULL;
568 dsched_strategy_async(struct disk *dp, struct bio *bio, biodone_t *done, void *priv)
572 nbio = push_bio(bio);
573 nbio->bio_done = done;
574 nbio->bio_offset = bio->bio_offset;
576 dsched_set_bio_dp(nbio, dp);
577 dsched_set_bio_priv(nbio, priv);
579 getmicrotime(&nbio->bio_caller_info3.tv);
580 dev_dstrategy(dp->d_rawdev, nbio);
584 * A special bio done call back function
585 * used by policy having request polling implemented.
588 request_polling_biodone(struct bio *bp)
590 struct dsched_disk_ctx *diskctx = NULL;
591 struct disk *dp = NULL;
593 struct dsched_policy *policy;
595 dp = dsched_get_bio_dp(bp);
596 policy = dp->d_sched_policy;
597 diskctx = dsched_get_disk_priv(dp);
598 KKASSERT(diskctx && policy);
599 dsched_disk_ctx_ref(diskctx);
603 * the bio_done function should not be blocked !
605 if (diskctx->dp->d_sched_policy->bio_done)
606 diskctx->dp->d_sched_policy->bio_done(bp);
611 atomic_subtract_int(&diskctx->current_tag_queue_depth, 1);
613 /* call the polling function,
615 * the polling function should not be blocked!
617 if (policy->polling_func)
618 policy->polling_func(diskctx);
620 dsched_debug(0, "dsched: the policy uses request polling without a polling function!\n");
621 dsched_disk_ctx_unref(diskctx);
625 * A special dsched strategy used by policy having request polling
626 * (polling function) implemented.
628 * The strategy is the just like dsched_strategy_async(), but
629 * the biodone call back is set to a preset one.
631 * If the policy needs its own biodone callback, it should
632 * register it in the policy structure. (bio_done field)
634 * The current_tag_queue_depth is maintained by this function
635 * and the request_polling_biodone() function
639 dsched_strategy_request_polling(struct disk *dp, struct bio *bio, struct dsched_disk_ctx *diskctx)
641 atomic_add_int(&diskctx->current_tag_queue_depth, 1);
642 dsched_strategy_async(dp, bio, request_polling_biodone, dsched_get_bio_priv(bio));
646 * Ref and deref various structures. The 1->0 transition of the reference
647 * count actually transitions 1->0x80000000 and causes the object to be
648 * destroyed. It is possible for transitory references to occur on the
649 * object while it is being destroyed. We use bit 31 to indicate that
650 * destruction is in progress and to prevent nested destructions.
653 dsched_disk_ctx_ref(struct dsched_disk_ctx *diskctx)
657 refcount = atomic_fetchadd_int(&diskctx->refcount, 1);
661 dsched_thread_io_ref(struct dsched_thread_io *tdio)
665 refcount = atomic_fetchadd_int(&tdio->refcount, 1);
669 dsched_thread_ctx_ref(struct dsched_thread_ctx *tdctx)
673 refcount = atomic_fetchadd_int(&tdctx->refcount, 1);
677 dsched_disk_ctx_unref(struct dsched_disk_ctx *diskctx)
683 * Handle 1->0 transitions for diskctx and nested destruction
684 * recursions. If the refs are already in destruction mode (bit 31
685 * set) on the 1->0 transition we don't try to destruct it again.
687 * 0x80000001->0x80000000 transitions are handled normally and
688 * thus avoid nested dstruction.
691 refs = diskctx->refcount;
695 KKASSERT(((refs ^ nrefs) & 0x80000000) == 0);
697 if (atomic_cmpset_int(&diskctx->refcount, refs, nrefs))
702 if (atomic_cmpset_int(&diskctx->refcount, refs, nrefs)) {
703 dsched_disk_ctx_destroy(diskctx);
711 dsched_disk_ctx_destroy(struct dsched_disk_ctx *diskctx)
713 struct dsched_thread_io *tdio;
716 kprintf("diskctx (%p) destruction started, trace:\n", diskctx);
719 lockmgr(&diskctx->lock, LK_EXCLUSIVE);
720 while ((tdio = TAILQ_FIRST(&diskctx->tdio_list)) != NULL) {
721 KKASSERT(tdio->flags & DSCHED_LINKED_DISK_CTX);
722 TAILQ_REMOVE(&diskctx->tdio_list, tdio, dlink);
723 atomic_clear_int(&tdio->flags, DSCHED_LINKED_DISK_CTX);
724 tdio->diskctx = NULL;
725 /* XXX tdio->diskctx->dp->d_sched_policy->destroy_tdio(tdio);*/
726 dsched_thread_io_unref(tdio);
728 lockmgr(&diskctx->lock, LK_RELEASE);
729 if (diskctx->dp->d_sched_policy->destroy_diskctx)
730 diskctx->dp->d_sched_policy->destroy_diskctx(diskctx);
731 KKASSERT(diskctx->refcount == 0x80000000);
732 objcache_put(dsched_diskctx_cache, diskctx);
733 atomic_subtract_int(&dsched_stats.diskctx_allocations, 1);
737 dsched_thread_io_unref(struct dsched_thread_io *tdio)
743 * Handle 1->0 transitions for tdio and nested destruction
744 * recursions. If the refs are already in destruction mode (bit 31
745 * set) on the 1->0 transition we don't try to destruct it again.
747 * 0x80000001->0x80000000 transitions are handled normally and
748 * thus avoid nested dstruction.
751 refs = tdio->refcount;
755 KKASSERT(((refs ^ nrefs) & 0x80000000) == 0);
757 if (atomic_cmpset_int(&tdio->refcount, refs, nrefs))
762 if (atomic_cmpset_int(&tdio->refcount, refs, nrefs)) {
763 dsched_thread_io_destroy(tdio);
770 dsched_thread_io_destroy(struct dsched_thread_io *tdio)
772 struct dsched_thread_ctx *tdctx;
773 struct dsched_disk_ctx *diskctx;
776 kprintf("tdio (%p) destruction started, trace:\n", tdio);
779 KKASSERT(tdio->qlength == 0);
781 while ((diskctx = tdio->diskctx) != NULL) {
782 dsched_disk_ctx_ref(diskctx);
783 lockmgr(&diskctx->lock, LK_EXCLUSIVE);
784 if (diskctx != tdio->diskctx) {
785 lockmgr(&diskctx->lock, LK_RELEASE);
786 dsched_disk_ctx_unref(diskctx);
789 KKASSERT(tdio->flags & DSCHED_LINKED_DISK_CTX);
790 if (diskctx->dp->d_sched_policy->destroy_tdio)
791 diskctx->dp->d_sched_policy->destroy_tdio(tdio);
792 TAILQ_REMOVE(&diskctx->tdio_list, tdio, dlink);
793 atomic_clear_int(&tdio->flags, DSCHED_LINKED_DISK_CTX);
794 tdio->diskctx = NULL;
795 lockmgr(&diskctx->lock, LK_RELEASE);
796 dsched_disk_ctx_unref(diskctx);
798 while ((tdctx = tdio->tdctx) != NULL) {
799 dsched_thread_ctx_ref(tdctx);
800 lockmgr(&tdctx->lock, LK_EXCLUSIVE);
801 if (tdctx != tdio->tdctx) {
802 lockmgr(&tdctx->lock, LK_RELEASE);
803 dsched_thread_ctx_unref(tdctx);
806 KKASSERT(tdio->flags & DSCHED_LINKED_THREAD_CTX);
807 TAILQ_REMOVE(&tdctx->tdio_list, tdio, link);
808 atomic_clear_int(&tdio->flags, DSCHED_LINKED_THREAD_CTX);
810 lockmgr(&tdctx->lock, LK_RELEASE);
811 dsched_thread_ctx_unref(tdctx);
813 KKASSERT(tdio->refcount == 0x80000000);
814 objcache_put(dsched_tdio_cache, tdio);
815 atomic_subtract_int(&dsched_stats.tdio_allocations, 1);
817 dsched_disk_ctx_unref(diskctx);
822 dsched_thread_ctx_unref(struct dsched_thread_ctx *tdctx)
828 * Handle 1->0 transitions for tdctx and nested destruction
829 * recursions. If the refs are already in destruction mode (bit 31
830 * set) on the 1->0 transition we don't try to destruct it again.
832 * 0x80000001->0x80000000 transitions are handled normally and
833 * thus avoid nested dstruction.
836 refs = tdctx->refcount;
840 KKASSERT(((refs ^ nrefs) & 0x80000000) == 0);
842 if (atomic_cmpset_int(&tdctx->refcount, refs, nrefs))
847 if (atomic_cmpset_int(&tdctx->refcount, refs, nrefs)) {
848 dsched_thread_ctx_destroy(tdctx);
855 dsched_thread_ctx_destroy(struct dsched_thread_ctx *tdctx)
857 struct dsched_thread_io *tdio;
860 kprintf("tdctx (%p) destruction started, trace:\n", tdctx);
863 DSCHED_GLOBAL_THREAD_CTX_LOCK();
865 lockmgr(&tdctx->lock, LK_EXCLUSIVE);
867 while ((tdio = TAILQ_FIRST(&tdctx->tdio_list)) != NULL) {
868 KKASSERT(tdio->flags & DSCHED_LINKED_THREAD_CTX);
869 TAILQ_REMOVE(&tdctx->tdio_list, tdio, link);
870 atomic_clear_int(&tdio->flags, DSCHED_LINKED_THREAD_CTX);
872 dsched_thread_io_unref(tdio);
874 KKASSERT(tdctx->refcount == 0x80000000);
875 TAILQ_REMOVE(&dsched_tdctx_list, tdctx, link);
877 lockmgr(&tdctx->lock, LK_RELEASE);
879 DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
881 objcache_put(dsched_tdctx_cache, tdctx);
882 atomic_subtract_int(&dsched_stats.tdctx_allocations, 1);
885 struct dsched_thread_io *
886 dsched_thread_io_alloc(struct disk *dp, struct dsched_thread_ctx *tdctx,
887 struct dsched_policy *pol)
889 struct dsched_thread_io *tdio;
891 dsched_disk_ctx_ref(dsched_get_disk_priv(dp));
893 tdio = objcache_get(dsched_tdio_cache, M_WAITOK);
894 bzero(tdio, DSCHED_THREAD_IO_MAX_SZ);
896 /* XXX: maybe we do need another ref for the disk list for tdio */
897 dsched_thread_io_ref(tdio);
899 DSCHED_THREAD_IO_LOCKINIT(tdio);
902 tdio->diskctx = dsched_get_disk_priv(dp);
903 TAILQ_INIT(&tdio->queue);
908 lockmgr(&tdio->diskctx->lock, LK_EXCLUSIVE);
909 TAILQ_INSERT_TAIL(&tdio->diskctx->tdio_list, tdio, dlink);
910 atomic_set_int(&tdio->flags, DSCHED_LINKED_DISK_CTX);
911 lockmgr(&tdio->diskctx->lock, LK_RELEASE);
917 /* Put the tdio in the tdctx list */
918 DSCHED_THREAD_CTX_LOCK(tdctx);
919 TAILQ_INSERT_TAIL(&tdctx->tdio_list, tdio, link);
920 DSCHED_THREAD_CTX_UNLOCK(tdctx);
921 atomic_set_int(&tdio->flags, DSCHED_LINKED_THREAD_CTX);
924 tdio->debug_policy = pol;
925 tdio->debug_inited = 0xF00F1234;
927 atomic_add_int(&dsched_stats.tdio_allocations, 1);
932 struct dsched_disk_ctx *
933 dsched_disk_ctx_alloc(struct disk *dp, struct dsched_policy *pol)
935 struct dsched_disk_ctx *diskctx;
937 diskctx = objcache_get(dsched_diskctx_cache, M_WAITOK);
938 bzero(diskctx, DSCHED_DISK_CTX_MAX_SZ);
939 dsched_disk_ctx_ref(diskctx);
941 DSCHED_DISK_CTX_LOCKINIT(diskctx);
942 TAILQ_INIT(&diskctx->tdio_list);
944 * XXX: magic number 32: most device has a tag queue
946 * Better to retrive more precise value from the driver
948 diskctx->max_tag_queue_depth = 32;
949 diskctx->current_tag_queue_depth = 0;
951 atomic_add_int(&dsched_stats.diskctx_allocations, 1);
952 if (pol->new_diskctx)
953 pol->new_diskctx(diskctx);
958 struct dsched_thread_ctx *
959 dsched_thread_ctx_alloc(struct proc *p)
961 struct dsched_thread_ctx *tdctx;
962 struct dsched_thread_io *tdio;
963 struct disk *dp = NULL;
965 tdctx = objcache_get(dsched_tdctx_cache, M_WAITOK);
966 bzero(tdctx, DSCHED_THREAD_CTX_MAX_SZ);
967 dsched_thread_ctx_ref(tdctx);
969 kprintf("dsched_thread_ctx_alloc, new tdctx = %p\n", tdctx);
971 DSCHED_THREAD_CTX_LOCKINIT(tdctx);
972 TAILQ_INIT(&tdctx->tdio_list);
975 DSCHED_GLOBAL_THREAD_CTX_LOCK();
976 while ((dp = disk_enumerate(dp))) {
977 tdio = dsched_thread_io_alloc(dp, tdctx, dp->d_sched_policy);
980 TAILQ_INSERT_TAIL(&dsched_tdctx_list, tdctx, link);
981 DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
983 atomic_add_int(&dsched_stats.tdctx_allocations, 1);
984 /* XXX: no callback here */
989 policy_new(struct disk *dp, struct dsched_policy *pol) {
990 struct dsched_thread_ctx *tdctx;
991 struct dsched_disk_ctx *diskctx;
992 struct dsched_thread_io *tdio;
994 diskctx = dsched_disk_ctx_alloc(dp, pol);
995 dsched_disk_ctx_ref(diskctx);
996 dsched_set_disk_priv(dp, diskctx);
998 TAILQ_FOREACH(tdctx, &dsched_tdctx_list, link) {
999 tdio = dsched_thread_io_alloc(dp, tdctx, pol);
1004 policy_destroy(struct disk *dp) {
1005 struct dsched_disk_ctx *diskctx;
1007 diskctx = dsched_get_disk_priv(dp);
1008 KKASSERT(diskctx != NULL);
1010 dsched_disk_ctx_unref(diskctx); /* from prepare */
1011 dsched_disk_ctx_unref(diskctx); /* from alloc */
1013 dsched_set_disk_priv(dp, NULL);
1017 dsched_new_buf(struct buf *bp)
1019 struct dsched_thread_ctx *tdctx = NULL;
1021 if (dsched_inited == 0)
1024 if (curproc != NULL) {
1025 tdctx = dsched_get_proc_priv(curproc);
1027 /* This is a kernel thread, so no proc info is available */
1028 tdctx = dsched_get_thread_priv(curthread);
1033 * XXX: hack. we don't want this assert because we aren't catching all
1034 * threads. mi_startup() is still getting away without an tdctx.
1037 /* by now we should have an tdctx. if not, something bad is going on */
1038 KKASSERT(tdctx != NULL);
1042 dsched_thread_ctx_ref(tdctx);
1044 dsched_set_buf_priv(bp, tdctx);
1048 dsched_exit_buf(struct buf *bp)
1050 struct dsched_thread_ctx *tdctx;
1052 tdctx = dsched_get_buf_priv(bp);
1053 if (tdctx != NULL) {
1054 dsched_clr_buf_priv(bp);
1055 dsched_thread_ctx_unref(tdctx);
1060 dsched_new_proc(struct proc *p)
1062 struct dsched_thread_ctx *tdctx;
1064 if (dsched_inited == 0)
1067 KKASSERT(p != NULL);
1069 tdctx = dsched_thread_ctx_alloc(p);
1071 dsched_thread_ctx_ref(tdctx);
1073 dsched_set_proc_priv(p, tdctx);
1074 atomic_add_int(&dsched_stats.nprocs, 1);
1079 dsched_new_thread(struct thread *td)
1081 struct dsched_thread_ctx *tdctx;
1083 if (dsched_inited == 0)
1086 KKASSERT(td != NULL);
1088 tdctx = dsched_thread_ctx_alloc(NULL);
1090 dsched_thread_ctx_ref(tdctx);
1092 dsched_set_thread_priv(td, tdctx);
1093 atomic_add_int(&dsched_stats.nthreads, 1);
1097 dsched_exit_proc(struct proc *p)
1099 struct dsched_thread_ctx *tdctx;
1101 if (dsched_inited == 0)
1104 KKASSERT(p != NULL);
1106 tdctx = dsched_get_proc_priv(p);
1107 KKASSERT(tdctx != NULL);
1109 tdctx->dead = 0xDEAD;
1110 dsched_set_proc_priv(p, NULL);
1112 dsched_thread_ctx_unref(tdctx); /* one for alloc, */
1113 dsched_thread_ctx_unref(tdctx); /* one for ref */
1114 atomic_subtract_int(&dsched_stats.nprocs, 1);
1119 dsched_exit_thread(struct thread *td)
1121 struct dsched_thread_ctx *tdctx;
1123 if (dsched_inited == 0)
1126 KKASSERT(td != NULL);
1128 tdctx = dsched_get_thread_priv(td);
1129 KKASSERT(tdctx != NULL);
1131 tdctx->dead = 0xDEAD;
1132 dsched_set_thread_priv(td, 0);
1134 dsched_thread_ctx_unref(tdctx); /* one for alloc, */
1135 dsched_thread_ctx_unref(tdctx); /* one for ref */
1136 atomic_subtract_int(&dsched_stats.nthreads, 1);
1139 struct dsched_thread_io *
1140 dsched_new_policy_thread_tdio(struct dsched_disk_ctx *diskctx,
1141 struct dsched_policy *pol) {
1142 struct dsched_thread_ctx *tdctx;
1143 struct dsched_thread_io *tdio;
1145 DSCHED_GLOBAL_THREAD_CTX_LOCK();
1147 tdctx = dsched_get_thread_priv(curthread);
1148 KKASSERT(tdctx != NULL);
1149 tdio = dsched_thread_io_alloc(diskctx->dp, tdctx, pol);
1151 DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
1156 /* DEFAULT NOOP POLICY */
1159 noop_prepare(struct dsched_disk_ctx *diskctx)
1165 noop_teardown(struct dsched_disk_ctx *diskctx)
1171 noop_cancel(struct dsched_disk_ctx *diskctx)
1177 noop_queue(struct dsched_disk_ctx *diskctx, struct dsched_thread_io *tdio,
1180 dsched_strategy_raw(diskctx->dp, bio);
1182 dsched_strategy_async(diskctx->dp, bio, noop_completed, NULL);
1193 dsched_tdio_cache = objcache_create("dsched-tdio-cache", 0, 0,
1195 objcache_malloc_alloc,
1196 objcache_malloc_free,
1197 &dsched_thread_io_malloc_args );
1199 dsched_tdctx_cache = objcache_create("dsched-tdctx-cache", 0, 0,
1201 objcache_malloc_alloc,
1202 objcache_malloc_free,
1203 &dsched_thread_ctx_malloc_args );
1205 dsched_diskctx_cache = objcache_create("dsched-diskctx-cache", 0, 0,
1207 objcache_malloc_alloc,
1208 objcache_malloc_free,
1209 &dsched_disk_ctx_malloc_args );
1211 bzero(&dsched_stats, sizeof(struct dsched_stats));
1213 lockinit(&dsched_lock, "dsched lock", 0, LK_CANRECURSE);
1214 DSCHED_GLOBAL_THREAD_CTX_LOCKINIT();
1216 dsched_register(&dsched_noop_policy);
1226 SYSINIT(subr_dsched_register, SI_SUB_CREATE_INIT-1, SI_ORDER_FIRST, dsched_init, NULL);
1227 SYSUNINIT(subr_dsched_register, SI_SUB_CREATE_INIT-1, SI_ORDER_ANY, dsched_uninit, NULL);
1233 sysctl_dsched_stats(SYSCTL_HANDLER_ARGS)
1235 return (sysctl_handle_opaque(oidp, &dsched_stats, sizeof(struct dsched_stats), req));
1239 sysctl_dsched_list_policies(SYSCTL_HANDLER_ARGS)
1241 struct dsched_policy *pol = NULL;
1242 int error, first = 1;
1244 lockmgr(&dsched_lock, LK_EXCLUSIVE);
1246 while ((pol = dsched_policy_enumerate(pol))) {
1248 error = SYSCTL_OUT(req, " ", 1);
1254 error = SYSCTL_OUT(req, pol->name, strlen(pol->name));
1260 lockmgr(&dsched_lock, LK_RELEASE);
1262 error = SYSCTL_OUT(req, "", 1);
1268 sysctl_dsched_policy(SYSCTL_HANDLER_ARGS)
1270 char buf[DSCHED_POLICY_NAME_LENGTH];
1271 struct dsched_disk_ctx *diskctx = arg1;
1272 struct dsched_policy *pol = NULL;
1275 if (diskctx == NULL) {
1279 lockmgr(&dsched_lock, LK_EXCLUSIVE);
1281 pol = diskctx->dp->d_sched_policy;
1282 memcpy(buf, pol->name, DSCHED_POLICY_NAME_LENGTH);
1284 error = sysctl_handle_string(oidp, buf, DSCHED_POLICY_NAME_LENGTH, req);
1285 if (error || req->newptr == NULL) {
1286 lockmgr(&dsched_lock, LK_RELEASE);
1290 pol = dsched_find_policy(buf);
1292 lockmgr(&dsched_lock, LK_RELEASE);
1296 dsched_switch(diskctx->dp, pol);
1298 lockmgr(&dsched_lock, LK_RELEASE);
1304 sysctl_dsched_default_policy(SYSCTL_HANDLER_ARGS)
1306 char buf[DSCHED_POLICY_NAME_LENGTH];
1307 struct dsched_policy *pol = NULL;
1310 lockmgr(&dsched_lock, LK_EXCLUSIVE);
1312 pol = default_policy;
1313 memcpy(buf, pol->name, DSCHED_POLICY_NAME_LENGTH);
1315 error = sysctl_handle_string(oidp, buf, DSCHED_POLICY_NAME_LENGTH, req);
1316 if (error || req->newptr == NULL) {
1317 lockmgr(&dsched_lock, LK_RELEASE);
1321 pol = dsched_find_policy(buf);
1323 lockmgr(&dsched_lock, LK_RELEASE);
1328 default_policy = pol;
1330 lockmgr(&dsched_lock, LK_RELEASE);
1335 SYSCTL_NODE(, OID_AUTO, dsched, CTLFLAG_RD, NULL,
1336 "Disk Scheduler Framework (dsched) magic");
1337 SYSCTL_NODE(_dsched, OID_AUTO, policy, CTLFLAG_RW, NULL,
1338 "List of disks and their policies");
1339 SYSCTL_INT(_dsched, OID_AUTO, debug, CTLFLAG_RW, &dsched_debug_enable,
1340 0, "Enable dsched debugging");
1341 SYSCTL_PROC(_dsched, OID_AUTO, stats, CTLTYPE_OPAQUE|CTLFLAG_RD,
1342 0, sizeof(struct dsched_stats), sysctl_dsched_stats, "dsched_stats",
1343 "dsched statistics");
1344 SYSCTL_PROC(_dsched, OID_AUTO, policies, CTLTYPE_STRING|CTLFLAG_RD,
1345 NULL, 0, sysctl_dsched_list_policies, "A", "names of available policies");
1346 SYSCTL_PROC(_dsched_policy, OID_AUTO, default, CTLTYPE_STRING|CTLFLAG_RW,
1347 NULL, 0, sysctl_dsched_default_policy, "A", "default dsched policy");
1350 dsched_sysctl_add_disk(struct dsched_disk_ctx *diskctx, char *name)
1352 if (!(diskctx->flags & DSCHED_SYSCTL_CTX_INITED)) {
1353 diskctx->flags |= DSCHED_SYSCTL_CTX_INITED;
1354 sysctl_ctx_init(&diskctx->sysctl_ctx);
1357 SYSCTL_ADD_PROC(&diskctx->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_dsched_policy),
1358 OID_AUTO, name, CTLTYPE_STRING|CTLFLAG_RW,
1359 diskctx, 0, sysctl_dsched_policy, "A", "policy");