2 * Copyright (c) 2009, 2010 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Alex Hornung <ahornung@gmail.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/kernel.h>
38 #include <sys/sysctl.h>
41 #include <sys/diskslice.h>
43 #include <sys/malloc.h>
44 #include <machine/md_var.h>
45 #include <sys/ctype.h>
46 #include <sys/syslog.h>
47 #include <sys/device.h>
48 #include <sys/msgport.h>
49 #include <sys/msgport2.h>
51 #include <sys/dsched.h>
52 #include <sys/fcntl.h>
53 #include <machine/varargs.h>
55 TAILQ_HEAD(tdio_list_head, dsched_thread_io);
57 MALLOC_DEFINE(M_DSCHED, "dsched", "dsched allocs");
59 static dsched_prepare_t noop_prepare;
60 static dsched_teardown_t noop_teardown;
61 static dsched_cancel_t noop_cancel;
62 static dsched_queue_t noop_queue;
64 static void dsched_thread_io_unref_destroy(struct dsched_thread_io *tdio);
65 static void dsched_sysctl_add_disk(struct dsched_disk_ctx *diskctx, char *name);
66 static void dsched_disk_ctx_destroy(struct dsched_disk_ctx *diskctx);
67 static void dsched_thread_io_destroy(struct dsched_thread_io *tdio);
68 static void dsched_thread_ctx_destroy(struct dsched_thread_ctx *tdctx);
70 static struct dsched_thread_io *dsched_thread_io_alloc(
71 struct disk *dp, struct dsched_thread_ctx *tdctx,
72 struct dsched_policy *pol, int tdctx_locked);
74 static int dsched_inited = 0;
75 static int default_set = 0;
77 struct lock dsched_lock;
78 static int dsched_debug_enable = 0;
80 struct dsched_stats dsched_stats;
82 struct objcache_malloc_args dsched_disk_ctx_malloc_args = {
83 DSCHED_DISK_CTX_MAX_SZ, M_DSCHED };
84 struct objcache_malloc_args dsched_thread_io_malloc_args = {
85 DSCHED_THREAD_IO_MAX_SZ, M_DSCHED };
86 struct objcache_malloc_args dsched_thread_ctx_malloc_args = {
87 DSCHED_THREAD_CTX_MAX_SZ, M_DSCHED };
89 static struct objcache *dsched_diskctx_cache;
90 static struct objcache *dsched_tdctx_cache;
91 static struct objcache *dsched_tdio_cache;
93 struct lock dsched_tdctx_lock;
95 static struct dsched_policy_head dsched_policy_list =
96 TAILQ_HEAD_INITIALIZER(dsched_policy_list);
98 static struct dsched_policy dsched_noop_policy = {
101 .prepare = noop_prepare,
102 .teardown = noop_teardown,
103 .cancel_all = noop_cancel,
104 .bio_queue = noop_queue
107 static struct dsched_policy *default_policy = &dsched_noop_policy;
110 * dsched_debug() is a SYSCTL and TUNABLE controlled debug output function
114 dsched_debug(int level, char *fmt, ...)
119 if (level <= dsched_debug_enable)
127 * Called on disk_create()
128 * tries to read which policy to use from loader.conf, if there's
129 * none specified, the default policy is used.
132 dsched_disk_create_callback(struct disk *dp, const char *head_name, int unit)
134 char tunable_key[SPECNAMELEN + 48];
135 char sched_policy[DSCHED_POLICY_NAME_LENGTH];
137 struct dsched_policy *policy = NULL;
139 /* Also look for serno stuff? */
140 lockmgr(&dsched_lock, LK_EXCLUSIVE);
142 ksnprintf(tunable_key, sizeof(tunable_key),
143 "dsched.policy.%s%d", head_name, unit);
144 if (TUNABLE_STR_FETCH(tunable_key, sched_policy,
145 sizeof(sched_policy)) != 0) {
146 policy = dsched_find_policy(sched_policy);
149 ksnprintf(tunable_key, sizeof(tunable_key),
150 "dsched.policy.%s", head_name);
152 for (ptr = tunable_key; *ptr; ptr++) {
156 if (!policy && (TUNABLE_STR_FETCH(tunable_key, sched_policy,
157 sizeof(sched_policy)) != 0)) {
158 policy = dsched_find_policy(sched_policy);
161 ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.default");
162 if (!policy && !default_set &&
163 (TUNABLE_STR_FETCH(tunable_key, sched_policy,
164 sizeof(sched_policy)) != 0)) {
165 policy = dsched_find_policy(sched_policy);
169 if (!default_set && bootverbose) {
171 "No policy for %s%d specified, "
172 "or policy not found\n",
175 dsched_set_policy(dp, default_policy);
177 dsched_set_policy(dp, policy);
180 if (strncmp(head_name, "mapper/", strlen("mapper/")) == 0)
181 ksnprintf(tunable_key, sizeof(tunable_key), "%s", head_name);
183 ksnprintf(tunable_key, sizeof(tunable_key), "%s%d", head_name, unit);
184 for (ptr = tunable_key; *ptr; ptr++) {
188 dsched_sysctl_add_disk(
189 (struct dsched_disk_ctx *)dsched_get_disk_priv(dp),
192 lockmgr(&dsched_lock, LK_RELEASE);
196 * Called from disk_setdiskinfo (or rather _setdiskinfo). This will check if
197 * there's any policy associated with the serial number of the device.
200 dsched_disk_update_callback(struct disk *dp, struct disk_info *info)
202 char tunable_key[SPECNAMELEN + 48];
203 char sched_policy[DSCHED_POLICY_NAME_LENGTH];
204 struct dsched_policy *policy = NULL;
206 if (info->d_serialno == NULL)
209 lockmgr(&dsched_lock, LK_EXCLUSIVE);
211 ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.%s",
214 if((TUNABLE_STR_FETCH(tunable_key, sched_policy,
215 sizeof(sched_policy)) != 0)) {
216 policy = dsched_find_policy(sched_policy);
220 dsched_switch(dp, policy);
223 dsched_sysctl_add_disk(
224 (struct dsched_disk_ctx *)dsched_get_disk_priv(dp),
227 lockmgr(&dsched_lock, LK_RELEASE);
231 * Called on disk_destroy()
232 * shuts down the scheduler core and cancels all remaining bios
235 dsched_disk_destroy_callback(struct disk *dp)
237 struct dsched_policy *old_policy;
238 struct dsched_disk_ctx *diskctx;
240 lockmgr(&dsched_lock, LK_EXCLUSIVE);
242 diskctx = dsched_get_disk_priv(dp);
244 old_policy = dp->d_sched_policy;
245 dp->d_sched_policy = &dsched_noop_policy;
246 old_policy->cancel_all(dsched_get_disk_priv(dp));
247 old_policy->teardown(dsched_get_disk_priv(dp));
249 if (diskctx->flags & DSCHED_SYSCTL_CTX_INITED)
250 sysctl_ctx_free(&diskctx->sysctl_ctx);
253 atomic_subtract_int(&old_policy->ref_count, 1);
254 KKASSERT(old_policy->ref_count >= 0);
256 lockmgr(&dsched_lock, LK_RELEASE);
261 * Caller must have dp->diskctx locked
264 dsched_queue(struct disk *dp, struct bio *bio)
266 struct dsched_thread_ctx *tdctx;
267 struct dsched_thread_io *tdio;
268 struct dsched_disk_ctx *diskctx;
271 if (dp->d_sched_policy == &dsched_noop_policy) {
272 dsched_clr_buf_priv(bio->bio_buf);
273 atomic_add_int(&dsched_stats.no_tdctx, 1);
274 dsched_strategy_raw(dp, bio);
279 tdctx = dsched_get_buf_priv(bio->bio_buf);
281 /* We don't handle this case, let dsched dispatch */
282 atomic_add_int(&dsched_stats.no_tdctx, 1);
283 dsched_strategy_raw(dp, bio);
287 DSCHED_THREAD_CTX_LOCK(tdctx);
291 * iterate in reverse to make sure we find the most up-to-date
292 * tdio for a given disk. After a switch it may take some time
293 * for everything to clean up.
295 TAILQ_FOREACH_REVERSE(tdio, &tdctx->tdio_list, tdio_list_head, link) {
296 if (tdio->dp == dp) {
297 dsched_thread_io_ref(tdio);
302 tdio = dsched_thread_io_alloc(dp, tdctx, dp->d_sched_policy, 1);
303 dsched_thread_io_ref(tdio);
306 DSCHED_THREAD_CTX_UNLOCK(tdctx);
307 dsched_clr_buf_priv(bio->bio_buf);
308 dsched_thread_ctx_unref(tdctx); /* acquired on new_buf */
310 diskctx = dsched_get_disk_priv(dp);
311 dsched_disk_ctx_ref(diskctx);
313 if (dp->d_sched_policy != &dsched_noop_policy)
314 KKASSERT(tdio->debug_policy == dp->d_sched_policy);
316 KKASSERT(tdio->debug_inited == 0xF00F1234);
318 error = dp->d_sched_policy->bio_queue(diskctx, tdio, bio);
321 dsched_strategy_raw(dp, bio);
323 dsched_disk_ctx_unref(diskctx);
324 dsched_thread_io_unref(tdio);
329 * Called from each module_init or module_attach of each policy
330 * registers the policy in the local policy list.
333 dsched_register(struct dsched_policy *d_policy)
335 struct dsched_policy *policy;
338 lockmgr(&dsched_lock, LK_EXCLUSIVE);
340 policy = dsched_find_policy(d_policy->name);
343 TAILQ_INSERT_TAIL(&dsched_policy_list, d_policy, link);
344 atomic_add_int(&d_policy->ref_count, 1);
346 dsched_debug(LOG_ERR, "Policy with name %s already registered!\n",
351 lockmgr(&dsched_lock, LK_RELEASE);
356 * Called from each module_detach of each policy
357 * unregisters the policy
360 dsched_unregister(struct dsched_policy *d_policy)
362 struct dsched_policy *policy;
364 lockmgr(&dsched_lock, LK_EXCLUSIVE);
365 policy = dsched_find_policy(d_policy->name);
368 if (policy->ref_count > 1) {
369 lockmgr(&dsched_lock, LK_RELEASE);
372 TAILQ_REMOVE(&dsched_policy_list, policy, link);
373 atomic_subtract_int(&policy->ref_count, 1);
374 KKASSERT(policy->ref_count == 0);
376 lockmgr(&dsched_lock, LK_RELEASE);
383 * switches the policy by first removing the old one and then
384 * enabling the new one.
387 dsched_switch(struct disk *dp, struct dsched_policy *new_policy)
389 struct dsched_policy *old_policy;
391 /* If we are asked to set the same policy, do nothing */
392 if (dp->d_sched_policy == new_policy)
395 /* lock everything down, diskwise */
396 lockmgr(&dsched_lock, LK_EXCLUSIVE);
397 old_policy = dp->d_sched_policy;
399 atomic_subtract_int(&old_policy->ref_count, 1);
400 KKASSERT(old_policy->ref_count >= 0);
402 dp->d_sched_policy = &dsched_noop_policy;
403 old_policy->teardown(dsched_get_disk_priv(dp));
406 /* Bring everything back to life */
407 dsched_set_policy(dp, new_policy);
408 lockmgr(&dsched_lock, LK_RELEASE);
415 * Loads a given policy and attaches it to the specified disk.
416 * Also initializes the core for the policy
419 dsched_set_policy(struct disk *dp, struct dsched_policy *new_policy)
423 /* Check if it is locked already. if not, we acquire the devfs lock */
424 if ((lockstatus(&dsched_lock, curthread)) != LK_EXCLUSIVE) {
425 lockmgr(&dsched_lock, LK_EXCLUSIVE);
429 DSCHED_GLOBAL_THREAD_CTX_LOCK();
431 policy_new(dp, new_policy);
432 new_policy->prepare(dsched_get_disk_priv(dp));
433 dp->d_sched_policy = new_policy;
434 atomic_add_int(&new_policy->ref_count, 1);
436 DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
438 kprintf("disk scheduler: set policy of %s to %s\n", dp->d_cdev->si_name,
441 /* If we acquired the lock, we also get rid of it */
443 lockmgr(&dsched_lock, LK_RELEASE);
446 struct dsched_policy*
447 dsched_find_policy(char *search)
449 struct dsched_policy *policy;
450 struct dsched_policy *policy_found = NULL;
453 /* Check if it is locked already. if not, we acquire the devfs lock */
454 if ((lockstatus(&dsched_lock, curthread)) != LK_EXCLUSIVE) {
455 lockmgr(&dsched_lock, LK_EXCLUSIVE);
459 TAILQ_FOREACH(policy, &dsched_policy_list, link) {
460 if (!strcmp(policy->name, search)) {
461 policy_found = policy;
466 /* If we acquired the lock, we also get rid of it */
468 lockmgr(&dsched_lock, LK_RELEASE);
477 dsched_find_disk(char *search)
480 struct disk *dp = NULL;
482 while ((dp = disk_enumerate(&marker, dp)) != NULL) {
483 if (strcmp(dp->d_cdev->si_name, search) == 0) {
484 disk_enumerate_stop(&marker, NULL);
485 /* leave ref on dp */
493 dsched_disk_enumerate(struct disk *marker, struct disk *dp,
494 struct dsched_policy *policy)
496 while ((dp = disk_enumerate(marker, dp)) != NULL) {
497 if (dp->d_sched_policy == policy)
503 struct dsched_policy *
504 dsched_policy_enumerate(struct dsched_policy *pol)
507 return (TAILQ_FIRST(&dsched_policy_list));
509 return (TAILQ_NEXT(pol, link));
513 dsched_cancel_bio(struct bio *bp)
515 bp->bio_buf->b_error = ENXIO;
516 bp->bio_buf->b_flags |= B_ERROR;
517 bp->bio_buf->b_resid = bp->bio_buf->b_bcount;
523 dsched_strategy_raw(struct disk *dp, struct bio *bp)
526 * Ideally, this stuff shouldn't be needed... but just in case, we leave it in
529 KASSERT(dp->d_rawdev != NULL, ("dsched_strategy_raw sees NULL d_rawdev!!"));
530 if(bp->bio_track != NULL) {
531 dsched_debug(LOG_INFO,
532 "dsched_strategy_raw sees non-NULL bio_track!! "
534 bp->bio_track = NULL;
536 dev_dstrategy(dp->d_rawdev, bp);
540 dsched_strategy_sync(struct disk *dp, struct bio *bio)
542 struct buf *bp, *nbp;
550 nbp->b_cmd = bp->b_cmd;
551 nbp->b_bufsize = bp->b_bufsize;
552 nbp->b_runningbufspace = bp->b_runningbufspace;
553 nbp->b_bcount = bp->b_bcount;
554 nbp->b_resid = bp->b_resid;
555 nbp->b_data = bp->b_data;
558 * Buffers undergoing device I/O do not need a kvabase/size.
560 nbp->b_kvabase = bp->b_kvabase;
561 nbp->b_kvasize = bp->b_kvasize;
563 nbp->b_dirtyend = bp->b_dirtyend;
565 nbio->bio_done = biodone_sync;
566 nbio->bio_flags |= BIO_SYNC;
567 nbio->bio_track = NULL;
569 nbio->bio_caller_info1.ptr = dp;
570 nbio->bio_offset = bio->bio_offset;
572 dev_dstrategy(dp->d_rawdev, nbio);
573 biowait(nbio, "dschedsync");
574 bp->b_resid = nbp->b_resid;
575 bp->b_error = nbp->b_error;
578 nbp->b_kvabase = NULL;
585 dsched_strategy_async(struct disk *dp, struct bio *bio, biodone_t *done, void *priv)
589 nbio = push_bio(bio);
590 nbio->bio_done = done;
591 nbio->bio_offset = bio->bio_offset;
593 dsched_set_bio_dp(nbio, dp);
594 dsched_set_bio_priv(nbio, priv);
596 getmicrotime(&nbio->bio_caller_info3.tv);
597 dev_dstrategy(dp->d_rawdev, nbio);
601 * A special bio done call back function
602 * used by policy having request polling implemented.
605 request_polling_biodone(struct bio *bp)
607 struct dsched_disk_ctx *diskctx = NULL;
608 struct disk *dp = NULL;
610 struct dsched_policy *policy;
612 dp = dsched_get_bio_dp(bp);
613 policy = dp->d_sched_policy;
614 diskctx = dsched_get_disk_priv(dp);
615 KKASSERT(diskctx && policy);
616 dsched_disk_ctx_ref(diskctx);
620 * the bio_done function should not be blocked !
622 if (diskctx->dp->d_sched_policy->bio_done)
623 diskctx->dp->d_sched_policy->bio_done(bp);
628 atomic_subtract_int(&diskctx->current_tag_queue_depth, 1);
630 /* call the polling function,
632 * the polling function should not be blocked!
634 if (policy->polling_func)
635 policy->polling_func(diskctx);
637 dsched_debug(0, "dsched: the policy uses request polling without a polling function!\n");
638 dsched_disk_ctx_unref(diskctx);
642 * A special dsched strategy used by policy having request polling
643 * (polling function) implemented.
645 * The strategy is the just like dsched_strategy_async(), but
646 * the biodone call back is set to a preset one.
648 * If the policy needs its own biodone callback, it should
649 * register it in the policy structure. (bio_done field)
651 * The current_tag_queue_depth is maintained by this function
652 * and the request_polling_biodone() function
656 dsched_strategy_request_polling(struct disk *dp, struct bio *bio, struct dsched_disk_ctx *diskctx)
658 atomic_add_int(&diskctx->current_tag_queue_depth, 1);
659 dsched_strategy_async(dp, bio, request_polling_biodone, dsched_get_bio_priv(bio));
663 * Ref and deref various structures. The 1->0 transition of the reference
664 * count actually transitions 1->0x80000000 and causes the object to be
665 * destroyed. It is possible for transitory references to occur on the
666 * object while it is being destroyed. We use bit 31 to indicate that
667 * destruction is in progress and to prevent nested destructions.
670 dsched_disk_ctx_ref(struct dsched_disk_ctx *diskctx)
672 int refcount __unused;
674 refcount = atomic_fetchadd_int(&diskctx->refcount, 1);
678 dsched_thread_io_ref(struct dsched_thread_io *tdio)
680 int refcount __unused;
682 refcount = atomic_fetchadd_int(&tdio->refcount, 1);
686 dsched_thread_ctx_ref(struct dsched_thread_ctx *tdctx)
688 int refcount __unused;
690 refcount = atomic_fetchadd_int(&tdctx->refcount, 1);
694 dsched_disk_ctx_unref(struct dsched_disk_ctx *diskctx)
700 * Handle 1->0 transitions for diskctx and nested destruction
701 * recursions. If the refs are already in destruction mode (bit 31
702 * set) on the 1->0 transition we don't try to destruct it again.
704 * 0x80000001->0x80000000 transitions are handled normally and
705 * thus avoid nested dstruction.
708 refs = diskctx->refcount;
712 KKASSERT(((refs ^ nrefs) & 0x80000000) == 0);
714 if (atomic_cmpset_int(&diskctx->refcount, refs, nrefs))
719 if (atomic_cmpset_int(&diskctx->refcount, refs, nrefs)) {
720 dsched_disk_ctx_destroy(diskctx);
728 dsched_disk_ctx_destroy(struct dsched_disk_ctx *diskctx)
730 struct dsched_thread_io *tdio;
735 kprintf("diskctx (%p) destruction started, trace:\n", diskctx);
738 lockmgr(&diskctx->lock, LK_EXCLUSIVE);
739 while ((tdio = TAILQ_FIRST(&diskctx->tdio_list)) != NULL) {
740 KKASSERT(tdio->flags & DSCHED_LINKED_DISK_CTX);
741 TAILQ_REMOVE(&diskctx->tdio_list, tdio, dlink);
742 atomic_clear_int(&tdio->flags, DSCHED_LINKED_DISK_CTX);
743 tdio->diskctx = NULL;
744 /* XXX tdio->diskctx->dp->d_sched_policy->destroy_tdio(tdio);*/
745 lockmgr(&diskctx->lock, LK_RELEASE);
746 dsched_thread_io_unref_destroy(tdio);
747 lockmgr(&diskctx->lock, LK_EXCLUSIVE);
749 lockmgr(&diskctx->lock, LK_RELEASE);
752 * Expect diskctx->refcount to be 0x80000000. If it isn't someone
753 * else still has a temporary ref on the diskctx and we have to
754 * transition it back to an undestroyed-state (albeit without any
755 * associations), so the other user destroys it properly when the
758 while ((refs = diskctx->refcount) != 0x80000000) {
759 kprintf("dsched_thread_io: destroy race diskctx=%p\n", diskctx);
761 KKASSERT(refs & 0x80000000);
762 nrefs = refs & 0x7FFFFFFF;
763 if (atomic_cmpset_int(&diskctx->refcount, refs, nrefs))
768 * Really for sure now.
770 if (diskctx->dp->d_sched_policy->destroy_diskctx)
771 diskctx->dp->d_sched_policy->destroy_diskctx(diskctx);
772 objcache_put(dsched_diskctx_cache, diskctx);
773 atomic_subtract_int(&dsched_stats.diskctx_allocations, 1);
777 dsched_thread_io_unref(struct dsched_thread_io *tdio)
783 * Handle 1->0 transitions for tdio and nested destruction
784 * recursions. If the refs are already in destruction mode (bit 31
785 * set) on the 1->0 transition we don't try to destruct it again.
787 * 0x80000001->0x80000000 transitions are handled normally and
788 * thus avoid nested dstruction.
791 refs = tdio->refcount;
795 KKASSERT(((refs ^ nrefs) & 0x80000000) == 0);
797 if (atomic_cmpset_int(&tdio->refcount, refs, nrefs))
802 if (atomic_cmpset_int(&tdio->refcount, refs, nrefs)) {
803 dsched_thread_io_destroy(tdio);
810 * Unref and destroy the tdio even if additional refs are present.
814 dsched_thread_io_unref_destroy(struct dsched_thread_io *tdio)
820 * If not already transitioned to destroy-in-progress we transition
821 * to destroy-in-progress, cleanup our ref, and destroy the tdio.
824 refs = tdio->refcount;
828 KKASSERT(((refs ^ nrefs) & 0x80000000) == 0);
829 if (nrefs & 0x80000000) {
830 if (atomic_cmpset_int(&tdio->refcount, refs, nrefs))
835 if (atomic_cmpset_int(&tdio->refcount, refs, nrefs)) {
836 dsched_thread_io_destroy(tdio);
843 dsched_thread_io_destroy(struct dsched_thread_io *tdio)
845 struct dsched_thread_ctx *tdctx;
846 struct dsched_disk_ctx *diskctx;
851 kprintf("tdio (%p) destruction started, trace:\n", tdio);
854 KKASSERT(tdio->qlength == 0);
856 while ((diskctx = tdio->diskctx) != NULL) {
857 dsched_disk_ctx_ref(diskctx);
858 lockmgr(&diskctx->lock, LK_EXCLUSIVE);
859 if (diskctx != tdio->diskctx) {
860 lockmgr(&diskctx->lock, LK_RELEASE);
861 dsched_disk_ctx_unref(diskctx);
864 KKASSERT(tdio->flags & DSCHED_LINKED_DISK_CTX);
865 if (diskctx->dp->d_sched_policy->destroy_tdio)
866 diskctx->dp->d_sched_policy->destroy_tdio(tdio);
867 TAILQ_REMOVE(&diskctx->tdio_list, tdio, dlink);
868 atomic_clear_int(&tdio->flags, DSCHED_LINKED_DISK_CTX);
869 tdio->diskctx = NULL;
870 dsched_thread_io_unref(tdio);
871 lockmgr(&diskctx->lock, LK_RELEASE);
872 dsched_disk_ctx_unref(diskctx);
874 while ((tdctx = tdio->tdctx) != NULL) {
875 dsched_thread_ctx_ref(tdctx);
876 lockmgr(&tdctx->lock, LK_EXCLUSIVE);
877 if (tdctx != tdio->tdctx) {
878 lockmgr(&tdctx->lock, LK_RELEASE);
879 dsched_thread_ctx_unref(tdctx);
882 KKASSERT(tdio->flags & DSCHED_LINKED_THREAD_CTX);
883 TAILQ_REMOVE(&tdctx->tdio_list, tdio, link);
884 atomic_clear_int(&tdio->flags, DSCHED_LINKED_THREAD_CTX);
886 dsched_thread_io_unref(tdio);
887 lockmgr(&tdctx->lock, LK_RELEASE);
888 dsched_thread_ctx_unref(tdctx);
892 * Expect tdio->refcount to be 0x80000000. If it isn't someone else
893 * still has a temporary ref on the tdio and we have to transition
894 * it back to an undestroyed-state (albeit without any associations)
895 * so the other user destroys it properly when the ref is released.
897 while ((refs = tdio->refcount) != 0x80000000) {
898 kprintf("dsched_thread_io: destroy race tdio=%p\n", tdio);
900 KKASSERT(refs & 0x80000000);
901 nrefs = refs & 0x7FFFFFFF;
902 if (atomic_cmpset_int(&tdio->refcount, refs, nrefs))
907 * Really for sure now.
909 objcache_put(dsched_tdio_cache, tdio);
910 atomic_subtract_int(&dsched_stats.tdio_allocations, 1);
914 dsched_thread_ctx_unref(struct dsched_thread_ctx *tdctx)
920 * Handle 1->0 transitions for tdctx and nested destruction
921 * recursions. If the refs are already in destruction mode (bit 31
922 * set) on the 1->0 transition we don't try to destruct it again.
924 * 0x80000001->0x80000000 transitions are handled normally and
925 * thus avoid nested dstruction.
928 refs = tdctx->refcount;
932 KKASSERT(((refs ^ nrefs) & 0x80000000) == 0);
934 if (atomic_cmpset_int(&tdctx->refcount, refs, nrefs))
939 if (atomic_cmpset_int(&tdctx->refcount, refs, nrefs)) {
940 dsched_thread_ctx_destroy(tdctx);
947 dsched_thread_ctx_destroy(struct dsched_thread_ctx *tdctx)
949 struct dsched_thread_io *tdio;
951 lockmgr(&tdctx->lock, LK_EXCLUSIVE);
953 while ((tdio = TAILQ_FIRST(&tdctx->tdio_list)) != NULL) {
954 KKASSERT(tdio->flags & DSCHED_LINKED_THREAD_CTX);
955 TAILQ_REMOVE(&tdctx->tdio_list, tdio, link);
956 atomic_clear_int(&tdio->flags, DSCHED_LINKED_THREAD_CTX);
958 lockmgr(&tdctx->lock, LK_RELEASE); /* avoid deadlock */
959 dsched_thread_io_unref_destroy(tdio);
960 lockmgr(&tdctx->lock, LK_EXCLUSIVE);
962 KKASSERT(tdctx->refcount == 0x80000000);
964 lockmgr(&tdctx->lock, LK_RELEASE);
966 objcache_put(dsched_tdctx_cache, tdctx);
967 atomic_subtract_int(&dsched_stats.tdctx_allocations, 1);
971 * Ensures that a tdio is assigned to tdctx and disk.
974 struct dsched_thread_io *
975 dsched_thread_io_alloc(struct disk *dp, struct dsched_thread_ctx *tdctx,
976 struct dsched_policy *pol, int tdctx_locked)
978 struct dsched_thread_io *tdio;
980 dsched_disk_ctx_ref(dsched_get_disk_priv(dp));
982 tdio = objcache_get(dsched_tdio_cache, M_INTWAIT);
983 bzero(tdio, DSCHED_THREAD_IO_MAX_SZ);
985 dsched_thread_io_ref(tdio); /* prevent ripout */
986 dsched_thread_io_ref(tdio); /* for diskctx ref */
988 DSCHED_THREAD_IO_LOCKINIT(tdio);
991 tdio->diskctx = dsched_get_disk_priv(dp);
992 TAILQ_INIT(&tdio->queue);
997 DSCHED_DISK_CTX_LOCK(tdio->diskctx);
998 TAILQ_INSERT_TAIL(&tdio->diskctx->tdio_list, tdio, dlink);
999 atomic_set_int(&tdio->flags, DSCHED_LINKED_DISK_CTX);
1000 DSCHED_DISK_CTX_UNLOCK(tdio->diskctx);
1004 * Put the tdio in the tdctx list. Inherit the temporary
1005 * ref (one ref for each list).
1007 if (tdctx_locked == 0)
1008 DSCHED_THREAD_CTX_LOCK(tdctx);
1009 tdio->tdctx = tdctx;
1011 TAILQ_INSERT_TAIL(&tdctx->tdio_list, tdio, link);
1012 atomic_set_int(&tdio->flags, DSCHED_LINKED_THREAD_CTX);
1013 if (tdctx_locked == 0)
1014 DSCHED_THREAD_CTX_UNLOCK(tdctx);
1016 dsched_thread_io_unref(tdio);
1019 tdio->debug_policy = pol;
1020 tdio->debug_inited = 0xF00F1234;
1022 atomic_add_int(&dsched_stats.tdio_allocations, 1);
1028 struct dsched_disk_ctx *
1029 dsched_disk_ctx_alloc(struct disk *dp, struct dsched_policy *pol)
1031 struct dsched_disk_ctx *diskctx;
1033 diskctx = objcache_get(dsched_diskctx_cache, M_WAITOK);
1034 bzero(diskctx, DSCHED_DISK_CTX_MAX_SZ);
1035 dsched_disk_ctx_ref(diskctx);
1037 DSCHED_DISK_CTX_LOCKINIT(diskctx);
1038 TAILQ_INIT(&diskctx->tdio_list);
1040 * XXX: magic number 32: most device has a tag queue
1042 * Better to retrive more precise value from the driver
1044 diskctx->max_tag_queue_depth = 32;
1045 diskctx->current_tag_queue_depth = 0;
1047 atomic_add_int(&dsched_stats.diskctx_allocations, 1);
1048 if (pol->new_diskctx)
1049 pol->new_diskctx(diskctx);
1054 struct dsched_thread_ctx *
1055 dsched_thread_ctx_alloc(struct proc *p)
1057 struct dsched_thread_ctx *tdctx;
1059 tdctx = objcache_get(dsched_tdctx_cache, M_WAITOK);
1060 bzero(tdctx, DSCHED_THREAD_CTX_MAX_SZ);
1061 dsched_thread_ctx_ref(tdctx);
1063 kprintf("dsched_thread_ctx_alloc, new tdctx = %p\n", tdctx);
1065 DSCHED_THREAD_CTX_LOCKINIT(tdctx);
1066 TAILQ_INIT(&tdctx->tdio_list);
1069 atomic_add_int(&dsched_stats.tdctx_allocations, 1);
1070 /* XXX: no callback here */
1076 policy_new(struct disk *dp, struct dsched_policy *pol)
1078 struct dsched_disk_ctx *diskctx;
1080 diskctx = dsched_disk_ctx_alloc(dp, pol);
1081 dsched_disk_ctx_ref(diskctx);
1082 dsched_set_disk_priv(dp, diskctx);
1086 policy_destroy(struct disk *dp) {
1087 struct dsched_disk_ctx *diskctx;
1089 diskctx = dsched_get_disk_priv(dp);
1090 KKASSERT(diskctx != NULL);
1092 dsched_disk_ctx_unref(diskctx); /* from prepare */
1093 dsched_disk_ctx_unref(diskctx); /* from alloc */
1095 dsched_set_disk_priv(dp, NULL);
1099 dsched_new_buf(struct buf *bp)
1101 struct dsched_thread_ctx *tdctx = NULL;
1103 if (dsched_inited == 0)
1106 if (curproc != NULL) {
1107 tdctx = dsched_get_proc_priv(curproc);
1109 /* This is a kernel thread, so no proc info is available */
1110 tdctx = dsched_get_thread_priv(curthread);
1115 * XXX: hack. we don't want this assert because we aren't catching all
1116 * threads. mi_startup() is still getting away without an tdctx.
1119 /* by now we should have an tdctx. if not, something bad is going on */
1120 KKASSERT(tdctx != NULL);
1124 dsched_thread_ctx_ref(tdctx);
1126 dsched_set_buf_priv(bp, tdctx);
1130 dsched_exit_buf(struct buf *bp)
1132 struct dsched_thread_ctx *tdctx;
1134 tdctx = dsched_get_buf_priv(bp);
1135 if (tdctx != NULL) {
1136 dsched_clr_buf_priv(bp);
1137 dsched_thread_ctx_unref(tdctx);
1142 dsched_new_proc(struct proc *p)
1144 struct dsched_thread_ctx *tdctx;
1146 if (dsched_inited == 0)
1149 KKASSERT(p != NULL);
1151 tdctx = dsched_thread_ctx_alloc(p);
1153 dsched_thread_ctx_ref(tdctx);
1155 dsched_set_proc_priv(p, tdctx);
1156 atomic_add_int(&dsched_stats.nprocs, 1);
1161 dsched_new_thread(struct thread *td)
1163 struct dsched_thread_ctx *tdctx;
1165 if (dsched_inited == 0)
1168 KKASSERT(td != NULL);
1170 tdctx = dsched_thread_ctx_alloc(NULL);
1172 dsched_thread_ctx_ref(tdctx);
1174 dsched_set_thread_priv(td, tdctx);
1175 atomic_add_int(&dsched_stats.nthreads, 1);
1179 dsched_exit_proc(struct proc *p)
1181 struct dsched_thread_ctx *tdctx;
1183 if (dsched_inited == 0)
1186 KKASSERT(p != NULL);
1188 tdctx = dsched_get_proc_priv(p);
1189 KKASSERT(tdctx != NULL);
1191 tdctx->dead = 0xDEAD;
1192 dsched_set_proc_priv(p, NULL);
1194 dsched_thread_ctx_unref(tdctx); /* one for alloc, */
1195 dsched_thread_ctx_unref(tdctx); /* one for ref */
1196 atomic_subtract_int(&dsched_stats.nprocs, 1);
1201 dsched_exit_thread(struct thread *td)
1203 struct dsched_thread_ctx *tdctx;
1205 if (dsched_inited == 0)
1208 KKASSERT(td != NULL);
1210 tdctx = dsched_get_thread_priv(td);
1211 KKASSERT(tdctx != NULL);
1213 tdctx->dead = 0xDEAD;
1214 dsched_set_thread_priv(td, 0);
1216 dsched_thread_ctx_unref(tdctx); /* one for alloc, */
1217 dsched_thread_ctx_unref(tdctx); /* one for ref */
1218 atomic_subtract_int(&dsched_stats.nthreads, 1);
1222 * Returns ref'd tdio.
1224 * tdio may have additional refs for the diskctx and tdctx it resides on.
1227 dsched_new_policy_thread_tdio(struct dsched_disk_ctx *diskctx,
1228 struct dsched_policy *pol)
1230 struct dsched_thread_ctx *tdctx;
1232 tdctx = dsched_get_thread_priv(curthread);
1233 KKASSERT(tdctx != NULL);
1234 dsched_thread_io_alloc(diskctx->dp, tdctx, pol, 0);
1237 /* DEFAULT NOOP POLICY */
1240 noop_prepare(struct dsched_disk_ctx *diskctx)
1246 noop_teardown(struct dsched_disk_ctx *diskctx)
1252 noop_cancel(struct dsched_disk_ctx *diskctx)
1258 noop_queue(struct dsched_disk_ctx *diskctx, struct dsched_thread_io *tdio,
1261 dsched_strategy_raw(diskctx->dp, bio);
1263 dsched_strategy_async(diskctx->dp, bio, noop_completed, NULL);
1274 dsched_tdio_cache = objcache_create("dsched-tdio-cache", 0, 0,
1276 objcache_malloc_alloc,
1277 objcache_malloc_free,
1278 &dsched_thread_io_malloc_args );
1280 dsched_tdctx_cache = objcache_create("dsched-tdctx-cache", 0, 0,
1282 objcache_malloc_alloc,
1283 objcache_malloc_free,
1284 &dsched_thread_ctx_malloc_args );
1286 dsched_diskctx_cache = objcache_create("dsched-diskctx-cache", 0, 0,
1288 objcache_malloc_alloc,
1289 objcache_malloc_free,
1290 &dsched_disk_ctx_malloc_args );
1292 bzero(&dsched_stats, sizeof(struct dsched_stats));
1294 lockinit(&dsched_lock, "dsched lock", 0, LK_CANRECURSE);
1295 DSCHED_GLOBAL_THREAD_CTX_LOCKINIT();
1297 dsched_register(&dsched_noop_policy);
1307 SYSINIT(subr_dsched_register, SI_SUB_CREATE_INIT-1, SI_ORDER_FIRST, dsched_init, NULL);
1308 SYSUNINIT(subr_dsched_register, SI_SUB_CREATE_INIT-1, SI_ORDER_ANY, dsched_uninit, NULL);
1314 sysctl_dsched_stats(SYSCTL_HANDLER_ARGS)
1316 return (sysctl_handle_opaque(oidp, &dsched_stats, sizeof(struct dsched_stats), req));
1320 sysctl_dsched_list_policies(SYSCTL_HANDLER_ARGS)
1322 struct dsched_policy *pol = NULL;
1323 int error, first = 1;
1325 lockmgr(&dsched_lock, LK_EXCLUSIVE);
1327 while ((pol = dsched_policy_enumerate(pol))) {
1329 error = SYSCTL_OUT(req, " ", 1);
1335 error = SYSCTL_OUT(req, pol->name, strlen(pol->name));
1341 lockmgr(&dsched_lock, LK_RELEASE);
1343 error = SYSCTL_OUT(req, "", 1);
1349 sysctl_dsched_policy(SYSCTL_HANDLER_ARGS)
1351 char buf[DSCHED_POLICY_NAME_LENGTH];
1352 struct dsched_disk_ctx *diskctx = arg1;
1353 struct dsched_policy *pol = NULL;
1356 if (diskctx == NULL) {
1360 lockmgr(&dsched_lock, LK_EXCLUSIVE);
1362 pol = diskctx->dp->d_sched_policy;
1363 memcpy(buf, pol->name, DSCHED_POLICY_NAME_LENGTH);
1365 error = sysctl_handle_string(oidp, buf, DSCHED_POLICY_NAME_LENGTH, req);
1366 if (error || req->newptr == NULL) {
1367 lockmgr(&dsched_lock, LK_RELEASE);
1371 pol = dsched_find_policy(buf);
1373 lockmgr(&dsched_lock, LK_RELEASE);
1377 dsched_switch(diskctx->dp, pol);
1379 lockmgr(&dsched_lock, LK_RELEASE);
1385 sysctl_dsched_default_policy(SYSCTL_HANDLER_ARGS)
1387 char buf[DSCHED_POLICY_NAME_LENGTH];
1388 struct dsched_policy *pol = NULL;
1391 lockmgr(&dsched_lock, LK_EXCLUSIVE);
1393 pol = default_policy;
1394 memcpy(buf, pol->name, DSCHED_POLICY_NAME_LENGTH);
1396 error = sysctl_handle_string(oidp, buf, DSCHED_POLICY_NAME_LENGTH, req);
1397 if (error || req->newptr == NULL) {
1398 lockmgr(&dsched_lock, LK_RELEASE);
1402 pol = dsched_find_policy(buf);
1404 lockmgr(&dsched_lock, LK_RELEASE);
1409 default_policy = pol;
1411 lockmgr(&dsched_lock, LK_RELEASE);
1416 SYSCTL_NODE(, OID_AUTO, dsched, CTLFLAG_RD, NULL,
1417 "Disk Scheduler Framework (dsched) magic");
1418 SYSCTL_NODE(_dsched, OID_AUTO, policy, CTLFLAG_RW, NULL,
1419 "List of disks and their policies");
1420 SYSCTL_INT(_dsched, OID_AUTO, debug, CTLFLAG_RW, &dsched_debug_enable,
1421 0, "Enable dsched debugging");
1422 SYSCTL_PROC(_dsched, OID_AUTO, stats, CTLTYPE_OPAQUE|CTLFLAG_RD,
1423 0, sizeof(struct dsched_stats), sysctl_dsched_stats, "dsched_stats",
1424 "dsched statistics");
1425 SYSCTL_PROC(_dsched, OID_AUTO, policies, CTLTYPE_STRING|CTLFLAG_RD,
1426 NULL, 0, sysctl_dsched_list_policies, "A", "names of available policies");
1427 SYSCTL_PROC(_dsched_policy, OID_AUTO, default, CTLTYPE_STRING|CTLFLAG_RW,
1428 NULL, 0, sysctl_dsched_default_policy, "A", "default dsched policy");
1431 dsched_sysctl_add_disk(struct dsched_disk_ctx *diskctx, char *name)
1433 if (!(diskctx->flags & DSCHED_SYSCTL_CTX_INITED)) {
1434 diskctx->flags |= DSCHED_SYSCTL_CTX_INITED;
1435 sysctl_ctx_init(&diskctx->sysctl_ctx);
1438 SYSCTL_ADD_PROC(&diskctx->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_dsched_policy),
1439 OID_AUTO, name, CTLTYPE_STRING|CTLFLAG_RW,
1440 diskctx, 0, sysctl_dsched_policy, "A", "policy");