2 * Copyright (c) 2009, 2010 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Alex Hornung <ahornung@gmail.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/kernel.h>
38 #include <sys/sysctl.h>
41 #include <sys/diskslice.h>
43 #include <sys/malloc.h>
44 #include <machine/md_var.h>
45 #include <sys/ctype.h>
46 #include <sys/syslog.h>
47 #include <sys/device.h>
48 #include <sys/msgport.h>
49 #include <sys/msgport2.h>
51 #include <sys/dsched.h>
52 #include <sys/fcntl.h>
53 #include <machine/varargs.h>
55 MALLOC_DEFINE(M_DSCHED, "dsched", "dsched allocs");
57 static dsched_prepare_t noop_prepare;
58 static dsched_teardown_t noop_teardown;
59 static dsched_cancel_t noop_cancel;
60 static dsched_queue_t noop_queue;
62 static void dsched_sysctl_add_disk(struct dsched_disk_ctx *diskctx, char *name);
63 static void dsched_disk_ctx_destroy(struct dsched_disk_ctx *diskctx);
64 static void dsched_thread_io_destroy(struct dsched_thread_io *tdio);
65 static void dsched_thread_ctx_destroy(struct dsched_thread_ctx *tdctx);
67 static int dsched_inited = 0;
68 static int default_set = 0;
70 struct lock dsched_lock;
71 static int dsched_debug_enable = 0;
73 struct dsched_stats dsched_stats;
75 struct objcache_malloc_args dsched_disk_ctx_malloc_args = {
76 DSCHED_DISK_CTX_MAX_SZ, M_DSCHED };
77 struct objcache_malloc_args dsched_thread_io_malloc_args = {
78 DSCHED_THREAD_IO_MAX_SZ, M_DSCHED };
79 struct objcache_malloc_args dsched_thread_ctx_malloc_args = {
80 DSCHED_THREAD_CTX_MAX_SZ, M_DSCHED };
82 static struct objcache *dsched_diskctx_cache;
83 static struct objcache *dsched_tdctx_cache;
84 static struct objcache *dsched_tdio_cache;
86 TAILQ_HEAD(, dsched_thread_ctx) dsched_tdctx_list =
87 TAILQ_HEAD_INITIALIZER(dsched_tdctx_list);
89 struct lock dsched_tdctx_lock;
91 static struct dsched_policy_head dsched_policy_list =
92 TAILQ_HEAD_INITIALIZER(dsched_policy_list);
94 static struct dsched_policy dsched_noop_policy = {
97 .prepare = noop_prepare,
98 .teardown = noop_teardown,
99 .cancel_all = noop_cancel,
100 .bio_queue = noop_queue
103 static struct dsched_policy *default_policy = &dsched_noop_policy;
106 * dsched_debug() is a SYSCTL and TUNABLE controlled debug output function
110 dsched_debug(int level, char *fmt, ...)
115 if (level <= dsched_debug_enable)
123 * Called on disk_create()
124 * tries to read which policy to use from loader.conf, if there's
125 * none specified, the default policy is used.
128 dsched_disk_create_callback(struct disk *dp, const char *head_name, int unit)
130 char tunable_key[SPECNAMELEN + 48];
131 char sched_policy[DSCHED_POLICY_NAME_LENGTH];
133 struct dsched_policy *policy = NULL;
135 /* Also look for serno stuff? */
136 /* kprintf("dsched_disk_create_callback() for disk %s%d\n", head_name, unit); */
137 lockmgr(&dsched_lock, LK_EXCLUSIVE);
139 ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.%s%d",
141 if (TUNABLE_STR_FETCH(tunable_key, sched_policy,
142 sizeof(sched_policy)) != 0) {
143 policy = dsched_find_policy(sched_policy);
146 ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.%s",
148 for (ptr = tunable_key; *ptr; ptr++) {
152 if (!policy && (TUNABLE_STR_FETCH(tunable_key, sched_policy,
153 sizeof(sched_policy)) != 0)) {
154 policy = dsched_find_policy(sched_policy);
157 ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.default");
158 if (!policy && !default_set && (TUNABLE_STR_FETCH(tunable_key, sched_policy,
159 sizeof(sched_policy)) != 0)) {
160 policy = dsched_find_policy(sched_policy);
164 if (!default_set && bootverbose) {
166 "No policy for %s%d specified, "
167 "or policy not found\n",
170 dsched_set_policy(dp, default_policy);
172 dsched_set_policy(dp, policy);
175 if (strncmp(head_name, "mapper/", strlen("mapper/")) == 0)
176 ksnprintf(tunable_key, sizeof(tunable_key), "%s", head_name);
178 ksnprintf(tunable_key, sizeof(tunable_key), "%s%d", head_name, unit);
179 for (ptr = tunable_key; *ptr; ptr++) {
183 dsched_sysctl_add_disk(
184 (struct dsched_disk_ctx *)dsched_get_disk_priv(dp),
187 lockmgr(&dsched_lock, LK_RELEASE);
191 * Called from disk_setdiskinfo (or rather _setdiskinfo). This will check if
192 * there's any policy associated with the serial number of the device.
195 dsched_disk_update_callback(struct disk *dp, struct disk_info *info)
197 char tunable_key[SPECNAMELEN + 48];
198 char sched_policy[DSCHED_POLICY_NAME_LENGTH];
199 struct dsched_policy *policy = NULL;
201 if (info->d_serialno == NULL)
204 lockmgr(&dsched_lock, LK_EXCLUSIVE);
206 ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.%s",
209 if((TUNABLE_STR_FETCH(tunable_key, sched_policy,
210 sizeof(sched_policy)) != 0)) {
211 policy = dsched_find_policy(sched_policy);
215 dsched_switch(dp, policy);
218 dsched_sysctl_add_disk(
219 (struct dsched_disk_ctx *)dsched_get_disk_priv(dp),
222 lockmgr(&dsched_lock, LK_RELEASE);
226 * Called on disk_destroy()
227 * shuts down the scheduler core and cancels all remaining bios
230 dsched_disk_destroy_callback(struct disk *dp)
232 struct dsched_policy *old_policy;
233 struct dsched_disk_ctx *diskctx;
235 lockmgr(&dsched_lock, LK_EXCLUSIVE);
237 diskctx = dsched_get_disk_priv(dp);
239 old_policy = dp->d_sched_policy;
240 dp->d_sched_policy = &dsched_noop_policy;
241 old_policy->cancel_all(dsched_get_disk_priv(dp));
242 old_policy->teardown(dsched_get_disk_priv(dp));
244 if (diskctx->flags & DSCHED_SYSCTL_CTX_INITED)
245 sysctl_ctx_free(&diskctx->sysctl_ctx);
248 atomic_subtract_int(&old_policy->ref_count, 1);
249 KKASSERT(old_policy->ref_count >= 0);
251 lockmgr(&dsched_lock, LK_RELEASE);
256 dsched_queue(struct disk *dp, struct bio *bio)
258 struct dsched_thread_ctx *tdctx;
259 struct dsched_thread_io *tdio;
260 struct dsched_disk_ctx *diskctx;
262 int found = 0, error = 0;
264 tdctx = dsched_get_buf_priv(bio->bio_buf);
266 /* We don't handle this case, let dsched dispatch */
267 atomic_add_int(&dsched_stats.no_tdctx, 1);
268 dsched_strategy_raw(dp, bio);
272 DSCHED_THREAD_CTX_LOCK(tdctx);
274 KKASSERT(!TAILQ_EMPTY(&tdctx->tdio_list));
275 TAILQ_FOREACH(tdio, &tdctx->tdio_list, link) {
276 if (tdio->dp == dp) {
277 dsched_thread_io_ref(tdio);
283 DSCHED_THREAD_CTX_UNLOCK(tdctx);
284 dsched_clr_buf_priv(bio->bio_buf);
285 dsched_thread_ctx_unref(tdctx); /* acquired on new_buf */
287 KKASSERT(found == 1);
288 diskctx = dsched_get_disk_priv(dp);
289 dsched_disk_ctx_ref(diskctx);
290 error = dp->d_sched_policy->bio_queue(diskctx, tdio, bio);
293 dsched_strategy_raw(dp, bio);
295 dsched_disk_ctx_unref(diskctx);
296 dsched_thread_io_unref(tdio);
301 * Called from each module_init or module_attach of each policy
302 * registers the policy in the local policy list.
305 dsched_register(struct dsched_policy *d_policy)
307 struct dsched_policy *policy;
310 lockmgr(&dsched_lock, LK_EXCLUSIVE);
312 policy = dsched_find_policy(d_policy->name);
315 TAILQ_INSERT_TAIL(&dsched_policy_list, d_policy, link);
316 atomic_add_int(&d_policy->ref_count, 1);
318 dsched_debug(LOG_ERR, "Policy with name %s already registered!\n",
323 lockmgr(&dsched_lock, LK_RELEASE);
328 * Called from each module_detach of each policy
329 * unregisters the policy
332 dsched_unregister(struct dsched_policy *d_policy)
334 struct dsched_policy *policy;
336 lockmgr(&dsched_lock, LK_EXCLUSIVE);
337 policy = dsched_find_policy(d_policy->name);
340 if (policy->ref_count > 1) {
341 lockmgr(&dsched_lock, LK_RELEASE);
344 TAILQ_REMOVE(&dsched_policy_list, policy, link);
345 atomic_subtract_int(&policy->ref_count, 1);
346 KKASSERT(policy->ref_count == 0);
348 lockmgr(&dsched_lock, LK_RELEASE);
354 * switches the policy by first removing the old one and then
355 * enabling the new one.
358 dsched_switch(struct disk *dp, struct dsched_policy *new_policy)
360 struct dsched_policy *old_policy;
362 /* If we are asked to set the same policy, do nothing */
363 if (dp->d_sched_policy == new_policy)
366 /* lock everything down, diskwise */
367 lockmgr(&dsched_lock, LK_EXCLUSIVE);
368 old_policy = dp->d_sched_policy;
370 atomic_subtract_int(&old_policy->ref_count, 1);
371 KKASSERT(old_policy->ref_count >= 0);
373 dp->d_sched_policy = &dsched_noop_policy;
374 old_policy->teardown(dsched_get_disk_priv(dp));
377 /* Bring everything back to life */
378 dsched_set_policy(dp, new_policy);
379 lockmgr(&dsched_lock, LK_RELEASE);
385 * Loads a given policy and attaches it to the specified disk.
386 * Also initializes the core for the policy
389 dsched_set_policy(struct disk *dp, struct dsched_policy *new_policy)
393 /* Check if it is locked already. if not, we acquire the devfs lock */
394 if (!(lockstatus(&dsched_lock, curthread)) == LK_EXCLUSIVE) {
395 lockmgr(&dsched_lock, LK_EXCLUSIVE);
399 policy_new(dp, new_policy);
400 new_policy->prepare(dsched_get_disk_priv(dp));
401 dp->d_sched_policy = new_policy;
402 atomic_add_int(&new_policy->ref_count, 1);
403 kprintf("disk scheduler: set policy of %s to %s\n", dp->d_cdev->si_name,
406 /* If we acquired the lock, we also get rid of it */
408 lockmgr(&dsched_lock, LK_RELEASE);
411 struct dsched_policy*
412 dsched_find_policy(char *search)
414 struct dsched_policy *policy;
415 struct dsched_policy *policy_found = NULL;
418 /* Check if it is locked already. if not, we acquire the devfs lock */
419 if (!(lockstatus(&dsched_lock, curthread)) == LK_EXCLUSIVE) {
420 lockmgr(&dsched_lock, LK_EXCLUSIVE);
424 TAILQ_FOREACH(policy, &dsched_policy_list, link) {
425 if (!strcmp(policy->name, search)) {
426 policy_found = policy;
431 /* If we acquired the lock, we also get rid of it */
433 lockmgr(&dsched_lock, LK_RELEASE);
439 dsched_find_disk(char *search)
441 struct disk *dp_found = NULL;
442 struct disk *dp = NULL;
444 while((dp = disk_enumerate(dp))) {
445 if (!strcmp(dp->d_cdev->si_name, search)) {
455 dsched_disk_enumerate(struct disk *dp, struct dsched_policy *policy)
457 while ((dp = disk_enumerate(dp))) {
458 if (dp->d_sched_policy == policy)
465 struct dsched_policy *
466 dsched_policy_enumerate(struct dsched_policy *pol)
469 return (TAILQ_FIRST(&dsched_policy_list));
471 return (TAILQ_NEXT(pol, link));
475 dsched_cancel_bio(struct bio *bp)
477 bp->bio_buf->b_error = ENXIO;
478 bp->bio_buf->b_flags |= B_ERROR;
479 bp->bio_buf->b_resid = bp->bio_buf->b_bcount;
485 dsched_strategy_raw(struct disk *dp, struct bio *bp)
488 * Ideally, this stuff shouldn't be needed... but just in case, we leave it in
491 KASSERT(dp->d_rawdev != NULL, ("dsched_strategy_raw sees NULL d_rawdev!!"));
492 if(bp->bio_track != NULL) {
493 dsched_debug(LOG_INFO,
494 "dsched_strategy_raw sees non-NULL bio_track!! "
496 bp->bio_track = NULL;
498 dev_dstrategy(dp->d_rawdev, bp);
502 dsched_strategy_sync(struct disk *dp, struct bio *bio)
504 struct buf *bp, *nbp;
512 nbp->b_cmd = bp->b_cmd;
513 nbp->b_bufsize = bp->b_bufsize;
514 nbp->b_runningbufspace = bp->b_runningbufspace;
515 nbp->b_bcount = bp->b_bcount;
516 nbp->b_resid = bp->b_resid;
517 nbp->b_data = bp->b_data;
520 * Buffers undergoing device I/O do not need a kvabase/size.
522 nbp->b_kvabase = bp->b_kvabase;
523 nbp->b_kvasize = bp->b_kvasize;
525 nbp->b_dirtyend = bp->b_dirtyend;
527 nbio->bio_done = biodone_sync;
528 nbio->bio_flags |= BIO_SYNC;
529 nbio->bio_track = NULL;
531 nbio->bio_caller_info1.ptr = dp;
532 nbio->bio_offset = bio->bio_offset;
534 dev_dstrategy(dp->d_rawdev, nbio);
535 biowait(nbio, "dschedsync");
536 bp->b_resid = nbp->b_resid;
537 bp->b_error = nbp->b_error;
540 nbp->b_kvabase = NULL;
547 dsched_strategy_async(struct disk *dp, struct bio *bio, biodone_t *done, void *priv)
551 nbio = push_bio(bio);
552 nbio->bio_done = done;
553 nbio->bio_offset = bio->bio_offset;
555 dsched_set_bio_dp(nbio, dp);
556 dsched_set_bio_priv(nbio, priv);
558 getmicrotime(&nbio->bio_caller_info3.tv);
559 dev_dstrategy(dp->d_rawdev, nbio);
563 * Ref and deref various structures. The 1->0 transition of the reference
564 * count actually transitions 1->0x80000000 and causes the object to be
565 * destroyed. It is possible for transitory references to occur on the
566 * object while it is being destroyed. We use bit 31 to indicate that
567 * destruction is in progress and to prevent nested destructions.
570 dsched_disk_ctx_ref(struct dsched_disk_ctx *diskctx)
574 refcount = atomic_fetchadd_int(&diskctx->refcount, 1);
578 dsched_thread_io_ref(struct dsched_thread_io *tdio)
582 refcount = atomic_fetchadd_int(&tdio->refcount, 1);
586 dsched_thread_ctx_ref(struct dsched_thread_ctx *tdctx)
590 refcount = atomic_fetchadd_int(&tdctx->refcount, 1);
594 dsched_disk_ctx_unref(struct dsched_disk_ctx *diskctx)
600 * Handle 1->0 transitions for diskctx and nested destruction
601 * recursions. If the refs are already in destruction mode (bit 31
602 * set) on the 1->0 transition we don't try to destruct it again.
604 * 0x80000001->0x80000000 transitions are handled normally and
605 * thus avoid nested dstruction.
608 refs = diskctx->refcount;
612 KKASSERT(((refs ^ nrefs) & 0x80000000) == 0);
614 if (atomic_cmpset_int(&diskctx->refcount, refs, nrefs))
619 if (atomic_cmpset_int(&diskctx->refcount, refs, nrefs)) {
620 dsched_disk_ctx_destroy(diskctx);
628 dsched_disk_ctx_destroy(struct dsched_disk_ctx *diskctx)
630 struct dsched_thread_io *tdio;
633 kprintf("diskctx (%p) destruction started, trace:\n", diskctx);
636 lockmgr(&diskctx->lock, LK_EXCLUSIVE);
637 while ((tdio = TAILQ_FIRST(&diskctx->tdio_list)) != NULL) {
638 KKASSERT(tdio->flags & DSCHED_LINKED_DISK_CTX);
639 TAILQ_REMOVE(&diskctx->tdio_list, tdio, dlink);
640 atomic_clear_int(&tdio->flags, DSCHED_LINKED_DISK_CTX);
641 tdio->diskctx = NULL;
642 /* XXX tdio->diskctx->dp->d_sched_policy->destroy_tdio(tdio);*/
643 dsched_thread_io_unref(tdio);
645 lockmgr(&diskctx->lock, LK_RELEASE);
646 if (diskctx->dp->d_sched_policy->destroy_diskctx)
647 diskctx->dp->d_sched_policy->destroy_diskctx(diskctx);
648 KKASSERT(diskctx->refcount == 0x80000000);
649 objcache_put(dsched_diskctx_cache, diskctx);
650 atomic_subtract_int(&dsched_stats.diskctx_allocations, 1);
654 dsched_thread_io_unref(struct dsched_thread_io *tdio)
660 * Handle 1->0 transitions for tdio and nested destruction
661 * recursions. If the refs are already in destruction mode (bit 31
662 * set) on the 1->0 transition we don't try to destruct it again.
664 * 0x80000001->0x80000000 transitions are handled normally and
665 * thus avoid nested dstruction.
668 refs = tdio->refcount;
672 KKASSERT(((refs ^ nrefs) & 0x80000000) == 0);
674 if (atomic_cmpset_int(&tdio->refcount, refs, nrefs))
679 if (atomic_cmpset_int(&tdio->refcount, refs, nrefs)) {
680 dsched_thread_io_destroy(tdio);
687 dsched_thread_io_destroy(struct dsched_thread_io *tdio)
689 struct dsched_thread_ctx *tdctx;
690 struct dsched_disk_ctx *diskctx;
693 kprintf("tdio (%p) destruction started, trace:\n", tdio);
696 KKASSERT(tdio->qlength == 0);
698 while ((diskctx = tdio->diskctx) != NULL) {
699 dsched_disk_ctx_ref(diskctx);
700 lockmgr(&diskctx->lock, LK_EXCLUSIVE);
701 if (diskctx != tdio->diskctx) {
702 lockmgr(&diskctx->lock, LK_RELEASE);
703 dsched_disk_ctx_unref(diskctx);
706 KKASSERT(tdio->flags & DSCHED_LINKED_DISK_CTX);
707 if (diskctx->dp->d_sched_policy->destroy_tdio)
708 diskctx->dp->d_sched_policy->destroy_tdio(tdio);
709 TAILQ_REMOVE(&diskctx->tdio_list, tdio, dlink);
710 atomic_clear_int(&tdio->flags, DSCHED_LINKED_DISK_CTX);
711 tdio->diskctx = NULL;
712 lockmgr(&diskctx->lock, LK_RELEASE);
713 dsched_disk_ctx_unref(diskctx);
715 while ((tdctx = tdio->tdctx) != NULL) {
716 dsched_thread_ctx_ref(tdctx);
717 lockmgr(&tdctx->lock, LK_EXCLUSIVE);
718 if (tdctx != tdio->tdctx) {
719 lockmgr(&tdctx->lock, LK_RELEASE);
720 dsched_thread_ctx_unref(tdctx);
723 KKASSERT(tdio->flags & DSCHED_LINKED_THREAD_CTX);
724 TAILQ_REMOVE(&tdctx->tdio_list, tdio, link);
725 atomic_clear_int(&tdio->flags, DSCHED_LINKED_THREAD_CTX);
727 lockmgr(&tdctx->lock, LK_RELEASE);
728 dsched_thread_ctx_unref(tdctx);
730 KKASSERT(tdio->refcount == 0x80000000);
731 objcache_put(dsched_tdio_cache, tdio);
732 atomic_subtract_int(&dsched_stats.tdio_allocations, 1);
734 dsched_disk_ctx_unref(diskctx);
739 dsched_thread_ctx_unref(struct dsched_thread_ctx *tdctx)
745 * Handle 1->0 transitions for tdctx and nested destruction
746 * recursions. If the refs are already in destruction mode (bit 31
747 * set) on the 1->0 transition we don't try to destruct it again.
749 * 0x80000001->0x80000000 transitions are handled normally and
750 * thus avoid nested dstruction.
753 refs = tdctx->refcount;
757 KKASSERT(((refs ^ nrefs) & 0x80000000) == 0);
759 if (atomic_cmpset_int(&tdctx->refcount, refs, nrefs))
764 if (atomic_cmpset_int(&tdctx->refcount, refs, nrefs)) {
765 dsched_thread_ctx_destroy(tdctx);
772 dsched_thread_ctx_destroy(struct dsched_thread_ctx *tdctx)
774 struct dsched_thread_io *tdio;
777 kprintf("tdctx (%p) destruction started, trace:\n", tdctx);
780 DSCHED_GLOBAL_THREAD_CTX_LOCK();
782 while ((tdio = TAILQ_FIRST(&tdctx->tdio_list)) != NULL) {
783 KKASSERT(tdio->flags & DSCHED_LINKED_THREAD_CTX);
784 TAILQ_REMOVE(&tdctx->tdio_list, tdio, link);
785 atomic_clear_int(&tdio->flags, DSCHED_LINKED_THREAD_CTX);
787 dsched_thread_io_unref(tdio);
789 KKASSERT(tdctx->refcount == 0x80000000);
790 TAILQ_REMOVE(&dsched_tdctx_list, tdctx, link);
792 DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
794 objcache_put(dsched_tdctx_cache, tdctx);
795 atomic_subtract_int(&dsched_stats.tdctx_allocations, 1);
798 struct dsched_thread_io *
799 dsched_thread_io_alloc(struct disk *dp, struct dsched_thread_ctx *tdctx,
800 struct dsched_policy *pol)
802 struct dsched_thread_io *tdio;
804 dsched_disk_ctx_ref(dsched_get_disk_priv(dp));
806 tdio = objcache_get(dsched_tdio_cache, M_WAITOK);
807 bzero(tdio, DSCHED_THREAD_IO_MAX_SZ);
809 /* XXX: maybe we do need another ref for the disk list for tdio */
810 dsched_thread_io_ref(tdio);
812 DSCHED_THREAD_IO_LOCKINIT(tdio);
815 tdio->diskctx = dsched_get_disk_priv(dp);
816 TAILQ_INIT(&tdio->queue);
821 lockmgr(&tdio->diskctx->lock, LK_EXCLUSIVE);
822 TAILQ_INSERT_TAIL(&tdio->diskctx->tdio_list, tdio, dlink);
823 atomic_set_int(&tdio->flags, DSCHED_LINKED_DISK_CTX);
824 lockmgr(&tdio->diskctx->lock, LK_RELEASE);
830 /* Put the tdio in the tdctx list */
831 DSCHED_THREAD_CTX_LOCK(tdctx);
832 TAILQ_INSERT_TAIL(&tdctx->tdio_list, tdio, link);
833 DSCHED_THREAD_CTX_UNLOCK(tdctx);
834 atomic_set_int(&tdio->flags, DSCHED_LINKED_THREAD_CTX);
837 atomic_add_int(&dsched_stats.tdio_allocations, 1);
842 struct dsched_disk_ctx *
843 dsched_disk_ctx_alloc(struct disk *dp, struct dsched_policy *pol)
845 struct dsched_disk_ctx *diskctx;
847 diskctx = objcache_get(dsched_diskctx_cache, M_WAITOK);
848 bzero(diskctx, DSCHED_DISK_CTX_MAX_SZ);
849 dsched_disk_ctx_ref(diskctx);
851 DSCHED_DISK_CTX_LOCKINIT(diskctx);
852 TAILQ_INIT(&diskctx->tdio_list);
854 atomic_add_int(&dsched_stats.diskctx_allocations, 1);
855 if (pol->new_diskctx)
856 pol->new_diskctx(diskctx);
861 struct dsched_thread_ctx *
862 dsched_thread_ctx_alloc(struct proc *p)
864 struct dsched_thread_ctx *tdctx;
865 struct dsched_thread_io *tdio;
866 struct disk *dp = NULL;
868 tdctx = objcache_get(dsched_tdctx_cache, M_WAITOK);
869 bzero(tdctx, DSCHED_THREAD_CTX_MAX_SZ);
870 dsched_thread_ctx_ref(tdctx);
872 kprintf("dsched_thread_ctx_alloc, new tdctx = %p\n", tdctx);
874 DSCHED_THREAD_CTX_LOCKINIT(tdctx);
875 TAILQ_INIT(&tdctx->tdio_list);
878 DSCHED_GLOBAL_THREAD_CTX_LOCK();
879 while ((dp = disk_enumerate(dp))) {
880 tdio = dsched_thread_io_alloc(dp, tdctx, dp->d_sched_policy);
883 TAILQ_INSERT_TAIL(&dsched_tdctx_list, tdctx, link);
884 DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
886 atomic_add_int(&dsched_stats.tdctx_allocations, 1);
887 /* XXX: no callback here */
892 policy_new(struct disk *dp, struct dsched_policy *pol) {
893 struct dsched_thread_ctx *tdctx;
894 struct dsched_disk_ctx *diskctx;
895 struct dsched_thread_io *tdio;
897 diskctx = dsched_disk_ctx_alloc(dp, pol);
898 dsched_disk_ctx_ref(diskctx);
899 dsched_set_disk_priv(dp, diskctx);
901 DSCHED_GLOBAL_THREAD_CTX_LOCK();
902 TAILQ_FOREACH(tdctx, &dsched_tdctx_list, link) {
903 tdio = dsched_thread_io_alloc(dp, tdctx, pol);
905 DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
910 policy_destroy(struct disk *dp) {
911 struct dsched_disk_ctx *diskctx;
913 diskctx = dsched_get_disk_priv(dp);
914 KKASSERT(diskctx != NULL);
916 dsched_disk_ctx_unref(diskctx); /* from prepare */
917 dsched_disk_ctx_unref(diskctx); /* from alloc */
919 dsched_set_disk_priv(dp, NULL);
923 dsched_new_buf(struct buf *bp)
925 struct dsched_thread_ctx *tdctx = NULL;
927 if (dsched_inited == 0)
930 if (curproc != NULL) {
931 tdctx = dsched_get_proc_priv(curproc);
933 /* This is a kernel thread, so no proc info is available */
934 tdctx = dsched_get_thread_priv(curthread);
939 * XXX: hack. we don't want this assert because we aren't catching all
940 * threads. mi_startup() is still getting away without an tdctx.
943 /* by now we should have an tdctx. if not, something bad is going on */
944 KKASSERT(tdctx != NULL);
948 dsched_thread_ctx_ref(tdctx);
950 dsched_set_buf_priv(bp, tdctx);
954 dsched_exit_buf(struct buf *bp)
956 struct dsched_thread_ctx *tdctx;
958 tdctx = dsched_get_buf_priv(bp);
960 dsched_clr_buf_priv(bp);
961 dsched_thread_ctx_unref(tdctx);
966 dsched_new_proc(struct proc *p)
968 struct dsched_thread_ctx *tdctx;
970 if (dsched_inited == 0)
975 tdctx = dsched_thread_ctx_alloc(p);
977 dsched_thread_ctx_ref(tdctx);
979 dsched_set_proc_priv(p, tdctx);
980 atomic_add_int(&dsched_stats.nprocs, 1);
985 dsched_new_thread(struct thread *td)
987 struct dsched_thread_ctx *tdctx;
989 if (dsched_inited == 0)
992 KKASSERT(td != NULL);
994 tdctx = dsched_thread_ctx_alloc(NULL);
996 dsched_thread_ctx_ref(tdctx);
998 dsched_set_thread_priv(td, tdctx);
999 atomic_add_int(&dsched_stats.nthreads, 1);
1003 dsched_exit_proc(struct proc *p)
1005 struct dsched_thread_ctx *tdctx;
1007 if (dsched_inited == 0)
1010 KKASSERT(p != NULL);
1012 tdctx = dsched_get_proc_priv(p);
1013 KKASSERT(tdctx != NULL);
1015 tdctx->dead = 0xDEAD;
1016 dsched_set_proc_priv(p, NULL);
1018 dsched_thread_ctx_unref(tdctx); /* one for alloc, */
1019 dsched_thread_ctx_unref(tdctx); /* one for ref */
1020 atomic_subtract_int(&dsched_stats.nprocs, 1);
1025 dsched_exit_thread(struct thread *td)
1027 struct dsched_thread_ctx *tdctx;
1029 if (dsched_inited == 0)
1032 KKASSERT(td != NULL);
1034 tdctx = dsched_get_thread_priv(td);
1035 KKASSERT(tdctx != NULL);
1037 tdctx->dead = 0xDEAD;
1038 dsched_set_thread_priv(td, 0);
1040 dsched_thread_ctx_unref(tdctx); /* one for alloc, */
1041 dsched_thread_ctx_unref(tdctx); /* one for ref */
1042 atomic_subtract_int(&dsched_stats.nthreads, 1);
1045 struct dsched_thread_io *
1046 dsched_new_policy_thread_tdio(struct dsched_disk_ctx *diskctx,
1047 struct dsched_policy *pol) {
1048 struct dsched_thread_ctx *tdctx;
1049 struct dsched_thread_io *tdio;
1051 DSCHED_GLOBAL_THREAD_CTX_LOCK();
1053 tdctx = dsched_get_thread_priv(curthread);
1054 KKASSERT(tdctx != NULL);
1055 tdio = dsched_thread_io_alloc(diskctx->dp, tdctx, pol);
1057 DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
1062 /* DEFAULT NOOP POLICY */
1065 noop_prepare(struct dsched_disk_ctx *diskctx)
1071 noop_teardown(struct dsched_disk_ctx *diskctx)
1077 noop_cancel(struct dsched_disk_ctx *diskctx)
1083 noop_queue(struct dsched_disk_ctx *diskctx, struct dsched_thread_io *tdio,
1086 dsched_strategy_raw(diskctx->dp, bio);
1088 dsched_strategy_async(diskctx->dp, bio, noop_completed, NULL);
1099 dsched_tdio_cache = objcache_create("dsched-tdio-cache", 0, 0,
1101 objcache_malloc_alloc,
1102 objcache_malloc_free,
1103 &dsched_thread_io_malloc_args );
1105 dsched_tdctx_cache = objcache_create("dsched-tdctx-cache", 0, 0,
1107 objcache_malloc_alloc,
1108 objcache_malloc_free,
1109 &dsched_thread_ctx_malloc_args );
1111 dsched_diskctx_cache = objcache_create("dsched-diskctx-cache", 0, 0,
1113 objcache_malloc_alloc,
1114 objcache_malloc_free,
1115 &dsched_disk_ctx_malloc_args );
1117 bzero(&dsched_stats, sizeof(struct dsched_stats));
1119 lockinit(&dsched_lock, "dsched lock", 0, LK_CANRECURSE);
1120 DSCHED_GLOBAL_THREAD_CTX_LOCKINIT();
1122 dsched_register(&dsched_noop_policy);
1132 SYSINIT(subr_dsched_register, SI_SUB_CREATE_INIT-1, SI_ORDER_FIRST, dsched_init, NULL);
1133 SYSUNINIT(subr_dsched_register, SI_SUB_CREATE_INIT-1, SI_ORDER_ANY, dsched_uninit, NULL);
1139 sysctl_dsched_stats(SYSCTL_HANDLER_ARGS)
1141 return (sysctl_handle_opaque(oidp, &dsched_stats, sizeof(struct dsched_stats), req));
1145 sysctl_dsched_list_policies(SYSCTL_HANDLER_ARGS)
1147 struct dsched_policy *pol = NULL;
1148 int error, first = 1;
1150 lockmgr(&dsched_lock, LK_EXCLUSIVE);
1152 while ((pol = dsched_policy_enumerate(pol))) {
1154 error = SYSCTL_OUT(req, " ", 1);
1160 error = SYSCTL_OUT(req, pol->name, strlen(pol->name));
1166 lockmgr(&dsched_lock, LK_RELEASE);
1168 error = SYSCTL_OUT(req, "", 1);
1174 sysctl_dsched_policy(SYSCTL_HANDLER_ARGS)
1176 char buf[DSCHED_POLICY_NAME_LENGTH];
1177 struct dsched_disk_ctx *diskctx = arg1;
1178 struct dsched_policy *pol = NULL;
1181 if (diskctx == NULL) {
1185 lockmgr(&dsched_lock, LK_EXCLUSIVE);
1187 pol = diskctx->dp->d_sched_policy;
1188 memcpy(buf, pol->name, DSCHED_POLICY_NAME_LENGTH);
1190 error = sysctl_handle_string(oidp, buf, DSCHED_POLICY_NAME_LENGTH, req);
1191 if (error || req->newptr == NULL) {
1192 lockmgr(&dsched_lock, LK_RELEASE);
1196 pol = dsched_find_policy(buf);
1198 lockmgr(&dsched_lock, LK_RELEASE);
1202 dsched_switch(diskctx->dp, pol);
1204 lockmgr(&dsched_lock, LK_RELEASE);
1210 sysctl_dsched_default_policy(SYSCTL_HANDLER_ARGS)
1212 char buf[DSCHED_POLICY_NAME_LENGTH];
1213 struct dsched_policy *pol = NULL;
1216 lockmgr(&dsched_lock, LK_EXCLUSIVE);
1218 pol = default_policy;
1219 memcpy(buf, pol->name, DSCHED_POLICY_NAME_LENGTH);
1221 error = sysctl_handle_string(oidp, buf, DSCHED_POLICY_NAME_LENGTH, req);
1222 if (error || req->newptr == NULL) {
1223 lockmgr(&dsched_lock, LK_RELEASE);
1227 pol = dsched_find_policy(buf);
1229 lockmgr(&dsched_lock, LK_RELEASE);
1234 default_policy = pol;
1236 lockmgr(&dsched_lock, LK_RELEASE);
1241 SYSCTL_NODE(, OID_AUTO, dsched, CTLFLAG_RD, NULL,
1242 "Disk Scheduler Framework (dsched) magic");
1243 SYSCTL_NODE(_dsched, OID_AUTO, policy, CTLFLAG_RW, NULL,
1244 "List of disks and their policies");
1245 SYSCTL_INT(_dsched, OID_AUTO, debug, CTLFLAG_RW, &dsched_debug_enable,
1246 0, "Enable dsched debugging");
1247 SYSCTL_PROC(_dsched, OID_AUTO, stats, CTLTYPE_OPAQUE|CTLFLAG_RD,
1248 0, sizeof(struct dsched_stats), sysctl_dsched_stats, "dsched_stats",
1249 "dsched statistics");
1250 SYSCTL_PROC(_dsched, OID_AUTO, policies, CTLTYPE_STRING|CTLFLAG_RD,
1251 NULL, 0, sysctl_dsched_list_policies, "A", "names of available policies");
1252 SYSCTL_PROC(_dsched_policy, OID_AUTO, default, CTLTYPE_STRING|CTLFLAG_RW,
1253 NULL, 0, sysctl_dsched_default_policy, "A", "default dsched policy");
1256 dsched_sysctl_add_disk(struct dsched_disk_ctx *diskctx, char *name)
1258 if (!(diskctx->flags & DSCHED_SYSCTL_CTX_INITED)) {
1259 diskctx->flags |= DSCHED_SYSCTL_CTX_INITED;
1260 sysctl_ctx_init(&diskctx->sysctl_ctx);
1263 SYSCTL_ADD_PROC(&diskctx->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_dsched_policy),
1264 OID_AUTO, name, CTLTYPE_STRING|CTLFLAG_RW,
1265 diskctx, 0, sysctl_dsched_policy, "A", "policy");