kernel - Fix races in disk iteration and diskctx handling
[dragonfly.git] / sys / kern / kern_dsched.c
CommitLineData
b80a9543
AH
1/*
2 * Copyright (c) 2009, 2010 The DragonFly Project. All rights reserved.
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Alex Hornung <ahornung@gmail.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 */
34#include <sys/param.h>
35#include <sys/systm.h>
36#include <sys/kernel.h>
37#include <sys/proc.h>
38#include <sys/sysctl.h>
39#include <sys/buf.h>
40#include <sys/conf.h>
41#include <sys/diskslice.h>
42#include <sys/disk.h>
43#include <sys/malloc.h>
b80a9543
AH
44#include <machine/md_var.h>
45#include <sys/ctype.h>
46#include <sys/syslog.h>
47#include <sys/device.h>
48#include <sys/msgport.h>
49#include <sys/msgport2.h>
50#include <sys/buf2.h>
51#include <sys/dsched.h>
52#include <sys/fcntl.h>
53#include <machine/varargs.h>
54
c7a0a046
AH
55TAILQ_HEAD(tdio_list_head, dsched_thread_io);
56
e02e815e
AH
57MALLOC_DEFINE(M_DSCHED, "dsched", "dsched allocs");
58
9495e99b
AH
59static dsched_prepare_t noop_prepare;
60static dsched_teardown_t noop_teardown;
61static dsched_cancel_t noop_cancel;
62static dsched_queue_t noop_queue;
b80a9543 63
5374d04f 64static void dsched_thread_io_unref_destroy(struct dsched_thread_io *tdio);
bc3c9325 65static void dsched_sysctl_add_disk(struct dsched_disk_ctx *diskctx, char *name);
265b0d4a
MD
66static void dsched_disk_ctx_destroy(struct dsched_disk_ctx *diskctx);
67static void dsched_thread_io_destroy(struct dsched_thread_io *tdio);
68static void dsched_thread_ctx_destroy(struct dsched_thread_ctx *tdctx);
bc3c9325 69
e02e815e 70static int dsched_inited = 0;
9495e99b 71static int default_set = 0;
b80a9543
AH
72
73struct lock dsched_lock;
74static int dsched_debug_enable = 0;
b80a9543 75
e02e815e
AH
76struct dsched_stats dsched_stats;
77
78struct objcache_malloc_args dsched_disk_ctx_malloc_args = {
79 DSCHED_DISK_CTX_MAX_SZ, M_DSCHED };
80struct objcache_malloc_args dsched_thread_io_malloc_args = {
81 DSCHED_THREAD_IO_MAX_SZ, M_DSCHED };
82struct objcache_malloc_args dsched_thread_ctx_malloc_args = {
83 DSCHED_THREAD_CTX_MAX_SZ, M_DSCHED };
84
85static struct objcache *dsched_diskctx_cache;
86static struct objcache *dsched_tdctx_cache;
87static struct objcache *dsched_tdio_cache;
88
89TAILQ_HEAD(, dsched_thread_ctx) dsched_tdctx_list =
90 TAILQ_HEAD_INITIALIZER(dsched_tdctx_list);
91
92struct lock dsched_tdctx_lock;
93
b80a9543
AH
94static struct dsched_policy_head dsched_policy_list =
95 TAILQ_HEAD_INITIALIZER(dsched_policy_list);
96
9495e99b 97static struct dsched_policy dsched_noop_policy = {
0160356d
AH
98 .name = "noop",
99
9495e99b
AH
100 .prepare = noop_prepare,
101 .teardown = noop_teardown,
102 .cancel_all = noop_cancel,
103 .bio_queue = noop_queue
b80a9543
AH
104};
105
9495e99b 106static struct dsched_policy *default_policy = &dsched_noop_policy;
b80a9543
AH
107
108/*
109 * dsched_debug() is a SYSCTL and TUNABLE controlled debug output function
110 * using kvprintf
111 */
112int
113dsched_debug(int level, char *fmt, ...)
114{
115 __va_list ap;
116
117 __va_start(ap, fmt);
118 if (level <= dsched_debug_enable)
119 kvprintf(fmt, ap);
120 __va_end(ap);
121
122 return 0;
123}
124
125/*
126 * Called on disk_create()
127 * tries to read which policy to use from loader.conf, if there's
128 * none specified, the default policy is used.
129 */
130void
0160356d 131dsched_disk_create_callback(struct disk *dp, const char *head_name, int unit)
b80a9543 132{
08db538f 133 char tunable_key[SPECNAMELEN + 48];
b80a9543 134 char sched_policy[DSCHED_POLICY_NAME_LENGTH];
8fd0ee3c 135 char *ptr;
b80a9543
AH
136 struct dsched_policy *policy = NULL;
137
138 /* Also look for serno stuff? */
b80a9543
AH
139 lockmgr(&dsched_lock, LK_EXCLUSIVE);
140
5374d04f
MD
141 ksnprintf(tunable_key, sizeof(tunable_key),
142 "dsched.policy.%s%d", head_name, unit);
b80a9543
AH
143 if (TUNABLE_STR_FETCH(tunable_key, sched_policy,
144 sizeof(sched_policy)) != 0) {
145 policy = dsched_find_policy(sched_policy);
146 }
147
5374d04f
MD
148 ksnprintf(tunable_key, sizeof(tunable_key),
149 "dsched.policy.%s", head_name);
150
8fd0ee3c
AH
151 for (ptr = tunable_key; *ptr; ptr++) {
152 if (*ptr == '/')
153 *ptr = '-';
154 }
b80a9543
AH
155 if (!policy && (TUNABLE_STR_FETCH(tunable_key, sched_policy,
156 sizeof(sched_policy)) != 0)) {
157 policy = dsched_find_policy(sched_policy);
158 }
159
279e9fd5 160 ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.default");
5374d04f
MD
161 if (!policy && !default_set &&
162 (TUNABLE_STR_FETCH(tunable_key, sched_policy,
163 sizeof(sched_policy)) != 0)) {
b80a9543
AH
164 policy = dsched_find_policy(sched_policy);
165 }
166
167 if (!policy) {
027623d4
MD
168 if (!default_set && bootverbose) {
169 dsched_debug(0,
170 "No policy for %s%d specified, "
171 "or policy not found\n",
172 head_name, unit);
9495e99b
AH
173 }
174 dsched_set_policy(dp, default_policy);
b80a9543 175 } else {
0160356d 176 dsched_set_policy(dp, policy);
b80a9543
AH
177 }
178
8fd0ee3c
AH
179 if (strncmp(head_name, "mapper/", strlen("mapper/")) == 0)
180 ksnprintf(tunable_key, sizeof(tunable_key), "%s", head_name);
181 else
182 ksnprintf(tunable_key, sizeof(tunable_key), "%s%d", head_name, unit);
183 for (ptr = tunable_key; *ptr; ptr++) {
184 if (*ptr == '/')
185 *ptr = '-';
186 }
bc3c9325
AH
187 dsched_sysctl_add_disk(
188 (struct dsched_disk_ctx *)dsched_get_disk_priv(dp),
189 tunable_key);
190
b80a9543
AH
191 lockmgr(&dsched_lock, LK_RELEASE);
192}
193
279e9fd5
AH
194/*
195 * Called from disk_setdiskinfo (or rather _setdiskinfo). This will check if
196 * there's any policy associated with the serial number of the device.
197 */
198void
199dsched_disk_update_callback(struct disk *dp, struct disk_info *info)
200{
201 char tunable_key[SPECNAMELEN + 48];
202 char sched_policy[DSCHED_POLICY_NAME_LENGTH];
203 struct dsched_policy *policy = NULL;
204
205 if (info->d_serialno == NULL)
206 return;
207
208 lockmgr(&dsched_lock, LK_EXCLUSIVE);
209
210 ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.%s",
211 info->d_serialno);
212
213 if((TUNABLE_STR_FETCH(tunable_key, sched_policy,
214 sizeof(sched_policy)) != 0)) {
215 policy = dsched_find_policy(sched_policy);
216 }
217
218 if (policy) {
219 dsched_switch(dp, policy);
220 }
221
bc3c9325
AH
222 dsched_sysctl_add_disk(
223 (struct dsched_disk_ctx *)dsched_get_disk_priv(dp),
224 info->d_serialno);
225
279e9fd5
AH
226 lockmgr(&dsched_lock, LK_RELEASE);
227}
228
b80a9543
AH
229/*
230 * Called on disk_destroy()
231 * shuts down the scheduler core and cancels all remaining bios
232 */
233void
0160356d 234dsched_disk_destroy_callback(struct disk *dp)
b80a9543 235{
0160356d 236 struct dsched_policy *old_policy;
bc3c9325 237 struct dsched_disk_ctx *diskctx;
b80a9543
AH
238
239 lockmgr(&dsched_lock, LK_EXCLUSIVE);
240
bc3c9325
AH
241 diskctx = dsched_get_disk_priv(dp);
242
0160356d 243 old_policy = dp->d_sched_policy;
9495e99b 244 dp->d_sched_policy = &dsched_noop_policy;
e02e815e
AH
245 old_policy->cancel_all(dsched_get_disk_priv(dp));
246 old_policy->teardown(dsched_get_disk_priv(dp));
bc3c9325
AH
247
248 if (diskctx->flags & DSCHED_SYSCTL_CTX_INITED)
249 sysctl_ctx_free(&diskctx->sysctl_ctx);
250
e02e815e 251 policy_destroy(dp);
0160356d
AH
252 atomic_subtract_int(&old_policy->ref_count, 1);
253 KKASSERT(old_policy->ref_count >= 0);
b80a9543
AH
254
255 lockmgr(&dsched_lock, LK_RELEASE);
256}
257
258
259void
260dsched_queue(struct disk *dp, struct bio *bio)
261{
e02e815e
AH
262 struct dsched_thread_ctx *tdctx;
263 struct dsched_thread_io *tdio;
264 struct dsched_disk_ctx *diskctx;
b80a9543 265
e02e815e
AH
266 int found = 0, error = 0;
267
268 tdctx = dsched_get_buf_priv(bio->bio_buf);
269 if (tdctx == NULL) {
270 /* We don't handle this case, let dsched dispatch */
271 atomic_add_int(&dsched_stats.no_tdctx, 1);
272 dsched_strategy_raw(dp, bio);
273 return;
274 }
275
276 DSCHED_THREAD_CTX_LOCK(tdctx);
277
278 KKASSERT(!TAILQ_EMPTY(&tdctx->tdio_list));
c7a0a046
AH
279 /*
280 * XXX:
281 * iterate in reverse to make sure we find the most up-to-date
282 * tdio for a given disk. After a switch it may take some time
283 * for everything to clean up.
284 */
285 TAILQ_FOREACH_REVERSE(tdio, &tdctx->tdio_list, tdio_list_head, link) {
e02e815e
AH
286 if (tdio->dp == dp) {
287 dsched_thread_io_ref(tdio);
288 found = 1;
289 break;
b80a9543 290 }
e02e815e
AH
291 }
292
293 DSCHED_THREAD_CTX_UNLOCK(tdctx);
294 dsched_clr_buf_priv(bio->bio_buf);
295 dsched_thread_ctx_unref(tdctx); /* acquired on new_buf */
296
297 KKASSERT(found == 1);
298 diskctx = dsched_get_disk_priv(dp);
299 dsched_disk_ctx_ref(diskctx);
c7a0a046
AH
300
301 if (dp->d_sched_policy != &dsched_noop_policy)
302 KKASSERT(tdio->debug_policy == dp->d_sched_policy);
303
304 KKASSERT(tdio->debug_inited == 0xF00F1234);
305
e02e815e
AH
306 error = dp->d_sched_policy->bio_queue(diskctx, tdio, bio);
307
308 if (error) {
b80a9543
AH
309 dsched_strategy_raw(dp, bio);
310 }
e02e815e
AH
311 dsched_disk_ctx_unref(diskctx);
312 dsched_thread_io_unref(tdio);
b80a9543
AH
313}
314
315
316/*
317 * Called from each module_init or module_attach of each policy
318 * registers the policy in the local policy list.
319 */
320int
0160356d 321dsched_register(struct dsched_policy *d_policy)
b80a9543
AH
322{
323 struct dsched_policy *policy;
324 int error = 0;
325
326 lockmgr(&dsched_lock, LK_EXCLUSIVE);
327
0160356d 328 policy = dsched_find_policy(d_policy->name);
b80a9543
AH
329
330 if (!policy) {
0160356d
AH
331 TAILQ_INSERT_TAIL(&dsched_policy_list, d_policy, link);
332 atomic_add_int(&d_policy->ref_count, 1);
b80a9543
AH
333 } else {
334 dsched_debug(LOG_ERR, "Policy with name %s already registered!\n",
0160356d 335 d_policy->name);
e02e815e 336 error = EEXIST;
b80a9543
AH
337 }
338
b80a9543
AH
339 lockmgr(&dsched_lock, LK_RELEASE);
340 return error;
341}
342
343/*
344 * Called from each module_detach of each policy
345 * unregisters the policy
346 */
347int
0160356d 348dsched_unregister(struct dsched_policy *d_policy)
b80a9543
AH
349{
350 struct dsched_policy *policy;
351
352 lockmgr(&dsched_lock, LK_EXCLUSIVE);
0160356d 353 policy = dsched_find_policy(d_policy->name);
b80a9543
AH
354
355 if (policy) {
e02e815e
AH
356 if (policy->ref_count > 1) {
357 lockmgr(&dsched_lock, LK_RELEASE);
358 return EBUSY;
359 }
b80a9543 360 TAILQ_REMOVE(&dsched_policy_list, policy, link);
0160356d 361 atomic_subtract_int(&policy->ref_count, 1);
e02e815e 362 KKASSERT(policy->ref_count == 0);
b80a9543
AH
363 }
364 lockmgr(&dsched_lock, LK_RELEASE);
c7a0a046 365
b80a9543
AH
366 return 0;
367}
368
369
370/*
371 * switches the policy by first removing the old one and then
372 * enabling the new one.
373 */
374int
0160356d 375dsched_switch(struct disk *dp, struct dsched_policy *new_policy)
b80a9543 376{
0160356d 377 struct dsched_policy *old_policy;
b80a9543
AH
378
379 /* If we are asked to set the same policy, do nothing */
0160356d 380 if (dp->d_sched_policy == new_policy)
b80a9543
AH
381 return 0;
382
383 /* lock everything down, diskwise */
384 lockmgr(&dsched_lock, LK_EXCLUSIVE);
0160356d 385 old_policy = dp->d_sched_policy;
b80a9543 386
e02e815e
AH
387 atomic_subtract_int(&old_policy->ref_count, 1);
388 KKASSERT(old_policy->ref_count >= 0);
b80a9543 389
9495e99b 390 dp->d_sched_policy = &dsched_noop_policy;
e02e815e
AH
391 old_policy->teardown(dsched_get_disk_priv(dp));
392 policy_destroy(dp);
b80a9543
AH
393
394 /* Bring everything back to life */
0160356d 395 dsched_set_policy(dp, new_policy);
e02e815e 396 lockmgr(&dsched_lock, LK_RELEASE);
c7a0a046 397
b80a9543
AH
398 return 0;
399}
400
401
402/*
403 * Loads a given policy and attaches it to the specified disk.
404 * Also initializes the core for the policy
405 */
406void
0160356d 407dsched_set_policy(struct disk *dp, struct dsched_policy *new_policy)
b80a9543
AH
408{
409 int locked = 0;
410
411 /* Check if it is locked already. if not, we acquire the devfs lock */
412 if (!(lockstatus(&dsched_lock, curthread)) == LK_EXCLUSIVE) {
413 lockmgr(&dsched_lock, LK_EXCLUSIVE);
414 locked = 1;
415 }
416
c7a0a046
AH
417 DSCHED_GLOBAL_THREAD_CTX_LOCK();
418
e02e815e
AH
419 policy_new(dp, new_policy);
420 new_policy->prepare(dsched_get_disk_priv(dp));
0160356d 421 dp->d_sched_policy = new_policy;
c7a0a046
AH
422
423 DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
424
0160356d 425 atomic_add_int(&new_policy->ref_count, 1);
b80a9543 426 kprintf("disk scheduler: set policy of %s to %s\n", dp->d_cdev->si_name,
0160356d 427 new_policy->name);
b80a9543
AH
428
429 /* If we acquired the lock, we also get rid of it */
430 if (locked)
431 lockmgr(&dsched_lock, LK_RELEASE);
432}
433
434struct dsched_policy*
435dsched_find_policy(char *search)
436{
437 struct dsched_policy *policy;
438 struct dsched_policy *policy_found = NULL;
439 int locked = 0;
440
441 /* Check if it is locked already. if not, we acquire the devfs lock */
442 if (!(lockstatus(&dsched_lock, curthread)) == LK_EXCLUSIVE) {
443 lockmgr(&dsched_lock, LK_EXCLUSIVE);
444 locked = 1;
445 }
446
447 TAILQ_FOREACH(policy, &dsched_policy_list, link) {
0160356d 448 if (!strcmp(policy->name, search)) {
b80a9543
AH
449 policy_found = policy;
450 break;
451 }
452 }
453
454 /* If we acquired the lock, we also get rid of it */
455 if (locked)
456 lockmgr(&dsched_lock, LK_RELEASE);
457
458 return policy_found;
459}
460
5374d04f
MD
461/*
462 * Returns ref'd disk
463 */
464struct disk *
b80a9543
AH
465dsched_find_disk(char *search)
466{
5374d04f 467 struct disk marker;
b80a9543
AH
468 struct disk *dp = NULL;
469
5374d04f
MD
470 while ((dp = disk_enumerate(&marker, dp)) != NULL) {
471 if (strcmp(dp->d_cdev->si_name, search) == 0) {
472 disk_enumerate_stop(&marker, NULL);
473 /* leave ref on dp */
b80a9543
AH
474 break;
475 }
476 }
5374d04f 477 return dp;
b80a9543
AH
478}
479
5374d04f
MD
480struct disk *
481dsched_disk_enumerate(struct disk *marker, struct disk *dp,
482 struct dsched_policy *policy)
b80a9543 483{
5374d04f 484 while ((dp = disk_enumerate(marker, dp)) != NULL) {
0160356d 485 if (dp->d_sched_policy == policy)
5374d04f 486 break;
b80a9543 487 }
b80a9543
AH
488 return NULL;
489}
490
491struct dsched_policy *
492dsched_policy_enumerate(struct dsched_policy *pol)
493{
494 if (!pol)
495 return (TAILQ_FIRST(&dsched_policy_list));
496 else
497 return (TAILQ_NEXT(pol, link));
498}
499
500void
501dsched_cancel_bio(struct bio *bp)
502{
503 bp->bio_buf->b_error = ENXIO;
504 bp->bio_buf->b_flags |= B_ERROR;
505 bp->bio_buf->b_resid = bp->bio_buf->b_bcount;
506
507 biodone(bp);
508}
509
510void
511dsched_strategy_raw(struct disk *dp, struct bio *bp)
512{
513 /*
514 * Ideally, this stuff shouldn't be needed... but just in case, we leave it in
515 * to avoid panics
516 */
517 KASSERT(dp->d_rawdev != NULL, ("dsched_strategy_raw sees NULL d_rawdev!!"));
518 if(bp->bio_track != NULL) {
519 dsched_debug(LOG_INFO,
520 "dsched_strategy_raw sees non-NULL bio_track!! "
8dad49a8 521 "bio: %p\n", bp);
b80a9543
AH
522 bp->bio_track = NULL;
523 }
524 dev_dstrategy(dp->d_rawdev, bp);
525}
526
527void
528dsched_strategy_sync(struct disk *dp, struct bio *bio)
529{
530 struct buf *bp, *nbp;
531 struct bio *nbio;
532
533 bp = bio->bio_buf;
534
535 nbp = getpbuf(NULL);
536 nbio = &nbp->b_bio1;
537
538 nbp->b_cmd = bp->b_cmd;
539 nbp->b_bufsize = bp->b_bufsize;
540 nbp->b_runningbufspace = bp->b_runningbufspace;
541 nbp->b_bcount = bp->b_bcount;
542 nbp->b_resid = bp->b_resid;
543 nbp->b_data = bp->b_data;
9a82e536
MD
544#if 0
545 /*
546 * Buffers undergoing device I/O do not need a kvabase/size.
547 */
b80a9543
AH
548 nbp->b_kvabase = bp->b_kvabase;
549 nbp->b_kvasize = bp->b_kvasize;
9a82e536 550#endif
b80a9543
AH
551 nbp->b_dirtyend = bp->b_dirtyend;
552
553 nbio->bio_done = biodone_sync;
554 nbio->bio_flags |= BIO_SYNC;
555 nbio->bio_track = NULL;
556
557 nbio->bio_caller_info1.ptr = dp;
558 nbio->bio_offset = bio->bio_offset;
559
560 dev_dstrategy(dp->d_rawdev, nbio);
561 biowait(nbio, "dschedsync");
562 bp->b_resid = nbp->b_resid;
563 bp->b_error = nbp->b_error;
564 biodone(bio);
9a82e536
MD
565#if 0
566 nbp->b_kvabase = NULL;
567 nbp->b_kvasize = 0;
568#endif
e02e815e 569 relpbuf(nbp, NULL);
b80a9543
AH
570}
571
572void
573dsched_strategy_async(struct disk *dp, struct bio *bio, biodone_t *done, void *priv)
574{
575 struct bio *nbio;
576
577 nbio = push_bio(bio);
578 nbio->bio_done = done;
579 nbio->bio_offset = bio->bio_offset;
580
581 dsched_set_bio_dp(nbio, dp);
582 dsched_set_bio_priv(nbio, priv);
583
584 getmicrotime(&nbio->bio_caller_info3.tv);
585 dev_dstrategy(dp->d_rawdev, nbio);
586}
587
09f2bfe9
BP
588/*
589 * A special bio done call back function
590 * used by policy having request polling implemented.
591 */
592static void
593request_polling_biodone(struct bio *bp)
594{
595 struct dsched_disk_ctx *diskctx = NULL;
596 struct disk *dp = NULL;
597 struct bio *obio;
598 struct dsched_policy *policy;
599
600 dp = dsched_get_bio_dp(bp);
601 policy = dp->d_sched_policy;
602 diskctx = dsched_get_disk_priv(dp);
603 KKASSERT(diskctx && policy);
604 dsched_disk_ctx_ref(diskctx);
605
606 /*
607 * XXX:
608 * the bio_done function should not be blocked !
609 */
610 if (diskctx->dp->d_sched_policy->bio_done)
611 diskctx->dp->d_sched_policy->bio_done(bp);
612
613 obio = pop_bio(bp);
614 biodone(obio);
615
616 atomic_subtract_int(&diskctx->current_tag_queue_depth, 1);
617
618 /* call the polling function,
619 * XXX:
620 * the polling function should not be blocked!
621 */
622 if (policy->polling_func)
623 policy->polling_func(diskctx);
624 else
625 dsched_debug(0, "dsched: the policy uses request polling without a polling function!\n");
626 dsched_disk_ctx_unref(diskctx);
627}
628
629/*
630 * A special dsched strategy used by policy having request polling
631 * (polling function) implemented.
632 *
633 * The strategy is the just like dsched_strategy_async(), but
634 * the biodone call back is set to a preset one.
635 *
636 * If the policy needs its own biodone callback, it should
637 * register it in the policy structure. (bio_done field)
638 *
639 * The current_tag_queue_depth is maintained by this function
640 * and the request_polling_biodone() function
641 */
642
643void
644dsched_strategy_request_polling(struct disk *dp, struct bio *bio, struct dsched_disk_ctx *diskctx)
645{
646 atomic_add_int(&diskctx->current_tag_queue_depth, 1);
647 dsched_strategy_async(dp, bio, request_polling_biodone, dsched_get_bio_priv(bio));
648}
649
265b0d4a
MD
650/*
651 * Ref and deref various structures. The 1->0 transition of the reference
652 * count actually transitions 1->0x80000000 and causes the object to be
653 * destroyed. It is possible for transitory references to occur on the
654 * object while it is being destroyed. We use bit 31 to indicate that
655 * destruction is in progress and to prevent nested destructions.
656 */
e02e815e
AH
657void
658dsched_disk_ctx_ref(struct dsched_disk_ctx *diskctx)
659{
660 int refcount;
661
662 refcount = atomic_fetchadd_int(&diskctx->refcount, 1);
e02e815e
AH
663}
664
665void
666dsched_thread_io_ref(struct dsched_thread_io *tdio)
667{
668 int refcount;
669
670 refcount = atomic_fetchadd_int(&tdio->refcount, 1);
e02e815e
AH
671}
672
673void
674dsched_thread_ctx_ref(struct dsched_thread_ctx *tdctx)
675{
676 int refcount;
677
678 refcount = atomic_fetchadd_int(&tdctx->refcount, 1);
e02e815e
AH
679}
680
681void
682dsched_disk_ctx_unref(struct dsched_disk_ctx *diskctx)
683{
265b0d4a
MD
684 int refs;
685 int nrefs;
e02e815e 686
265b0d4a
MD
687 /*
688 * Handle 1->0 transitions for diskctx and nested destruction
689 * recursions. If the refs are already in destruction mode (bit 31
690 * set) on the 1->0 transition we don't try to destruct it again.
691 *
692 * 0x80000001->0x80000000 transitions are handled normally and
693 * thus avoid nested dstruction.
694 */
695 for (;;) {
696 refs = diskctx->refcount;
697 cpu_ccfence();
698 nrefs = refs - 1;
699
700 KKASSERT(((refs ^ nrefs) & 0x80000000) == 0);
701 if (nrefs) {
702 if (atomic_cmpset_int(&diskctx->refcount, refs, nrefs))
703 break;
704 continue;
705 }
706 nrefs = 0x80000000;
707 if (atomic_cmpset_int(&diskctx->refcount, refs, nrefs)) {
708 dsched_disk_ctx_destroy(diskctx);
709 break;
710 }
711 }
712}
e02e815e 713
265b0d4a
MD
714static
715void
716dsched_disk_ctx_destroy(struct dsched_disk_ctx *diskctx)
717{
718 struct dsched_thread_io *tdio;
5374d04f
MD
719 int refs;
720 int nrefs;
e02e815e 721
e02e815e 722#if 0
265b0d4a
MD
723 kprintf("diskctx (%p) destruction started, trace:\n", diskctx);
724 print_backtrace(4);
e02e815e 725#endif
265b0d4a
MD
726 lockmgr(&diskctx->lock, LK_EXCLUSIVE);
727 while ((tdio = TAILQ_FIRST(&diskctx->tdio_list)) != NULL) {
728 KKASSERT(tdio->flags & DSCHED_LINKED_DISK_CTX);
729 TAILQ_REMOVE(&diskctx->tdio_list, tdio, dlink);
730 atomic_clear_int(&tdio->flags, DSCHED_LINKED_DISK_CTX);
731 tdio->diskctx = NULL;
732 /* XXX tdio->diskctx->dp->d_sched_policy->destroy_tdio(tdio);*/
5374d04f
MD
733 lockmgr(&diskctx->lock, LK_RELEASE);
734 dsched_thread_io_unref_destroy(tdio);
735 lockmgr(&diskctx->lock, LK_EXCLUSIVE);
e02e815e 736 }
265b0d4a 737 lockmgr(&diskctx->lock, LK_RELEASE);
5374d04f
MD
738
739 /*
740 * Expect diskctx->refcount to be 0x80000000. If it isn't someone
741 * else still has a temporary ref on the diskctx and we have to
742 * transition it back to an undestroyed-state (albeit without any
743 * associations), so the other user destroys it properly when the
744 * ref is released.
745 */
746 while ((refs = diskctx->refcount) != 0x80000000) {
747 kprintf("dsched_thread_io: destroy race diskctx=%p\n", diskctx);
748 cpu_ccfence();
749 KKASSERT(refs & 0x80000000);
750 nrefs = refs & 0x7FFFFFFF;
751 if (atomic_cmpset_int(&diskctx->refcount, refs, nrefs))
752 return;
753 }
754
755 /*
756 * Really for sure now.
757 */
265b0d4a
MD
758 if (diskctx->dp->d_sched_policy->destroy_diskctx)
759 diskctx->dp->d_sched_policy->destroy_diskctx(diskctx);
265b0d4a
MD
760 objcache_put(dsched_diskctx_cache, diskctx);
761 atomic_subtract_int(&dsched_stats.diskctx_allocations, 1);
e02e815e
AH
762}
763
764void
765dsched_thread_io_unref(struct dsched_thread_io *tdio)
766{
265b0d4a
MD
767 int refs;
768 int nrefs;
e02e815e 769
265b0d4a
MD
770 /*
771 * Handle 1->0 transitions for tdio and nested destruction
772 * recursions. If the refs are already in destruction mode (bit 31
773 * set) on the 1->0 transition we don't try to destruct it again.
774 *
775 * 0x80000001->0x80000000 transitions are handled normally and
776 * thus avoid nested dstruction.
777 */
778 for (;;) {
779 refs = tdio->refcount;
780 cpu_ccfence();
781 nrefs = refs - 1;
782
783 KKASSERT(((refs ^ nrefs) & 0x80000000) == 0);
784 if (nrefs) {
785 if (atomic_cmpset_int(&tdio->refcount, refs, nrefs))
786 break;
787 continue;
788 }
789 nrefs = 0x80000000;
790 if (atomic_cmpset_int(&tdio->refcount, refs, nrefs)) {
791 dsched_thread_io_destroy(tdio);
792 break;
793 }
794 }
795}
e02e815e 796
5374d04f
MD
797/*
798 * Unref and destroy the tdio even if additional refs are present.
799 */
800static
801void
802dsched_thread_io_unref_destroy(struct dsched_thread_io *tdio)
803{
804 int refs;
805 int nrefs;
806
807 /*
808 * If not already transitioned to destroy-in-progress we transition
809 * to destroy-in-progress, cleanup our ref, and destroy the tdio.
810 */
811 for (;;) {
812 refs = tdio->refcount;
813 cpu_ccfence();
814 nrefs = refs - 1;
815
816 KKASSERT(((refs ^ nrefs) & 0x80000000) == 0);
817 if (nrefs & 0x80000000) {
818 if (atomic_cmpset_int(&tdio->refcount, refs, nrefs))
819 break;
820 continue;
821 }
822 nrefs |= 0x80000000;
823 if (atomic_cmpset_int(&tdio->refcount, refs, nrefs)) {
824 dsched_thread_io_destroy(tdio);
825 break;
826 }
827 }
828}
829
265b0d4a
MD
830static void
831dsched_thread_io_destroy(struct dsched_thread_io *tdio)
832{
833 struct dsched_thread_ctx *tdctx;
834 struct dsched_disk_ctx *diskctx;
5374d04f
MD
835 int refs;
836 int nrefs;
e02e815e 837
e02e815e 838#if 0
265b0d4a
MD
839 kprintf("tdio (%p) destruction started, trace:\n", tdio);
840 print_backtrace(8);
e02e815e 841#endif
265b0d4a 842 KKASSERT(tdio->qlength == 0);
e02e815e 843
265b0d4a
MD
844 while ((diskctx = tdio->diskctx) != NULL) {
845 dsched_disk_ctx_ref(diskctx);
846 lockmgr(&diskctx->lock, LK_EXCLUSIVE);
847 if (diskctx != tdio->diskctx) {
e02e815e 848 lockmgr(&diskctx->lock, LK_RELEASE);
265b0d4a
MD
849 dsched_disk_ctx_unref(diskctx);
850 continue;
e02e815e 851 }
265b0d4a
MD
852 KKASSERT(tdio->flags & DSCHED_LINKED_DISK_CTX);
853 if (diskctx->dp->d_sched_policy->destroy_tdio)
854 diskctx->dp->d_sched_policy->destroy_tdio(tdio);
855 TAILQ_REMOVE(&diskctx->tdio_list, tdio, dlink);
856 atomic_clear_int(&tdio->flags, DSCHED_LINKED_DISK_CTX);
857 tdio->diskctx = NULL;
5374d04f 858 dsched_thread_io_unref(tdio);
265b0d4a
MD
859 lockmgr(&diskctx->lock, LK_RELEASE);
860 dsched_disk_ctx_unref(diskctx);
861 }
862 while ((tdctx = tdio->tdctx) != NULL) {
863 dsched_thread_ctx_ref(tdctx);
864 lockmgr(&tdctx->lock, LK_EXCLUSIVE);
865 if (tdctx != tdio->tdctx) {
e02e815e 866 lockmgr(&tdctx->lock, LK_RELEASE);
265b0d4a
MD
867 dsched_thread_ctx_unref(tdctx);
868 continue;
e02e815e 869 }
265b0d4a
MD
870 KKASSERT(tdio->flags & DSCHED_LINKED_THREAD_CTX);
871 TAILQ_REMOVE(&tdctx->tdio_list, tdio, link);
872 atomic_clear_int(&tdio->flags, DSCHED_LINKED_THREAD_CTX);
873 tdio->tdctx = NULL;
5374d04f 874 dsched_thread_io_unref(tdio);
265b0d4a
MD
875 lockmgr(&tdctx->lock, LK_RELEASE);
876 dsched_thread_ctx_unref(tdctx);
877 }
5374d04f
MD
878
879 /*
880 * Expect tdio->refcount to be 0x80000000. If it isn't someone else
881 * still has a temporary ref on the tdio and we have to transition
882 * it back to an undestroyed-state (albeit without any associations)
883 * so the other user destroys it properly when the ref is released.
884 */
885 while ((refs = tdio->refcount) != 0x80000000) {
886 kprintf("dsched_thread_io: destroy race tdio=%p\n", tdio);
887 cpu_ccfence();
888 KKASSERT(refs & 0x80000000);
889 nrefs = refs & 0x7FFFFFFF;
890 if (atomic_cmpset_int(&tdio->refcount, refs, nrefs))
891 return;
892 }
893
894 /*
895 * Really for sure now.
896 */
265b0d4a
MD
897 objcache_put(dsched_tdio_cache, tdio);
898 atomic_subtract_int(&dsched_stats.tdio_allocations, 1);
e02e815e
AH
899}
900
901void
902dsched_thread_ctx_unref(struct dsched_thread_ctx *tdctx)
903{
265b0d4a
MD
904 int refs;
905 int nrefs;
e02e815e 906
265b0d4a
MD
907 /*
908 * Handle 1->0 transitions for tdctx and nested destruction
909 * recursions. If the refs are already in destruction mode (bit 31
910 * set) on the 1->0 transition we don't try to destruct it again.
911 *
912 * 0x80000001->0x80000000 transitions are handled normally and
913 * thus avoid nested dstruction.
914 */
915 for (;;) {
916 refs = tdctx->refcount;
917 cpu_ccfence();
918 nrefs = refs - 1;
919
920 KKASSERT(((refs ^ nrefs) & 0x80000000) == 0);
921 if (nrefs) {
922 if (atomic_cmpset_int(&tdctx->refcount, refs, nrefs))
923 break;
924 continue;
925 }
926 nrefs = 0x80000000;
927 if (atomic_cmpset_int(&tdctx->refcount, refs, nrefs)) {
928 dsched_thread_ctx_destroy(tdctx);
929 break;
930 }
931 }
932}
e02e815e 933
265b0d4a
MD
934static void
935dsched_thread_ctx_destroy(struct dsched_thread_ctx *tdctx)
936{
937 struct dsched_thread_io *tdio;
e02e815e 938
e02e815e 939#if 0
265b0d4a
MD
940 kprintf("tdctx (%p) destruction started, trace:\n", tdctx);
941 print_backtrace(8);
e02e815e 942#endif
265b0d4a
MD
943 DSCHED_GLOBAL_THREAD_CTX_LOCK();
944
c7a0a046
AH
945 lockmgr(&tdctx->lock, LK_EXCLUSIVE);
946
265b0d4a
MD
947 while ((tdio = TAILQ_FIRST(&tdctx->tdio_list)) != NULL) {
948 KKASSERT(tdio->flags & DSCHED_LINKED_THREAD_CTX);
949 TAILQ_REMOVE(&tdctx->tdio_list, tdio, link);
950 atomic_clear_int(&tdio->flags, DSCHED_LINKED_THREAD_CTX);
951 tdio->tdctx = NULL;
5374d04f
MD
952 lockmgr(&tdctx->lock, LK_RELEASE); /* avoid deadlock */
953 dsched_thread_io_unref_destroy(tdio);
954 lockmgr(&tdctx->lock, LK_EXCLUSIVE);
265b0d4a
MD
955 }
956 KKASSERT(tdctx->refcount == 0x80000000);
957 TAILQ_REMOVE(&dsched_tdctx_list, tdctx, link);
e02e815e 958
c7a0a046
AH
959 lockmgr(&tdctx->lock, LK_RELEASE);
960
265b0d4a 961 DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
e02e815e 962
265b0d4a
MD
963 objcache_put(dsched_tdctx_cache, tdctx);
964 atomic_subtract_int(&dsched_stats.tdctx_allocations, 1);
e02e815e
AH
965}
966
5374d04f
MD
967/*
968 * Ensures that a tdio is assigned to tdctx and disk.
969 */
970void
e02e815e 971dsched_thread_io_alloc(struct disk *dp, struct dsched_thread_ctx *tdctx,
5374d04f 972 struct dsched_policy *pol)
e02e815e
AH
973{
974 struct dsched_thread_io *tdio;
975#if 0
976 dsched_disk_ctx_ref(dsched_get_disk_priv(dp));
977#endif
978 tdio = objcache_get(dsched_tdio_cache, M_WAITOK);
979 bzero(tdio, DSCHED_THREAD_IO_MAX_SZ);
980
5374d04f
MD
981 dsched_thread_io_ref(tdio); /* prevent ripout */
982 dsched_thread_io_ref(tdio); /* for diskctx ref */
e02e815e
AH
983
984 DSCHED_THREAD_IO_LOCKINIT(tdio);
985 tdio->dp = dp;
986
987 tdio->diskctx = dsched_get_disk_priv(dp);
988 TAILQ_INIT(&tdio->queue);
989
990 if (pol->new_tdio)
991 pol->new_tdio(tdio);
992
076dc4bb 993 lockmgr(&tdio->diskctx->lock, LK_EXCLUSIVE);
e02e815e 994 TAILQ_INSERT_TAIL(&tdio->diskctx->tdio_list, tdio, dlink);
265b0d4a 995 atomic_set_int(&tdio->flags, DSCHED_LINKED_DISK_CTX);
076dc4bb 996 lockmgr(&tdio->diskctx->lock, LK_RELEASE);
e02e815e
AH
997
998 if (tdctx) {
5374d04f
MD
999 /*
1000 * Put the tdio in the tdctx list. Inherit the temporary
1001 * ref (one ref for each list).
1002 */
1003 DSCHED_THREAD_CTX_LOCK(tdctx);
e02e815e
AH
1004 tdio->tdctx = tdctx;
1005 tdio->p = tdctx->p;
e02e815e 1006 TAILQ_INSERT_TAIL(&tdctx->tdio_list, tdio, link);
265b0d4a 1007 atomic_set_int(&tdio->flags, DSCHED_LINKED_THREAD_CTX);
5374d04f
MD
1008 DSCHED_THREAD_CTX_UNLOCK(tdctx);
1009 } else {
1010 dsched_thread_io_unref(tdio);
e02e815e
AH
1011 }
1012
c7a0a046
AH
1013 tdio->debug_policy = pol;
1014 tdio->debug_inited = 0xF00F1234;
1015
e02e815e 1016 atomic_add_int(&dsched_stats.tdio_allocations, 1);
e02e815e
AH
1017}
1018
1019
1020struct dsched_disk_ctx *
1021dsched_disk_ctx_alloc(struct disk *dp, struct dsched_policy *pol)
1022{
1023 struct dsched_disk_ctx *diskctx;
1024
1025 diskctx = objcache_get(dsched_diskctx_cache, M_WAITOK);
1026 bzero(diskctx, DSCHED_DISK_CTX_MAX_SZ);
1027 dsched_disk_ctx_ref(diskctx);
1028 diskctx->dp = dp;
1029 DSCHED_DISK_CTX_LOCKINIT(diskctx);
1030 TAILQ_INIT(&diskctx->tdio_list);
09f2bfe9
BP
1031 /*
1032 * XXX: magic number 32: most device has a tag queue
1033 * of depth 32.
1034 * Better to retrive more precise value from the driver
1035 */
1036 diskctx->max_tag_queue_depth = 32;
1037 diskctx->current_tag_queue_depth = 0;
e02e815e
AH
1038
1039 atomic_add_int(&dsched_stats.diskctx_allocations, 1);
1040 if (pol->new_diskctx)
1041 pol->new_diskctx(diskctx);
1042 return diskctx;
1043}
1044
1045
1046struct dsched_thread_ctx *
1047dsched_thread_ctx_alloc(struct proc *p)
1048{
1049 struct dsched_thread_ctx *tdctx;
5374d04f
MD
1050 struct disk marker;
1051 struct disk *dp;
e02e815e
AH
1052
1053 tdctx = objcache_get(dsched_tdctx_cache, M_WAITOK);
1054 bzero(tdctx, DSCHED_THREAD_CTX_MAX_SZ);
1055 dsched_thread_ctx_ref(tdctx);
1056#if 0
1057 kprintf("dsched_thread_ctx_alloc, new tdctx = %p\n", tdctx);
1058#endif
1059 DSCHED_THREAD_CTX_LOCKINIT(tdctx);
1060 TAILQ_INIT(&tdctx->tdio_list);
1061 tdctx->p = p;
1062
0b81692c 1063 DSCHED_GLOBAL_THREAD_CTX_LOCK();
5374d04f
MD
1064 dp = NULL;
1065 while ((dp = disk_enumerate(&marker, dp)) != NULL)
1066 dsched_thread_io_alloc(dp, tdctx, dp->d_sched_policy);
e02e815e 1067
e02e815e
AH
1068 TAILQ_INSERT_TAIL(&dsched_tdctx_list, tdctx, link);
1069 DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
1070
1071 atomic_add_int(&dsched_stats.tdctx_allocations, 1);
1072 /* XXX: no callback here */
1073 return tdctx;
1074}
1075
1076void
1077policy_new(struct disk *dp, struct dsched_policy *pol) {
1078 struct dsched_thread_ctx *tdctx;
1079 struct dsched_disk_ctx *diskctx;
e02e815e
AH
1080
1081 diskctx = dsched_disk_ctx_alloc(dp, pol);
1082 dsched_disk_ctx_ref(diskctx);
1083 dsched_set_disk_priv(dp, diskctx);
1084
5374d04f
MD
1085 /*
1086 * XXX this is really really expensive!
1087 */
1088 TAILQ_FOREACH(tdctx, &dsched_tdctx_list, link)
1089 dsched_thread_io_alloc(dp, tdctx, pol);
e02e815e
AH
1090}
1091
1092void
1093policy_destroy(struct disk *dp) {
1094 struct dsched_disk_ctx *diskctx;
1095
1096 diskctx = dsched_get_disk_priv(dp);
1097 KKASSERT(diskctx != NULL);
1098
1099 dsched_disk_ctx_unref(diskctx); /* from prepare */
1100 dsched_disk_ctx_unref(diskctx); /* from alloc */
1101
1102 dsched_set_disk_priv(dp, NULL);
1103}
1104
b80a9543
AH
1105void
1106dsched_new_buf(struct buf *bp)
1107{
e02e815e
AH
1108 struct dsched_thread_ctx *tdctx = NULL;
1109
1110 if (dsched_inited == 0)
1111 return;
1112
1113 if (curproc != NULL) {
1114 tdctx = dsched_get_proc_priv(curproc);
1115 } else {
1116 /* This is a kernel thread, so no proc info is available */
1117 tdctx = dsched_get_thread_priv(curthread);
1118 }
1119
1120#if 0
1121 /*
1122 * XXX: hack. we don't want this assert because we aren't catching all
1123 * threads. mi_startup() is still getting away without an tdctx.
1124 */
1125
1126 /* by now we should have an tdctx. if not, something bad is going on */
1127 KKASSERT(tdctx != NULL);
1128#endif
1129
1130 if (tdctx) {
1131 dsched_thread_ctx_ref(tdctx);
1132 }
1133 dsched_set_buf_priv(bp, tdctx);
b80a9543
AH
1134}
1135
aa166ad1
AH
1136void
1137dsched_exit_buf(struct buf *bp)
1138{
e02e815e
AH
1139 struct dsched_thread_ctx *tdctx;
1140
1141 tdctx = dsched_get_buf_priv(bp);
1142 if (tdctx != NULL) {
1143 dsched_clr_buf_priv(bp);
1144 dsched_thread_ctx_unref(tdctx);
1145 }
aa166ad1 1146}
b80a9543
AH
1147
1148void
1149dsched_new_proc(struct proc *p)
1150{
e02e815e
AH
1151 struct dsched_thread_ctx *tdctx;
1152
1153 if (dsched_inited == 0)
1154 return;
1155
1156 KKASSERT(p != NULL);
1157
1158 tdctx = dsched_thread_ctx_alloc(p);
1159 tdctx->p = p;
1160 dsched_thread_ctx_ref(tdctx);
1161
1162 dsched_set_proc_priv(p, tdctx);
1163 atomic_add_int(&dsched_stats.nprocs, 1);
b80a9543
AH
1164}
1165
1166
1167void
1168dsched_new_thread(struct thread *td)
1169{
e02e815e
AH
1170 struct dsched_thread_ctx *tdctx;
1171
1172 if (dsched_inited == 0)
1173 return;
1174
1175 KKASSERT(td != NULL);
1176
1177 tdctx = dsched_thread_ctx_alloc(NULL);
1178 tdctx->td = td;
1179 dsched_thread_ctx_ref(tdctx);
1180
1181 dsched_set_thread_priv(td, tdctx);
1182 atomic_add_int(&dsched_stats.nthreads, 1);
b80a9543
AH
1183}
1184
1185void
1186dsched_exit_proc(struct proc *p)
1187{
e02e815e
AH
1188 struct dsched_thread_ctx *tdctx;
1189
1190 if (dsched_inited == 0)
1191 return;
1192
1193 KKASSERT(p != NULL);
1194
1195 tdctx = dsched_get_proc_priv(p);
1196 KKASSERT(tdctx != NULL);
1197
1198 tdctx->dead = 0xDEAD;
b5d7061d 1199 dsched_set_proc_priv(p, NULL);
e02e815e
AH
1200
1201 dsched_thread_ctx_unref(tdctx); /* one for alloc, */
1202 dsched_thread_ctx_unref(tdctx); /* one for ref */
1203 atomic_subtract_int(&dsched_stats.nprocs, 1);
b80a9543
AH
1204}
1205
1206
1207void
1208dsched_exit_thread(struct thread *td)
1209{
e02e815e
AH
1210 struct dsched_thread_ctx *tdctx;
1211
1212 if (dsched_inited == 0)
1213 return;
1214
1215 KKASSERT(td != NULL);
1216
1217 tdctx = dsched_get_thread_priv(td);
1218 KKASSERT(tdctx != NULL);
1219
1220 tdctx->dead = 0xDEAD;
1221 dsched_set_thread_priv(td, 0);
1222
1223 dsched_thread_ctx_unref(tdctx); /* one for alloc, */
1224 dsched_thread_ctx_unref(tdctx); /* one for ref */
1225 atomic_subtract_int(&dsched_stats.nthreads, 1);
b80a9543
AH
1226}
1227
5374d04f
MD
1228/*
1229 * Returns ref'd tdio.
1230 *
1231 * tdio may have additional refs for the diskctx and tdctx it resides on.
1232 */
1233void
89dabacd 1234dsched_new_policy_thread_tdio(struct dsched_disk_ctx *diskctx,
5374d04f
MD
1235 struct dsched_policy *pol)
1236{
89dabacd 1237 struct dsched_thread_ctx *tdctx;
89dabacd 1238
0b81692c
MD
1239 DSCHED_GLOBAL_THREAD_CTX_LOCK();
1240
89dabacd
AH
1241 tdctx = dsched_get_thread_priv(curthread);
1242 KKASSERT(tdctx != NULL);
5374d04f 1243 dsched_thread_io_alloc(diskctx->dp, tdctx, pol);
0b81692c
MD
1244
1245 DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
89dabacd
AH
1246}
1247
e02e815e
AH
1248/* DEFAULT NOOP POLICY */
1249
0160356d 1250static int
9495e99b 1251noop_prepare(struct dsched_disk_ctx *diskctx)
b80a9543
AH
1252{
1253 return 0;
1254}
1255
0160356d 1256static void
9495e99b 1257noop_teardown(struct dsched_disk_ctx *diskctx)
b80a9543
AH
1258{
1259
1260}
1261
0160356d 1262static void
9495e99b 1263noop_cancel(struct dsched_disk_ctx *diskctx)
b80a9543
AH
1264{
1265
1266}
1267
0160356d 1268static int
9495e99b 1269noop_queue(struct dsched_disk_ctx *diskctx, struct dsched_thread_io *tdio,
e02e815e 1270 struct bio *bio)
b80a9543 1271{
e02e815e 1272 dsched_strategy_raw(diskctx->dp, bio);
b80a9543 1273#if 0
9495e99b 1274 dsched_strategy_async(diskctx->dp, bio, noop_completed, NULL);
b80a9543
AH
1275#endif
1276 return 0;
1277}
1278
b80a9543
AH
1279/*
1280 * SYSINIT stuff
1281 */
b80a9543
AH
1282static void
1283dsched_init(void)
1284{
e02e815e
AH
1285 dsched_tdio_cache = objcache_create("dsched-tdio-cache", 0, 0,
1286 NULL, NULL, NULL,
1287 objcache_malloc_alloc,
1288 objcache_malloc_free,
1289 &dsched_thread_io_malloc_args );
1290
1291 dsched_tdctx_cache = objcache_create("dsched-tdctx-cache", 0, 0,
1292 NULL, NULL, NULL,
1293 objcache_malloc_alloc,
1294 objcache_malloc_free,
1295 &dsched_thread_ctx_malloc_args );
1296
1297 dsched_diskctx_cache = objcache_create("dsched-diskctx-cache", 0, 0,
1298 NULL, NULL, NULL,
1299 objcache_malloc_alloc,
1300 objcache_malloc_free,
1301 &dsched_disk_ctx_malloc_args );
1302
1303 bzero(&dsched_stats, sizeof(struct dsched_stats));
1304
279e9fd5 1305 lockinit(&dsched_lock, "dsched lock", 0, LK_CANRECURSE);
e02e815e
AH
1306 DSCHED_GLOBAL_THREAD_CTX_LOCKINIT();
1307
9495e99b 1308 dsched_register(&dsched_noop_policy);
e02e815e
AH
1309
1310 dsched_inited = 1;
b80a9543
AH
1311}
1312
1313static void
1314dsched_uninit(void)
1315{
1316}
1317
e02e815e
AH
1318SYSINIT(subr_dsched_register, SI_SUB_CREATE_INIT-1, SI_ORDER_FIRST, dsched_init, NULL);
1319SYSUNINIT(subr_dsched_register, SI_SUB_CREATE_INIT-1, SI_ORDER_ANY, dsched_uninit, NULL);
b80a9543
AH
1320
1321/*
1322 * SYSCTL stuff
1323 */
e02e815e 1324static int
279e9fd5 1325sysctl_dsched_stats(SYSCTL_HANDLER_ARGS)
e02e815e
AH
1326{
1327 return (sysctl_handle_opaque(oidp, &dsched_stats, sizeof(struct dsched_stats), req));
1328}
1329
279e9fd5
AH
1330static int
1331sysctl_dsched_list_policies(SYSCTL_HANDLER_ARGS)
1332{
1333 struct dsched_policy *pol = NULL;
1334 int error, first = 1;
1335
1336 lockmgr(&dsched_lock, LK_EXCLUSIVE);
1337
1338 while ((pol = dsched_policy_enumerate(pol))) {
1339 if (!first) {
1340 error = SYSCTL_OUT(req, " ", 1);
1341 if (error)
1342 break;
1343 } else {
1344 first = 0;
1345 }
1346 error = SYSCTL_OUT(req, pol->name, strlen(pol->name));
1347 if (error)
1348 break;
1349
1350 }
1351
1352 lockmgr(&dsched_lock, LK_RELEASE);
1353
1354 error = SYSCTL_OUT(req, "", 1);
9495e99b 1355
279e9fd5
AH
1356 return error;
1357}
1358
bc3c9325
AH
1359static int
1360sysctl_dsched_policy(SYSCTL_HANDLER_ARGS)
1361{
1362 char buf[DSCHED_POLICY_NAME_LENGTH];
1363 struct dsched_disk_ctx *diskctx = arg1;
1364 struct dsched_policy *pol = NULL;
1365 int error;
1366
1367 if (diskctx == NULL) {
1368 return 0;
1369 }
1370
1371 lockmgr(&dsched_lock, LK_EXCLUSIVE);
1372
1373 pol = diskctx->dp->d_sched_policy;
1374 memcpy(buf, pol->name, DSCHED_POLICY_NAME_LENGTH);
1375
1376 error = sysctl_handle_string(oidp, buf, DSCHED_POLICY_NAME_LENGTH, req);
1377 if (error || req->newptr == NULL) {
1378 lockmgr(&dsched_lock, LK_RELEASE);
1379 return (error);
1380 }
1381
1382 pol = dsched_find_policy(buf);
1383 if (pol == NULL) {
1384 lockmgr(&dsched_lock, LK_RELEASE);
1385 return 0;
1386 }
1387
1388 dsched_switch(diskctx->dp, pol);
1389
1390 lockmgr(&dsched_lock, LK_RELEASE);
1391
1392 return error;
1393}
1394
9495e99b
AH
1395static int
1396sysctl_dsched_default_policy(SYSCTL_HANDLER_ARGS)
1397{
1398 char buf[DSCHED_POLICY_NAME_LENGTH];
1399 struct dsched_policy *pol = NULL;
1400 int error;
1401
1402 lockmgr(&dsched_lock, LK_EXCLUSIVE);
1403
1404 pol = default_policy;
1405 memcpy(buf, pol->name, DSCHED_POLICY_NAME_LENGTH);
1406
1407 error = sysctl_handle_string(oidp, buf, DSCHED_POLICY_NAME_LENGTH, req);
1408 if (error || req->newptr == NULL) {
1409 lockmgr(&dsched_lock, LK_RELEASE);
1410 return (error);
1411 }
1412
1413 pol = dsched_find_policy(buf);
1414 if (pol == NULL) {
1415 lockmgr(&dsched_lock, LK_RELEASE);
1416 return 0;
1417 }
1418
1419 default_set = 1;
1420 default_policy = pol;
1421
1422 lockmgr(&dsched_lock, LK_RELEASE);
1423
1424 return error;
1425}
1426
e02e815e
AH
1427SYSCTL_NODE(, OID_AUTO, dsched, CTLFLAG_RD, NULL,
1428 "Disk Scheduler Framework (dsched) magic");
bc3c9325
AH
1429SYSCTL_NODE(_dsched, OID_AUTO, policy, CTLFLAG_RW, NULL,
1430 "List of disks and their policies");
e02e815e
AH
1431SYSCTL_INT(_dsched, OID_AUTO, debug, CTLFLAG_RW, &dsched_debug_enable,
1432 0, "Enable dsched debugging");
1433SYSCTL_PROC(_dsched, OID_AUTO, stats, CTLTYPE_OPAQUE|CTLFLAG_RD,
279e9fd5 1434 0, sizeof(struct dsched_stats), sysctl_dsched_stats, "dsched_stats",
e02e815e 1435 "dsched statistics");
279e9fd5
AH
1436SYSCTL_PROC(_dsched, OID_AUTO, policies, CTLTYPE_STRING|CTLFLAG_RD,
1437 NULL, 0, sysctl_dsched_list_policies, "A", "names of available policies");
9495e99b
AH
1438SYSCTL_PROC(_dsched_policy, OID_AUTO, default, CTLTYPE_STRING|CTLFLAG_RW,
1439 NULL, 0, sysctl_dsched_default_policy, "A", "default dsched policy");
279e9fd5 1440
bc3c9325
AH
1441static void
1442dsched_sysctl_add_disk(struct dsched_disk_ctx *diskctx, char *name)
1443{
1444 if (!(diskctx->flags & DSCHED_SYSCTL_CTX_INITED)) {
1445 diskctx->flags |= DSCHED_SYSCTL_CTX_INITED;
1446 sysctl_ctx_init(&diskctx->sysctl_ctx);
1447 }
1448
1449 SYSCTL_ADD_PROC(&diskctx->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_dsched_policy),
1450 OID_AUTO, name, CTLTYPE_STRING|CTLFLAG_RW,
1451 diskctx, 0, sysctl_dsched_policy, "A", "policy");
1452}