2 * Copyright (c) 2013-2014 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@dragonflybsd.org>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * Implements an abstraction layer for synchronous and asynchronous
39 * buffered device I/O. Can be used for OS-abstraction but the main
40 * purpose is to allow larger buffers to be used against hammer2_chain's
41 * using smaller allocations, without causing deadlocks.
44 static int hammer2_io_cleanup_callback(hammer2_io_t *dio, void *arg);
47 hammer2_io_cmp(hammer2_io_t *io1, hammer2_io_t *io2)
49 if (io2->pbase < io1->pbase)
51 if (io2->pbase > io1->pbase)
56 RB_PROTOTYPE2(hammer2_io_tree, hammer2_io, rbnode, hammer2_io_cmp, off_t);
57 RB_GENERATE2(hammer2_io_tree, hammer2_io, rbnode, hammer2_io_cmp,
60 struct hammer2_cleanupcb_info {
61 struct hammer2_io_tree tmptree;
65 #define HAMMER2_GETBLK_GOOD 0
66 #define HAMMER2_GETBLK_QUEUED 1
67 #define HAMMER2_GETBLK_OWNED 2
70 * Allocate/Locate the requested dio, reference it, issue or queue iocb.
73 hammer2_io_getblk(hammer2_mount_t *hmp, off_t lbase, int lsize,
81 * XXX after free, buffer reuse case w/ different size can clash
82 * with dio cache. Lets avoid it for now. Ultimate we need to
83 * invalidate the dio cache when freeing blocks to allow a mix
84 * of 16KB and 64KB block sizes).
86 /*int psize = hammer2_devblksize(lsize);*/
87 int psize = HAMMER2_PBUFSIZE;
90 pmask = ~(hammer2_off_t)(psize - 1);
92 KKASSERT((1 << (int)(lbase & HAMMER2_OFF_MASK_RADIX)) == lsize);
93 lbase &= ~HAMMER2_OFF_MASK_RADIX;
94 pbase = lbase & pmask;
95 KKASSERT(pbase != 0 && ((lbase + lsize - 1) & pmask) == pbase);
98 * Access/Allocate the DIO, bump dio->refs to prevent destruction.
100 spin_lock_shared(&hmp->io_spin);
101 dio = RB_LOOKUP(hammer2_io_tree, &hmp->iotree, pbase);
103 if ((atomic_fetchadd_int(&dio->refs, 1) &
104 HAMMER2_DIO_MASK) == 0) {
105 atomic_add_int(&dio->hmp->iofree_count, -1);
107 spin_unlock_shared(&hmp->io_spin);
109 spin_unlock_shared(&hmp->io_spin);
110 dio = kmalloc(sizeof(*dio), M_HAMMER2, M_INTWAIT | M_ZERO);
115 spin_init(&dio->spin, "h2dio");
116 TAILQ_INIT(&dio->iocbq);
117 spin_lock(&hmp->io_spin);
118 xio = RB_INSERT(hammer2_io_tree, &hmp->iotree, dio);
120 atomic_add_int(&hammer2_dio_count, 1);
121 spin_unlock(&hmp->io_spin);
123 if ((atomic_fetchadd_int(&xio->refs, 1) &
124 HAMMER2_DIO_MASK) == 0) {
125 atomic_add_int(&xio->hmp->iofree_count, -1);
127 spin_unlock(&hmp->io_spin);
128 kfree(dio, M_HAMMER2);
134 * Obtain/Validate the buffer.
138 if (dio->act < 5) /* SMP race ok */
146 * Issue the iocb immediately if the buffer is already good.
147 * Once set GOOD cannot be cleared until refs drops to 0.
149 if (refs & HAMMER2_DIO_GOOD) {
150 iocb->callback(iocb);
155 * Try to own the DIO by setting INPROG so we can issue
158 if (refs & HAMMER2_DIO_INPROG) {
160 * If DIO_INPROG is already set then set WAITING and
163 spin_lock(&dio->spin);
164 if (atomic_cmpset_int(&dio->refs, refs,
165 refs | HAMMER2_DIO_WAITING)) {
166 iocb->flags |= HAMMER2_IOCB_ONQ |
168 TAILQ_INSERT_TAIL(&dio->iocbq, iocb, entry);
169 spin_unlock(&dio->spin);
172 spin_unlock(&dio->spin);
176 * If DIO_INPROG is not set then set it and issue the
177 * callback immediately to start I/O.
179 if (atomic_cmpset_int(&dio->refs, refs,
180 refs | HAMMER2_DIO_INPROG)) {
181 iocb->flags |= HAMMER2_IOCB_INPROG;
182 iocb->callback(iocb);
192 * The originator of the iocb is finished with it.
195 hammer2_io_complete(hammer2_iocb_t *iocb)
197 hammer2_io_t *dio = iocb->dio;
198 hammer2_iocb_t *cbtmp;
205 * If IOCB_INPROG was not set completion is synchronous due to the
206 * buffer already being good. We can simply set IOCB_DONE and return.
207 * In this situation DIO_INPROG is not set and we have no visibility
210 if ((iocb->flags & HAMMER2_IOCB_INPROG) == 0) {
211 atomic_set_int(&iocb->flags, HAMMER2_IOCB_DONE);
216 * The iocb was queued, obtained DIO_INPROG, and its callback was
217 * made. The callback is now complete. We still own DIO_INPROG.
219 * We can set DIO_GOOD if no error occurred, which gives certain
220 * stability guarantees to dio->bp and allows other accessors to
221 * short-cut access. DIO_GOOD cannot be cleared until the last
224 KKASSERT(dio->refs & HAMMER2_DIO_INPROG);
226 BUF_KERNPROC(dio->bp);
227 if ((dio->bp->b_flags & B_ERROR) == 0) {
228 KKASSERT(dio->bp->b_flags & B_CACHE);
229 atomic_set_int(&dio->refs, HAMMER2_DIO_GOOD);
234 * Clean up the dio before marking the iocb as being done. If another
235 * iocb is pending we chain to it while leaving DIO_INPROG set (it
236 * will call io completion and presumably clear DIO_INPROG).
238 * Otherwise if no other iocbs are pending we clear DIO_INPROG before
239 * finishing up the cbio. This means that DIO_INPROG is cleared at
240 * the end of the chain before ANY of the cbios are marked done.
242 * NOTE: The TAILQ is not stable until the spin-lock is held.
246 nrefs = orefs & ~(HAMMER2_DIO_WAITING | HAMMER2_DIO_INPROG);
248 if (orefs & HAMMER2_DIO_WAITING) {
249 spin_lock(&dio->spin);
250 cbtmp = TAILQ_FIRST(&dio->iocbq);
253 * NOTE: flags not adjusted in this case.
254 * Flags will be adjusted by the last
257 TAILQ_REMOVE(&dio->iocbq, cbtmp, entry);
258 spin_unlock(&dio->spin);
259 cbtmp->callback(cbtmp); /* chained */
261 } else if (atomic_cmpset_int(&dio->refs,
263 spin_unlock(&dio->spin);
266 spin_unlock(&dio->spin);
268 } else if (atomic_cmpset_int(&dio->refs, orefs, nrefs)) {
275 * Mark the iocb as done and wakeup any waiters. This is done after
276 * all iocb chains have been called back and after DIO_INPROG has been
277 * cleared. This avoids races against ref count drops by the waiting
278 * threads (a hard but not impossible SMP race) which might result in
279 * a 1->0 transition of the refs while DIO_INPROG is still set.
282 oflags = iocb->flags;
285 nflags &= ~(HAMMER2_IOCB_WAKEUP | HAMMER2_IOCB_INPROG);
286 nflags |= HAMMER2_IOCB_DONE;
288 if (atomic_cmpset_int(&iocb->flags, oflags, nflags)) {
289 if (oflags & HAMMER2_IOCB_WAKEUP)
291 /* SMP: iocb is now stale */
301 * Wait for an iocb's I/O to finish.
304 hammer2_iocb_wait(hammer2_iocb_t *iocb)
310 oflags = iocb->flags;
312 nflags = oflags | HAMMER2_IOCB_WAKEUP;
313 if (oflags & HAMMER2_IOCB_DONE)
315 tsleep_interlock(iocb, 0);
316 if (atomic_cmpset_int(&iocb->flags, oflags, nflags)) {
317 tsleep(iocb, PINTERLOCKED, "h2iocb", hz);
324 * Release our ref on *diop.
326 * On the last ref we must atomically clear DIO_GOOD and set DIO_INPROG,
327 * then dispose of the underlying buffer.
330 hammer2_io_putblk(hammer2_io_t **diop)
332 hammer2_mount_t *hmp;
345 * Drop refs, on 1->0 transition clear flags, set INPROG.
350 if ((refs & HAMMER2_DIO_MASK) == 1) {
351 if (refs & HAMMER2_DIO_INPROG) {
354 xcb = TAILQ_FIRST(&dio->iocbq);
355 kprintf("BAD REFS dio %p %08x/%08x, cbio %p\n",
356 dio, refs, dio->refs, xcb);
358 kprintf(" IOCB: func=%p dio=%p cl=%p ch=%p ptr=%p\n",
365 KKASSERT((refs & HAMMER2_DIO_INPROG) == 0);
366 if (atomic_cmpset_int(&dio->refs, refs,
369 HAMMER2_DIO_DIRTY)) |
370 HAMMER2_DIO_INPROG)) {
375 if (atomic_cmpset_int(&dio->refs, refs, refs - 1))
383 * We have set DIO_INPROG to gain control of the buffer and we have
384 * cleared DIO_GOOD to prevent other accessors from thinking it is
387 * We can now dispose of the buffer, and should do it before calling
388 * io_complete() in case there's a race against a new reference
389 * which causes io_complete() to chain and instantiate the bp again.
396 if (refs & HAMMER2_DIO_GOOD) {
397 KKASSERT(bp != NULL);
398 if (refs & HAMMER2_DIO_DIRTY) {
399 if (hammer2_cluster_enable) {
400 peof = (pbase + HAMMER2_SEGMASK64) &
402 cluster_write(bp, peof, psize, 4);
404 bp->b_flags |= B_CLUSTEROK;
407 } else if (bp->b_flags & (B_ERROR | B_INVAL | B_RELBUF)) {
413 if (refs & HAMMER2_DIO_DIRTY) {
421 * The instant we call io_complete dio is a free agent again and
422 * can be ripped out from under us.
424 * we can cleanup our final DIO_INPROG by simulating an iocb
427 hmp = dio->hmp; /* extract fields */
428 atomic_add_int(&hmp->iofree_count, 1);
432 iocb.flags = HAMMER2_IOCB_INPROG;
433 hammer2_io_complete(&iocb);
434 dio = NULL; /* dio stale */
437 * We cache free buffers so re-use cases can use a shared lock, but
438 * if too many build up we have to clean them out.
440 if (hmp->iofree_count > 1000) {
441 struct hammer2_cleanupcb_info info;
443 RB_INIT(&info.tmptree);
444 spin_lock(&hmp->io_spin);
445 if (hmp->iofree_count > 1000) {
446 info.count = hmp->iofree_count / 2;
447 RB_SCAN(hammer2_io_tree, &hmp->iotree, NULL,
448 hammer2_io_cleanup_callback, &info);
450 spin_unlock(&hmp->io_spin);
451 hammer2_io_cleanup(hmp, &info.tmptree);
456 * Cleanup any dio's with (INPROG | refs) == 0.
458 * Called to clean up cached DIOs on umount after all activity has been
463 hammer2_io_cleanup_callback(hammer2_io_t *dio, void *arg)
465 struct hammer2_cleanupcb_info *info = arg;
468 if ((dio->refs & (HAMMER2_DIO_MASK | HAMMER2_DIO_INPROG)) == 0) {
473 KKASSERT(dio->bp == NULL);
474 RB_REMOVE(hammer2_io_tree, &dio->hmp->iotree, dio);
475 xio = RB_INSERT(hammer2_io_tree, &info->tmptree, dio);
476 KKASSERT(xio == NULL);
477 if (--info->count <= 0) /* limit scan */
484 hammer2_io_cleanup(hammer2_mount_t *hmp, struct hammer2_io_tree *tree)
488 while ((dio = RB_ROOT(tree)) != NULL) {
489 RB_REMOVE(hammer2_io_tree, tree, dio);
490 KKASSERT(dio->bp == NULL &&
491 (dio->refs & (HAMMER2_DIO_MASK | HAMMER2_DIO_INPROG)) == 0);
492 kfree(dio, M_HAMMER2);
493 atomic_add_int(&hammer2_dio_count, -1);
494 atomic_add_int(&hmp->iofree_count, -1);
499 * Returns a pointer to the requested data.
502 hammer2_io_data(hammer2_io_t *dio, off_t lbase)
508 KKASSERT(bp != NULL);
509 off = (lbase & ~HAMMER2_OFF_MASK_RADIX) - bp->b_loffset;
510 KKASSERT(off >= 0 && off < bp->b_bufsize);
511 return(bp->b_data + off);
515 * Helpers for hammer2_io_new*() functions
519 hammer2_iocb_new_callback(hammer2_iocb_t *iocb)
521 hammer2_io_t *dio = iocb->dio;
522 int gbctl = (iocb->flags & HAMMER2_IOCB_QUICK) ? GETBLK_NOWAIT : 0;
525 * If IOCB_INPROG is not set the dio already has a good buffer and we
526 * can't mess with it other than zero the requested range.
528 * If IOCB_INPROG is set we also own DIO_INPROG at this time and can
529 * do what needs to be done with dio->bp.
531 if (iocb->flags & HAMMER2_IOCB_INPROG) {
532 if ((iocb->flags & HAMMER2_IOCB_READ) == 0) {
533 if (iocb->lsize == dio->psize) {
535 * Fully covered buffer, try to optimize to
536 * avoid any I/O. We might already have the
537 * buffer due to iocb chaining.
539 if (dio->bp == NULL) {
540 dio->bp = getblk(dio->hmp->devvp,
541 dio->pbase, dio->psize,
545 vfs_bio_clrbuf(dio->bp);
546 dio->bp->b_flags |= B_CACHE;
548 } else if (iocb->flags & HAMMER2_IOCB_QUICK) {
550 * Partial buffer, quick mode. Do nothing.
551 * Do not instantiate the buffer or try to
552 * mark it B_CACHE because other portions of
553 * the buffer might have to be read by other
556 } else if (dio->bp == NULL ||
557 (dio->bp->b_flags & B_CACHE) == 0) {
559 * Partial buffer, normal mode, requires
560 * read-before-write. Chain the read.
562 * We might already have the buffer due to
563 * iocb chaining. XXX unclear if we really
564 * need to write/release it and reacquire
567 * QUEUE ASYNC I/O, IOCB IS NOT YET COMPLETE.
570 if (dio->refs & HAMMER2_DIO_DIRTY)
576 atomic_set_int(&iocb->flags, HAMMER2_IOCB_READ);
577 breadcb(dio->hmp->devvp,
578 dio->pbase, dio->psize,
579 hammer2_io_callback, iocb);
581 } /* else buffer is good */
582 } /* else callback from breadcb is complete */
585 if (iocb->flags & HAMMER2_IOCB_ZERO)
586 bzero(hammer2_io_data(dio, iocb->lbase), iocb->lsize);
587 atomic_set_int(&dio->refs, HAMMER2_DIO_DIRTY);
589 hammer2_io_complete(iocb);
594 _hammer2_io_new(hammer2_mount_t *hmp, off_t lbase, int lsize,
595 hammer2_io_t **diop, int flags)
600 iocb.callback = hammer2_iocb_new_callback;
608 hammer2_io_getblk(hmp, lbase, lsize, &iocb);
609 if ((iocb.flags & HAMMER2_IOCB_DONE) == 0)
610 hammer2_iocb_wait(&iocb);
611 dio = *diop = iocb.dio;
617 hammer2_io_new(hammer2_mount_t *hmp, off_t lbase, int lsize,
620 return(_hammer2_io_new(hmp, lbase, lsize, diop, HAMMER2_IOCB_ZERO));
624 hammer2_io_newnz(hammer2_mount_t *hmp, off_t lbase, int lsize,
627 return(_hammer2_io_new(hmp, lbase, lsize, diop, 0));
631 hammer2_io_newq(hammer2_mount_t *hmp, off_t lbase, int lsize,
634 return(_hammer2_io_new(hmp, lbase, lsize, diop, HAMMER2_IOCB_QUICK));
639 hammer2_iocb_bread_callback(hammer2_iocb_t *iocb)
641 hammer2_io_t *dio = iocb->dio;
646 * If IOCB_INPROG is not set the dio already has a good buffer and we
647 * can't mess with it other than zero the requested range.
649 * If IOCB_INPROG is set we also own DIO_INPROG at this time and can
650 * do what needs to be done with dio->bp.
652 if (iocb->flags & HAMMER2_IOCB_INPROG) {
653 if (dio->bp && (dio->bp->b_flags & B_CACHE)) {
655 * Already good, likely due to being chained from
659 } else if (hammer2_cluster_enable) {
661 * Synchronous cluster I/O for now.
667 peof = (dio->pbase + HAMMER2_SEGMASK64) &
669 error = cluster_read(dio->hmp->devvp, peof, dio->pbase,
671 dio->psize, HAMMER2_PBUFSIZE*4,
675 * Synchronous I/O for now.
681 error = bread(dio->hmp->devvp, dio->pbase,
682 dio->psize, &dio->bp);
689 hammer2_io_complete(iocb);
693 hammer2_io_bread(hammer2_mount_t *hmp, off_t lbase, int lsize,
699 iocb.callback = hammer2_iocb_bread_callback;
707 hammer2_io_getblk(hmp, lbase, lsize, &iocb);
708 if ((iocb.flags & HAMMER2_IOCB_DONE) == 0)
709 hammer2_iocb_wait(&iocb);
710 dio = *diop = iocb.dio;
716 * System buf/bio async callback extracts the iocb and chains
717 * to the iocb callback.
720 hammer2_io_callback(struct bio *bio)
722 struct buf *dbp = bio->bio_buf;
723 hammer2_iocb_t *iocb = bio->bio_caller_info1.ptr;
727 if ((bio->bio_flags & BIO_DONE) == 0)
729 bio->bio_flags &= ~(BIO_DONE | BIO_SYNC);
730 dio->bp = bio->bio_buf;
731 iocb->callback(iocb);
735 hammer2_io_bawrite(hammer2_io_t **diop)
737 atomic_set_int(&(*diop)->refs, HAMMER2_DIO_DIRTY);
738 hammer2_io_putblk(diop);
742 hammer2_io_bdwrite(hammer2_io_t **diop)
744 atomic_set_int(&(*diop)->refs, HAMMER2_DIO_DIRTY);
745 hammer2_io_putblk(diop);
749 hammer2_io_bwrite(hammer2_io_t **diop)
751 atomic_set_int(&(*diop)->refs, HAMMER2_DIO_DIRTY);
752 hammer2_io_putblk(diop);
753 return (0); /* XXX */
757 hammer2_io_setdirty(hammer2_io_t *dio)
759 atomic_set_int(&dio->refs, HAMMER2_DIO_DIRTY);
763 hammer2_io_setinval(hammer2_io_t *dio, u_int bytes)
765 if ((u_int)dio->psize == bytes)
766 dio->bp->b_flags |= B_INVAL | B_RELBUF;
770 hammer2_io_brelse(hammer2_io_t **diop)
772 hammer2_io_putblk(diop);
776 hammer2_io_bqrelse(hammer2_io_t **diop)
778 hammer2_io_putblk(diop);
782 hammer2_io_isdirty(hammer2_io_t *dio)
784 return((dio->refs & HAMMER2_DIO_DIRTY) != 0);