2 * Copyright (c) 2013-2014 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@dragonflybsd.org>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * Implements an abstraction layer for synchronous and asynchronous
39 * buffered device I/O. Can be used for OS-abstraction but the main
40 * purpose is to allow larger buffers to be used against hammer2_chain's
41 * using smaller allocations, without causing deadlocks.
44 static int hammer2_io_cleanup_callback(hammer2_io_t *dio, void *arg);
47 hammer2_io_cmp(hammer2_io_t *io1, hammer2_io_t *io2)
49 if (io2->pbase < io1->pbase)
51 if (io2->pbase > io1->pbase)
56 RB_PROTOTYPE2(hammer2_io_tree, hammer2_io, rbnode, hammer2_io_cmp, off_t);
57 RB_GENERATE2(hammer2_io_tree, hammer2_io, rbnode, hammer2_io_cmp,
60 struct hammer2_cleanupcb_info {
61 struct hammer2_io_tree tmptree;
65 #define HAMMER2_GETBLK_GOOD 0
66 #define HAMMER2_GETBLK_QUEUED 1
67 #define HAMMER2_GETBLK_OWNED 2
70 * Allocate/Locate the requested dio, reference it, issue or queue iocb.
73 hammer2_io_getblk(hammer2_mount_t *hmp, off_t lbase, int lsize,
81 * XXX after free, buffer reuse case w/ different size can clash
82 * with dio cache. Lets avoid it for now. Ultimate we need to
83 * invalidate the dio cache when freeing blocks to allow a mix
84 * of 16KB and 64KB block sizes).
86 /*int psize = hammer2_devblksize(lsize);*/
87 int psize = HAMMER2_PBUFSIZE;
90 pmask = ~(hammer2_off_t)(psize - 1);
92 KKASSERT((1 << (int)(lbase & HAMMER2_OFF_MASK_RADIX)) == lsize);
93 lbase &= ~HAMMER2_OFF_MASK_RADIX;
94 pbase = lbase & pmask;
95 KKASSERT(pbase != 0 && ((lbase + lsize - 1) & pmask) == pbase);
98 * Access/Allocate the DIO, bump dio->refs to prevent destruction.
100 spin_lock_shared(&hmp->io_spin);
101 dio = RB_LOOKUP(hammer2_io_tree, &hmp->iotree, pbase);
103 if ((atomic_fetchadd_int(&dio->refs, 1) &
104 HAMMER2_DIO_MASK) == 0) {
105 atomic_add_int(&dio->hmp->iofree_count, -1);
107 spin_unlock_shared(&hmp->io_spin);
109 spin_unlock_shared(&hmp->io_spin);
110 dio = kmalloc(sizeof(*dio), M_HAMMER2, M_INTWAIT | M_ZERO);
115 spin_init(&dio->spin, "h2dio");
116 TAILQ_INIT(&dio->iocbq);
117 spin_lock(&hmp->io_spin);
118 xio = RB_INSERT(hammer2_io_tree, &hmp->iotree, dio);
120 atomic_add_int(&hammer2_dio_count, 1);
121 spin_unlock(&hmp->io_spin);
123 if ((atomic_fetchadd_int(&xio->refs, 1) &
124 HAMMER2_DIO_MASK) == 0) {
125 atomic_add_int(&xio->hmp->iofree_count, -1);
127 spin_unlock(&hmp->io_spin);
128 kfree(dio, M_HAMMER2);
134 * Obtain/Validate the buffer.
143 * Issue the iocb immediately if the buffer is already good.
144 * Once set GOOD cannot be cleared until refs drops to 0.
146 if (refs & HAMMER2_DIO_GOOD) {
147 iocb->callback(iocb);
152 * Try to own the DIO by setting INPROG so we can issue
155 if (refs & HAMMER2_DIO_INPROG) {
157 * If DIO_INPROG is already set then set WAITING and
160 spin_lock(&dio->spin);
161 if (atomic_cmpset_int(&dio->refs, refs,
162 refs | HAMMER2_DIO_WAITING)) {
163 iocb->flags |= HAMMER2_IOCB_ONQ |
165 TAILQ_INSERT_TAIL(&dio->iocbq, iocb, entry);
166 spin_unlock(&dio->spin);
169 spin_unlock(&dio->spin);
173 * If DIO_INPROG is not set then set it and issue the
174 * callback immediately to start I/O.
176 if (atomic_cmpset_int(&dio->refs, refs,
177 refs | HAMMER2_DIO_INPROG)) {
178 iocb->flags |= HAMMER2_IOCB_INPROG;
179 iocb->callback(iocb);
191 * The originator of the iocb is finished with it.
194 hammer2_io_complete(hammer2_iocb_t *iocb)
196 hammer2_io_t *dio = iocb->dio;
203 * If IOCB_INPROG was not set completion is synchronous due to the
204 * buffer already being good. We can simply set IOCB_DONE and return.
205 * In this situation DIO_INPROG is not set and we have no visibility
208 if ((iocb->flags & HAMMER2_IOCB_INPROG) == 0) {
209 iocb->flags |= HAMMER2_IOCB_DONE;
214 * The iocb was queued, obtained DIO_INPROG, and its callback was
215 * made. The callback is now complete. We still own DIO_INPROG.
217 * We can set DIO_GOOD if no error occurred, which gives certain
218 * stability guarantees to dio->bp and allows other accessors to
219 * short-cut access. DIO_GOOD cannot be cleared until the last
222 KKASSERT(dio->refs & HAMMER2_DIO_INPROG);
224 BUF_KERNPROC(dio->bp);
225 if ((dio->bp->b_flags & B_ERROR) == 0) {
226 KKASSERT(dio->bp->b_flags & B_CACHE);
227 atomic_set_int(&dio->refs, HAMMER2_DIO_GOOD);
232 oflags = iocb->flags;
235 nflags &= ~(HAMMER2_IOCB_WAKEUP | HAMMER2_IOCB_INPROG);
236 nflags |= HAMMER2_IOCB_DONE;
238 if (atomic_cmpset_int(&iocb->flags, oflags, nflags)) {
239 if (oflags & HAMMER2_IOCB_WAKEUP)
241 /* SMP: iocb is now stale */
249 * Now finish up the dio. If another iocb is pending chain to it
250 * leaving DIO_INPROG set. Otherwise clear DIO_INPROG
253 * NOTE: The TAILQ is not stable until the spin-lock is held.
257 nrefs = orefs & ~(HAMMER2_DIO_WAITING | HAMMER2_DIO_INPROG);
259 if (orefs & HAMMER2_DIO_WAITING) {
260 spin_lock(&dio->spin);
261 iocb = TAILQ_FIRST(&dio->iocbq);
263 TAILQ_REMOVE(&dio->iocbq, iocb, entry);
264 spin_unlock(&dio->spin);
265 iocb->callback(iocb); /* chained */
267 } else if (atomic_cmpset_int(&dio->refs,
269 spin_unlock(&dio->spin);
272 spin_unlock(&dio->spin);
274 } else if (atomic_cmpset_int(&dio->refs, orefs, nrefs)) {
279 /* SMP: dio is stale now */
283 * Wait for an iocb's I/O to finish.
286 hammer2_iocb_wait(hammer2_iocb_t *iocb)
292 oflags = iocb->flags;
294 nflags = oflags | HAMMER2_IOCB_WAKEUP;
295 if (oflags & HAMMER2_IOCB_DONE)
297 tsleep_interlock(iocb, 0);
298 if (atomic_cmpset_int(&iocb->flags, oflags, nflags)) {
299 tsleep(iocb, PINTERLOCKED, "h2iocb", hz);
306 * Release our ref on *diop.
308 * On the last ref we must atomically clear DIO_GOOD and set DIO_INPROG,
309 * then dispose of the underlying buffer.
312 hammer2_io_putblk(hammer2_io_t **diop)
314 hammer2_mount_t *hmp;
327 * Drop refs, on 1->0 transition clear flags, set INPROG.
332 if ((refs & HAMMER2_DIO_MASK) == 1) {
333 KKASSERT((refs & HAMMER2_DIO_INPROG) == 0);
334 if (atomic_cmpset_int(&dio->refs, refs,
337 HAMMER2_DIO_DIRTY)) |
338 HAMMER2_DIO_INPROG)) {
343 if (atomic_cmpset_int(&dio->refs, refs, refs - 1))
351 * We have set DIO_INPROG to gain control of the buffer and we have
352 * cleared DIO_GOOD to prevent other accessors from thinking it is
355 * We can now dispose of the buffer, and should do it before calling
356 * io_complete() in case there's a race against a new reference
357 * which causes io_complete() to chain and instantiate the bp again.
364 if (refs & HAMMER2_DIO_GOOD) {
365 KKASSERT(bp != NULL);
366 if (refs & HAMMER2_DIO_DIRTY) {
367 if (hammer2_cluster_enable) {
368 peof = (pbase + HAMMER2_SEGMASK64) &
370 cluster_write(bp, peof, psize, 4);
372 bp->b_flags |= B_CLUSTEROK;
375 } else if (bp->b_flags & (B_ERROR | B_INVAL | B_RELBUF)) {
381 if (refs & HAMMER2_DIO_DIRTY) {
389 * The instant we call io_complete dio is a free agent again and
390 * can be ripped out from under us.
392 * we can cleanup our final DIO_INPROG by simulating an iocb
395 hmp = dio->hmp; /* extract fields */
396 atomic_add_int(&hmp->iofree_count, 1);
400 iocb.flags = HAMMER2_IOCB_INPROG;
401 hammer2_io_complete(&iocb);
402 dio = NULL; /* dio stale */
405 * We cache free buffers so re-use cases can use a shared lock, but
406 * if too many build up we have to clean them out.
408 if (hmp->iofree_count > 1000) {
409 struct hammer2_cleanupcb_info info;
411 RB_INIT(&info.tmptree);
412 spin_lock(&hmp->io_spin);
413 if (hmp->iofree_count > 1000) {
414 info.count = hmp->iofree_count / 2;
415 RB_SCAN(hammer2_io_tree, &hmp->iotree, NULL,
416 hammer2_io_cleanup_callback, &info);
418 spin_unlock(&hmp->io_spin);
419 hammer2_io_cleanup(hmp, &info.tmptree);
424 * Cleanup any dio's with (INPROG | refs) == 0.
426 * Called to clean up cached DIOs on umount after all activity has been
431 hammer2_io_cleanup_callback(hammer2_io_t *dio, void *arg)
433 struct hammer2_cleanupcb_info *info = arg;
436 if ((dio->refs & (HAMMER2_DIO_MASK | HAMMER2_DIO_INPROG)) == 0) {
441 KKASSERT(dio->bp == NULL);
442 RB_REMOVE(hammer2_io_tree, &dio->hmp->iotree, dio);
443 xio = RB_INSERT(hammer2_io_tree, &info->tmptree, dio);
444 KKASSERT(xio == NULL);
445 if (--info->count <= 0) /* limit scan */
452 hammer2_io_cleanup(hammer2_mount_t *hmp, struct hammer2_io_tree *tree)
456 while ((dio = RB_ROOT(tree)) != NULL) {
457 RB_REMOVE(hammer2_io_tree, tree, dio);
458 KKASSERT(dio->bp == NULL &&
459 (dio->refs & (HAMMER2_DIO_MASK | HAMMER2_DIO_INPROG)) == 0);
460 kfree(dio, M_HAMMER2);
461 atomic_add_int(&hammer2_dio_count, -1);
462 atomic_add_int(&hmp->iofree_count, -1);
467 * Returns a pointer to the requested data.
470 hammer2_io_data(hammer2_io_t *dio, off_t lbase)
476 KKASSERT(bp != NULL);
477 off = (lbase & ~HAMMER2_OFF_MASK_RADIX) - bp->b_loffset;
478 KKASSERT(off >= 0 && off < bp->b_bufsize);
479 return(bp->b_data + off);
483 * Helpers for hammer2_io_new*() functions
487 hammer2_iocb_new_callback(hammer2_iocb_t *iocb)
489 hammer2_io_t *dio = iocb->dio;
490 int gbctl = (iocb->flags & HAMMER2_IOCB_QUICK) ? GETBLK_NOWAIT : 0;
493 * If IOCB_INPROG is not set the dio already has a good buffer and we
494 * can't mess with it other than zero the requested range.
496 * If IOCB_INPROG is set we also own DIO_INPROG at this time and can
497 * do what needs to be done with dio->bp.
499 if (iocb->flags & HAMMER2_IOCB_INPROG) {
500 if ((iocb->flags & HAMMER2_IOCB_READ) == 0) {
501 if (iocb->lsize == dio->psize) {
503 * Fully covered buffer, try to optimize to
504 * avoid any I/O. We might already have the
505 * buffer due to iocb chaining.
507 if (dio->bp == NULL) {
508 dio->bp = getblk(dio->hmp->devvp,
509 dio->pbase, dio->psize,
513 vfs_bio_clrbuf(dio->bp);
514 dio->bp->b_flags |= B_CACHE;
516 } else if (iocb->flags & HAMMER2_IOCB_QUICK) {
518 * Partial buffer, quick mode. Do nothing.
519 * Do not instantiate the buffer or try to
520 * mark it B_CACHE because other portions of
521 * the buffer might have to be read by other
524 } else if (dio->bp == NULL ||
525 (dio->bp->b_flags & B_CACHE) == 0) {
527 * Partial buffer, normal mode, requires
528 * read-before-write. Chain the read.
530 * We might already have the buffer due to
531 * iocb chaining. XXX unclear if we really
532 * need to write/release it and reacquire
535 * QUEUE ASYNC I/O, IOCB IS NOT YET COMPLETE.
538 if (dio->refs & HAMMER2_DIO_DIRTY)
544 iocb->flags |= HAMMER2_IOCB_READ;
545 breadcb(dio->hmp->devvp,
546 dio->pbase, dio->psize,
547 hammer2_io_callback, iocb);
549 } /* else buffer is good */
550 } /* else callback from breadcb is complete */
553 if (iocb->flags & HAMMER2_IOCB_ZERO)
554 bzero(hammer2_io_data(dio, iocb->lbase), iocb->lsize);
555 atomic_set_int(&dio->refs, HAMMER2_DIO_DIRTY);
557 hammer2_io_complete(iocb);
562 _hammer2_io_new(hammer2_mount_t *hmp, off_t lbase, int lsize,
563 hammer2_io_t **diop, int flags)
568 iocb.callback = hammer2_iocb_new_callback;
576 hammer2_io_getblk(hmp, lbase, lsize, &iocb);
577 if ((iocb.flags & HAMMER2_IOCB_DONE) == 0)
578 hammer2_iocb_wait(&iocb);
579 dio = *diop = iocb.dio;
585 hammer2_io_new(hammer2_mount_t *hmp, off_t lbase, int lsize,
588 return(_hammer2_io_new(hmp, lbase, lsize, diop, HAMMER2_IOCB_ZERO));
592 hammer2_io_newnz(hammer2_mount_t *hmp, off_t lbase, int lsize,
595 return(_hammer2_io_new(hmp, lbase, lsize, diop, 0));
599 hammer2_io_newq(hammer2_mount_t *hmp, off_t lbase, int lsize,
602 return(_hammer2_io_new(hmp, lbase, lsize, diop, HAMMER2_IOCB_QUICK));
607 hammer2_iocb_bread_callback(hammer2_iocb_t *iocb)
609 hammer2_io_t *dio = iocb->dio;
614 * If IOCB_INPROG is not set the dio already has a good buffer and we
615 * can't mess with it other than zero the requested range.
617 * If IOCB_INPROG is set we also own DIO_INPROG at this time and can
618 * do what needs to be done with dio->bp.
620 if (iocb->flags & HAMMER2_IOCB_INPROG) {
621 if (dio->bp && (dio->bp->b_flags & B_CACHE)) {
623 * Already good, likely due to being chained from
627 } else if (hammer2_cluster_enable) {
629 * Synchronous cluster I/O for now.
635 peof = (dio->pbase + HAMMER2_SEGMASK64) &
637 error = cluster_read(dio->hmp->devvp, peof, dio->pbase,
639 dio->psize, HAMMER2_PBUFSIZE*4,
643 * Synchronous I/O for now.
649 error = bread(dio->hmp->devvp, dio->pbase,
650 dio->psize, &dio->bp);
657 hammer2_io_complete(iocb);
661 hammer2_io_bread(hammer2_mount_t *hmp, off_t lbase, int lsize,
667 iocb.callback = hammer2_iocb_bread_callback;
675 hammer2_io_getblk(hmp, lbase, lsize, &iocb);
676 if ((iocb.flags & HAMMER2_IOCB_DONE) == 0)
677 hammer2_iocb_wait(&iocb);
678 dio = *diop = iocb.dio;
684 * System buf/bio async callback extracts the iocb and chains
685 * to the iocb callback.
688 hammer2_io_callback(struct bio *bio)
690 struct buf *dbp = bio->bio_buf;
691 hammer2_iocb_t *iocb = bio->bio_caller_info1.ptr;
695 if ((bio->bio_flags & BIO_DONE) == 0)
697 bio->bio_flags &= ~(BIO_DONE | BIO_SYNC);
698 dio->bp = bio->bio_buf;
699 iocb->callback(iocb);
703 hammer2_io_bawrite(hammer2_io_t **diop)
705 atomic_set_int(&(*diop)->refs, HAMMER2_DIO_DIRTY);
706 hammer2_io_putblk(diop);
710 hammer2_io_bdwrite(hammer2_io_t **diop)
712 atomic_set_int(&(*diop)->refs, HAMMER2_DIO_DIRTY);
713 hammer2_io_putblk(diop);
717 hammer2_io_bwrite(hammer2_io_t **diop)
719 atomic_set_int(&(*diop)->refs, HAMMER2_DIO_DIRTY);
720 hammer2_io_putblk(diop);
721 return (0); /* XXX */
725 hammer2_io_setdirty(hammer2_io_t *dio)
727 atomic_set_int(&dio->refs, HAMMER2_DIO_DIRTY);
731 hammer2_io_setinval(hammer2_io_t *dio, u_int bytes)
733 if ((u_int)dio->psize == bytes)
734 dio->bp->b_flags |= B_INVAL | B_RELBUF;
738 hammer2_io_brelse(hammer2_io_t **diop)
740 hammer2_io_putblk(diop);
744 hammer2_io_bqrelse(hammer2_io_t **diop)
746 hammer2_io_putblk(diop);
750 hammer2_io_isdirty(hammer2_io_t *dio)
752 return((dio->refs & HAMMER2_DIO_DIRTY) != 0);