2 * Copyright (c) 2013-2018 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@dragonflybsd.org>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 #define HAMMER2_DOP_READ 1
38 #define HAMMER2_DOP_NEW 2
39 #define HAMMER2_DOP_NEWNZ 3
40 #define HAMMER2_DOP_READQ 4
43 * Implements an abstraction layer for synchronous and asynchronous
44 * buffered device I/O. Can be used as an OS-abstraction but the main
45 * purpose is to allow larger buffers to be used against hammer2_chain's
46 * using smaller allocations, without causing deadlocks.
48 * The DIOs also record temporary state with limited persistence. This
49 * feature is used to keep track of dedupable blocks.
51 static int hammer2_io_cleanup_callback(hammer2_io_t *dio, void *arg);
52 static void dio_write_stats_update(hammer2_io_t *dio, struct buf *bp);
55 hammer2_io_cmp(hammer2_io_t *io1, hammer2_io_t *io2)
57 if (io1->pbase < io2->pbase)
59 if (io1->pbase > io2->pbase)
64 RB_PROTOTYPE2(hammer2_io_tree, hammer2_io, rbnode, hammer2_io_cmp, off_t);
65 RB_GENERATE2(hammer2_io_tree, hammer2_io, rbnode, hammer2_io_cmp,
68 struct hammer2_cleanupcb_info {
69 struct hammer2_io_tree tmptree;
76 hammer2_io_mask(hammer2_io_t *dio, hammer2_off_t off, u_int bytes)
81 if (bytes < 1024) /* smaller chunks not supported */
85 * Calculate crc check mask for larger chunks
87 i = (((off & ~HAMMER2_OFF_MASK_RADIX) - dio->pbase) &
88 HAMMER2_PBUFMASK) >> 10;
89 if (i == 0 && bytes == HAMMER2_PBUFSIZE)
91 mask = ((uint64_t)1U << (bytes >> 10)) - 1;
99 * Returns the DIO corresponding to the data|radix, creating it if necessary.
101 * If createit is 0, NULL can be returned indicating that the DIO does not
102 * exist. (btype) is ignored when createit is 0.
106 hammer2_io_alloc(hammer2_dev_t *hmp, hammer2_key_t data_off, uint8_t btype,
107 int createit, int *isgoodp)
118 psize = HAMMER2_PBUFSIZE;
119 pmask = ~(hammer2_off_t)(psize - 1);
120 lsize = 1 << (int)(data_off & HAMMER2_OFF_MASK_RADIX);
121 lbase = data_off & ~HAMMER2_OFF_MASK_RADIX;
122 pbase = lbase & pmask;
124 if (pbase == 0 || ((lbase + lsize - 1) & pmask) != pbase) {
125 kprintf("Illegal: %016jx %016jx+%08x / %016jx\n",
126 pbase, lbase, lsize, pmask);
128 KKASSERT(pbase != 0 && ((lbase + lsize - 1) & pmask) == pbase);
132 * Access/Allocate the DIO, bump dio->refs to prevent destruction.
134 hammer2_spin_sh(&hmp->io_spin);
135 dio = RB_LOOKUP(hammer2_io_tree, &hmp->iotree, pbase);
137 refs = atomic_fetchadd_64(&dio->refs, 1);
138 if ((refs & HAMMER2_DIO_MASK) == 0) {
139 atomic_add_int(&dio->hmp->iofree_count, -1);
141 if (refs & HAMMER2_DIO_GOOD)
143 hammer2_spin_unsh(&hmp->io_spin);
144 } else if (createit) {
146 hammer2_spin_unsh(&hmp->io_spin);
147 dio = kmalloc(sizeof(*dio), M_HAMMER2, M_INTWAIT | M_ZERO);
152 dio->refs = refs + 1;
154 hammer2_spin_ex(&hmp->io_spin);
155 xio = RB_INSERT(hammer2_io_tree, &hmp->iotree, dio);
157 atomic_add_int(&hammer2_dio_count, 1);
158 hammer2_spin_unex(&hmp->io_spin);
160 refs = atomic_fetchadd_64(&xio->refs, 1);
161 if ((refs & HAMMER2_DIO_MASK) == 0)
162 atomic_add_int(&xio->hmp->iofree_count, -1);
163 if (refs & HAMMER2_DIO_GOOD)
165 hammer2_spin_unex(&hmp->io_spin);
166 kfree(dio, M_HAMMER2);
170 hammer2_spin_unsh(&hmp->io_spin);
181 * Acquire the requested dio. If DIO_GOOD is not set we must instantiate
182 * a buffer. If set the buffer already exists and is good to go.
185 hammer2_io_getblk(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize, int op)
196 bflags = ((btype == HAMMER2_BREF_TYPE_DATA) ? B_NOTMETA : 0);
199 KKASSERT((1 << (int)(lbase & HAMMER2_OFF_MASK_RADIX)) == lsize);
201 if (op == HAMMER2_DOP_READQ) {
202 dio = hammer2_io_alloc(hmp, lbase, btype, 0, &isgood);
205 op = HAMMER2_DOP_READ;
207 dio = hammer2_io_alloc(hmp, lbase, btype, 1, &isgood);
215 * Buffer is already good, handle the op and return.
217 if (orefs & HAMMER2_DIO_GOOD) {
223 case HAMMER2_DOP_NEW:
224 bzero(hammer2_io_data(dio, lbase), lsize);
226 case HAMMER2_DOP_NEWNZ:
227 atomic_set_long(&dio->refs, HAMMER2_DIO_DIRTY);
229 case HAMMER2_DOP_READ:
240 if (orefs & HAMMER2_DIO_INPROG) {
241 nrefs = orefs | HAMMER2_DIO_WAITING;
242 tsleep_interlock(dio, 0);
243 if (atomic_cmpset_64(&dio->refs, orefs, nrefs)) {
244 tsleep(dio, PINTERLOCKED, "h2dio", hz);
248 nrefs = orefs | HAMMER2_DIO_INPROG;
249 if (atomic_cmpset_64(&dio->refs, orefs, nrefs)) {
256 * We break to here if GOOD is not set and we acquired INPROG for
259 KKASSERT(dio->bp == NULL);
260 if (btype == HAMMER2_BREF_TYPE_DATA)
261 hce = hammer2_cluster_data_read;
263 hce = hammer2_cluster_meta_read;
266 if (dio->pbase == (lbase & ~HAMMER2_OFF_MASK_RADIX) &&
267 dio->psize == lsize) {
269 case HAMMER2_DOP_NEW:
270 case HAMMER2_DOP_NEWNZ:
271 dio->bp = getblk(dio->hmp->devvp,
272 dio->pbase, dio->psize,
274 if (op == HAMMER2_DOP_NEW) {
276 bzero(dio->bp->b_data, dio->psize);
278 atomic_set_long(&dio->refs, HAMMER2_DIO_DIRTY);
280 case HAMMER2_DOP_READ:
284 * Synchronous cluster I/O for now.
286 peof = (dio->pbase + HAMMER2_SEGMASK64) &
289 error = cluster_readx(dio->hmp->devvp,
293 HAMMER2_PBUFSIZE*hce,
297 error = breadnx(dio->hmp->devvp, dio->pbase,
299 NULL, NULL, 0, &dio->bp);
305 * Synchronous cluster I/O for now.
307 peof = (dio->pbase + HAMMER2_SEGMASK64) &
309 error = cluster_readx(dio->hmp->devvp,
310 peof, dio->pbase, dio->psize,
312 dio->psize, HAMMER2_PBUFSIZE*hce,
315 error = breadnx(dio->hmp->devvp, dio->pbase,
317 NULL, NULL, 0, &dio->bp);
324 case HAMMER2_DOP_NEW:
326 bzero(hammer2_io_data(dio, lbase), lsize);
328 case HAMMER2_DOP_NEWNZ:
329 atomic_set_long(&dio->refs, HAMMER2_DIO_DIRTY);
331 case HAMMER2_DOP_READ:
337 * Tell the kernel that the buffer cache is not
338 * meta-data based on the btype. This allows
339 * swapcache to distinguish between data and
343 case HAMMER2_BREF_TYPE_DATA:
344 dio->bp->b_flags |= B_NOTMETA;
354 BUF_KERNPROC(dio->bp);
355 dio->bp->b_flags &= ~B_AGE;
360 * Clear INPROG and WAITING, set GOOD wake up anyone waiting.
365 nrefs = orefs & ~(HAMMER2_DIO_INPROG | HAMMER2_DIO_WAITING);
367 nrefs |= HAMMER2_DIO_GOOD;
368 if (atomic_cmpset_64(&dio->refs, orefs, nrefs)) {
369 if (orefs & HAMMER2_DIO_WAITING)
376 /* XXX error handling */
382 * Release our ref on *diop.
384 * On the 1->0 transition we clear DIO_GOOD, set DIO_INPROG, and dispose
385 * of dio->bp. Then we clean up DIO_INPROG and DIO_WAITING.
388 hammer2_io_putblk(hammer2_io_t **diop)
403 KKASSERT((dio->refs & HAMMER2_DIO_MASK) != 0);
408 * On the 1->0 transition clear GOOD and set INPROG, and break.
409 * On any other transition we can return early.
415 if ((orefs & HAMMER2_DIO_MASK) == 1 &&
416 (orefs & HAMMER2_DIO_INPROG) == 0) {
418 * Lastdrop case, INPROG can be set. GOOD must be
419 * cleared to prevent the getblk shortcut.
422 nrefs &= ~(HAMMER2_DIO_GOOD | HAMMER2_DIO_DIRTY);
423 nrefs |= HAMMER2_DIO_INPROG;
424 if (atomic_cmpset_64(&dio->refs, orefs, nrefs))
426 } else if ((orefs & HAMMER2_DIO_MASK) == 1) {
428 * Lastdrop case, INPROG already set. We must
429 * wait for INPROG to clear.
431 nrefs = orefs | HAMMER2_DIO_WAITING;
432 tsleep_interlock(dio, 0);
433 if (atomic_cmpset_64(&dio->refs, orefs, nrefs)) {
434 tsleep(dio, PINTERLOCKED, "h2dio", hz);
442 if (atomic_cmpset_64(&dio->refs, orefs, nrefs))
451 * Lastdrop (1->0 transition). INPROG has been set, GOOD and DIRTY
452 * have been cleared. iofree_count has not yet been incremented,
453 * note that another accessor race will decrement iofree_count so
454 * we have to increment it regardless.
456 * We can now dispose of the buffer, and should do it before calling
457 * io_complete() in case there's a race against a new reference
458 * which causes io_complete() to chain and instantiate the bp again.
465 if ((orefs & HAMMER2_DIO_GOOD) && bp) {
467 * Non-errored disposal of bp
469 if (orefs & HAMMER2_DIO_DIRTY) {
470 dio_write_stats_update(dio, bp);
473 * Allows dirty buffers to accumulate and
474 * possibly be canceled (e.g. by a 'rm'),
475 * will burst-write later.
477 * We normally do not allow the kernel to
478 * cluster dirty buffers because H2 already
479 * uses a large block size.
481 * NOTE: Do not use cluster_write() here. The
482 * problem is that due to the way chains
483 * are locked, buffers are cycled in and out
484 * quite often so the disposal here is not
485 * necessarily the final disposal. Avoid
486 * excessive rewriting of the same blocks
487 * by using bdwrite().
493 if ((hce = hammer2_cluster_write) > 0) {
495 * Allows write-behind to keep the buffer
498 peof = (pbase + HAMMER2_SEGMASK64) &
500 bp->b_flags |= B_CLUSTEROK;
501 cluster_write(bp, peof, psize, hce);
504 if (hammer2_cluster_write)
505 bp->b_flags |= B_CLUSTEROK;
507 bp->b_flags &= ~B_CLUSTEROK;
509 } else if (bp->b_flags & (B_ERROR | B_INVAL | B_RELBUF)) {
516 * Errored disposal of bp
522 * Update iofree_count before disposing of the dio
525 atomic_add_int(&hmp->iofree_count, 1);
528 * Clear INPROG, GOOD, and WAITING (GOOD should already be clear).
533 nrefs = orefs & ~(HAMMER2_DIO_INPROG | HAMMER2_DIO_GOOD |
534 HAMMER2_DIO_WAITING);
535 if (atomic_cmpset_64(&dio->refs, orefs, nrefs)) {
536 if (orefs & HAMMER2_DIO_WAITING)
544 * We cache free buffers so re-use cases can use a shared lock, but
545 * if too many build up we have to clean them out.
547 dio_limit = hammer2_dio_limit;
550 if (dio_limit > 1024*1024)
551 dio_limit = 1024*1024;
552 if (hmp->iofree_count > dio_limit) {
553 struct hammer2_cleanupcb_info info;
555 RB_INIT(&info.tmptree);
556 hammer2_spin_ex(&hmp->io_spin);
557 if (hmp->iofree_count > dio_limit) {
558 info.count = hmp->iofree_count / 5;
559 RB_SCAN(hammer2_io_tree, &hmp->iotree, NULL,
560 hammer2_io_cleanup_callback, &info);
562 hammer2_spin_unex(&hmp->io_spin);
563 hammer2_io_cleanup(hmp, &info.tmptree);
568 * Cleanup any dio's with (INPROG | refs) == 0.
570 * Called to clean up cached DIOs on umount after all activity has been
575 hammer2_io_cleanup_callback(hammer2_io_t *dio, void *arg)
577 struct hammer2_cleanupcb_info *info = arg;
580 if ((dio->refs & (HAMMER2_DIO_MASK | HAMMER2_DIO_INPROG)) == 0) {
584 act = dio->act - (ticks - dio->ticks) / hz - 1;
591 KKASSERT(dio->bp == NULL);
592 if (info->count > 0) {
593 RB_REMOVE(hammer2_io_tree, &dio->hmp->iotree, dio);
594 xio = RB_INSERT(hammer2_io_tree, &info->tmptree, dio);
595 KKASSERT(xio == NULL);
603 hammer2_io_cleanup(hammer2_dev_t *hmp, struct hammer2_io_tree *tree)
607 while ((dio = RB_ROOT(tree)) != NULL) {
608 RB_REMOVE(hammer2_io_tree, tree, dio);
609 KKASSERT(dio->bp == NULL &&
610 (dio->refs & (HAMMER2_DIO_MASK | HAMMER2_DIO_INPROG)) == 0);
611 if (dio->refs & HAMMER2_DIO_DIRTY) {
612 kprintf("hammer2_io_cleanup: Dirty buffer "
613 "%016jx/%d (bp=%p)\n",
614 dio->pbase, dio->psize, dio->bp);
616 kfree(dio, M_HAMMER2);
617 atomic_add_int(&hammer2_dio_count, -1);
618 atomic_add_int(&hmp->iofree_count, -1);
623 * Returns a pointer to the requested data.
626 hammer2_io_data(hammer2_io_t *dio, off_t lbase)
632 KKASSERT(bp != NULL);
634 off = (lbase & ~HAMMER2_OFF_MASK_RADIX) - bp->b_loffset;
635 KKASSERT(off >= 0 && off < bp->b_bufsize);
636 return(bp->b_data + off);
640 hammer2_io_new(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize,
643 *diop = hammer2_io_getblk(hmp, btype, lbase, lsize, HAMMER2_DOP_NEW);
644 return ((*diop)->error);
648 hammer2_io_newnz(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize,
651 *diop = hammer2_io_getblk(hmp, btype, lbase, lsize, HAMMER2_DOP_NEWNZ);
652 return ((*diop)->error);
656 hammer2_io_bread(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize,
659 *diop = hammer2_io_getblk(hmp, btype, lbase, lsize, HAMMER2_DOP_READ);
660 return ((*diop)->error);
664 hammer2_io_getquick(hammer2_dev_t *hmp, off_t lbase, int lsize)
668 dio = hammer2_io_getblk(hmp, 0, lbase, lsize, HAMMER2_DOP_READQ);
673 hammer2_io_bawrite(hammer2_io_t **diop)
675 atomic_set_64(&(*diop)->refs, HAMMER2_DIO_DIRTY);
676 hammer2_io_putblk(diop);
680 hammer2_io_bdwrite(hammer2_io_t **diop)
682 atomic_set_64(&(*diop)->refs, HAMMER2_DIO_DIRTY);
683 hammer2_io_putblk(diop);
687 hammer2_io_bwrite(hammer2_io_t **diop)
689 atomic_set_64(&(*diop)->refs, HAMMER2_DIO_DIRTY);
690 hammer2_io_putblk(diop);
691 return (0); /* XXX */
695 hammer2_io_setdirty(hammer2_io_t *dio)
697 atomic_set_64(&dio->refs, HAMMER2_DIO_DIRTY);
701 * This routine is called when a MODIFIED chain is being DESTROYED,
702 * in an attempt to allow the related buffer cache buffer to be
703 * invalidated and discarded instead of flushing it to disk.
705 * At the moment this case is only really useful for file meta-data.
706 * File data is already handled via the logical buffer cache associated
707 * with the vnode, and will be discarded if it was never flushed to disk.
708 * File meta-data may include inodes, directory entries, and indirect blocks.
711 * However, our DIO buffers are PBUFSIZE'd (64KB), and the area being
712 * invalidated might be smaller. Most of the meta-data structures above
713 * are in the 'smaller' category. For now, don't try to invalidate the
717 hammer2_io_inval(hammer2_io_t *dio, hammer2_off_t data_off, u_int bytes)
723 hammer2_io_brelse(hammer2_io_t **diop)
725 hammer2_io_putblk(diop);
729 hammer2_io_bqrelse(hammer2_io_t **diop)
731 hammer2_io_putblk(diop);
735 * Set dedup validation bits in a DIO. We do not need the buffer cache
736 * buffer for this. This must be done concurrent with setting bits in
737 * the freemap so as to interlock with bulkfree's clearing of those bits.
740 hammer2_io_dedup_set(hammer2_dev_t *hmp, hammer2_blockref_t *bref)
747 dio = hammer2_io_alloc(hmp, bref->data_off, bref->type, 1, &isgood);
748 lsize = 1 << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
749 mask = hammer2_dedup_mask(dio, bref->data_off, lsize);
750 atomic_clear_64(&dio->dedup_valid, mask);
751 atomic_set_64(&dio->dedup_alloc, mask);
752 hammer2_io_putblk(&dio);
756 * Clear dedup validation bits in a DIO. This is typically done when
757 * a modified chain is destroyed or by the bulkfree code. No buffer
758 * is needed for this operation. If the DIO no longer exists it is
759 * equivalent to the bits not being set.
762 hammer2_io_dedup_delete(hammer2_dev_t *hmp, uint8_t btype,
763 hammer2_off_t data_off, u_int bytes)
769 if ((data_off & ~HAMMER2_OFF_MASK_RADIX) == 0)
771 if (btype != HAMMER2_BREF_TYPE_DATA)
773 dio = hammer2_io_alloc(hmp, data_off, btype, 0, &isgood);
775 if (data_off < dio->pbase ||
776 (data_off & ~HAMMER2_OFF_MASK_RADIX) + bytes >
777 dio->pbase + dio->psize) {
778 panic("hammer2_dedup_delete: DATAOFF BAD "
779 "%016jx/%d %016jx\n",
780 data_off, bytes, dio->pbase);
782 mask = hammer2_dedup_mask(dio, data_off, bytes);
783 atomic_clear_64(&dio->dedup_alloc, mask);
784 atomic_clear_64(&dio->dedup_valid, mask);
785 hammer2_io_putblk(&dio);
790 * Assert that dedup allocation bits in a DIO are not set. This operation
791 * does not require a buffer. The DIO does not need to exist.
794 hammer2_io_dedup_assert(hammer2_dev_t *hmp, hammer2_off_t data_off, u_int bytes)
799 dio = hammer2_io_alloc(hmp, data_off, HAMMER2_BREF_TYPE_DATA,
802 KASSERT((dio->dedup_alloc &
803 hammer2_dedup_mask(dio, data_off, bytes)) == 0,
804 ("hammer2_dedup_assert: %016jx/%d %016jx/%016jx",
807 hammer2_dedup_mask(dio, data_off, bytes),
809 hammer2_io_putblk(&dio);
815 dio_write_stats_update(hammer2_io_t *dio, struct buf *bp)
819 if (bp->b_flags & B_DELWRI)
825 case HAMMER2_BREF_TYPE_DATA:
826 counterp = &hammer2_iod_file_write;
828 case HAMMER2_BREF_TYPE_DIRENT:
829 case HAMMER2_BREF_TYPE_INODE:
830 counterp = &hammer2_iod_meta_write;
832 case HAMMER2_BREF_TYPE_INDIRECT:
833 counterp = &hammer2_iod_indr_write;
835 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
836 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
837 counterp = &hammer2_iod_fmap_write;
840 counterp = &hammer2_iod_volu_write;
843 *counterp += dio->psize;
847 hammer2_io_bkvasync(hammer2_io_t *dio)
849 KKASSERT(dio->bp != NULL);
854 * Ref a dio that is already owned
857 hammer2_io_ref(hammer2_io_t *dio)
859 atomic_add_64(&dio->refs, 1);