2 * Copyright (c) 2013-2018 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@dragonflybsd.org>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 #define HAMMER2_DOP_READ 1
38 #define HAMMER2_DOP_NEW 2
39 #define HAMMER2_DOP_NEWNZ 3
40 #define HAMMER2_DOP_READQ 4
43 * Implements an abstraction layer for synchronous and asynchronous
44 * buffered device I/O. Can be used as an OS-abstraction but the main
45 * purpose is to allow larger buffers to be used against hammer2_chain's
46 * using smaller allocations, without causing deadlocks.
48 * The DIOs also record temporary state with limited persistence. This
49 * feature is used to keep track of dedupable blocks.
51 static int hammer2_io_cleanup_callback(hammer2_io_t *dio, void *arg);
52 static void dio_write_stats_update(hammer2_io_t *dio, struct buf *bp);
55 hammer2_io_cmp(hammer2_io_t *io1, hammer2_io_t *io2)
57 if (io1->pbase < io2->pbase)
59 if (io1->pbase > io2->pbase)
64 RB_PROTOTYPE2(hammer2_io_tree, hammer2_io, rbnode, hammer2_io_cmp, off_t);
65 RB_GENERATE2(hammer2_io_tree, hammer2_io, rbnode, hammer2_io_cmp,
68 struct hammer2_cleanupcb_info {
69 struct hammer2_io_tree tmptree;
76 hammer2_io_mask(hammer2_io_t *dio, hammer2_off_t off, u_int bytes)
81 if (bytes < 1024) /* smaller chunks not supported */
85 * Calculate crc check mask for larger chunks
87 i = (((off & ~HAMMER2_OFF_MASK_RADIX) - dio->pbase) &
88 HAMMER2_PBUFMASK) >> 10;
89 if (i == 0 && bytes == HAMMER2_PBUFSIZE)
91 mask = ((uint64_t)1U << (bytes >> 10)) - 1;
99 * Returns the DIO corresponding to the data|radix, creating it if necessary.
101 * If createit is 0, NULL can be returned indicating that the DIO does not
102 * exist. (btype) is ignored when createit is 0.
106 hammer2_io_alloc(hammer2_dev_t *hmp, hammer2_key_t data_off, uint8_t btype,
107 int createit, int *isgoodp)
118 psize = HAMMER2_PBUFSIZE;
119 pmask = ~(hammer2_off_t)(psize - 1);
120 lsize = 1 << (int)(data_off & HAMMER2_OFF_MASK_RADIX);
121 lbase = data_off & ~HAMMER2_OFF_MASK_RADIX;
122 pbase = lbase & pmask;
124 if (pbase == 0 || ((lbase + lsize - 1) & pmask) != pbase) {
125 kprintf("Illegal: %016jx %016jx+%08x / %016jx\n",
126 pbase, lbase, lsize, pmask);
128 KKASSERT(pbase != 0 && ((lbase + lsize - 1) & pmask) == pbase);
132 * Access/Allocate the DIO, bump dio->refs to prevent destruction.
134 hammer2_spin_sh(&hmp->io_spin);
135 dio = RB_LOOKUP(hammer2_io_tree, &hmp->iotree, pbase);
137 refs = atomic_fetchadd_64(&dio->refs, 1);
138 if ((refs & HAMMER2_DIO_MASK) == 0) {
139 atomic_add_int(&dio->hmp->iofree_count, -1);
141 if (refs & HAMMER2_DIO_GOOD)
143 hammer2_spin_unsh(&hmp->io_spin);
144 } else if (createit) {
146 hammer2_spin_unsh(&hmp->io_spin);
147 dio = kmalloc(sizeof(*dio), M_HAMMER2, M_INTWAIT | M_ZERO);
152 dio->refs = refs + 1;
154 hammer2_spin_ex(&hmp->io_spin);
155 xio = RB_INSERT(hammer2_io_tree, &hmp->iotree, dio);
157 atomic_add_int(&hammer2_dio_count, 1);
158 hammer2_spin_unex(&hmp->io_spin);
160 refs = atomic_fetchadd_64(&xio->refs, 1);
161 if ((refs & HAMMER2_DIO_MASK) == 0)
162 atomic_add_int(&xio->hmp->iofree_count, -1);
163 if (refs & HAMMER2_DIO_GOOD)
165 hammer2_spin_unex(&hmp->io_spin);
166 kfree(dio, M_HAMMER2);
170 hammer2_spin_unsh(&hmp->io_spin);
181 * Acquire the requested dio. If DIO_GOOD is not set we must instantiate
182 * a buffer. If set the buffer already exists and is good to go.
185 hammer2_io_getblk(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize, int op)
196 bflags = ((btype == HAMMER2_BREF_TYPE_DATA) ? B_NOTMETA : 0);
199 KKASSERT((1 << (int)(lbase & HAMMER2_OFF_MASK_RADIX)) == lsize);
201 if (op == HAMMER2_DOP_READQ) {
202 dio = hammer2_io_alloc(hmp, lbase, btype, 0, &isgood);
205 op = HAMMER2_DOP_READ;
207 dio = hammer2_io_alloc(hmp, lbase, btype, 1, &isgood);
215 * Buffer is already good, handle the op and return.
217 if (orefs & HAMMER2_DIO_GOOD) {
223 case HAMMER2_DOP_NEW:
224 bzero(hammer2_io_data(dio, lbase), lsize);
226 case HAMMER2_DOP_NEWNZ:
227 atomic_set_long(&dio->refs, HAMMER2_DIO_DIRTY);
229 case HAMMER2_DOP_READ:
240 if (orefs & HAMMER2_DIO_INPROG) {
241 nrefs = orefs | HAMMER2_DIO_WAITING;
242 tsleep_interlock(dio, 0);
243 if (atomic_cmpset_64(&dio->refs, orefs, nrefs)) {
244 tsleep(dio, PINTERLOCKED, "h2dio", hz);
248 nrefs = orefs | HAMMER2_DIO_INPROG;
249 if (atomic_cmpset_64(&dio->refs, orefs, nrefs)) {
256 * We break to here if GOOD is not set and we acquired INPROG for
259 KKASSERT(dio->bp == NULL);
260 if (btype == HAMMER2_BREF_TYPE_DATA)
261 hce = hammer2_cluster_data_read;
263 hce = hammer2_cluster_meta_read;
266 if (dio->pbase == (lbase & ~HAMMER2_OFF_MASK_RADIX) &&
267 dio->psize == lsize) {
269 case HAMMER2_DOP_NEW:
270 case HAMMER2_DOP_NEWNZ:
271 dio->bp = getblk(dio->hmp->devvp,
272 dio->pbase, dio->psize,
274 if (op == HAMMER2_DOP_NEW) {
276 bzero(dio->bp->b_data, dio->psize);
278 atomic_set_long(&dio->refs, HAMMER2_DIO_DIRTY);
280 case HAMMER2_DOP_READ:
284 * Synchronous cluster I/O for now.
286 peof = (dio->pbase + HAMMER2_SEGMASK64) &
289 error = cluster_readx(dio->hmp->devvp,
293 HAMMER2_PBUFSIZE*hce,
297 error = breadnx(dio->hmp->devvp, dio->pbase,
299 NULL, NULL, 0, &dio->bp);
305 * Synchronous cluster I/O for now.
307 peof = (dio->pbase + HAMMER2_SEGMASK64) &
309 error = cluster_readx(dio->hmp->devvp,
310 peof, dio->pbase, dio->psize,
312 dio->psize, HAMMER2_PBUFSIZE*hce,
315 error = breadnx(dio->hmp->devvp, dio->pbase,
317 NULL, NULL, 0, &dio->bp);
324 case HAMMER2_DOP_NEW:
326 bzero(hammer2_io_data(dio, lbase), lsize);
328 case HAMMER2_DOP_NEWNZ:
329 atomic_set_long(&dio->refs, HAMMER2_DIO_DIRTY);
331 case HAMMER2_DOP_READ:
337 * Tell the kernel that the buffer cache is not
338 * meta-data based on the btype. This allows
339 * swapcache to distinguish between data and
343 case HAMMER2_BREF_TYPE_DATA:
344 dio->bp->b_flags |= B_NOTMETA;
354 BUF_KERNPROC(dio->bp);
355 dio->bp->b_flags &= ~B_AGE;
360 * Clear INPROG and WAITING, set GOOD wake up anyone waiting.
365 nrefs = orefs & ~(HAMMER2_DIO_INPROG | HAMMER2_DIO_WAITING);
367 nrefs |= HAMMER2_DIO_GOOD;
368 if (atomic_cmpset_64(&dio->refs, orefs, nrefs)) {
369 if (orefs & HAMMER2_DIO_WAITING)
376 /* XXX error handling */
382 * Release our ref on *diop.
384 * On the 1->0 transition we clear DIO_GOOD, set DIO_INPROG, and dispose
385 * of dio->bp. Then we clean up DIO_INPROG and DIO_WAITING.
388 hammer2_io_putblk(hammer2_io_t **diop)
403 KKASSERT((dio->refs & HAMMER2_DIO_MASK) != 0);
408 * On the 1->0 transition clear GOOD and set INPROG, and break.
409 * On any other transition we can return early.
415 if ((orefs & HAMMER2_DIO_MASK) == 1 &&
416 (orefs & HAMMER2_DIO_INPROG) == 0) {
418 * Lastdrop case, INPROG can be set.
421 nrefs &= ~(HAMMER2_DIO_GOOD | HAMMER2_DIO_DIRTY);
422 nrefs |= HAMMER2_DIO_INPROG;
423 if (atomic_cmpset_64(&dio->refs, orefs, nrefs))
425 } else if ((orefs & HAMMER2_DIO_MASK) == 1) {
427 * Lastdrop case, INPROG already set. We must
428 * wait for INPROG to clear.
430 nrefs = orefs | HAMMER2_DIO_WAITING;
431 tsleep_interlock(dio, 0);
432 if (atomic_cmpset_64(&dio->refs, orefs, nrefs)) {
433 tsleep(dio, PINTERLOCKED, "h2dio", hz);
441 if (atomic_cmpset_64(&dio->refs, orefs, nrefs))
450 * Lastdrop (1->0 transition). INPROG has been set, GOOD and DIRTY
451 * have been cleared. iofree_count has not yet been incremented,
452 * note that another accessor race will decrement iofree_count so
453 * we have to increment it regardless.
455 * We can now dispose of the buffer, and should do it before calling
456 * io_complete() in case there's a race against a new reference
457 * which causes io_complete() to chain and instantiate the bp again.
464 if ((orefs & HAMMER2_DIO_GOOD) && bp) {
466 * Non-errored disposal of bp
468 if (orefs & HAMMER2_DIO_DIRTY) {
469 dio_write_stats_update(dio, bp);
472 * Allows dirty buffers to accumulate and
473 * possibly be canceled (e.g. by a 'rm'),
474 * will burst-write later. Allow the kernel
475 * to cluster the dirty buffers.
477 * NOTE: Do not use cluster_write() here. The
478 * problem is that due to the way chains
479 * are locked, buffers are cycled in and out
480 * quite often so the disposal here is not
481 * necessarily the final disposal. Avoid
482 * excessive rewriting of the same blocks
483 * by using bdwrite().
489 if ((hce = hammer2_cluster_write) > 0) {
491 * Allows write-behind to keep the buffer
494 peof = (pbase + HAMMER2_SEGMASK64) &
496 bp->b_flags |= B_CLUSTEROK;
497 cluster_write(bp, peof, psize, hce);
501 bp->b_flags |= B_CLUSTEROK;
504 } else if (bp->b_flags & (B_ERROR | B_INVAL | B_RELBUF)) {
511 * Errored disposal of bp
517 * Update iofree_count before disposing of the dio
520 atomic_add_int(&hmp->iofree_count, 1);
523 * Clear INPROG, GOOD, and WAITING
528 nrefs = orefs & ~(HAMMER2_DIO_INPROG | HAMMER2_DIO_GOOD |
529 HAMMER2_DIO_WAITING);
530 if (atomic_cmpset_64(&dio->refs, orefs, nrefs)) {
531 if (orefs & HAMMER2_DIO_WAITING)
539 * We cache free buffers so re-use cases can use a shared lock, but
540 * if too many build up we have to clean them out.
542 dio_limit = hammer2_dio_limit;
545 if (dio_limit > 1024*1024)
546 dio_limit = 1024*1024;
547 if (hmp->iofree_count > dio_limit) {
548 struct hammer2_cleanupcb_info info;
550 RB_INIT(&info.tmptree);
551 hammer2_spin_ex(&hmp->io_spin);
552 if (hmp->iofree_count > dio_limit) {
553 info.count = hmp->iofree_count / 5;
554 RB_SCAN(hammer2_io_tree, &hmp->iotree, NULL,
555 hammer2_io_cleanup_callback, &info);
557 hammer2_spin_unex(&hmp->io_spin);
558 hammer2_io_cleanup(hmp, &info.tmptree);
563 * Cleanup any dio's with (INPROG | refs) == 0.
565 * Called to clean up cached DIOs on umount after all activity has been
570 hammer2_io_cleanup_callback(hammer2_io_t *dio, void *arg)
572 struct hammer2_cleanupcb_info *info = arg;
575 if ((dio->refs & (HAMMER2_DIO_MASK | HAMMER2_DIO_INPROG)) == 0) {
579 act = dio->act - (ticks - dio->ticks) / hz - 1;
586 KKASSERT(dio->bp == NULL);
587 if (info->count > 0) {
588 RB_REMOVE(hammer2_io_tree, &dio->hmp->iotree, dio);
589 xio = RB_INSERT(hammer2_io_tree, &info->tmptree, dio);
590 KKASSERT(xio == NULL);
598 hammer2_io_cleanup(hammer2_dev_t *hmp, struct hammer2_io_tree *tree)
602 while ((dio = RB_ROOT(tree)) != NULL) {
603 RB_REMOVE(hammer2_io_tree, tree, dio);
604 KKASSERT(dio->bp == NULL &&
605 (dio->refs & (HAMMER2_DIO_MASK | HAMMER2_DIO_INPROG)) == 0);
606 if (dio->refs & HAMMER2_DIO_DIRTY) {
607 kprintf("hammer2_io_cleanup: Dirty buffer "
608 "%016jx/%d (bp=%p)\n",
609 dio->pbase, dio->psize, dio->bp);
611 kfree(dio, M_HAMMER2);
612 atomic_add_int(&hammer2_dio_count, -1);
613 atomic_add_int(&hmp->iofree_count, -1);
618 * Returns a pointer to the requested data.
621 hammer2_io_data(hammer2_io_t *dio, off_t lbase)
627 KKASSERT(bp != NULL);
629 off = (lbase & ~HAMMER2_OFF_MASK_RADIX) - bp->b_loffset;
630 KKASSERT(off >= 0 && off < bp->b_bufsize);
631 return(bp->b_data + off);
635 hammer2_io_new(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize,
638 *diop = hammer2_io_getblk(hmp, btype, lbase, lsize, HAMMER2_DOP_NEW);
639 return ((*diop)->error);
643 hammer2_io_newnz(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize,
646 *diop = hammer2_io_getblk(hmp, btype, lbase, lsize, HAMMER2_DOP_NEWNZ);
647 return ((*diop)->error);
651 hammer2_io_bread(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize,
654 *diop = hammer2_io_getblk(hmp, btype, lbase, lsize, HAMMER2_DOP_READ);
655 return ((*diop)->error);
659 hammer2_io_getquick(hammer2_dev_t *hmp, off_t lbase, int lsize)
663 dio = hammer2_io_getblk(hmp, 0, lbase, lsize, HAMMER2_DOP_READQ);
668 hammer2_io_bawrite(hammer2_io_t **diop)
670 atomic_set_64(&(*diop)->refs, HAMMER2_DIO_DIRTY);
671 hammer2_io_putblk(diop);
675 hammer2_io_bdwrite(hammer2_io_t **diop)
677 atomic_set_64(&(*diop)->refs, HAMMER2_DIO_DIRTY);
678 hammer2_io_putblk(diop);
682 hammer2_io_bwrite(hammer2_io_t **diop)
684 atomic_set_64(&(*diop)->refs, HAMMER2_DIO_DIRTY);
685 hammer2_io_putblk(diop);
686 return (0); /* XXX */
690 hammer2_io_setdirty(hammer2_io_t *dio)
692 atomic_set_64(&dio->refs, HAMMER2_DIO_DIRTY);
696 * This routine is called when a MODIFIED chain is being DESTROYED,
697 * in an attempt to allow the related buffer cache buffer to be
698 * invalidated and discarded instead of flushing it to disk.
700 * At the moment this case is only really useful for file meta-data.
701 * File data is already handled via the logical buffer cache associated
702 * with the vnode, and will be discarded if it was never flushed to disk.
703 * File meta-data may include inodes, directory entries, and indirect blocks.
706 * However, our DIO buffers are PBUFSIZE'd (64KB), and the area being
707 * invalidated might be smaller. Most of the meta-data structures above
708 * are in the 'smaller' category. For now, don't try to invalidate the
712 hammer2_io_inval(hammer2_io_t *dio, hammer2_off_t data_off, u_int bytes)
718 hammer2_io_brelse(hammer2_io_t **diop)
720 hammer2_io_putblk(diop);
724 hammer2_io_bqrelse(hammer2_io_t **diop)
726 hammer2_io_putblk(diop);
730 * Set dedup validation bits in a DIO. We do not need the buffer cache
731 * buffer for this. This must be done concurrent with setting bits in
732 * the freemap so as to interlock with bulkfree's clearing of those bits.
735 hammer2_io_dedup_set(hammer2_dev_t *hmp, hammer2_blockref_t *bref)
742 dio = hammer2_io_alloc(hmp, bref->data_off, bref->type, 1, &isgood);
743 lsize = 1 << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
744 mask = hammer2_dedup_mask(dio, bref->data_off, lsize);
745 atomic_clear_64(&dio->dedup_valid, mask);
746 atomic_set_64(&dio->dedup_alloc, mask);
747 hammer2_io_putblk(&dio);
751 * Clear dedup validation bits in a DIO. This is typically done when
752 * a modified chain is destroyed or by the bulkfree code. No buffer
753 * is needed for this operation. If the DIO no longer exists it is
754 * equivalent to the bits not being set.
757 hammer2_io_dedup_delete(hammer2_dev_t *hmp, uint8_t btype,
758 hammer2_off_t data_off, u_int bytes)
764 if ((data_off & ~HAMMER2_OFF_MASK_RADIX) == 0)
766 if (btype != HAMMER2_BREF_TYPE_DATA)
768 dio = hammer2_io_alloc(hmp, data_off, btype, 0, &isgood);
770 if (data_off < dio->pbase ||
771 (data_off & ~HAMMER2_OFF_MASK_RADIX) + bytes >
772 dio->pbase + dio->psize) {
773 panic("hammer2_dedup_delete: DATAOFF BAD "
774 "%016jx/%d %016jx\n",
775 data_off, bytes, dio->pbase);
777 mask = hammer2_dedup_mask(dio, data_off, bytes);
778 atomic_clear_64(&dio->dedup_alloc, mask);
779 atomic_clear_64(&dio->dedup_valid, mask);
780 hammer2_io_putblk(&dio);
785 * Assert that dedup allocation bits in a DIO are not set. This operation
786 * does not require a buffer. The DIO does not need to exist.
789 hammer2_io_dedup_assert(hammer2_dev_t *hmp, hammer2_off_t data_off, u_int bytes)
794 dio = hammer2_io_alloc(hmp, data_off, HAMMER2_BREF_TYPE_DATA,
797 KASSERT((dio->dedup_alloc &
798 hammer2_dedup_mask(dio, data_off, bytes)) == 0,
799 ("hammer2_dedup_assert: %016jx/%d %016jx/%016jx",
802 hammer2_dedup_mask(dio, data_off, bytes),
804 hammer2_io_putblk(&dio);
810 dio_write_stats_update(hammer2_io_t *dio, struct buf *bp)
814 if (bp->b_flags & B_DELWRI)
820 case HAMMER2_BREF_TYPE_DATA:
821 counterp = &hammer2_iod_file_write;
823 case HAMMER2_BREF_TYPE_DIRENT:
824 case HAMMER2_BREF_TYPE_INODE:
825 counterp = &hammer2_iod_meta_write;
827 case HAMMER2_BREF_TYPE_INDIRECT:
828 counterp = &hammer2_iod_indr_write;
830 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
831 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
832 counterp = &hammer2_iod_fmap_write;
835 counterp = &hammer2_iod_volu_write;
838 *counterp += dio->psize;
842 hammer2_io_bkvasync(hammer2_io_t *dio)
844 KKASSERT(dio->bp != NULL);