From 125966e80c1aba734d3d5f12a8fcfde2bbcdb018 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Sun, 30 Aug 2015 14:17:35 -0700 Subject: [PATCH] hammer2 - Refactor bulkfree * Change the bulkfree scan from breadth-first to depth-first. This improves disk performance significantly (~2x) and is also needed for the duplicate-detection feature. * Create an 8-way set-associative hash table similar to what the live dedup code uses. Record the data_off for elements we have processed and detect if a duplicate is encountered so we do not have to re-process the duplicate subtree. Also prioritize the table based on the aggregate bottom-up inode count to reduce the chance that a top-level duplicate (aka snapshot) will get kicked out of the hash table. * Clean up the hammer2_chain_scan() API, making it more bref-centric which allows us to avoid instantiating chain structures for leaf entities. This significantly improves performance and increases flexibility. * Manual page adjustments for kern.maxvnodes settings suggestions. --- sbin/hammer/hammer.8 | 18 +- sbin/hammer2/hammer2.8 | 21 ++ sys/vfs/hammer2/hammer2.h | 13 +- sys/vfs/hammer2/hammer2_bulkfree.c | 338 +++++++++++++++++++---------- sys/vfs/hammer2/hammer2_chain.c | 173 ++++++++------- sys/vfs/hammer2/hammer2_flush.c | 4 +- sys/vfs/hammer2/hammer2_strategy.c | 20 ++ sys/vfs/hammer2/hammer2_vfsops.c | 38 +++- 8 files changed, 401 insertions(+), 224 deletions(-) diff --git a/sbin/hammer/hammer.8 b/sbin/hammer/hammer.8 index 0af63f3879..d7351e6755 100644 --- a/sbin/hammer/hammer.8 +++ b/sbin/hammer/hammer.8 @@ -1590,14 +1590,16 @@ The swapcache will save the cached VM pages related to block device (which doesn't recycle unless you umount the filesystem) instead of the cached VM pages backing the file vnodes. -.\".Pp -.\"Double buffering should also be turned on if live dedup is enabled via -.\"Va vfs.hammer.live_dedup . -.\"This is because the live dedup must validate the contents of a potential -.\"duplicate file block and it must run through the block device to do that -.\"and not the file vnode. -.\"If double buffering is not enabled then live dedup will create extra disk -.\"reads to validate potential data duplicates. +.Pp +Double buffering is normally desireable when working with large filesystems, +particularly when swapcache is used. +The swapcache can only back active VM objects, including the block device, +and large filesystems often have far more inodes than the kernel can support. +In addition, when using this mode, you may wish to reduce the +.Va kern.maxvnodes +setting for the system to force the system to do less caching of logical +file buffers and more caching of device buffers, since the device buffers +are backing the logical file buffers. .Sh UPGRADE INSTRUCTIONS HAMMER V1 TO V2 This upgrade changes the way directory entries are stored. It is possible to upgrade a V1 file system to V2 in place, but diff --git a/sbin/hammer2/hammer2.8 b/sbin/hammer2/hammer2.8 index b7dd0ef17b..bca2450b36 100644 --- a/sbin/hammer2/hammer2.8 +++ b/sbin/hammer2/hammer2.8 @@ -409,6 +409,27 @@ network connectivity is lost. TODO. .Sh RESTORING FROM A SNAPSHOT BACKUP TODO. +.Sh PERFORMANCE TUNING +Because HAMMER2 implements compression, decompression, and deup natively, +it always double-buffers file data. This means that the file data is +cached via the device vnode (in compressed / dedupped-form) and the same +data is also cached by the file vnode (in decompressed / non-dedupped form). +.Pp +While HAMMER2 will try to age the logical file buffers on its, some +additional performance tuning may be necessary for optimal operation +whether swapcache is used or not. Our recommendation is to reduce the +number of vnodes (and thus also the logical buffer cache behind the +vnodes) that the system caches via the +.Va kern.maxvnodes +sysctl. +.Pp +Too-large a value will result in excessive double-caching and can cause +unnecessary read disk I/O. +We recommend a number between 25000 and 250000 vnodes, depending on your +use case. +Keep in mind that even though the vnode cache is smaller, this will make +room for a great deal more device-level buffer caching which can encompasses +far more data and meta-data than the vnode-level caching. .Sh ENVIRONMENT TODO. .Sh FILES diff --git a/sys/vfs/hammer2/hammer2.h b/sys/vfs/hammer2/hammer2.h index c788557b10..a700df3db0 100644 --- a/sys/vfs/hammer2/hammer2.h +++ b/sys/vfs/hammer2/hammer2.h @@ -377,7 +377,7 @@ RB_PROTOTYPE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp); #define HAMMER2_CHAIN_INITIAL 0x00000020 /* initial create */ #define HAMMER2_CHAIN_UPDATE 0x00000040 /* need parent update */ #define HAMMER2_CHAIN_DEFERRED 0x00000080 /* flush depth defer */ -#define HAMMER2_CHAIN_IOFLUSH 0x00000100 /* bawrite on put */ +#define HAMMER2_CHAIN_UNUSED000001000 0x00000100 #define HAMMER2_CHAIN_ONFLUSH 0x00000200 /* on a flush list */ #define HAMMER2_CHAIN_FICTITIOUS 0x00000400 /* unsuitable for I/O */ #define HAMMER2_CHAIN_VOLUMESYNC 0x00000800 /* needs volume sync */ @@ -1390,9 +1390,10 @@ hammer2_chain_t *hammer2_chain_next(hammer2_chain_t **parentp, hammer2_key_t *key_nextp, hammer2_key_t key_beg, hammer2_key_t key_end, int *cache_indexp, int flags); -hammer2_chain_t *hammer2_chain_scan(hammer2_chain_t *parent, - hammer2_chain_t *chain, - int *cache_indexp, int flags); +hammer2_blockref_t *hammer2_chain_scan(hammer2_chain_t *parent, + hammer2_chain_t **chainp, + hammer2_blockref_t *bref, + int *firstp, int *cache_indexp, int flags); int hammer2_chain_create(hammer2_chain_t **parentp, hammer2_chain_t **chainp, hammer2_pfs_t *pmp, @@ -1587,9 +1588,6 @@ void hammer2_cluster_forcegood(hammer2_cluster_t *cluster); hammer2_cluster_t *hammer2_cluster_copy(hammer2_cluster_t *ocluster); void hammer2_cluster_unlock(hammer2_cluster_t *cluster); -int hammer2_bulk_scan(hammer2_chain_t *parent, - int (*func)(hammer2_chain_t *chain, void *info), - void *info); int hammer2_bulkfree_pass(hammer2_dev_t *hmp, struct hammer2_ioc_bulkfree *bfi); @@ -1607,6 +1605,7 @@ int hammer2_vop_strategy(struct vop_strategy_args *ap); int hammer2_vop_bmap(struct vop_bmap_args *ap); void hammer2_write_thread(void *arg); void hammer2_bioq_sync(hammer2_pfs_t *pmp); +void hammer2_dedup_clear(hammer2_dev_t *hmp); #endif /* !_KERNEL */ #endif /* !_VFS_HAMMER2_HAMMER2_H_ */ diff --git a/sys/vfs/hammer2/hammer2_bulkfree.c b/sys/vfs/hammer2/hammer2_bulkfree.c index 8198f2a8ff..2b1cb11e1a 100644 --- a/sys/vfs/hammer2/hammer2_bulkfree.c +++ b/sys/vfs/hammer2/hammer2_bulkfree.c @@ -51,112 +51,158 @@ */ typedef struct hammer2_chain_save { TAILQ_ENTRY(hammer2_chain_save) entry; - hammer2_chain_t *parent; + hammer2_chain_t *chain; + int pri; } hammer2_chain_save_t; TAILQ_HEAD(hammer2_chain_save_list, hammer2_chain_save); typedef struct hammer2_chain_save_list hammer2_chain_save_list_t; +typedef struct hammer2_bulkfree_info { + hammer2_dev_t *hmp; + kmem_anon_desc_t kp; + hammer2_off_t sbase; /* sub-loop iteration */ + hammer2_off_t sstop; + hammer2_bmap_data_t *bmap; + int depth; + long count_10_00; + long count_11_10; + long count_10_11; + long count_l0cleans; + long count_linadjusts; + long count_inodes_scanned; + long count_dedup_factor; + long bytes_scanned; + hammer2_off_t adj_free; + hammer2_tid_t mtid; + hammer2_tid_t saved_mirror_tid; + time_t save_time; + hammer2_chain_save_list_t list; + hammer2_dedup_t *dedup; + int pri; +} hammer2_bulkfree_info_t; + +static int h2_bulkfree_test(hammer2_bulkfree_info_t *info, + hammer2_blockref_t *bref, int pri); + /* * General bulk scan function with callback. Called with a referenced - * but UNLOCKED parent. The original parent is returned in the same state. + * but UNLOCKED parent. The parent is returned in the same state. */ +static int hammer2_bulk_scan(hammer2_chain_t *parent, - int (*func)(hammer2_chain_t *chain, void *info), - void *info) + int (*func)(hammer2_bulkfree_info_t *info, + hammer2_blockref_t *bref), + hammer2_bulkfree_info_t *info) { - hammer2_chain_save_list_t list; - hammer2_chain_save_t *save; + hammer2_blockref_t bref; + hammer2_chain_t *chain; + int cache_index = -1; int doabort = 0; + int first = 1; + + ++info->pri; - TAILQ_INIT(&list); - hammer2_chain_ref(parent); - save = kmalloc(sizeof(*save), M_HAMMER2, M_WAITOK | M_ZERO); - save->parent = parent; - TAILQ_INSERT_TAIL(&list, save, entry); + hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS | + HAMMER2_RESOLVE_SHARED); + chain = NULL; - while ((save = TAILQ_FIRST(&list)) != NULL && doabort == 0) { - hammer2_chain_t *chain; - int cache_index; + /* + * Generally loop on the contents if we have not been flagged + * for abort. + * + * Remember that these chains are completely isolated from + * the frontend, so we can release locks temporarily without + * imploding. + */ + while ((doabort & HAMMER2_BULK_ABORT) == 0 && + hammer2_chain_scan(parent, &chain, &bref, &first, + &cache_index, + HAMMER2_LOOKUP_NODATA | + HAMMER2_LOOKUP_SHARED) != NULL) { + /* + * Process bref, chain is only non-NULL if the bref + * might be recursable (its possible that we sometimes get + * a non-NULL chain where the bref cannot be recursed). + */ +#if 0 + kprintf("SCAN %016jx\n", bref.data_off); + int xerr = tsleep(&info->pri, PCATCH, "slp", hz / 10); + if (xerr == EINTR || xerr == ERESTART) { + doabort |= HAMMER2_BULK_ABORT; + } +#endif + ++info->pri; + if (h2_bulkfree_test(info, &bref, 1)) + continue; - TAILQ_REMOVE(&list, save, entry); + doabort |= func(info, &bref); - parent = save->parent; - save->parent = NULL; - chain = NULL; - cache_index = -1; + if (doabort & HAMMER2_BULK_ABORT) + break; /* - * lock the parent, the lock eats the ref. + * A non-null chain is always returned if it is + * recursive, otherwise a non-null chain might be + * returned but usually is not when not recursive. */ - hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS | - HAMMER2_RESOLVE_SHARED); + if (chain == NULL) + continue; /* - * Generally loop on the contents if we have not been flagged - * for abort. + * Else check type and setup depth-first scan. + * + * Account for bytes actually read. */ - while ((doabort & HAMMER2_BULK_ABORT) == 0) { - chain = hammer2_chain_scan(parent, chain, &cache_index, - HAMMER2_LOOKUP_NODATA | - HAMMER2_LOOKUP_SHARED); - if (chain == NULL) - break; - doabort |= func(chain, info); - - if (doabort & HAMMER2_BULK_ABORT) { - hammer2_chain_unlock(chain); - hammer2_chain_drop(chain); - chain = NULL; - break; - } - switch(chain->bref.type) { - case HAMMER2_BREF_TYPE_INODE: - case HAMMER2_BREF_TYPE_FREEMAP_NODE: - case HAMMER2_BREF_TYPE_INDIRECT: - case HAMMER2_BREF_TYPE_VOLUME: - case HAMMER2_BREF_TYPE_FREEMAP: - /* - * Breadth-first scan. Chain is referenced - * to save for later and will be unlocked on - * our loop (so it isn't left locked while on - * the list). - */ - if (save == NULL) { - save = kmalloc(sizeof(*save), - M_HAMMER2, - M_WAITOK | M_ZERO); - } + info->bytes_scanned += chain->bytes; + + switch(chain->bref.type) { + case HAMMER2_BREF_TYPE_INODE: + case HAMMER2_BREF_TYPE_FREEMAP_NODE: + case HAMMER2_BREF_TYPE_INDIRECT: + case HAMMER2_BREF_TYPE_VOLUME: + case HAMMER2_BREF_TYPE_FREEMAP: + ++info->depth; + if (info->depth > 16) { + hammer2_chain_save_t *save; + save = kmalloc(sizeof(*save), M_HAMMER2, + M_WAITOK | M_ZERO); + save->chain = chain; hammer2_chain_ref(chain); - save->parent = chain; - TAILQ_INSERT_TAIL(&list, save, entry); - save = NULL; - break; - default: - /* does not recurse */ - break; + TAILQ_INSERT_TAIL(&info->list, save, entry); + + /* guess */ + info->pri += 10; + } else { + int savepri = info->pri; + + hammer2_chain_unlock(chain); + info->pri = 0; + doabort |= hammer2_bulk_scan(chain, func, info); + info->pri += savepri; + hammer2_chain_lock(chain, + HAMMER2_RESOLVE_ALWAYS | + HAMMER2_RESOLVE_SHARED); } + --info->depth; + break; + default: + /* does not recurse */ + break; } - - /* - * Releases the lock and the ref the lock inherited. Free - * save structure if we didn't recycle it above. - */ - hammer2_chain_unlock(parent); - hammer2_chain_drop(parent); - if (save) - kfree(save, M_HAMMER2); + } + if (chain) { + hammer2_chain_unlock(chain); + hammer2_chain_drop(chain); } /* - * Cleanup anything left undone due to an abort + * Save with higher pri now that we know what it is. */ - while ((save = TAILQ_FIRST(&list)) != NULL) { - TAILQ_REMOVE(&list, save, entry); - hammer2_chain_drop(save->parent); - kfree(save, M_HAMMER2); - } + h2_bulkfree_test(info, &parent->bref, info->pri + 1); + + hammer2_chain_unlock(parent); return doabort; } @@ -193,24 +239,8 @@ hammer2_bulk_scan(hammer2_chain_t *parent, /* * Bulkfree callback info */ -typedef struct hammer2_bulkfree_info { - hammer2_dev_t *hmp; - kmem_anon_desc_t kp; - hammer2_off_t sbase; /* sub-loop iteration */ - hammer2_off_t sstop; - hammer2_bmap_data_t *bmap; - long count_10_00; - long count_11_10; - long count_10_11; - long count_l0cleans; - long count_linadjusts; - hammer2_off_t adj_free; - hammer2_tid_t mtid; - hammer2_tid_t saved_mirror_tid; - time_t save_time; -} hammer2_bulkfree_info_t; - -static int h2_bulkfree_callback(hammer2_chain_t *chain, void *info); +static int h2_bulkfree_callback(hammer2_bulkfree_info_t *cbinfo, + hammer2_blockref_t *bref); static void h2_bulkfree_sync(hammer2_bulkfree_info_t *cbinfo); static void h2_bulkfree_sync_adjust(hammer2_bulkfree_info_t *cbinfo, hammer2_bmap_data_t *live, hammer2_bmap_data_t *bmap, @@ -221,6 +251,7 @@ hammer2_bulkfree_pass(hammer2_dev_t *hmp, hammer2_ioc_bulkfree_t *bfi) { hammer2_bulkfree_info_t cbinfo; hammer2_chain_t *vchain; + hammer2_chain_save_t *save; hammer2_off_t incr; size_t size; int doabort = 0; @@ -232,6 +263,15 @@ hammer2_bulkfree_pass(hammer2_dev_t *hmp, hammer2_ioc_bulkfree_t *bfi) */ lockmgr(&hmp->bulklk, LK_EXCLUSIVE); + /* + * We have to clear the live dedup cache as it might have entries + * that are freeable as of now. Any new entries in the dedup cache + * made after this point, even if they become freeable, will have + * previously been fully allocated and will be protected by the + * 2-stage bulkfree. + */ + hammer2_dedup_clear(hmp); + #if 0 /* * XXX This has been removed. Instead of trying to flush, which @@ -260,6 +300,9 @@ hammer2_bulkfree_pass(hammer2_dev_t *hmp, hammer2_ioc_bulkfree_t *bfi) cbinfo.bmap = kmem_alloc_swapbacked(&cbinfo.kp, size); cbinfo.saved_mirror_tid = hmp->voldata.mirror_tid; + cbinfo.dedup = kmalloc(sizeof(*cbinfo.dedup) * HAMMER2_DEDUP_HEUR_SIZE, + M_HAMMER2, M_WAITOK | M_ZERO); + /* * Normalize start point to a 2GB boundary. We operate on a * 64KB leaf bitmap boundary which represents 2GB of storage. @@ -278,6 +321,7 @@ hammer2_bulkfree_pass(hammer2_dev_t *hmp, hammer2_ioc_bulkfree_t *bfi) * storage allocated by the frontend. */ vchain = hammer2_chain_bulksnap(&hmp->vchain); + TAILQ_INIT(&cbinfo.list); /* * Loop on a full meta-data scan as many times as required to @@ -302,9 +346,27 @@ hammer2_bulkfree_pass(hammer2_dev_t *hmp, hammer2_ioc_bulkfree_t *bfi) hammer2_trans_init(hmp->spmp, 0); cbinfo.mtid = hammer2_trans_sub(hmp->spmp); - + cbinfo.pri = 0; doabort |= hammer2_bulk_scan(vchain, h2_bulkfree_callback, &cbinfo); + + while ((save = TAILQ_FIRST(&cbinfo.list)) != NULL && + doabort == 0) { + TAILQ_REMOVE(&cbinfo.list, save, entry); + cbinfo.pri = 0; + doabort |= hammer2_bulk_scan(save->chain, + h2_bulkfree_callback, + &cbinfo); + hammer2_chain_drop(save->chain); + kfree(save, M_HAMMER2); + } + while (save) { + TAILQ_REMOVE(&cbinfo.list, save, entry); + hammer2_chain_drop(save->chain); + kfree(save, M_HAMMER2); + save = TAILQ_FIRST(&cbinfo.list); + } + kprintf("bulkfree lastdrop %d %d\n", vchain->refs, vchain->core.chain_count); @@ -333,6 +395,8 @@ hammer2_bulkfree_pass(hammer2_dev_t *hmp, hammer2_ioc_bulkfree_t *bfi) } hammer2_chain_bulkdrop(vchain); kmem_free_swapbacked(&cbinfo.kp); + kfree(cbinfo.dedup, M_HAMMER2); + cbinfo.dedup = NULL; bfi->sstop = cbinfo.sbase; @@ -349,6 +413,7 @@ hammer2_bulkfree_pass(hammer2_dev_t *hmp, hammer2_ioc_bulkfree_t *bfi) kprintf(" raced on %ld\n", cbinfo.count_10_11); kprintf(" ~2MB segs cleaned %ld\n", cbinfo.count_l0cleans); kprintf(" linear adjusts %ld\n", cbinfo.count_linadjusts); + kprintf(" dedup factor %ld\n", cbinfo.count_dedup_factor); lockmgr(&hmp->bulklk, LK_RELEASE); @@ -356,9 +421,8 @@ hammer2_bulkfree_pass(hammer2_dev_t *hmp, hammer2_ioc_bulkfree_t *bfi) } static int -h2_bulkfree_callback(hammer2_chain_t *chain, void *info) +h2_bulkfree_callback(hammer2_bulkfree_info_t *cbinfo, hammer2_blockref_t *bref) { - hammer2_bulkfree_info_t *cbinfo = info; hammer2_bmap_data_t *bmap; hammer2_off_t data_off; uint16_t class; @@ -371,26 +435,26 @@ h2_bulkfree_callback(hammer2_chain_t *chain, void *info) */ if (hammer2_signal_check(&cbinfo->save_time)) return HAMMER2_BULK_ABORT; + if (bref->type == HAMMER2_BREF_TYPE_INODE) { + ++cbinfo->count_inodes_scanned; + if ((cbinfo->count_inodes_scanned & 1023) == 0) + kprintf(" inodes %6ld bytes %9ld\n", + cbinfo->count_inodes_scanned, + cbinfo->bytes_scanned); + } -#if 0 - kprintf("scan chain %016jx %016jx/%-2d type=%02x\n", - (intmax_t)chain->bref.data_off, - (intmax_t)chain->bref.key, - chain->bref.keybits, - chain->bref.type); -#endif /* * Calculate the data offset and determine if it is within * the current freemap range being gathered. */ error = 0; - data_off = chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX; + data_off = bref->data_off & ~HAMMER2_OFF_MASK_RADIX; if (data_off < cbinfo->sbase || data_off > cbinfo->sstop) return 0; - if (data_off < chain->hmp->voldata.allocator_beg) + if (data_off < cbinfo->hmp->voldata.allocator_beg) return 0; - if (data_off > chain->hmp->voldata.volu_size) + if (data_off > cbinfo->hmp->voldata.volu_size) return 0; /* @@ -400,16 +464,16 @@ h2_bulkfree_callback(hammer2_chain_t *chain, void *info) * Hammer2 does not allow allocations to cross the L1 (2GB) boundary, * it's a problem if it does. (Or L0 (2MB) for that matter). */ - radix = (int)(chain->bref.data_off & HAMMER2_OFF_MASK_RADIX); + radix = (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX); bytes = (size_t)1 << radix; - class = (chain->bref.type << 8) | hammer2_devblkradix(radix); + class = (bref->type << 8) | hammer2_devblkradix(radix); if (data_off + bytes > cbinfo->sstop) { kprintf("hammer2_bulkfree_scan: illegal 2GB boundary " "%016jx %016jx/%d\n", - (intmax_t)chain->bref.data_off, - (intmax_t)chain->bref.key, - chain->bref.keybits); + (intmax_t)bref->data_off, + (intmax_t)bref->key, + bref->keybits); bytes = cbinfo->sstop - data_off; /* XXX */ } @@ -430,9 +494,9 @@ h2_bulkfree_callback(hammer2_chain_t *chain, void *info) if (data_off + bytes > HAMMER2_FREEMAP_LEVEL0_SIZE) { kprintf("hammer2_bulkfree_scan: illegal 2MB boundary " "%016jx %016jx/%d\n", - (intmax_t)chain->bref.data_off, - (intmax_t)chain->bref.key, - chain->bref.keybits); + (intmax_t)bref->data_off, + (intmax_t)bref->key, + bref->keybits); bytes = HAMMER2_FREEMAP_LEVEL0_SIZE - data_off; } @@ -443,9 +507,9 @@ h2_bulkfree_callback(hammer2_chain_t *chain, void *info) if (bmap->class != class) { kprintf("hammer2_bulkfree_scan: illegal mixed class " "%016jx %016jx/%d (%04x vs %04x)\n", - (intmax_t)chain->bref.data_off, - (intmax_t)chain->bref.key, - chain->bref.keybits, + (intmax_t)bref->data_off, + (intmax_t)bref->key, + bref->keybits, class, bmap->class); } if (bmap->linear < (int32_t)data_off + (int32_t)bytes) @@ -752,3 +816,39 @@ h2_bulkfree_sync_adjust(hammer2_bulkfree_info_t *cbinfo, } #endif } + +/* + * BULKFREE DEDUP HEURISTIC + * + * WARNING! This code is SMP safe but the heuristic allows SMP collisions. + * All fields must be loaded into locals and validated. + */ +static +int +h2_bulkfree_test(hammer2_bulkfree_info_t *cbinfo, hammer2_blockref_t *bref, + int pri) +{ + hammer2_dedup_t *dedup; + int best; + int n; + int i; + + n = hammer2_icrc32(&bref->data_off, sizeof(bref->data_off)); + dedup = cbinfo->dedup + (n & (HAMMER2_DEDUP_HEUR_MASK & ~7)); + + for (i = best = 0; i < 8; ++i) { + if (dedup[i].data_off == bref->data_off) { + if (dedup[i].ticks < pri) + dedup[i].ticks = pri; + if (pri == 1) + cbinfo->count_dedup_factor += dedup[i].ticks; + return 1; + } + if (dedup[i].ticks < dedup[best].ticks) + best = i; + } + dedup[best].data_off = bref->data_off; + dedup[best].ticks = pri; + + return 0; +} diff --git a/sys/vfs/hammer2/hammer2_chain.c b/sys/vfs/hammer2/hammer2_chain.c index 23c0dd691c..dd5364abde 100644 --- a/sys/vfs/hammer2/hammer2_chain.c +++ b/sys/vfs/hammer2/hammer2_chain.c @@ -868,29 +868,7 @@ hammer2_chain_unlock(hammer2_chain_t *chain) /* * Statistics */ - if (hammer2_io_isdirty(chain->dio) == 0) { - ; - } else if (chain->flags & HAMMER2_CHAIN_IOFLUSH) { - switch(chain->bref.type) { - case HAMMER2_BREF_TYPE_DATA: - counterp = &hammer2_ioa_file_write; - break; - case HAMMER2_BREF_TYPE_INODE: - counterp = &hammer2_ioa_meta_write; - break; - case HAMMER2_BREF_TYPE_INDIRECT: - counterp = &hammer2_ioa_indr_write; - break; - case HAMMER2_BREF_TYPE_FREEMAP_NODE: - case HAMMER2_BREF_TYPE_FREEMAP_LEAF: - counterp = &hammer2_ioa_fmap_write; - break; - default: - counterp = &hammer2_ioa_volu_write; - break; - } - *counterp += chain->bytes; - } else { + if (hammer2_io_isdirty(chain->dio)) { switch(chain->bref.type) { case HAMMER2_BREF_TYPE_DATA: counterp = &hammer2_iod_file_write; @@ -915,26 +893,11 @@ hammer2_chain_unlock(hammer2_chain_t *chain) /* * Clean out the dio. * - * If a device buffer was used for data be sure to destroy the - * buffer when we are done to avoid aliases (XXX what about the - * underlying VM pages?). - * * NOTE: Freemap leaf's use reserved blocks and thus no aliasing * is possible. - * - * NOTE: The isdirty check tracks whether we have to bdwrite() the - * buffer or not. The buffer might already be dirty. The - * flag is re-set when chain_modify() is called, even if - * MODIFIED is already set, allowing the OS to retire the - * buffer independent of a hammer2 flush. */ chain->data = NULL; - if ((chain->flags & HAMMER2_CHAIN_IOFLUSH) && - hammer2_io_isdirty(chain->dio)) { - hammer2_io_bawrite(&chain->dio); - } else { - hammer2_io_bqrelse(&chain->dio); - } + hammer2_io_bqrelse(&chain->dio); hammer2_mtx_unlock(&chain->lock); } @@ -1132,7 +1095,10 @@ hammer2_chain_modify(hammer2_chain_t *chain, hammer2_tid_t mtid, * possible COW. * * If dedup_off is non-zero, caller already has a data offset - * containing the caller's desired data. + * containing the caller's desired data. The dedup offset is + * allowed to be in a partially free state and we must be sure + * to reset it to a fully allocated state to force two bulkfree + * passes to free it again. * * XXX can a chain already be marked MODIFIED without a data * assignment? If not, assert here instead of testing the case. @@ -1145,6 +1111,8 @@ hammer2_chain_modify(hammer2_chain_t *chain, hammer2_tid_t mtid, chain->bref.data_off = dedup_off; atomic_set_int(&chain->flags, HAMMER2_CHAIN_DEDUP); + hammer2_freemap_adjust(hmp, &chain->bref, + HAMMER2_FREEMAP_DORECOVER); } else { hammer2_freemap_alloc(chain, chain->bytes); } @@ -2023,24 +1991,32 @@ hammer2_chain_next(hammer2_chain_t **parentp, hammer2_chain_t *chain, /* * The raw scan function is similar to lookup/next but does not seek to a key. - * Blockrefs are iterated via first_chain = (parent, NULL) and - * next_chain = (parent, chain). + * Blockrefs are iterated via first_bref = (parent, NULL) and + * next_chain = (parent, bref). * - * The passed-in parent must be locked and its data resolved. The returned - * chain will be locked. Pass chain == NULL to acquire the first sub-chain - * under parent and then iterate with the passed-in chain (which this - * function will unlock). + * The passed-in parent must be locked and its data resolved. The function + * nominally returns a locked and referenced *chainp != NULL for chains + * the caller might need to recurse on (and will dipose of any *chainp passed + * in). The caller must check the chain->bref.type either way. + * + * *chainp is not set for leaf elements. + * + * This function takes a pointer to a stack-based bref structure whos + * contents is updated for each iteration. The same pointer is returned, + * or NULL when the iteration is complete. *firstp must be set to 1 for + * the first ieration. This function will set it to 0. */ -hammer2_chain_t * -hammer2_chain_scan(hammer2_chain_t *parent, hammer2_chain_t *chain, +hammer2_blockref_t * +hammer2_chain_scan(hammer2_chain_t *parent, hammer2_chain_t **chainp, + hammer2_blockref_t *bref, int *firstp, int *cache_indexp, int flags) { hammer2_dev_t *hmp; hammer2_blockref_t *base; - hammer2_blockref_t *bref; - hammer2_blockref_t bcopy; + hammer2_blockref_t *bref_ptr; hammer2_key_t key; hammer2_key_t next_key; + hammer2_chain_t *chain = NULL; int count = 0; int how_always = HAMMER2_RESOLVE_ALWAYS; int how_maybe = HAMMER2_RESOLVE_MAYBE; @@ -2070,17 +2046,24 @@ hammer2_chain_scan(hammer2_chain_t *parent, hammer2_chain_t *chain, /* * Calculate key to locate first/next element, unlocking the previous * element as we go. Be careful, the key calculation can overflow. + * + * (also reset bref to NULL) */ - if (chain) { - key = chain->bref.key + - ((hammer2_key_t)1 << chain->bref.keybits); - hammer2_chain_unlock(chain); - hammer2_chain_drop(chain); - chain = NULL; - if (key == 0) - goto done; - } else { + if (*firstp) { key = 0; + *firstp = 0; + } else { + key = bref->key + ((hammer2_key_t)1 << bref->keybits); + if ((chain = *chainp) != NULL) { + *chainp = NULL; + hammer2_chain_unlock(chain); + hammer2_chain_drop(chain); + chain = NULL; + } + if (key == 0) { + bref = NULL; + goto done; + } } again: @@ -2102,6 +2085,7 @@ again: */ if (parent->data->ipdata.meta.op_flags & HAMMER2_OPFLAG_DIRECTDATA) { + bref = NULL; goto done; } base = &parent->data->ipdata.u.blockset.blockref[0]; @@ -2150,40 +2134,68 @@ again: hammer2_chain_countbrefs(parent, base, count); next_key = 0; + bref_ptr = NULL; hammer2_spin_ex(&parent->core.spin); chain = hammer2_combined_find(parent, base, count, cache_indexp, &next_key, key, HAMMER2_KEY_MAX, - &bref); + &bref_ptr); generation = parent->core.generation; /* * Exhausted parent chain, we're done. */ - if (bref == NULL) { + if (bref_ptr == NULL) { hammer2_spin_unex(&parent->core.spin); KKASSERT(chain == NULL); + bref = NULL; goto done; } + /* + * Copy into the supplied stack-based blockref. + */ + *bref = *bref_ptr; + /* * Selected from blockref or in-memory chain. */ if (chain == NULL) { - bcopy = *bref; - hammer2_spin_unex(&parent->core.spin); - chain = hammer2_chain_get(parent, generation, &bcopy); - if (chain == NULL) { - kprintf("retry scan parent %p keys %016jx\n", - parent, key); - goto again; - } - if (bcmp(&bcopy, bref, sizeof(bcopy))) { - hammer2_chain_drop(chain); - chain = NULL; - goto again; + switch(bref->type) { + case HAMMER2_BREF_TYPE_INODE: + case HAMMER2_BREF_TYPE_FREEMAP_NODE: + case HAMMER2_BREF_TYPE_INDIRECT: + case HAMMER2_BREF_TYPE_VOLUME: + case HAMMER2_BREF_TYPE_FREEMAP: + /* + * Recursion, always get the chain + */ + hammer2_spin_unex(&parent->core.spin); + chain = hammer2_chain_get(parent, generation, bref); + if (chain == NULL) { + kprintf("retry scan parent %p keys %016jx\n", + parent, key); + goto again; + } + if (bcmp(bref, bref_ptr, sizeof(*bref))) { + hammer2_chain_drop(chain); + chain = NULL; + goto again; + } + break; + default: + /* + * No recursion, do not waste time instantiating + * a chain, just iterate using the bref. + */ + hammer2_spin_unex(&parent->core.spin); + break; } } else { + /* + * Recursion or not we need the chain in order to supply + * the bref. + */ hammer2_chain_ref(chain); hammer2_spin_unex(&parent->core.spin); } @@ -2192,7 +2204,8 @@ again: * chain is referenced but not locked. We must lock the chain * to obtain definitive DUPLICATED/DELETED state */ - hammer2_chain_lock(chain, how); + if (chain) + hammer2_chain_lock(chain, how); /* * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX) @@ -2210,22 +2223,26 @@ again: * releasing the spinlock with the flag after locking the * chain. */ - if (chain->flags & HAMMER2_CHAIN_DELETED) { + if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) { hammer2_chain_unlock(chain); hammer2_chain_drop(chain); chain = NULL; key = next_key; - if (key == 0) + if (key == 0) { + bref = NULL; goto done; + } goto again; } done: /* - * All done, return the chain or NULL + * All done, return the bref or NULL, supply chain if necessary. */ - return (chain); + if (chain) + *chainp = chain; + return (bref); } /* diff --git a/sys/vfs/hammer2/hammer2_flush.c b/sys/vfs/hammer2/hammer2_flush.c index ef3769f11b..faebb73612 100644 --- a/sys/vfs/hammer2/hammer2_flush.c +++ b/sys/vfs/hammer2/hammer2_flush.c @@ -787,7 +787,9 @@ again: * (this only really works if the DIO system buffer is the * same size as chain->bytes). */ - if ((chain->flags & HAMMER2_CHAIN_DESTROY) && chain->dio) { + if ((chain->flags & HAMMER2_CHAIN_DESTROY) && + (chain->flags & HAMMER2_CHAIN_DEDUP) == 0 && + chain->dio) { hammer2_io_setinval(chain->dio, chain->bytes); } } diff --git a/sys/vfs/hammer2/hammer2_strategy.c b/sys/vfs/hammer2/hammer2_strategy.c index 06d56b3d21..7ec8e31343 100644 --- a/sys/vfs/hammer2/hammer2_strategy.c +++ b/sys/vfs/hammer2/hammer2_strategy.c @@ -1287,3 +1287,23 @@ hammer2_dedup_lookup(hammer2_dev_t *hmp, char **datap, int pblksize) } return 0; } + +/* + * Poof. Races are ok, if someone gets in and reuses a dedup offset + * before or while we are clearing it they will also recover the freemap + * entry (set it to fully allocated), so a bulkfree race can only set it + * to a possibly-free state. + * + * XXX ok, well, not really sure races are ok but going to run with it + * for the moment. + */ +void +hammer2_dedup_clear(hammer2_dev_t *hmp) +{ + int i; + + for (i = 0; i < HAMMER2_DEDUP_HEUR_SIZE; ++i) { + hmp->heur_dedup[i].data_off = 0; + hmp->heur_dedup[i].ticks = ticks - 1; + } +} diff --git a/sys/vfs/hammer2/hammer2_vfsops.c b/sys/vfs/hammer2/hammer2_vfsops.c index 46d41049cc..b29b0aee45 100644 --- a/sys/vfs/hammer2/hammer2_vfsops.c +++ b/sys/vfs/hammer2/hammer2_vfsops.c @@ -1780,8 +1780,7 @@ hammer2_recovery(hammer2_dev_t *hmp) TAILQ_INIT(&info.list); info.depth = 0; parent = hammer2_chain_lookup_init(&hmp->vchain, 0); - cumulative_error = hammer2_recovery_scan(hmp, parent, - &info, sync_tid); + cumulative_error = hammer2_recovery_scan(hmp, parent, &info, sync_tid); hammer2_chain_lookup_done(parent); while ((elm = TAILQ_FIRST(&info.list)) != NULL) { @@ -1791,8 +1790,8 @@ hammer2_recovery(hammer2_dev_t *hmp) kfree(elm, M_HAMMER2); hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); - error = hammer2_recovery_scan(hmp, parent, - &info, hmp->voldata.freemap_tid); + error = hammer2_recovery_scan(hmp, parent, &info, + hmp->voldata.freemap_tid); hammer2_chain_unlock(parent); hammer2_chain_drop(parent); /* drop elm->chain ref */ if (error) @@ -1811,9 +1810,11 @@ hammer2_recovery_scan(hammer2_dev_t *hmp, hammer2_chain_t *parent, { const hammer2_inode_data_t *ripdata; hammer2_chain_t *chain; + hammer2_blockref_t bref; int cache_index; int cumulative_error = 0; int error; + int first; /* * Adjust freemap to ensure that the block(s) are marked allocated. @@ -1886,11 +1887,28 @@ hammer2_recovery_scan(hammer2_dev_t *hmp, hammer2_chain_t *parent, * hanging around after we are done with them. */ cache_index = 0; - chain = hammer2_chain_scan(parent, NULL, &cache_index, - HAMMER2_LOOKUP_NODATA); - while (chain) { + chain = NULL; + first = 1; + + while (hammer2_chain_scan(parent, &chain, &bref, + &first, &cache_index, + HAMMER2_LOOKUP_NODATA) != NULL) { + /* + * If this is a leaf + */ + if (chain == NULL) { + if (bref.mirror_tid > sync_tid) { + hammer2_freemap_adjust(hmp, &bref, + HAMMER2_FREEMAP_DORECOVER); + } + continue; + } + + /* + * This may or may not be a recursive node. + */ atomic_set_int(&chain->flags, HAMMER2_CHAIN_RELEASE); - if (chain->bref.mirror_tid > sync_tid) { + if (bref.mirror_tid > sync_tid) { ++info->depth; error = hammer2_recovery_scan(hmp, chain, info, sync_tid); @@ -1903,12 +1921,10 @@ hammer2_recovery_scan(hammer2_dev_t *hmp, hammer2_chain_t *parent, * Flush the recovery at the PFS boundary to stage it for * the final flush of the super-root topology. */ - if ((chain->bref.flags & HAMMER2_BREF_FLAG_PFSROOT) && + if ((bref.flags & HAMMER2_BREF_FLAG_PFSROOT) && (chain->flags & HAMMER2_CHAIN_ONFLUSH)) { hammer2_flush(chain, HAMMER2_FLUSH_TOP); } - chain = hammer2_chain_scan(parent, chain, &cache_index, - HAMMER2_LOOKUP_NODATA); } return cumulative_error; -- 2.41.0