From 10136ab6cde1969ab6ca22168b2a10ed5d9cc557 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Wed, 13 Nov 2013 20:23:02 -0800 Subject: [PATCH] hammer2 - Implement crash recovery, cleanups, stabilization * Allocations which are made by the flush itself run in the next transaction instead of the current transaction. We do this so the flush code can flush a stable version of the freemap itself. * Implement crash recovery. Due to the above mechanics an incremental scan must be run at mount-time of all chains belonging to the last transaction and ensure that the blocks are marked allocated in the freemap. Since the scan is incremental this doesn't take very long. * Add some chain API infrastructure to support the incremental scan. * Allow transactions to operate on a single media mount point (hmp) (verses a pfsmount (pmp)). Used by the recovery code. * Take another pass on the flush algorithm, fixing a few bugs. The filter is still pretty fragile unfortunately. Having to special-case the root chains (hmp->vchain and hmp->fchain) is causing problems. Add debugging to help figure out an assertion that still occurs. --- sbin/hammer2/cmd_debug.c | 3 + sys/vfs/hammer2/hammer2.h | 27 +++- sys/vfs/hammer2/hammer2_chain.c | 257 +++++++++++++++++++++++++++--- sys/vfs/hammer2/hammer2_flush.c | 142 ++++++++++------- sys/vfs/hammer2/hammer2_freemap.c | 149 +++++++++++++---- sys/vfs/hammer2/hammer2_inode.c | 14 +- sys/vfs/hammer2/hammer2_ioctl.c | 8 +- sys/vfs/hammer2/hammer2_vfsops.c | 218 +++++++++++++++++++++++-- sys/vfs/hammer2/hammer2_vnops.c | 29 ++-- 9 files changed, 689 insertions(+), 158 deletions(-) diff --git a/sbin/hammer2/cmd_debug.c b/sbin/hammer2/cmd_debug.c index 7e81bf3917..c565517578 100644 --- a/sbin/hammer2/cmd_debug.c +++ b/sbin/hammer2/cmd_debug.c @@ -548,6 +548,9 @@ show_bref(int fd, int tab, int bi, hammer2_blockref_t *bref, int dofreemap) } break; case HAMMER2_BREF_TYPE_VOLUME: + printf("alloc_tid=%016jx freemap_tid=%016jx ", + media.voldata.alloc_tid, + media.voldata.freemap_tid); if (dofreemap) { bscan = &media.voldata.freemap_blockset.blockref[0]; bcount = HAMMER2_SET_COUNT; diff --git a/sys/vfs/hammer2/hammer2.h b/sys/vfs/hammer2/hammer2.h index 3ba54b49fd..3c4d6f059f 100644 --- a/sys/vfs/hammer2/hammer2.h +++ b/sys/vfs/hammer2/hammer2.h @@ -205,6 +205,7 @@ struct hammer2_chain { u_int refs; u_int lockcnt; int debug_reason; + int duplicate_reason; hammer2_media_data_t *data; /* data pointer shortcut */ TAILQ_ENTRY(hammer2_chain) flush_node; /* flush deferral list */ }; @@ -245,7 +246,7 @@ RB_PROTOTYPE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp); #define HAMMER2_CHAIN_ONRBTREE 0x00004000 /* on parent RB tree */ #define HAMMER2_CHAIN_SNAPSHOT 0x00008000 /* snapshot special */ #define HAMMER2_CHAIN_EMBEDDED 0x00010000 /* embedded data */ -#define HAMMER2_CHAIN_UNUSED20000 0x00020000 +#define HAMMER2_CHAIN_RELEASE 0x00020000 /* don't keep around */ #define HAMMER2_CHAIN_UNUSED40000 0x00040000 #define HAMMER2_CHAIN_UNUSED80000 0x00080000 #define HAMMER2_CHAIN_DUPLICATED 0x00100000 /* fwd delete-dup */ @@ -261,7 +262,7 @@ RB_PROTOTYPE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp); #define HAMMER2_LOOKUP_NOLOCK 0x00000001 /* ref only */ #define HAMMER2_LOOKUP_NODATA 0x00000002 /* data left NULL */ #define HAMMER2_LOOKUP_SHARED 0x00000100 -#define HAMMER2_LOOKUP_MATCHIND 0x00000200 +#define HAMMER2_LOOKUP_MATCHIND 0x00000200 /* return all chains */ #define HAMMER2_LOOKUP_FREEMAP 0x00000400 /* freemap base */ #define HAMMER2_LOOKUP_ALWAYS 0x00000800 /* resolve data */ @@ -323,6 +324,13 @@ RB_PROTOTYPE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp); */ #define HAMMER2_FLUSH_DEPTH_LIMIT 10 /* stack recursion limit */ +/* + * hammer2_freemap_adjust() + */ +#define HAMMER2_FREEMAP_DORECOVER 1 +#define HAMMER2_FREEMAP_DOMAYFREE 2 +#define HAMMER2_FREEMAP_DOREALFREE 3 + /* * HAMMER2 IN-MEMORY CACHE OF MEDIA STRUCTURES * @@ -461,7 +469,8 @@ RB_PROTOTYPE2(hammer2_inode_tree, hammer2_inode, rbnode, hammer2_inode_cmp, */ struct hammer2_trans { TAILQ_ENTRY(hammer2_trans) entry; - struct hammer2_pfsmount *pmp; + struct hammer2_pfsmount *pmp; /* might be NULL */ + struct hammer2_mount *hmp_single; /* if single-targetted */ hammer2_tid_t real_tid; hammer2_tid_t sync_tid; hammer2_tid_t inode_tid; @@ -780,6 +789,9 @@ hammer2_chain_t *hammer2_chain_next(hammer2_chain_t **parentp, hammer2_key_t *key_nextp, hammer2_key_t key_beg, hammer2_key_t key_end, int *cache_indexp, int flags); +hammer2_chain_t *hammer2_chain_scan(hammer2_chain_t *parent, + hammer2_chain_t *chain, + int *cache_indexp, int flags); int hammer2_chain_create(hammer2_trans_t *trans, hammer2_chain_t **parentp, @@ -788,7 +800,8 @@ int hammer2_chain_create(hammer2_trans_t *trans, int type, size_t bytes); void hammer2_chain_duplicate(hammer2_trans_t *trans, hammer2_chain_t **parentp, hammer2_chain_t **chainp, - hammer2_blockref_t *bref, int snapshot); + hammer2_blockref_t *bref, int snapshot, + int duplicate_reason); int hammer2_chain_snapshot(hammer2_trans_t *trans, hammer2_chain_t **chainp, hammer2_ioc_pfs_t *pfs); void hammer2_chain_delete(hammer2_trans_t *trans, hammer2_chain_t *chain, @@ -820,8 +833,8 @@ void hammer2_base_insert(hammer2_chain_t *chain, /* * hammer2_trans.c */ -void hammer2_trans_init(hammer2_trans_t *trans, - hammer2_pfsmount_t *pmp, int flags); +void hammer2_trans_init(hammer2_trans_t *trans, hammer2_pfsmount_t *pmp, + hammer2_mount_t *hmp, int flags); void hammer2_trans_clear_invfsync(hammer2_trans_t *trans); void hammer2_trans_done(hammer2_trans_t *trans); @@ -886,7 +899,7 @@ void hammer2_lwinprog_wait(hammer2_pfsmount_t *pmp); */ int hammer2_freemap_alloc(hammer2_trans_t *trans, hammer2_mount_t *hmp, hammer2_blockref_t *bref, size_t bytes); -void hammer2_freemap_free(hammer2_trans_t *trans, hammer2_mount_t *hmp, +void hammer2_freemap_adjust(hammer2_trans_t *trans, hammer2_mount_t *hmp, hammer2_blockref_t *bref, int how); diff --git a/sys/vfs/hammer2/hammer2_chain.c b/sys/vfs/hammer2/hammer2_chain.c index c2f4d11f94..b07c53fe55 100644 --- a/sys/vfs/hammer2/hammer2_chain.c +++ b/sys/vfs/hammer2/hammer2_chain.c @@ -1417,13 +1417,6 @@ hammer2_chain_modify(hammer2_trans_t *trans, hammer2_chain_t **chainp, if (chain != &hmp->fchain && chain != &hmp->vchain) { KKASSERT((flags & HAMMER2_MODIFY_ASSERTNOCOPY) == 0); hammer2_chain_delete_duplicate(trans, chainp, 0); -#if 0 - kprintf("RET1A %p.%d flags %08x mod=%016jx del=%016jx\n", chain, chain->bref.type, chain->flags, chain->modify_tid, chain->delete_tid); -#endif -#if 0 - chain = *chainp; - kprintf("RET1B %p.%d flags %08x mod=%016jx del=%016jx\n", chain, chain->bref.type, chain->flags, chain->modify_tid, chain->delete_tid); -#endif return; } @@ -1468,7 +1461,19 @@ hammer2_chain_modify(hammer2_trans_t *trans, hammer2_chain_t **chainp, atomic_clear_int(&chain->flags, HAMMER2_CHAIN_FORCECOW); } - chain->modify_tid = trans->sync_tid; + /* + * Update modify_tid. XXX special-case vchain/fchain because they + * are always modified in-place. Otherwise the chain being modified + * must not be part of a future transaction. + */ + if (chain == &hmp->vchain || chain == &hmp->fchain) { + if (chain->modify_tid <= trans->sync_tid) + chain->modify_tid = trans->sync_tid; + } else { + KKASSERT(chain->modify_tid <= trans->sync_tid); + chain->modify_tid = trans->sync_tid; + } + if ((flags & HAMMER2_MODIFY_NO_MODIFY_TID) == 0) chain->bref.modify_tid = trans->sync_tid; @@ -1665,17 +1670,12 @@ hammer2_chain_find_callback(hammer2_chain_t *child, void *data) hammer2_chain_t *best; hammer2_key_t child_end; -#if 0 /* - * Skip deleted chains which have been flushed (MOVED no longer set), - * causes caller to check blockref array. + * This is a live-view find. Ignore chains which have been + * delete-duplicated. */ - if ((child->flags & (HAMMER2_CHAIN_DELETED | HAMMER2_CHAIN_MOVED)) == - HAMMER2_CHAIN_DELETED) { - /* continue scan */ + if (child->flags & HAMMER2_CHAIN_DUPLICATED) return(0); - } -#endif /* * General cases @@ -2200,6 +2200,191 @@ hammer2_chain_next(hammer2_chain_t **parentp, hammer2_chain_t *chain, cache_indexp, flags)); } +/* + * Raw scan functions are similar to lookup/next but do not seek the parent + * chain and do not skip stale chains. These functions are primarily used + * by the recovery code. + * + * Parent and chain are locked, parent's data must be resolved. To acquire + * the first sub-chain under parent pass chain == NULL. + */ +hammer2_chain_t * +hammer2_chain_scan(hammer2_chain_t *parent, hammer2_chain_t *chain, + int *cache_indexp, int flags) +{ + hammer2_mount_t *hmp; + hammer2_blockref_t *base; + hammer2_blockref_t *bref; + hammer2_blockref_t bcopy; + hammer2_chain_core_t *above; + hammer2_key_t key; + hammer2_key_t next_key; + int count = 0; + int how_always = HAMMER2_RESOLVE_ALWAYS; + int how_maybe = HAMMER2_RESOLVE_MAYBE; + int how; + + hmp = parent->hmp; + + /* + * Scan flags borrowed from lookup + */ + if (flags & HAMMER2_LOOKUP_ALWAYS) { + how_maybe = how_always; + how = HAMMER2_RESOLVE_ALWAYS; + } else if (flags & (HAMMER2_LOOKUP_NODATA | HAMMER2_LOOKUP_NOLOCK)) { + how = HAMMER2_RESOLVE_NEVER; + } else { + how = HAMMER2_RESOLVE_MAYBE; + } + if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK)) { + how_maybe |= HAMMER2_RESOLVE_SHARED; + how_always |= HAMMER2_RESOLVE_SHARED; + how |= HAMMER2_RESOLVE_SHARED; + } + + /* + * Calculate key to locate first/next element, unlocking the previous + * element as we go. Be careful, the key calculation can overflow. + */ + if (chain) { + key = chain->bref.key + + ((hammer2_key_t)1 << chain->bref.keybits); + hammer2_chain_unlock(chain); + chain = NULL; + if (key == 0) + goto done; + } else { + key = 0; + } + +again: + /* + * Locate the blockref array. Currently we do a fully associative + * search through the array. + */ + switch(parent->bref.type) { + case HAMMER2_BREF_TYPE_INODE: + /* + * An inode with embedded data has no sub-chains. + */ + if (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) + goto done; + base = &parent->data->ipdata.u.blockset.blockref[0]; + count = HAMMER2_SET_COUNT; + break; + case HAMMER2_BREF_TYPE_FREEMAP_NODE: + case HAMMER2_BREF_TYPE_INDIRECT: + /* + * Optimize indirect blocks in the INITIAL state to avoid + * I/O. + */ + if (parent->flags & HAMMER2_CHAIN_INITIAL) { + base = NULL; + } else { + if (parent->data == NULL) + panic("parent->data is NULL"); + base = &parent->data->npdata[0]; + } + count = parent->bytes / sizeof(hammer2_blockref_t); + break; + case HAMMER2_BREF_TYPE_VOLUME: + base = &hmp->voldata.sroot_blockset.blockref[0]; + count = HAMMER2_SET_COUNT; + break; + case HAMMER2_BREF_TYPE_FREEMAP: + base = &hmp->voldata.freemap_blockset.blockref[0]; + count = HAMMER2_SET_COUNT; + break; + default: + panic("hammer2_chain_lookup: unrecognized blockref type: %d", + parent->bref.type); + base = NULL; /* safety */ + count = 0; /* safety */ + } + + /* + * Merged scan to find next candidate. + * + * hammer2_base_*() functions require the above->live_* fields + * to be synchronized. + * + * We need to hold the spinlock to access the block array and RB tree + * and to interlock chain creation. + */ + if ((parent->core->flags & HAMMER2_CORE_COUNTEDBREFS) == 0) + hammer2_chain_countbrefs(parent, base, count); + + above = parent->core; + next_key = 0; + spin_lock(&above->cst.spin); + chain = hammer2_combined_find(parent, base, count, + cache_indexp, &next_key, + key, HAMMER2_MAX_KEY, &bref); + + /* + * Exhausted parent chain, we're done. + */ + if (bref == NULL) { + spin_unlock(&above->cst.spin); + KKASSERT(chain == NULL); + goto done; + } + + /* + * Selected from blockref or in-memory chain. + */ + if (chain == NULL) { + bcopy = *bref; + spin_unlock(&above->cst.spin); + chain = hammer2_chain_get(parent, &bcopy); + if (chain == NULL) { + kprintf("retry scan parent %p keys %016jx\n", + parent, key); + goto again; + } + if (bcmp(&bcopy, bref, sizeof(bcopy))) { + hammer2_chain_drop(chain); + chain = NULL; + goto again; + } + } else { + hammer2_chain_ref(chain); + spin_unlock(&above->cst.spin); + } + /* chain is referenced but not locked */ + + /* + * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX) + * + * NOTE: chain's key range is not relevant as there might be + * one-offs within the range that are not deleted. + * + * NOTE: XXX this could create problems with scans used in + * situations other than mount-time recovery. + */ + if (chain->flags & HAMMER2_CHAIN_DELETED) { + hammer2_chain_drop(chain); + chain = NULL; + + key = next_key; + if (key == 0) + goto done; + goto again; + } + + /* + * Lock as requested + */ + hammer2_chain_lock(chain, how | HAMMER2_RESOLVE_NOREF); + +done: + /* + * All done, return the chain or NULL + */ + return (chain); +} + /* * Create and return a new hammer2 system memory structure of the specified * key, type and size and insert it under (*parentp). This is a full @@ -2316,12 +2501,27 @@ hammer2_chain_create(hammer2_trans_t *trans, hammer2_chain_t **parentp, } } else { /* - * Potentially update the existing chain's key/keybits. + * We are reattaching a chain that has been duplicated and + * left disconnected under a DIFFERENT parent with potentially + * different key/keybits. + * + * The chain must be modified in the current transaction + * (the duplication code should have done that for us), + * and it's modify_tid should be greater than the parent's + * bref.mirror_tid. This should cause it to be created under + * the new parent. + * + * If deleted in the same transaction, the create/delete TIDs + * will be the same and effective the chain will not have + * existed at all from the point of view of the parent. * * Do NOT mess with the current state of the INITIAL flag. */ + KKASSERT(chain->modify_tid > parent->bref.mirror_tid); + KKASSERT(chain->modify_tid == trans->sync_tid); chain->bref.key = key; chain->bref.keybits = keybits; + /* chain->modify_tid = chain->bref.mirror_tid; */ KKASSERT(chain->above == NULL); } @@ -2496,7 +2696,7 @@ static void hammer2_chain_dup_fixup(hammer2_chain_t *ochain, void hammer2_chain_duplicate(hammer2_trans_t *trans, hammer2_chain_t **parentp, hammer2_chain_t **chainp, hammer2_blockref_t *bref, - int snapshot) + int snapshot, int duplicate_reason) { hammer2_mount_t *hmp; hammer2_chain_t *parent; @@ -2520,9 +2720,11 @@ hammer2_chain_duplicate(hammer2_trans_t *trans, hammer2_chain_t **parentp, ochain = *chainp; hmp = ochain->hmp; if (parentp) - ochain->debug_reason += 0x10000; + ochain->debug_reason += 0x10000; else - ochain->debug_reason += 0x100000; + ochain->debug_reason += 0x100000; + ochain->duplicate_reason = (ochain->duplicate_reason << 8) | + (duplicate_reason | 0x10); #if 0 if (ochain->bref.type == HAMMER2_BREF_TYPE_DATA) { @@ -2563,6 +2765,7 @@ hammer2_chain_duplicate(hammer2_trans_t *trans, hammer2_chain_t **parentp, (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX); nchain->bytes = bytes; nchain->modify_tid = ochain->modify_tid; + nchain->duplicate_reason = duplicate_reason; if (ochain->flags & HAMMER2_CHAIN_INITIAL) atomic_set_int(&nchain->flags, HAMMER2_CHAIN_INITIAL); @@ -3042,6 +3245,7 @@ hammer2_chain_create_indirect(hammer2_trans_t *trans, hammer2_chain_t *parent, int nbytes; int cache_index; int loops; + int reason; /* * Calculate the base blockref pointer or NULL if the chain @@ -3190,12 +3394,13 @@ hammer2_chain_create_indirect(hammer2_trans_t *trans, hammer2_chain_t *parent, cache_index = 0; spin_lock(&above->cst.spin); loops = 0; + reason = 0; for (;;) { - if (++loops > 8192) { + if (++loops > 100000) { spin_unlock(&above->cst.spin); - panic("shit parent=%p base/count %p:%d\n", - parent, base, count); + panic("excessive loops r=%d p=%p base/count %p:%d %016jx\n", + reason, parent, base, count, key_next); } /* @@ -3244,10 +3449,12 @@ hammer2_chain_create_indirect(hammer2_trans_t *trans, hammer2_chain_t *parent, spin_unlock(&above->cst.spin); chain = hammer2_chain_get(parent, bref); if (chain == NULL) { + reason = 1; spin_lock(&above->cst.spin); continue; } if (bcmp(&bcopy, bref, sizeof(bcopy))) { + reason = 2; hammer2_chain_drop(chain); spin_lock(&above->cst.spin); continue; @@ -3264,6 +3471,7 @@ hammer2_chain_create_indirect(hammer2_trans_t *trans, hammer2_chain_t *parent, * deleted we move on to the next key. */ if (chain->flags & HAMMER2_CHAIN_DUPLICATED) { + reason = 3; hammer2_chain_unlock(chain); spin_lock(&above->cst.spin); continue; @@ -3277,13 +3485,14 @@ hammer2_chain_create_indirect(hammer2_trans_t *trans, hammer2_chain_t *parent, * Shift the chain to the indirect block. */ hammer2_chain_delete(trans, chain, HAMMER2_DELETE_WILLDUP); - hammer2_chain_duplicate(trans, &ichain, &chain, NULL, 0); + hammer2_chain_duplicate(trans, &ichain, &chain, NULL, 0, 1); hammer2_chain_unlock(chain); KKASSERT(parent->refs > 0); chain = NULL; next_key: spin_lock(&above->cst.spin); next_key_spinlocked: + reason = 4; if (key_next == 0 || key_next > key_end) break; key_beg = key_next; diff --git a/sys/vfs/hammer2/hammer2_flush.c b/sys/vfs/hammer2/hammer2_flush.c index 72545e83a5..ea37b3df89 100644 --- a/sys/vfs/hammer2/hammer2_flush.c +++ b/sys/vfs/hammer2/hammer2_flush.c @@ -95,8 +95,12 @@ hammer2_updatestats(hammer2_flush_info_t *info, hammer2_blockref_t *bref, /* * Transaction support functions for writing to the filesystem. * - * Initializing a new transaction allocates a transaction ID. We - * don't bother marking the volume header MODIFIED. Instead, the volume + * Initializing a new transaction allocates a transaction ID. Typically + * passed a pmp (hmp passed as NULL), indicating a cluster transaction. Can + * be passed a NULL pmp and non-NULL hmp to indicate a transaction on a single + * media target. The latter mode is used by the recovery code. + * + * We don't bother marking the volume header MODIFIED. Instead, the volume * will be synchronized at a later time as part of a larger flush sequence. * * Non-flush transactions can typically run concurrently. However if @@ -116,14 +120,20 @@ hammer2_updatestats(hammer2_flush_info_t *info, hammer2_blockref_t *bref, * wind up (harmlessly) more advanced on flush. */ void -hammer2_trans_init(hammer2_trans_t *trans, hammer2_pfsmount_t *pmp, int flags) +hammer2_trans_init(hammer2_trans_t *trans, hammer2_pfsmount_t *pmp, + hammer2_mount_t *hmp, int flags) { - hammer2_mount_t *hmp; hammer2_trans_t *head; bzero(trans, sizeof(*trans)); - trans->pmp = pmp; - hmp = pmp->cluster.chains[0]->hmp; /* XXX */ + if (pmp) { + trans->pmp = pmp; + KKASSERT(hmp == NULL); + hmp = pmp->cluster.chains[0]->hmp; /* XXX */ + } else { + trans->hmp_single = hmp; + KKASSERT(hmp); + } hammer2_voldata_lock(hmp); trans->flags = flags; @@ -198,7 +208,10 @@ hammer2_trans_done(hammer2_trans_t *trans) hammer2_trans_t *head; hammer2_trans_t *scan; - hmp = trans->pmp->cluster.chains[0]->hmp; + if (trans->pmp) + hmp = trans->pmp->cluster.chains[0]->hmp; + else + hmp = trans->hmp_single; /* * Remove and adjust flushcnt @@ -375,7 +388,6 @@ hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t **chainp) char *bdata; hammer2_io_t *dio; int error; - int wasmodified; int diddeferral; hmp = chain->hmp; @@ -450,10 +462,6 @@ retry: * is terminal and we must recurse to deal with any dirty chains * under the deletion, including possibly flushing them out (e.g. * open descriptor on an unlinked file). - * - * Do not update bref.mirror_tid here, the chain still has a data - * state based on mirror_tid and might be duplicated again (though - * I don't think this can occur). */ if (chain->delete_tid <= info->sync_tid && (chain->flags & HAMMER2_CHAIN_DUPLICATED)) { @@ -475,6 +483,11 @@ retry: atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); hammer2_chain_drop(chain); } + + /* + * Update mirror_tid, indicating that chain is synchronized + * on its modification and block table. + */ if (chain->bref.mirror_tid < info->sync_tid) chain->bref.mirror_tid = info->sync_tid; /* do not update core->update_lo, there may be another path */ @@ -694,8 +707,6 @@ retry: * * (no deferral in this path) */ - if (chain->bref.mirror_tid < info->sync_tid) - chain->bref.mirror_tid = info->sync_tid; if (core->update_lo < info->sync_tid) core->update_lo = info->sync_tid; @@ -732,8 +743,6 @@ retry: * * (no deferral in this path) */ - if (chain->bref.mirror_tid < info->sync_tid) - chain->bref.mirror_tid = info->sync_tid; if (core->update_lo < info->sync_tid) core->update_lo = info->sync_tid; @@ -806,34 +815,44 @@ retry: atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); hammer2_chain_drop(chain); } + + /* + * Update mirror_tid, indicating that chain is synchronized + * on its modification and block table. + */ + if (chain->bref.mirror_tid < info->sync_tid) + chain->bref.mirror_tid = info->sync_tid; return; } /* * A degenerate flush might not have flushed anything and thus not * processed modified blocks on the way back up. Detect the case. - * bref.mirror_tid may still propagate upward but won't be flushed - * if no modifications were actually made. - * - * Note that MOVED can be set without MODIFIED being set due to - * a deletion, in which case it is handled by Scan2 later on. - * - * Both bits can be set along with DELETED due to a deletion if - * modified data within the synchronization zone and the chain - * was then deleted beyond the zone, in which case we still have - * to flush for synchronization point consistency. Otherwise though - * DELETED and MODIFIED are treated as separate flags. */ if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) { - kprintf("chain %p.%d %08x recursed but wasn't modified mirr=%016jx update_lo=%016jx synctid=%016jx\n", - chain, chain->bref.type, chain->flags, chain->bref.mirror_tid, core->update_lo, info->sync_tid); + kprintf("chain %p.%d %08x recursed but wasn't " + "modified mirr=%016jx " + "update_lo=%016jx synctid=%016jx\n", + chain, chain->bref.type, chain->flags, + chain->bref.mirror_tid, + core->update_lo, info->sync_tid); +#if 0 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) { hammer2_chain_ref(chain); atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED); } +#endif chain->debug_reason = (chain->debug_reason & ~255) | 10; + + /* + * Update mirror_tid, indicating that chain is synchronized + * on its modification and block table. + */ + if (chain->bref.mirror_tid < info->sync_tid) + chain->bref.mirror_tid = info->sync_tid; return; } + chain->debug_reason = (chain->debug_reason & ~255) | 11; /* @@ -841,10 +860,8 @@ retry: * * A DESTROYED node that reaches this point must be flushed for * synchronization point consistency. - */ - - /* - * Update mirror_tid, clear MODIFIED, and set MOVED. + * + * Update bref.mirror_tid, clear MODIFIED, and set MOVED. * * The caller will update the parent's reference to this chain * by testing MOVED as long as the modification was in-bounds. @@ -861,33 +878,34 @@ retry: if (hammer2_debug & 0x2000) { Debugger("Flush hell"); } - wasmodified = (chain->flags & HAMMER2_CHAIN_MODIFIED) != 0; + atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); - if (chain == &hmp->vchain) - kprintf("(FLUSHED VOLUME HEADER)\n"); - if (chain == &hmp->fchain) - kprintf("(FLUSHED FREEMAP HEADER)\n"); if ((chain->flags & HAMMER2_CHAIN_MOVED) || chain == &hmp->vchain || chain == &hmp->fchain) { /* - * Drop the ref from the MODIFIED bit we cleared. - * Net is -0 or -1 ref depending. + * Drop the ref from the MODIFIED bit we cleared, + * net -1 ref. */ - if (wasmodified) - hammer2_chain_drop(chain); + hammer2_chain_drop(chain); } else { /* * Drop the ref from the MODIFIED bit we cleared and - * set a ref for the MOVED bit we are setting. Net - * is +0 or +1 ref depending. + * set a ref for the MOVED bit we are setting. Net 0 refs. */ - if (wasmodified == 0) - hammer2_chain_ref(chain); atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED); } + /* + * We are writing out the parent (writing out of the volume root is + * deferred but we still do some hand-waving). + * + * Update mirror_tid on the parent. + */ + if (chain->bref.mirror_tid < info->sync_tid) + chain->bref.mirror_tid = info->sync_tid; + /* * If this is part of a recursive flush we can go ahead and write * out the buffer cache buffer and pass a new bref back up the chain @@ -899,16 +917,16 @@ retry: switch(chain->bref.type) { case HAMMER2_BREF_TYPE_FREEMAP: hammer2_modify_volume(hmp); + hmp->voldata.freemap_tid = hmp->fchain.bref.mirror_tid; break; case HAMMER2_BREF_TYPE_VOLUME: /* - * We should flush the free block table before we calculate - * CRCs and copy voldata -> volsync. - * - * To prevent SMP races, fchain must remain locked until - * voldata is copied to volsync. + * The free block table is flushed by hammer2_vfs_sync() + * before it flushes vchain. We must still hold fchain + * locked while copying voldata to volsync, however. */ hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS); +#if 0 if ((hmp->fchain.flags & HAMMER2_CHAIN_MODIFIED) || hmp->voldata.freemap_tid < info->trans->sync_tid) { /* this will modify vchain as a side effect */ @@ -916,10 +934,19 @@ retry: hammer2_chain_flush(info->trans, &tmp); KKASSERT(tmp == &hmp->fchain); } +#endif + + /* + * There is no parent to our root vchain and fchain to + * synchronize the bref to, their updated mirror_tid's + * must be synchronized to the volume header. + */ + hmp->voldata.mirror_tid = chain->bref.mirror_tid; + /*hmp->voldata.freemap_tid = hmp->fchain.bref.mirror_tid;*/ /* * The volume header is flushed manually by the syncer, not - * here. All we do is adjust the crc's. + * here. All we do here is adjust the crc's. */ KKASSERT(chain->data != NULL); KKASSERT(chain->dio == NULL); @@ -1074,6 +1101,9 @@ hammer2_chain_flush_scan1(hammer2_chain_t *child, void *data) * * Or MODIFIED is not set and child is already fully synchronized * with its sub-tree. Don't persue. + * + * (child can never be fchain or vchain so a special check isn't + * needed). */ if (child->modify_tid > trans->sync_tid) { KKASSERT(child->delete_tid >= child->modify_tid); @@ -1471,7 +1501,9 @@ hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data) child->delete_tid > parent->bref.mirror_tid && child->modify_tid <= parent->bref.mirror_tid) { KKASSERT(child->flags & HAMMER2_CHAIN_MOVED); - KKASSERT(parent->modify_tid == trans->sync_tid); + KKASSERT(parent->modify_tid == trans->sync_tid || + (parent == &hmp->vchain || + parent == &hmp->fchain)); hammer2_rollup_stats(parent, child, -1); spin_lock(&above->cst.spin); #if FLUSH_DEBUG @@ -1503,7 +1535,9 @@ hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data) if (base && child->modify_tid > parent->bref.mirror_tid) { KKASSERT(child->flags & HAMMER2_CHAIN_MOVED); - KKASSERT(parent->modify_tid == trans->sync_tid); + KKASSERT(parent->modify_tid == trans->sync_tid || + (parent == &hmp->vchain || + parent == &hmp->fchain)); hammer2_rollup_stats(parent, child, 1); spin_lock(&above->cst.spin); #if FLUSH_DEBUG diff --git a/sys/vfs/hammer2/hammer2_freemap.c b/sys/vfs/hammer2/hammer2_freemap.c index dc648350ed..d219045396 100644 --- a/sys/vfs/hammer2/hammer2_freemap.c +++ b/sys/vfs/hammer2/hammer2_freemap.c @@ -211,8 +211,15 @@ hammer2_freemap_alloc(hammer2_trans_t *trans, hammer2_mount_t *hmp, return(hammer2_freemap_reserve(hmp, bref, radix)); } - if (bref->data_off & ~HAMMER2_OFF_MASK_RADIX) - hammer2_freemap_free(trans, hmp, bref, 0); + /* + * Mark previously allocated block as possibly freeable. There might + * be snapshots and other races so we can't just mark it fully free. + * (XXX optimize this for the current-transaction create+delete case) + */ + if (bref->data_off & ~HAMMER2_OFF_MASK_RADIX) { + hammer2_freemap_adjust(trans, hmp, bref, + HAMMER2_FREEMAP_DOMAYFREE); + } /* * Setting ISALLOCATING ensures correct operation even when the @@ -334,7 +341,7 @@ hammer2_freemap_try_alloc(hammer2_trans_t *trans, hammer2_chain_t **parentp, &cache_index, HAMMER2_LOOKUP_FREEMAP | HAMMER2_LOOKUP_ALWAYS | - HAMMER2_LOOKUP_MATCHIND/*XXX*/); + HAMMER2_LOOKUP_MATCHIND); if (chain == NULL) { /* * Create the missing leaf, be sure to initialize @@ -381,6 +388,7 @@ hammer2_freemap_try_alloc(hammer2_trans_t *trans, hammer2_chain_t **parentp, int start; int n; + KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF); start = (int)((iter->bnext - key) >> HAMMER2_FREEMAP_LEVEL0_RADIX); KKASSERT(start >= 0 && start < HAMMER2_FREEMAP_COUNT); @@ -737,8 +745,8 @@ hammer2_freemap_iterate(hammer2_trans_t *trans, hammer2_chain_t **parentp, * any lost allocations. */ void -hammer2_freemap_free(hammer2_trans_t *trans, hammer2_mount_t *hmp, - hammer2_blockref_t *bref, int how) +hammer2_freemap_adjust(hammer2_trans_t *trans, hammer2_mount_t *hmp, + hammer2_blockref_t *bref, int how) { hammer2_off_t data_off = bref->data_off; hammer2_chain_t *chain; @@ -761,6 +769,7 @@ hammer2_freemap_free(hammer2_trans_t *trans, hammer2_mount_t *hmp, int count; int modified = 0; int cache_index = -1; + int error; radix = (int)data_off & HAMMER2_OFF_MASK_RADIX; data_off &= ~HAMMER2_OFF_MASK_RADIX; @@ -770,13 +779,13 @@ hammer2_freemap_free(hammer2_trans_t *trans, hammer2_mount_t *hmp, class = (bref->type << 8) | hammer2_devblkradix(radix); /* - * We can't free data allocated by newfs_hammer2. - * Assert validity. + * We can't adjust thre freemap for data allocations made by + * newfs_hammer2. */ - KKASSERT((data_off & HAMMER2_ZONE_MASK64) >= HAMMER2_ZONE_SEG); if (data_off < hmp->voldata.allocator_beg) return; + KKASSERT((data_off & HAMMER2_ZONE_MASK64) >= HAMMER2_ZONE_SEG); KKASSERT((trans->flags & HAMMER2_TRANS_ISALLOCATING) == 0); atomic_set_int(&trans->flags, HAMMER2_TRANS_ISALLOCATING); if (trans->flags & HAMMER2_TRANS_ISFLUSH) @@ -797,74 +806,143 @@ hammer2_freemap_free(hammer2_trans_t *trans, hammer2_mount_t *hmp, &cache_index, HAMMER2_LOOKUP_FREEMAP | HAMMER2_LOOKUP_ALWAYS | - HAMMER2_LOOKUP_MATCHIND/*XXX*/); - if (chain == NULL) { - kprintf("hammer2_freemap_free: %016jx: no chain\n", + HAMMER2_LOOKUP_MATCHIND); + + /* + * Stop early if we are trying to free something but no leaf exists. + */ + if (chain == NULL && how != HAMMER2_FREEMAP_DORECOVER) { + kprintf("hammer2_freemap_adjust: %016jx: no chain\n", (intmax_t)bref->data_off); hammer2_chain_unlock(parent); return; } - KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF); /* - * Find the bmap entry (covering a 2MB swath) - * Find the bitmap array index - * Find the bitmap bit index (runs in 2-bit pairs) + * Create any missing leaf(s) if we are doing a recovery (marking + * the block(s) as being allocated instead of being freed). Be sure + * to initialize the auxillary freemap tracking info in the + * bref.check.freemap structure. */ - bmap = &chain->data->bmdata[(int)(data_off >> HAMMER2_SEGRADIX) & - (HAMMER2_FREEMAP_COUNT - 1)]; - bitmap = &bmap->bitmap[(int)(data_off >> (HAMMER2_SEGRADIX - 3)) & 7]; + if (chain == NULL && how == HAMMER2_FREEMAP_DORECOVER) { + error = hammer2_chain_create(trans, &parent, &chain, + key, HAMMER2_FREEMAP_LEVEL1_RADIX, + HAMMER2_BREF_TYPE_FREEMAP_LEAF, + HAMMER2_FREEMAP_LEVELN_PSIZE); + kprintf("fixup create chain %p %016jx:%d\n", chain, chain->bref.key, chain->bref.keybits); + + if (error == 0) { + hammer2_chain_modify(trans, &chain, 0); + bzero(&chain->data->bmdata[0], + HAMMER2_FREEMAP_LEVELN_PSIZE); + chain->bref.check.freemap.bigmask = (uint32_t)-1; + chain->bref.check.freemap.avail = l1size; + /* bref.methods should already be inherited */ + hammer2_freemap_init(trans, hmp, key, chain); + } + /* XXX handle error */ + } + + /* + * Calculate the bitmask (runs in 2-bit pairs). + */ start = ((int)(data_off >> HAMMER2_FREEMAP_BLOCK_RADIX) & 15) * 2; bmmask01 = 1 << start; bmmask10 = 2 << start; bmmask11 = 3 << start; /* - * Fixup the bitmap + * Fixup the bitmap. Partial blocks cannot be fully freed unless + * a bulk scan is able to roll them up. */ if (radix < HAMMER2_FREEMAP_BLOCK_RADIX) { count = 1; - how = 0; /* partial block, cannot set to 00 */ + if (how == HAMMER2_FREEMAP_DOREALFREE) + how = HAMMER2_FREEMAP_DOMAYFREE; } else { count = 1 << (radix - HAMMER2_FREEMAP_BLOCK_RADIX); } + /* + * [re]load the bmap and bitmap pointers. Each bmap entry covers + * a 2MB swath. The bmap itself (LEVEL1) covers 2GB. + */ +again: + bmap = &chain->data->bmdata[(int)(data_off >> HAMMER2_SEGRADIX) & + (HAMMER2_FREEMAP_COUNT - 1)]; + bitmap = &bmap->bitmap[(int)(data_off >> (HAMMER2_SEGRADIX - 3)) & 7]; + + while (count) { KKASSERT(bmmask11); - KKASSERT((*bitmap & bmmask11) != bmmask00); - if ((*bitmap & bmmask11) == bmmask11) { + if (how == HAMMER2_FREEMAP_DORECOVER) { + /* + * Recovery request, mark as allocated. + */ + if ((*bitmap & bmmask11) != bmmask11) { + if (modified == 0) { + hammer2_chain_modify(trans, &chain, 0); + modified = 1; + goto again; + } + if ((*bitmap & bmmask11) == bmmask00) + bmap->avail -= 1 << radix; + if (bmap->class == 0) + bmap->class = class; + *bitmap |= bmmask11; + kprintf("hammer2_freemap_recover: fixup " + "type=%02x block=%016jx/%zd\n", + bref->type, data_off, bytes); + } else { + /* + kprintf("hammer2_freemap_recover: good " + "type=%02x block=%016jx/%zd\n", + bref->type, data_off, bytes); + */ + } + } else if ((*bitmap & bmmask11) == bmmask11) { + /* + * Mayfree/Realfree request and bitmap is currently + * marked as being fully allocated. + */ if (!modified) { hammer2_chain_modify(trans, &chain, 0); modified = 1; - bmap = &chain->data->bmdata[(int)(data_off >> HAMMER2_SEGRADIX) & - (HAMMER2_FREEMAP_COUNT - 1)]; - bitmap = &bmap->bitmap[(int)(data_off >> (HAMMER2_SEGRADIX - 3)) & 7]; + goto again; } - if (how) + if (how == HAMMER2_FREEMAP_DOREALFREE) *bitmap &= ~bmmask11; else *bitmap = (*bitmap & ~bmmask11) | bmmask10; } else if ((*bitmap & bmmask11) == bmmask10) { - if (how) { + /* + * Mayfree/Realfree request and bitmap is currently + * marked as being possibly freeable. + */ + if (how == HAMMER2_FREEMAP_DOREALFREE) { if (!modified) { hammer2_chain_modify(trans, &chain, 0); modified = 1; - bmap = &chain->data->bmdata[(int)(data_off >> HAMMER2_SEGRADIX) & - (HAMMER2_FREEMAP_COUNT - 1)]; - bitmap = &bmap->bitmap[(int)(data_off >> (HAMMER2_SEGRADIX - 3)) & 7]; + goto again; } *bitmap &= ~bmmask11; } - } else if ((*bitmap & bmmask11) == bmmask01) { - KKASSERT(0); + } else { + /* + * 01 - Not implemented, currently illegal state + * 00 - Not allocated at all, illegal free. + */ + panic("hammer2_freemap_adjust: " + "Illegal state %08x(%08x)", + *bitmap, *bitmap & bmmask11); } --count; bmmask01 <<= 2; bmmask10 <<= 2; bmmask11 <<= 2; } - if (how && modified) { + if (how == HAMMER2_FREEMAP_DOREALFREE && modified) { bmap->avail += 1 << radix; KKASSERT(bmap->avail <= HAMMER2_SEGSIZE); if (bmap->avail == HAMMER2_SEGSIZE && @@ -884,6 +962,11 @@ hammer2_freemap_free(hammer2_trans_t *trans, hammer2_mount_t *hmp, /* * chain->bref.check.freemap.bigmask (XXX) + * + * Setting bigmask is a hint to the allocation code that there might + * be something allocatable. We also set this in recovery... it + * doesn't hurt and we might want to use the hint for other validation + * operations later on. */ if (modified) chain->bref.check.freemap.bigmask |= 1 << radix; diff --git a/sys/vfs/hammer2/hammer2_inode.c b/sys/vfs/hammer2/hammer2_inode.c index 2c40a9e3ee..45f6acb68d 100644 --- a/sys/vfs/hammer2/hammer2_inode.c +++ b/sys/vfs/hammer2/hammer2_inode.c @@ -847,7 +847,7 @@ retry: bref = tmp->bref; bref.key = lhc; /* invisible dir entry key */ bref.keybits = 0; - hammer2_chain_duplicate(trans, &parent, &tmp, &bref, 0); + hammer2_chain_duplicate(trans, &parent, &tmp, &bref, 0, 2); hammer2_inode_unlock_ex(dip, parent); /*hammer2_chain_lookup_done(parent);*/ hammer2_chain_unlock(nchain); /* no longer needed */ @@ -956,7 +956,8 @@ hammer2_inode_connect(hammer2_trans_t *trans, int hlink, */ nchain = ochain; ochain = NULL; - hammer2_chain_duplicate(trans, NULL, &nchain, NULL, 0); + hammer2_chain_duplicate(trans, NULL, &nchain, NULL, + 0, 3); error = hammer2_chain_create(trans, &parent, &nchain, lhc, 0, HAMMER2_BREF_TYPE_INODE, @@ -996,8 +997,7 @@ hammer2_inode_connect(hammer2_trans_t *trans, int hlink, * * We will return ochain (the hardlink target). */ - hammer2_chain_modify(trans, &nchain, - HAMMER2_MODIFY_ASSERTNOCOPY); + hammer2_chain_modify(trans, &nchain, 0); KKASSERT(name_len < HAMMER2_INODE_MAXNAME); ipdata = &nchain->data->ipdata; bcopy(name, ipdata->filename, name_len); @@ -1019,8 +1019,7 @@ hammer2_inode_connect(hammer2_trans_t *trans, int hlink, * Since this is a snapshot we return nchain in the fake * hardlink case. */ - hammer2_chain_modify(trans, &nchain, - HAMMER2_MODIFY_ASSERTNOCOPY); + hammer2_chain_modify(trans, &nchain, 0); KKASSERT(name_len < HAMMER2_INODE_MAXNAME); ipdata = &nchain->data->ipdata; *ipdata = ochain->data->ipdata; @@ -1037,8 +1036,7 @@ hammer2_inode_connect(hammer2_trans_t *trans, int hlink, * We must fixup the name stored in oip. The bref key * has already been set up. */ - hammer2_chain_modify(trans, &nchain, - HAMMER2_MODIFY_ASSERTNOCOPY); + hammer2_chain_modify(trans, &nchain, 0); ipdata = &nchain->data->ipdata; KKASSERT(name_len < HAMMER2_INODE_MAXNAME); diff --git a/sys/vfs/hammer2/hammer2_ioctl.c b/sys/vfs/hammer2/hammer2_ioctl.c index 78656f6b71..ae4b74a114 100644 --- a/sys/vfs/hammer2/hammer2_ioctl.c +++ b/sys/vfs/hammer2/hammer2_ioctl.c @@ -510,7 +510,7 @@ hammer2_ioctl_pfs_create(hammer2_inode_t *ip, void *data) return(EINVAL); pfs->name[sizeof(pfs->name) - 1] = 0; /* ensure 0-termination */ - hammer2_trans_init(&trans, ip->pmp, HAMMER2_TRANS_NEWINODE); + hammer2_trans_init(&trans, ip->pmp, NULL, HAMMER2_TRANS_NEWINODE); nip = hammer2_inode_create(&trans, hmp->sroot, NULL, NULL, pfs->name, strlen(pfs->name), &nchain, &error); @@ -545,7 +545,7 @@ hammer2_ioctl_pfs_delete(hammer2_inode_t *ip, void *data) int error; hmp = ip->pmp->cluster.chains[0]->hmp; /* XXX */ - hammer2_trans_init(&trans, ip->pmp, 0); + hammer2_trans_init(&trans, ip->pmp, NULL, 0); error = hammer2_unlink_file(&trans, hmp->sroot, pfs->name, strlen(pfs->name), 2, NULL); @@ -569,7 +569,7 @@ hammer2_ioctl_pfs_snapshot(hammer2_inode_t *ip, void *data) hammer2_vfs_sync(ip->pmp->mp, MNT_WAIT); - hammer2_trans_init(&trans, ip->pmp, HAMMER2_TRANS_NEWINODE); + hammer2_trans_init(&trans, ip->pmp, NULL, HAMMER2_TRANS_NEWINODE); parent = hammer2_inode_lock_ex(ip); error = hammer2_chain_snapshot(&trans, &parent, pfs); hammer2_inode_unlock_ex(ip, parent); @@ -608,7 +608,7 @@ hammer2_ioctl_inode_set(hammer2_inode_t *ip, void *data) hammer2_trans_t trans; int error = 0; - hammer2_trans_init(&trans, ip->pmp, 0); + hammer2_trans_init(&trans, ip->pmp, NULL, 0); chain = hammer2_inode_lock_ex(ip); if (ino->ip_data.comp_algo != chain->data->ipdata.comp_algo) { diff --git a/sys/vfs/hammer2/hammer2_vfsops.c b/sys/vfs/hammer2/hammer2_vfsops.c index 7ec4a7705d..095ab61cb2 100644 --- a/sys/vfs/hammer2/hammer2_vfsops.c +++ b/sys/vfs/hammer2/hammer2_vfsops.c @@ -168,8 +168,9 @@ static int hammer2_vfs_init(struct vfsconf *conf); static int hammer2_vfs_uninit(struct vfsconf *vfsp); static int hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data, struct ucred *cred); -static int hammer2_remount(hammer2_mount_t *, char *, struct vnode *, - struct ucred *); +static int hammer2_remount(hammer2_mount_t *, struct mount *, char *, + struct vnode *, struct ucred *); +static int hammer2_recovery(hammer2_mount_t *hmp); static int hammer2_vfs_unmount(struct mount *mp, int mntflags); static int hammer2_vfs_root(struct mount *mp, struct vnode **vpp); static int hammer2_vfs_statfs(struct mount *mp, struct statfs *sbp, @@ -391,7 +392,8 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data, for (i = 0; i < pmp->cluster.nchains; ++i) { hmp = pmp->cluster.chains[i]->hmp; devvp = hmp->devvp; - error = hammer2_remount(hmp, path, devvp, cred); + error = hammer2_remount(hmp, mp, path, + devvp, cred); if (error) break; } @@ -471,6 +473,8 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data, /* * vchain setup. vchain.data is embedded. * vchain.refs is initialized and will never drop to 0. + * + * NOTE! voldata is not yet loaded. */ hmp->vchain.hmp = hmp; hmp->vchain.refs = 1; @@ -478,6 +482,7 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data, hmp->vchain.bref.type = HAMMER2_BREF_TYPE_VOLUME; hmp->vchain.bref.data_off = 0 | HAMMER2_PBUFRADIX; hmp->vchain.delete_tid = HAMMER2_MAX_TID; + hammer2_chain_core_alloc(NULL, &hmp->vchain, NULL); /* hmp->vchain.u.xxx is left NULL */ @@ -505,7 +510,8 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data, /* hmp->fchain.u.xxx is left NULL */ /* - * Install the volume header + * Install the volume header and initialize fields from + * voldata. */ error = hammer2_install_volume_header(hmp); if (error) { @@ -514,7 +520,10 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data, } hmp->vchain.bref.mirror_tid = hmp->voldata.mirror_tid; + hmp->vchain.modify_tid = hmp->voldata.mirror_tid; hmp->fchain.bref.mirror_tid = hmp->voldata.freemap_tid; + hmp->fchain.modify_tid = hmp->voldata.freemap_tid; + /* * First locate the super-root inode, which is key 0 @@ -543,6 +552,11 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data, hammer2_inode_unlock_ex(hmp->sroot, schain); schain = NULL; /* leave hmp->sroot with one ref */ + + if ((mp->mnt_flag & MNT_RDONLY) == 0) { + error = hammer2_recovery(hmp); + /* XXX do something with error */ + } } /* @@ -729,7 +743,7 @@ hammer2_write_thread(void *arg) parent = NULL; parentp = &parent; - hammer2_trans_init(&trans, pmp, HAMMER2_TRANS_BUFCACHE); + hammer2_trans_init(&trans, pmp, NULL, HAMMER2_TRANS_BUFCACHE); while ((bio = bioq_takefirst(&pmp->wthread_bioq)) != NULL) { /* @@ -740,7 +754,7 @@ hammer2_write_thread(void *arg) bio->bio_flags |= BIO_DONE; wakeup(bio); hammer2_trans_done(&trans); - hammer2_trans_init(&trans, pmp, + hammer2_trans_init(&trans, pmp, NULL, HAMMER2_TRANS_BUFCACHE); continue; } @@ -1334,10 +1348,17 @@ hammer2_write_bp(hammer2_chain_t *chain, struct buf *bp, int ioflag, static int -hammer2_remount(hammer2_mount_t *hmp, char *path, struct vnode *devvp, - struct ucred *cred) +hammer2_remount(hammer2_mount_t *hmp, struct mount *mp, char *path, + struct vnode *devvp, struct ucred *cred) { - return (0); + int error; + + if (hmp->ronly && (mp->mnt_kern_flag & MNTK_WANTRDWR)) { + error = hammer2_recovery(hmp); + } else { + error = 0; + } + return error; } static @@ -1400,6 +1421,12 @@ hammer2_vfs_unmount(struct mount *mp, int mntflags) /* * Flush any left over chains. The voldata lock is only used * to synchronize against HAMMER2_CHAIN_MODIFIED_AUX. + * + * Flush twice to ensure that the freemap is completely + * synchronized. If we only do it once the next mount's + * recovery scan will have to do some fixups (which isn't + * bad, but we don't want it to have to do it except when + * recovering from a crash). */ hammer2_voldata_lock(hmp); if (((hmp->vchain.flags | hmp->fchain.flags) & @@ -1408,7 +1435,7 @@ hammer2_vfs_unmount(struct mount *mp, int mntflags) hmp->fchain.core->update_hi > hmp->voldata.freemap_tid) { hammer2_voldata_unlock(hmp, 0); hammer2_vfs_sync(mp, MNT_WAIT); - hammer2_vfs_sync(mp, MNT_WAIT); + /*hammer2_vfs_sync(mp, MNT_WAIT);*/ } else { hammer2_voldata_unlock(hmp, 0); } @@ -1613,6 +1640,157 @@ hammer2_vfs_statvfs(struct mount *mp, struct statvfs *sbp, struct ucred *cred) return (0); } +/* + * Mount-time recovery (RW mounts) + * + * Updates to the free block table are allowed to lag flushes by one + * transaction. In case of a crash, then on a fresh mount we must do an + * incremental scan of transaction id voldata.mirror_tid and make sure the + * related blocks have been marked allocated. + * + */ +struct hammer2_recovery_elm { + TAILQ_ENTRY(hammer2_recovery_elm) entry; + hammer2_chain_t *chain; +}; + +TAILQ_HEAD(hammer2_recovery_list, hammer2_recovery_elm); + +static int hammer2_recovery_scan(hammer2_trans_t *trans, hammer2_mount_t *hmp, + hammer2_chain_t *parent, + struct hammer2_recovery_list *list, int depth); + +#define HAMMER2_RECOVERY_MAXDEPTH 10 + +static +int +hammer2_recovery(hammer2_mount_t *hmp) +{ + hammer2_trans_t trans; + struct hammer2_recovery_list list; + struct hammer2_recovery_elm *elm; + hammer2_chain_t *parent; + int error; + int cumulative_error = 0; + + hammer2_trans_init(&trans, NULL, hmp, 0); + + TAILQ_INIT(&list); + parent = hammer2_chain_lookup_init(&hmp->vchain, 0); + cumulative_error = hammer2_recovery_scan(&trans, hmp, parent, &list, 0); + hammer2_chain_lookup_done(parent); + + while ((elm = TAILQ_FIRST(&list)) != NULL) { + TAILQ_REMOVE(&list, elm, entry); + parent = elm->chain; + kfree(elm, M_HAMMER2); + + hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS | + HAMMER2_RESOLVE_NOREF); + error = hammer2_recovery_scan(&trans, hmp, parent, &list, 0); + hammer2_chain_unlock(parent); + if (error) + cumulative_error = error; + } + hammer2_trans_done(&trans); + + return cumulative_error; +} + +static +int +hammer2_recovery_scan(hammer2_trans_t *trans, hammer2_mount_t *hmp, + hammer2_chain_t *parent, + struct hammer2_recovery_list *list, int depth) +{ + hammer2_chain_t *chain; + int cache_index; + int cumulative_error = 0; + int error; + + /* + * Defer operation if depth limit reached. + */ + if (depth >= HAMMER2_RECOVERY_MAXDEPTH) { + struct hammer2_recovery_elm *elm; + + elm = kmalloc(sizeof(*elm), M_HAMMER2, M_ZERO | M_WAITOK); + elm->chain = parent; + hammer2_chain_ref(parent); + TAILQ_INSERT_TAIL(list, elm, entry); + /* unlocked by caller */ + + return(0); + } + + /* + * Adjust freemap to ensure that the block(s) are marked allocated. + */ + if (parent->bref.type != HAMMER2_BREF_TYPE_VOLUME) { + hammer2_freemap_adjust(trans, hmp, &parent->bref, + HAMMER2_FREEMAP_DORECOVER); + } + + /* + * Check type for recursive scan + */ + switch(parent->bref.type) { + case HAMMER2_BREF_TYPE_VOLUME: + /* data already instantiated */ + break; + case HAMMER2_BREF_TYPE_INODE: + /* + * Must instantiate data for DIRECTDATA test and also + * for recursion. + */ + hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); + hammer2_chain_unlock(parent); + if (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) { + /* not applicable to recovery scan */ + return 0; + } + break; + case HAMMER2_BREF_TYPE_INDIRECT: + /* + * Must instantiate data for recursion + */ + hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); + hammer2_chain_unlock(parent); + break; + case HAMMER2_BREF_TYPE_DATA: + case HAMMER2_BREF_TYPE_FREEMAP: + case HAMMER2_BREF_TYPE_FREEMAP_NODE: + case HAMMER2_BREF_TYPE_FREEMAP_LEAF: + /* not applicable to recovery scan */ + return 0; + break; + default: + return EDOM; + } + + /* + * Recursive scan of the last flushed transaction only. We are + * doing this without pmp assignments so don't leave the chains + * hanging around after we are done with them. + */ + cache_index = 0; + chain = hammer2_chain_scan(parent, NULL, &cache_index, + HAMMER2_LOOKUP_NODATA); + while (chain) { + atomic_set_int(&chain->flags, HAMMER2_CHAIN_RELEASE); + if (chain->bref.mirror_tid >= hmp->voldata.mirror_tid) { + error = hammer2_recovery_scan(trans, hmp, chain, + list, depth + 1); + if (error) + cumulative_error = error; + } + chain = hammer2_chain_scan(parent, chain, &cache_index, + HAMMER2_LOOKUP_NODATA); + } + + return cumulative_error; +} + /* * Sync the entire filesystem; this is called from the filesystem syncer * process periodically and whenever a user calls sync(1) on the hammer @@ -1665,7 +1843,7 @@ hammer2_vfs_sync(struct mount *mp, int waitfor) * wait for pending I/O to finish (so it gets a transaction id * that the meta-data flush will catch). */ - hammer2_trans_init(&info.trans, pmp, 0); + hammer2_trans_init(&info.trans, pmp, NULL, 0); info.error = 0; info.waitfor = MNT_NOWAIT; vsyncscan(mp, flags | VMSC_NOWAIT, hammer2_sync_scan2, &info); @@ -1681,7 +1859,7 @@ hammer2_vfs_sync(struct mount *mp, int waitfor) /* * Start the flush transaction and flush all meta-data. */ - hammer2_trans_init(&info.trans, pmp, HAMMER2_TRANS_ISFLUSH); + hammer2_trans_init(&info.trans, pmp, NULL, HAMMER2_TRANS_ISFLUSH); total_error = 0; for (i = 0; i < pmp->cluster.nchains; ++i) { @@ -1696,19 +1874,31 @@ hammer2_vfs_sync(struct mount *mp, int waitfor) * ahead of the topology. We depend on the bulk free scan * code to deal with any loose ends. */ +#if 1 + hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS); + if ((hmp->fchain.flags & HAMMER2_CHAIN_MODIFIED) || + hmp->fchain.core->update_hi > hmp->voldata.freemap_tid) { + /* this will also modify vchain as a side effect */ + chain = &hmp->fchain; + hammer2_chain_flush(&info.trans, &chain); + KKASSERT(chain == &hmp->fchain); + } + hammer2_chain_unlock(&hmp->fchain); +#endif + hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS); if ((hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED) || hmp->vchain.core->update_hi > hmp->voldata.mirror_tid) { chain = &hmp->vchain; hammer2_chain_flush(&info.trans, &chain); KKASSERT(chain == &hmp->vchain); - hmp->voldata.mirror_tid = chain->bref.mirror_tid; force_fchain = 1; } else { force_fchain = 0; } hammer2_chain_unlock(&hmp->vchain); +#if 0 hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS); if ((hmp->fchain.flags & HAMMER2_CHAIN_MODIFIED) || hmp->fchain.core->update_hi > hmp->voldata.freemap_tid || @@ -1717,9 +1907,9 @@ hammer2_vfs_sync(struct mount *mp, int waitfor) chain = &hmp->fchain; hammer2_chain_flush(&info.trans, &chain); KKASSERT(chain == &hmp->fchain); - hmp->voldata.freemap_tid = chain->bref.mirror_tid; } hammer2_chain_unlock(&hmp->fchain); +#endif error = 0; diff --git a/sys/vfs/hammer2/hammer2_vnops.c b/sys/vfs/hammer2/hammer2_vnops.c index abba4e34eb..5747337789 100644 --- a/sys/vfs/hammer2/hammer2_vnops.c +++ b/sys/vfs/hammer2/hammer2_vnops.c @@ -337,7 +337,8 @@ hammer2_vop_reclaim(struct vop_reclaim_args *ap) ip->vp = NULL; if (chain->flags & HAMMER2_CHAIN_DELETED) { atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROYED); - hammer2_trans_init(&trans, ip->pmp, HAMMER2_TRANS_BUFCACHE); + hammer2_trans_init(&trans, ip->pmp, NULL, + HAMMER2_TRANS_BUFCACHE); hammer2_chain_setsubmod(&trans, chain); spin_lock(&chain->core->cst.spin); if (chain->core->update_hi < trans.sync_tid) @@ -381,11 +382,11 @@ hammer2_vop_fsync(struct vop_fsync_args *ap) * WARNING: Cannot use TRANS_ISFLUSH for partial syncs. */ #if 0 - hammer2_trans_init(&trans, ip->pmp, HAMMER2_TRANS_ISFLUSH); + hammer2_trans_init(&trans, ip->pmp, NULL, HAMMER2_TRANS_ISFLUSH); vfsync(vp, ap->a_waitfor, 1, NULL, NULL); hammer2_trans_clear_invfsync(&trans); #endif - hammer2_trans_init(&trans, ip->pmp, 0); + hammer2_trans_init(&trans, ip->pmp, NULL, 0); vfsync(vp, ap->a_waitfor, 1, NULL, NULL); /* @@ -510,7 +511,7 @@ hammer2_vop_setattr(struct vop_setattr_args *ap) return(EROFS); hammer2_chain_memory_wait(ip->pmp); - hammer2_trans_init(&trans, ip->pmp, 0); + hammer2_trans_init(&trans, ip->pmp, NULL, 0); chain = hammer2_inode_lock_ex(ip); ipdata = &chain->data->ipdata; error = 0; @@ -925,7 +926,7 @@ hammer2_vop_write(struct vop_write_args *ap) * The transaction interlocks against flushes initiations * (note: but will run concurrently with the actual flush). */ - hammer2_trans_init(&trans, ip->pmp, 0); + hammer2_trans_init(&trans, ip->pmp, NULL, 0); error = hammer2_write_file(ip, uio, ap->a_ioflag, seqcount); hammer2_trans_done(&trans); @@ -1301,7 +1302,7 @@ hammer2_vop_nresolve(struct vop_nresolve_args *ap) kprintf("hammer2: need to unconsolidate hardlink for %s\n", chain->data->ipdata.filename); /* XXX retain shared lock on dip? (currently not held) */ - hammer2_trans_init(&trans, dip->pmp, 0); + hammer2_trans_init(&trans, dip->pmp, NULL, 0); hammer2_hardlink_deconsolidate(&trans, dip, &chain, &ochain); hammer2_trans_done(&trans); } @@ -1397,7 +1398,7 @@ hammer2_vop_nmkdir(struct vop_nmkdir_args *ap) name_len = ncp->nc_nlen; hammer2_chain_memory_wait(dip->pmp); - hammer2_trans_init(&trans, dip->pmp, HAMMER2_TRANS_NEWINODE); + hammer2_trans_init(&trans, dip->pmp, NULL, HAMMER2_TRANS_NEWINODE); nip = hammer2_inode_create(&trans, dip, ap->a_vap, ap->a_cred, name, name_len, &chain, &error); if (error) { @@ -1508,7 +1509,7 @@ hammer2_vop_nlink(struct vop_nlink_args *ap) */ ip = VTOI(ap->a_vp); hammer2_chain_memory_wait(ip->pmp); - hammer2_trans_init(&trans, ip->pmp, HAMMER2_TRANS_NEWINODE); + hammer2_trans_init(&trans, ip->pmp, NULL, HAMMER2_TRANS_NEWINODE); chain = hammer2_inode_lock_ex(ip); error = hammer2_hardlink_consolidate(&trans, ip, &chain, dip, 1); @@ -1565,7 +1566,7 @@ hammer2_vop_ncreate(struct vop_ncreate_args *ap) name = ncp->nc_name; name_len = ncp->nc_nlen; hammer2_chain_memory_wait(dip->pmp); - hammer2_trans_init(&trans, dip->pmp, HAMMER2_TRANS_NEWINODE); + hammer2_trans_init(&trans, dip->pmp, NULL, HAMMER2_TRANS_NEWINODE); nip = hammer2_inode_create(&trans, dip, ap->a_vap, ap->a_cred, name, name_len, &nchain, &error); @@ -1609,7 +1610,7 @@ hammer2_vop_nmknod(struct vop_nmknod_args *ap) name = ncp->nc_name; name_len = ncp->nc_nlen; hammer2_chain_memory_wait(dip->pmp); - hammer2_trans_init(&trans, dip->pmp, HAMMER2_TRANS_NEWINODE); + hammer2_trans_init(&trans, dip->pmp, NULL, HAMMER2_TRANS_NEWINODE); nip = hammer2_inode_create(&trans, dip, ap->a_vap, ap->a_cred, name, name_len, &nchain, &error); @@ -1653,7 +1654,7 @@ hammer2_vop_nsymlink(struct vop_nsymlink_args *ap) name = ncp->nc_name; name_len = ncp->nc_nlen; hammer2_chain_memory_wait(dip->pmp); - hammer2_trans_init(&trans, dip->pmp, HAMMER2_TRANS_NEWINODE); + hammer2_trans_init(&trans, dip->pmp, NULL, HAMMER2_TRANS_NEWINODE); ap->a_vap->va_type = VLNK; /* enforce type */ @@ -1741,7 +1742,7 @@ hammer2_vop_nremove(struct vop_nremove_args *ap) name = ncp->nc_name; name_len = ncp->nc_nlen; hammer2_chain_memory_wait(dip->pmp); - hammer2_trans_init(&trans, dip->pmp, 0); + hammer2_trans_init(&trans, dip->pmp, NULL, 0); error = hammer2_unlink_file(&trans, dip, name, name_len, 0, NULL); hammer2_trans_done(&trans); if (error == 0) { @@ -1773,7 +1774,7 @@ hammer2_vop_nrmdir(struct vop_nrmdir_args *ap) name_len = ncp->nc_nlen; hammer2_chain_memory_wait(dip->pmp); - hammer2_trans_init(&trans, dip->pmp, 0); + hammer2_trans_init(&trans, dip->pmp, NULL, 0); error = hammer2_unlink_file(&trans, dip, name, name_len, 1, NULL); hammer2_trans_done(&trans); if (error == 0) { @@ -1823,7 +1824,7 @@ hammer2_vop_nrename(struct vop_nrename_args *ap) tname_len = tncp->nc_nlen; hammer2_chain_memory_wait(tdip->pmp); - hammer2_trans_init(&trans, tdip->pmp, 0); + hammer2_trans_init(&trans, tdip->pmp, NULL, 0); /* * ip is the inode being renamed. If this is a hardlink then -- 2.41.0