From 40498d1c88122d4588f3b021c4ec7c0444b2164b Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Tue, 13 Mar 2018 18:00:44 -0700 Subject: [PATCH] hammer2 - Flush asynchronization, bug fixes, stabilization * Better-asynchronize the flush (sync) code. Generally split flushing of inodes and flushing of topology above the inodes such that we can flush the inodes inside a normal transaction. This leaves only the topology flush in the flush transaction (which locks out all other modifying transactions). Concurrent read/write/ flush performance is significantly improved. * Fix numerous bugs in the inode tracking code where the chain topology under an inode could wind up remaining in a modified state when the vnode is in a clean state. This could cause hefty 'sync' latency on clean trees (that had already been flushed). * Fix numerous bugs in the handling of lost parent links, which occurs due to locking races primarily when indirect blocks have to be inserted or deleted. * Deleted-chain sub-topology tracking had some issues which could lead to chain's being lost, resulting in malloc complaints on umount. * Rename a few H2 sysctl variables for readability. * Adjust the "dumpchain" debugging directive to allow us to trace the ONFLUSH topology. --- sbin/hammer2/cmd_debug.c | 4 +- sbin/hammer2/hammer2.h | 2 +- sbin/hammer2/main.c | 14 ++- sys/vfs/hammer2/hammer2.h | 23 ++-- sys/vfs/hammer2/hammer2_chain.c | 167 ++++++++++++++--------------- sys/vfs/hammer2/hammer2_flush.c | 116 ++++++++++++++++++-- sys/vfs/hammer2/hammer2_inode.c | 67 +++++++++--- sys/vfs/hammer2/hammer2_io.c | 16 +-- sys/vfs/hammer2/hammer2_ioctl.c | 12 ++- sys/vfs/hammer2/hammer2_strategy.c | 2 +- sys/vfs/hammer2/hammer2_vfsops.c | 152 +++++++++++++++----------- sys/vfs/hammer2/hammer2_vnops.c | 72 ++++++++++--- 12 files changed, 428 insertions(+), 219 deletions(-) diff --git a/sbin/hammer2/cmd_debug.c b/sbin/hammer2/cmd_debug.c index d95ed50adb..8c7a8367fd 100644 --- a/sbin/hammer2/cmd_debug.c +++ b/sbin/hammer2/cmd_debug.c @@ -860,9 +860,9 @@ cmd_hash(int ac, const char **av) } int -cmd_chaindump(const char *path) +cmd_dumpchain(const char *path, u_int flags) { - int dummy = 0; + int dummy = (int)flags; int fd; fd = open(path, O_RDONLY); diff --git a/sbin/hammer2/hammer2.h b/sbin/hammer2/hammer2.h index 8a76b4d80e..fd2cb4581c 100644 --- a/sbin/hammer2/hammer2.h +++ b/sbin/hammer2/hammer2.h @@ -143,7 +143,7 @@ int cmd_leaf(const char *sel_path); int cmd_shell(const char *hostname); int cmd_debugspan(const char *hostname); int cmd_destroy_path(int ac, const char **av); -int cmd_chaindump(const char *path); +int cmd_dumpchain(const char *path, u_int flags); int cmd_show(const char *devpath, int dofreemap); int cmd_rsainit(const char *dir_path); int cmd_rsaenc(const char **keys, int nkeys); diff --git a/sbin/hammer2/main.c b/sbin/hammer2/main.c index e44fd6c56a..d434401bf0 100644 --- a/sbin/hammer2/main.c +++ b/sbin/hammer2/main.c @@ -149,11 +149,14 @@ main(int ac, char **av) usage(1); } ecode = cmd_remote_connect(sel_path, av[1]); - } else if (strcmp(av[0], "chaindump") == 0) { + } else if (strcmp(av[0], "dumpchain") == 0) { if (ac < 2) - ecode = cmd_chaindump("."); + ecode = cmd_dumpchain(".", (u_int)-1); + else if (ac < 3) + ecode = cmd_dumpchain(av[1], (u_int)-1); else - ecode = cmd_chaindump(av[1]); + ecode = cmd_dumpchain(av[1], + (u_int)strtoul(av[2], NULL, 0)); } else if (strcmp(av[0], "debugspan") == 0) { /* * Debug connection to the target hammer2 service and run @@ -474,8 +477,9 @@ usage(int code) "Add cluster link\n" " debugspan " "Debug spanning tree\n" - " chaindump " - "Dump in-memory chain topo at\n" + " dumpchain [chnflags] " + "Dump in-memory chain topo from\n" + "NOTE: ONFLUSH flag is 0x200\n" " destroy * " "Destroy a directory entry (only use if inode bad)\n" " disconnect " diff --git a/sys/vfs/hammer2/hammer2.h b/sys/vfs/hammer2/hammer2.h index 3726c16c93..6da23a5d4f 100644 --- a/sys/vfs/hammer2/hammer2.h +++ b/sys/vfs/hammer2/hammer2.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011-2017 The DragonFly Project. All rights reserved. + * Copyright (c) 2011-2018 The DragonFly Project. All rights reserved. * * This code is derived from software contributed to The DragonFly Project * by Matthew Dillon @@ -780,6 +780,7 @@ typedef struct hammer2_trans hammer2_trans_t; #define HAMMER2_FLUSH_TOP 0x0001 #define HAMMER2_FLUSH_ALL 0x0002 +#define HAMMER2_FLUSH_INODE_STOP 0x0004 /* stop at sub-inode */ /* @@ -1043,9 +1044,11 @@ typedef struct hammer2_xop_group hammer2_xop_group_t; * flags to hammer2_xop_alloc() * * MODIFYING - This is a modifying transaction, allocate a mtid. + * RECURSE - Recurse top-level inode (for root flushes) */ #define HAMMER2_XOP_MODIFYING 0x00000001 #define HAMMER2_XOP_STRATEGY 0x00000002 +#define HAMMER2_XOP_INODE_STOP 0x00000004 /* * Global (per partition) management structure, represents a hard block @@ -1362,9 +1365,8 @@ extern int hammer2_dedup_enable; extern int hammer2_always_compress; extern int hammer2_inval_enable; extern int hammer2_flush_pipe; -extern int hammer2_synchronous_flush; extern int hammer2_dio_count; -extern int hammer2_limit_dio; +extern int hammer2_dio_limit; extern int hammer2_bulkfree_tps; extern long hammer2_chain_allocs; extern long hammer2_chain_frees; @@ -1385,8 +1387,8 @@ extern long hammer2_iod_indr_write; extern long hammer2_iod_fmap_write; extern long hammer2_iod_volu_write; -extern long hammer2_check_xxhash64; -extern long hammer2_check_icrc32; +extern long hammer2_process_xxhash64; +extern long hammer2_process_icrc32; extern struct objcache *cache_buffer_read; extern struct objcache *cache_buffer_write; @@ -1458,7 +1460,8 @@ hammer2_inode_t *hammer2_inode_create(hammer2_inode_t *dip, const uint8_t *name, size_t name_len, hammer2_key_t lhc, hammer2_key_t inum, uint8_t type, uint8_t target_type, int flags, int *errorp); -void hammer2_inode_chain_sync(hammer2_inode_t *ip); +int hammer2_inode_chain_sync(hammer2_inode_t *ip); +int hammer2_inode_chain_flush(hammer2_inode_t *ip); int hammer2_inode_unlink_finisher(hammer2_inode_t *ip, int isopen); int hammer2_dirent_create(hammer2_inode_t *dip, const char *name, size_t name_len, hammer2_key_t inum, uint8_t type); @@ -1498,7 +1501,7 @@ void hammer2_chain_unlock(hammer2_chain_t *chain); void hammer2_chain_unlock_hold(hammer2_chain_t *chain); void hammer2_chain_wait(hammer2_chain_t *chain); hammer2_chain_t *hammer2_chain_get(hammer2_chain_t *parent, int generation, - hammer2_blockref_t *bref); + hammer2_blockref_t *bref, int how); hammer2_chain_t *hammer2_chain_lookup_init(hammer2_chain_t *parent, int flags); void hammer2_chain_lookup_done(hammer2_chain_t *parent); hammer2_chain_t *hammer2_chain_getparent(hammer2_chain_t *chain, int flags); @@ -1522,8 +1525,7 @@ int hammer2_chain_create(hammer2_chain_t **parentp, hammer2_chain_t **chainp, hammer2_key_t key, int keybits, int type, size_t bytes, hammer2_tid_t mtid, hammer2_off_t dedup_off, int flags); -void hammer2_chain_rename(hammer2_blockref_t *bref, - hammer2_chain_t **parentp, +void hammer2_chain_rename(hammer2_chain_t **parentp, hammer2_chain_t *chain, hammer2_tid_t mtid, int flags); int hammer2_chain_delete(hammer2_chain_t *parent, hammer2_chain_t *chain, @@ -1686,7 +1688,8 @@ int hammer2_msg_adhoc_input(kdmsg_msg_t *msg); * hammer2_vfsops.c */ void hammer2_volconf_update(hammer2_dev_t *hmp, int index); -void hammer2_dump_chain(hammer2_chain_t *chain, int tab, int *countp, char pfx); +void hammer2_dump_chain(hammer2_chain_t *chain, int tab, int *countp, char pfx, + u_int flags); int hammer2_vfs_sync(struct mount *mp, int waitflags); int hammer2_vfs_enospace(hammer2_inode_t *ip, off_t bytes, struct ucred *cred); diff --git a/sys/vfs/hammer2/hammer2_chain.c b/sys/vfs/hammer2/hammer2_chain.c index f17f86db03..57dc1bbd7c 100644 --- a/sys/vfs/hammer2/hammer2_chain.c +++ b/sys/vfs/hammer2/hammer2_chain.c @@ -2088,9 +2088,9 @@ hammer2_chain_find_callback(hammer2_chain_t *child, void *data) /* * Retrieve the specified chain from a media blockref, creating the * in-memory chain structure which reflects it. The returned chain is - * held but not locked. The caller must lock it to crc-check and - * dereference its data, and should check chain->error after locking - * before assuming that the data is good. + * held and locked according to (how) (HAMMER2_RESOLVE_*). The caller must + * handle crc-checks and so forth, and should check chain->error before + * assuming that the data is good. * * To handle insertion races pass the INSERT_RACE flag along with the * generation number of the core. NULL will be returned if the generation @@ -2105,7 +2105,7 @@ hammer2_chain_find_callback(hammer2_chain_t *child, void *data) */ hammer2_chain_t * hammer2_chain_get(hammer2_chain_t *parent, int generation, - hammer2_blockref_t *bref) + hammer2_blockref_t *bref, int how) { hammer2_dev_t *hmp = parent->hmp; hammer2_chain_t *chain; @@ -2127,6 +2127,11 @@ hammer2_chain_get(hammer2_chain_t *parent, int generation, */ atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPPED); + /* + * chain must be locked to avoid unexpected ripouts + */ + hammer2_chain_lock(chain, how); + /* * Link the chain into its parent. A spinlock is required to safely * access the RBTREE, and it is possible to collide with another @@ -2144,6 +2149,7 @@ hammer2_chain_get(hammer2_chain_t *parent, int generation, if (error) { KKASSERT((chain->flags & HAMMER2_CHAIN_ONRBTREE) == 0); /*kprintf("chain %p get race\n", chain);*/ + hammer2_chain_unlock(chain); hammer2_chain_drop(chain); chain = NULL; } else { @@ -2553,39 +2559,43 @@ again: /* * Selected from blockref or in-memory chain. */ + bcopy = *bref; if (chain == NULL) { - bcopy = *bref; hammer2_spin_unex(&parent->core.spin); - chain = hammer2_chain_get(parent, generation, - &bcopy); - if (chain == NULL) { - /* - kprintf("retry lookup parent %p keys %016jx:%016jx\n", - parent, key_beg, key_end); - */ - goto again; + if (bcopy.type == HAMMER2_BREF_TYPE_INDIRECT || + bcopy.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { + chain = hammer2_chain_get(parent, generation, + &bcopy, how_maybe); + } else { + chain = hammer2_chain_get(parent, generation, + &bcopy, how); } - if (bcmp(&bcopy, bref, sizeof(bcopy))) { - hammer2_chain_drop(chain); - chain = NULL; /* SAFETY */ + if (chain == NULL) goto again; - } } else { hammer2_chain_ref(chain); hammer2_spin_unex(&parent->core.spin); - } - /* - * chain is referenced but not locked. We must lock the chain - * to obtain definitive state. - */ - if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || - chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { - hammer2_chain_lock(chain, how_maybe); - } else { - hammer2_chain_lock(chain, how); + /* + * chain is referenced but not locked. We must lock the + * chain to obtain definitive state. + */ + if (bcopy.type == HAMMER2_BREF_TYPE_INDIRECT || + bcopy.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) { + hammer2_chain_lock(chain, how_maybe); + } else { + hammer2_chain_lock(chain, how); + } + KKASSERT(chain->parent == parent); } - KKASSERT(chain->parent == parent); + if (bcmp(&bcopy, &chain->bref, sizeof(bcopy)) || + chain->parent != parent) { + hammer2_chain_unlock(chain); + hammer2_chain_drop(chain); + chain = NULL; /* SAFETY */ + goto again; + } + /* * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX) @@ -2943,17 +2953,10 @@ again: * Recursion, always get the chain */ hammer2_spin_unex(&parent->core.spin); - chain = hammer2_chain_get(parent, generation, bref); - if (chain == NULL) { - kprintf("retry scan parent %p keys %016jx\n", - parent, key); + chain = hammer2_chain_get(parent, generation, + bref, how); + if (chain == NULL) goto again; - } - if (bcmp(bref, bref_ptr, sizeof(*bref))) { - hammer2_chain_drop(chain); - chain = NULL; - goto again; - } break; default: /* @@ -2970,14 +2973,16 @@ again: */ hammer2_chain_ref(chain); hammer2_spin_unex(&parent->core.spin); - } - - /* - * chain is referenced but not locked. We must lock the chain - * to obtain definitive state. - */ - if (chain) hammer2_chain_lock(chain, how); + } + if (chain && + (bcmp(bref, &chain->bref, sizeof(*bref)) || + chain->parent != parent)) { + hammer2_chain_unlock(chain); + hammer2_chain_drop(chain); + chain = NULL; + goto again; + } /* * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX) @@ -3354,10 +3359,6 @@ done: * FULL. This typically means that the caller is creating the chain after * doing a hammer2_chain_lookup(). * - * A non-NULL bref is typically passed when key and keybits must be overridden. - * Note that hammer2_cluster_duplicate() *ONLY* uses the key and keybits fields - * from a passed-in bref and uses the old chain's bref for everything else. - * * Neither (parent) or (chain) can be errored. * * If (parent) is non-NULL then the chain is inserted under the parent. @@ -3372,10 +3373,10 @@ done: * specific chain. */ void -hammer2_chain_rename(hammer2_blockref_t *bref, - hammer2_chain_t **parentp, hammer2_chain_t *chain, +hammer2_chain_rename(hammer2_chain_t **parentp, hammer2_chain_t *chain, hammer2_tid_t mtid, int flags) { + hammer2_blockref_t *bref; hammer2_dev_t *hmp; hammer2_chain_t *parent; size_t bytes; @@ -3398,8 +3399,7 @@ hammer2_chain_rename(hammer2_blockref_t *bref, * * NOTE: Handle special radix == 0 case (means 0 bytes). */ - if (bref == NULL) - bref = &chain->bref; + bref = &chain->bref; bytes = (size_t)(bref->data_off & HAMMER2_OFF_MASK_RADIX); if (bytes) bytes = (hammer2_off_t)1 << bytes; @@ -3858,6 +3858,7 @@ hammer2_chain_create_indirect(hammer2_chain_t *parent, * chain is referenced but not locked. We must lock the * chain to obtain definitive state. */ + bcopy = *bref; if (chain) { /* * Use chain already present in the RBTREE @@ -3870,22 +3871,14 @@ hammer2_chain_create_indirect(hammer2_chain_t *parent, * Get chain for blockref element. _get returns NULL * on insertion race. */ - bcopy = *bref; hammer2_spin_unex(&parent->core.spin); - chain = hammer2_chain_get(parent, generation, &bcopy); + chain = hammer2_chain_get(parent, generation, &bcopy, + HAMMER2_RESOLVE_NEVER); if (chain == NULL) { reason = 1; hammer2_spin_ex(&parent->core.spin); continue; } - hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER); - if (bcmp(&bcopy, bref, sizeof(bcopy))) { - reason = 2; - hammer2_chain_unlock(chain); - hammer2_chain_drop(chain); - hammer2_spin_ex(&parent->core.spin); - continue; - } } /* @@ -3898,7 +3891,8 @@ hammer2_chain_create_indirect(hammer2_chain_t *parent, * * (note reversed logic for this one) */ - if (chain->parent != parent || + if (bcmp(&bcopy, &chain->bref, sizeof(bcopy)) || + chain->parent != parent || (chain->flags & HAMMER2_CHAIN_DELETED)) { hammer2_chain_unlock(chain); hammer2_chain_drop(chain); @@ -3925,7 +3919,7 @@ hammer2_chain_create_indirect(hammer2_chain_t *parent, */ error = hammer2_chain_delete(parent, chain, mtid, 0); KKASSERT(error == 0); - hammer2_chain_rename(NULL, &ichain, chain, mtid, 0); + hammer2_chain_rename(&ichain, chain, mtid, 0); hammer2_chain_unlock(chain); hammer2_chain_drop(chain); KKASSERT(parent->refs > 0); @@ -4120,37 +4114,33 @@ hammer2_chain_indirect_maintenance(hammer2_chain_t *parent, break; key_next = bref->key + ((hammer2_key_t)1 << bref->keybits); + bcopy = *bref; if (sub) { hammer2_chain_ref(sub); hammer2_spin_unex(&chain->core.spin); hammer2_chain_lock(sub, HAMMER2_RESOLVE_NEVER); - if (sub->parent != chain || - (sub->flags & HAMMER2_CHAIN_DELETED)) { - hammer2_chain_unlock(sub); - hammer2_chain_drop(sub); - hammer2_spin_ex(&chain->core.spin); - continue; - } } else { - bcopy = *bref; hammer2_spin_unex(&chain->core.spin); - sub = hammer2_chain_get(chain, generation, &bcopy); + sub = hammer2_chain_get(chain, generation, &bcopy, + HAMMER2_RESOLVE_NEVER); if (sub == NULL) { hammer2_spin_ex(&chain->core.spin); continue; } - hammer2_chain_lock(sub, HAMMER2_RESOLVE_NEVER); - if (bcmp(&bcopy, bref, sizeof(bcopy)) != 0) { - hammer2_chain_unlock(sub); - hammer2_chain_drop(sub); - hammer2_spin_ex(&chain->core.spin); - continue; - } + } + if (bcmp(&bcopy, &sub->bref, sizeof(bcopy)) || + sub->parent != chain || + (sub->flags & HAMMER2_CHAIN_DELETED)) { + hammer2_chain_unlock(sub); + hammer2_chain_drop(sub); + hammer2_spin_ex(&chain->core.spin); + sub = NULL; /* safety */ + continue; } error = hammer2_chain_delete(chain, sub, sub->bref.modify_tid, 0); KKASSERT(error == 0); - hammer2_chain_rename(NULL, &parent, sub, + hammer2_chain_rename(&parent, sub, sub->bref.modify_tid, HAMMER2_INSERT_SAMEPARENT); hammer2_chain_unlock(sub); @@ -4795,11 +4785,16 @@ hammer2_chain_delete(hammer2_chain_t *parent, hammer2_chain_t *chain, /* * Permanent deletions mark the chain as destroyed. + * + * NOTE: We do not setflush the chain unless the deletion is + * permanent, since the deletion of a chain does not actually + * require it to be flushed. */ if (error == 0) { - if (flags & HAMMER2_DELETE_PERMANENT) + if (flags & HAMMER2_DELETE_PERMANENT) { atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY); - hammer2_chain_setflush(chain); + hammer2_chain_setflush(chain); + } } return error; @@ -5465,7 +5460,7 @@ hammer2_chain_testcheck(hammer2_chain_t *chain, void *bdata) chain->bref.check.iscsi32.value, check32); } - hammer2_check_icrc32 += chain->bytes; + hammer2_process_icrc32 += chain->bytes; break; case HAMMER2_CHECK_XXHASH64: check64 = XXH64(bdata, chain->bytes, XXH_HAMMER2_SEED); @@ -5482,7 +5477,7 @@ hammer2_chain_testcheck(hammer2_chain_t *chain, void *bdata) chain->bref.check.xxhash64.value, check64); } - hammer2_check_xxhash64 += chain->bytes; + hammer2_process_xxhash64 += chain->bytes; break; case HAMMER2_CHECK_SHA192: { diff --git a/sys/vfs/hammer2/hammer2_flush.c b/sys/vfs/hammer2/hammer2_flush.c index 611ab21cdd..0b458e897d 100644 --- a/sys/vfs/hammer2/hammer2_flush.c +++ b/sys/vfs/hammer2/hammer2_flush.c @@ -298,6 +298,9 @@ hammer2_trans_assert_strategy(hammer2_pfs_t *pmp) * recursive deletions (rm -rf, etc) a chance to remove more of the * hierarchy, potentially allowing an enormous amount of write I/O to * be avoided. + * + * NOTE: The flush code tests HAMMER2_CHAIN_DESTROY to differentiate + * between these chains and the deep-recursion requeue. */ void hammer2_delayed_flush(hammer2_chain_t *chain) @@ -337,6 +340,17 @@ hammer2_delayed_flush(hammer2_chain_t *chain) * chain is locked on call and will remain locked on return. The chain's * UPDATE flag indicates that its parent's block table (which is not yet * part of the flush) should be updated. + * + * flags: + * HAMMER2_FLUSH_TOP Indicates that this is the top of the flush. + * Is cleared for the recursion. + * + * HAMMER2_FLUSH_ALL Recurse everything + * + * HAMMER2_FLUSH_INODE_RECURSE + * Recurse one inode level, flush includes + * sub-inodes but do not go deeper (thus UPDATE + * can wind up remaining set). */ int hammer2_flush(hammer2_chain_t *chain, int flags) @@ -403,24 +417,44 @@ hammer2_flush(hammer2_chain_t *chain, int flags) * Now that we've popped back up we can do a secondary * recursion on the deferred elements. * + * NOTE: hmp->flushq chains (marked DESTROY) must be + * handled unconditionally so they can be cleaned + * out. + * * NOTE: hammer2_flush() may replace scan. */ if (hammer2_debug & 0x0040) kprintf("deferred flush %p\n", scan); hammer2_chain_lock(scan, HAMMER2_RESOLVE_MAYBE); if (scan->error == 0) { - hammer2_flush(scan, flags & ~HAMMER2_FLUSH_TOP); - hammer2_chain_unlock(scan); - hammer2_chain_drop(scan);/* ref from defer */ + if (scan->flags & HAMMER2_CHAIN_DESTROY) { + hammer2_flush(scan, + flags | + HAMMER2_FLUSH_TOP | + HAMMER2_FLUSH_ALL); + } else { + hammer2_flush(scan, + flags & ~HAMMER2_FLUSH_TOP); + } } else { info.error |= scan->error; } + hammer2_chain_unlock(scan); + hammer2_chain_drop(scan);/* ref from defer */ } /* - * [re]flush chain. + * [re]flush chain as the deep recursion may have generated + * additional modifications. */ info.diddeferral = 0; + if (info.parent != chain->parent) { + kprintf("LOST CHILD4 %p->%p (actual parent %p)\n", + info.parent, chain, chain->parent); + hammer2_chain_drop(info.parent); + info.parent = chain->parent; + hammer2_chain_ref(info.parent); + } hammer2_flush_core(&info, chain, flags); /* @@ -522,11 +556,22 @@ hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain, } hmp = chain->hmp; - parent = info->parent; /* can be NULL */ + + /* + * NOTE: parent can be NULL, usually due to destroy races. + */ + parent = info->parent; KKASSERT(chain->parent == parent); /* * Downward search recursion + * + * We must be careful on cold stops. If CHAIN_UPDATE is set and + * we stop cold (verses a deferral which will re-run the chain later), + * the update can wind up never being applied. This situation most + * typically occurs on inode boundaries due to the way + * hammer2_vfs_sync() breaks-up the flush. As a safety, we + * flush-through such situations. */ if (chain->flags & (HAMMER2_CHAIN_DEFERRED | HAMMER2_CHAIN_DELAYED)) { /* @@ -534,6 +579,7 @@ hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain, */ ++info->diddeferral; } else if ((chain->flags & HAMMER2_CHAIN_PFSBOUNDARY) && + (chain->flags & HAMMER2_CHAIN_UPDATE) == 0 && (flags & HAMMER2_FLUSH_ALL) == 0 && (flags & HAMMER2_FLUSH_TOP) == 0 && chain->pmp && chain->pmp->mp) { @@ -566,6 +612,27 @@ hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain, HAMMER2_CHAIN_MODIFIED)) { hammer2_chain_setflush(parent); } + } else if (chain->bref.type == HAMMER2_BREF_TYPE_INODE && + (chain->flags & HAMMER2_CHAIN_UPDATE) == 0 && + (flags & HAMMER2_FLUSH_INODE_STOP) && + (flags & HAMMER2_FLUSH_ALL) == 0 && + (flags & HAMMER2_FLUSH_TOP) == 0 && + chain->pmp && chain->pmp->mp) { + /* + * If FLUSH_INODE_STOP is specified and both ALL and TOP + * are clear, we must not flush the chain. The chain should + * have already been flushed and any further ONFLUSH/UPDATE + * setting will be related to the next flush. + * + * This features allows us to flush inodes independently of + * each other and meta-data above the inodes separately. + */ + if (chain->flags & (HAMMER2_CHAIN_ONFLUSH | + HAMMER2_CHAIN_DESTROY | + HAMMER2_CHAIN_MODIFIED)) { + if (parent) + hammer2_chain_setflush(parent); + } } else if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT) { /* * Recursion depth reached. @@ -1255,7 +1322,22 @@ done: /* * flush helper (backend threaded) * - * Flushes core chains, issues disk sync, flushes volume roots. + * Flushes chain topology for the specified inode. + * + * If HAMMER2_XOP_FLUSH is set we flush all chains from the current inode + * through but stop at sub-inodes (we flush the inode chains for sub-inodes, + * but do not go further as deeper modifications do not belong to the current + * flush cycle). + * + * If HAMMER2_XOP_FLUSH is not set we flush the current inode's chains only + * and do not recurse through sub-inodes, including not including those + * sub-inodes. + * + * Remember that HAMMER2 is currently using a flat inode model, so directory + * hierarchies do not translate to inode hierarchies. PFS ROOTs, however, + * do. + * + * chain->parent can be NULL, usually due to destroy races. * * Primarily called from vfs_sync(). */ @@ -1270,6 +1352,12 @@ hammer2_inode_xop_flush(hammer2_thread_t *thr, hammer2_xop_t *arg) int fsync_error = 0; int total_error = 0; int j; + int xflags; + int ispfsroot = 0; + + xflags = HAMMER2_FLUSH_TOP; + if (xop->head.flags & HAMMER2_XOP_INODE_STOP) + xflags |= HAMMER2_FLUSH_INODE_STOP; /* * Flush core chains @@ -1280,11 +1368,13 @@ hammer2_inode_xop_flush(hammer2_thread_t *thr, hammer2_xop_t *arg) hmp = chain->hmp; if ((chain->flags & HAMMER2_CHAIN_FLUSH_MASK) || TAILQ_FIRST(&hmp->flushq) != NULL) { - hammer2_flush(chain, HAMMER2_FLUSH_TOP); + hammer2_flush(chain, xflags); parent = chain->parent; - KKASSERT(chain->pmp != parent->pmp); - hammer2_chain_setflush(parent); + if (parent) + hammer2_chain_setflush(parent); } + if (chain->flags & HAMMER2_CHAIN_PFSBOUNDARY) + ispfsroot = 1; hammer2_chain_unlock(chain); hammer2_chain_drop(chain); chain = NULL; @@ -1292,6 +1382,14 @@ hammer2_inode_xop_flush(hammer2_thread_t *thr, hammer2_xop_t *arg) hmp = NULL; } + /* + * Don't flush from the volume root to the PFSROOT unless ip was + * a PFSROOT. If it isn't then this flush is probably related to + * a VOP_FSYNC. + */ + if (ispfsroot == 0) + goto skip; + /* * Flush volume roots. Avoid replication, we only want to * flush each hammer2_dev (hmp) once. diff --git a/sys/vfs/hammer2/hammer2_inode.c b/sys/vfs/hammer2/hammer2_inode.c index a47162e64f..26c447f771 100644 --- a/sys/vfs/hammer2/hammer2_inode.c +++ b/sys/vfs/hammer2/hammer2_inode.c @@ -1165,12 +1165,18 @@ hammer2_inode_unlink_finisher(hammer2_inode_t *ip, int isopen) hammer2_knote(ip->vp, NOTE_DELETE); /* - * nlinks is now zero, delete the inode if not open. + * nlinks is now an implied zero, delete the inode if not open. + * We avoid unnecessary media updates by not bothering to actually + * decrement nlinks for the 1->0 transition + * + * Put the inode on the sideq to ensure that any disconnected chains + * get properly flushed (so they can be freed). */ if (isopen == 0) { hammer2_xop_destroy_t *xop; killit: + hammer2_inode_delayed_sideq(ip); atomic_set_int(&ip->flags, HAMMER2_INODE_ISDELETED); xop = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING); hammer2_xop_start(&xop->head, hammer2_inode_xop_destroy); @@ -1200,29 +1206,30 @@ killit: void hammer2_inode_modify(hammer2_inode_t *ip) { - hammer2_pfs_t *pmp; - atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED); if (ip->vp) { vsetisdirty(ip->vp); - } else if ((pmp = ip->pmp) != NULL && - (ip->flags & HAMMER2_INODE_NOSIDEQ) == 0) { + } else if (ip->pmp && (ip->flags & HAMMER2_INODE_NOSIDEQ) == 0) { hammer2_inode_delayed_sideq(ip); } } /* * Synchronize the inode's frontend state with the chain state prior - * to any explicit flush of the inode or any strategy write call. + * to any explicit flush of the inode or any strategy write call. This + * does not flush the inode's chain or its sub-topology to media (higher + * level layers are responsible for doing that). * - * Called with a locked inode inside a transaction. + * Called with a locked inode inside a normal transaction. */ -void +int hammer2_inode_chain_sync(hammer2_inode_t *ip) { + int error; + + error = 0; if (ip->flags & (HAMMER2_INODE_RESIZED | HAMMER2_INODE_MODIFIED)) { hammer2_xop_fsync_t *xop; - int error; xop = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING); xop->clear_directdata = 0; @@ -1256,6 +1263,28 @@ hammer2_inode_chain_sync(hammer2_inode_t *ip) /* XXX return error somehow? */ } } + return error; +} + +/* + * Flushes the inode's chain and its sub-topology to media. Usually called + * after hammer2_inode_chain_sync() is called. + */ +int +hammer2_inode_chain_flush(hammer2_inode_t *ip) +{ + hammer2_xop_fsync_t *xop; + int error; + + xop = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING | + HAMMER2_XOP_INODE_STOP); + hammer2_xop_start(&xop->head, hammer2_inode_xop_flush); + error = hammer2_xop_collect(&xop->head, HAMMER2_XOP_COLLECT_WAITALL); + hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP); + if (error == HAMMER2_ERROR_ENOENT) + error = 0; + + return error; } /* @@ -1286,8 +1315,10 @@ hammer2_inode_run_sideq(hammer2_pfs_t *pmp, int doall) return; if (doall == 0) { if (pmp->sideq_count > (pmp->inum_count >> 3)) { - kprintf("hammer2: flush sideq %ld/%ld\n", - pmp->sideq_count, pmp->inum_count); + if (hammer2_debug & 0x0001) { + kprintf("hammer2: flush sideq %ld/%ld\n", + pmp->sideq_count, pmp->inum_count); + } } } @@ -1312,6 +1343,7 @@ hammer2_inode_run_sideq(hammer2_pfs_t *pmp, int doall) * the boat and synchronize it normally. */ hammer2_inode_chain_sync(ip); + hammer2_inode_chain_flush(ip); } else if (ip->flags & HAMMER2_INODE_ISUNLINKED) { /* * The inode was unlinked while open. The inode must @@ -1330,6 +1362,7 @@ hammer2_inode_run_sideq(hammer2_pfs_t *pmp, int doall) * chains. */ hammer2_inode_chain_sync(ip); + hammer2_inode_chain_flush(ip); } hammer2_inode_unlock(ip); @@ -1343,8 +1376,10 @@ hammer2_inode_run_sideq(hammer2_pfs_t *pmp, int doall) * don't stop flushing until sideq_count drops below 1/16. */ if (doall == 0 && pmp->sideq_count <= (pmp->inum_count >> 4)) { - kprintf("hammer2: flush sideq %ld/%ld (end)\n", - pmp->sideq_count, pmp->inum_count); + if (hammer2_debug & 0x0001) { + kprintf("hammer2: flush sideq %ld/%ld (end)\n", + pmp->sideq_count, pmp->inum_count); + } break; } } @@ -1678,7 +1713,11 @@ fail: } /* - * Synchronize the in-memory inode with the chain. + * Synchronize the in-memory inode with the chain. This does not flush + * the chain to disk. Instead, it makes front-end inode changes visible + * in the chain topology, thus visible to the backend. This is done in an + * ad-hoc manner outside of the filesystem vfs_sync, and in a controlled + * manner inside the vfs_sync. */ void hammer2_inode_xop_chain_sync(hammer2_thread_t *thr, hammer2_xop_t *arg) diff --git a/sys/vfs/hammer2/hammer2_io.c b/sys/vfs/hammer2/hammer2_io.c index ca5e54f1ed..4e2bb6d0be 100644 --- a/sys/vfs/hammer2/hammer2_io.c +++ b/sys/vfs/hammer2/hammer2_io.c @@ -392,7 +392,7 @@ hammer2_io_putblk(hammer2_io_t **diop) struct buf *bp; off_t pbase; int psize; - int limit_dio; + int dio_limit; uint64_t orefs; uint64_t nrefs; @@ -539,17 +539,17 @@ hammer2_io_putblk(hammer2_io_t **diop) * We cache free buffers so re-use cases can use a shared lock, but * if too many build up we have to clean them out. */ - limit_dio = hammer2_limit_dio; - if (limit_dio < 256) - limit_dio = 256; - if (limit_dio > 1024*1024) - limit_dio = 1024*1024; - if (hmp->iofree_count > limit_dio) { + dio_limit = hammer2_dio_limit; + if (dio_limit < 256) + dio_limit = 256; + if (dio_limit > 1024*1024) + dio_limit = 1024*1024; + if (hmp->iofree_count > dio_limit) { struct hammer2_cleanupcb_info info; RB_INIT(&info.tmptree); hammer2_spin_ex(&hmp->io_spin); - if (hmp->iofree_count > limit_dio) { + if (hmp->iofree_count > dio_limit) { info.count = hmp->iofree_count / 5; RB_SCAN(hammer2_io_tree, &hmp->iotree, NULL, hammer2_io_cleanup_callback, &info); diff --git a/sys/vfs/hammer2/hammer2_ioctl.c b/sys/vfs/hammer2/hammer2_ioctl.c index c521772f1a..00b4ae11a2 100644 --- a/sys/vfs/hammer2/hammer2_ioctl.c +++ b/sys/vfs/hammer2/hammer2_ioctl.c @@ -57,7 +57,7 @@ static int hammer2_ioctl_pfs_snapshot(hammer2_inode_t *ip, void *data); static int hammer2_ioctl_pfs_delete(hammer2_inode_t *ip, void *data); static int hammer2_ioctl_inode_get(hammer2_inode_t *ip, void *data); static int hammer2_ioctl_inode_set(hammer2_inode_t *ip, void *data); -static int hammer2_ioctl_debug_dump(hammer2_inode_t *ip); +static int hammer2_ioctl_debug_dump(hammer2_inode_t *ip, u_int flags); //static int hammer2_ioctl_inode_comp_set(hammer2_inode_t *ip, void *data); //static int hammer2_ioctl_inode_comp_rec_set(hammer2_inode_t *ip, void *data); //static int hammer2_ioctl_inode_comp_rec_set2(hammer2_inode_t *ip, void *data); @@ -155,7 +155,7 @@ hammer2_ioctl(hammer2_inode_t *ip, u_long com, void *data, int fflag, error = hammer2_ioctl_destroy(ip, data); break; case HAMMER2IOC_DEBUG_DUMP: - error = hammer2_ioctl_debug_dump(ip); + error = hammer2_ioctl_debug_dump(ip, *(u_int *)data); break; default: error = EOPNOTSUPP; @@ -661,6 +661,7 @@ hammer2_ioctl_pfs_create(hammer2_inode_t *ip, void *data) hammer2_inode_ref(nip); hammer2_inode_unlock(nip); hammer2_inode_chain_sync(nip); + hammer2_inode_chain_flush(nip); KKASSERT(nip->refs == 1); hammer2_inode_drop(nip); @@ -906,6 +907,7 @@ hammer2_ioctl_pfs_snapshot(hammer2_inode_t *ip, void *data) hammer2_inode_ref(nip); hammer2_inode_unlock(nip); hammer2_inode_chain_sync(nip); + hammer2_inode_chain_flush(nip); KKASSERT(nip->refs == 1); hammer2_inode_drop(nip); @@ -1021,17 +1023,17 @@ hammer2_ioctl_inode_set(hammer2_inode_t *ip, void *data) static int -hammer2_ioctl_debug_dump(hammer2_inode_t *ip) +hammer2_ioctl_debug_dump(hammer2_inode_t *ip, u_int flags) { hammer2_chain_t *chain; - int count = 1000; + int count = 100000; int i; for (i = 0; i < ip->cluster.nchains; ++i) { chain = ip->cluster.array[i].chain; if (chain == NULL) continue; - hammer2_dump_chain(chain, 0, &count, 'i'); + hammer2_dump_chain(chain, 0, &count, 'i', flags); } return 0; } diff --git a/sys/vfs/hammer2/hammer2_strategy.c b/sys/vfs/hammer2/hammer2_strategy.c index 5bbabe0819..6319e04a74 100644 --- a/sys/vfs/hammer2/hammer2_strategy.c +++ b/sys/vfs/hammer2/hammer2_strategy.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011-2015 The DragonFly Project. All rights reserved. + * Copyright (c) 2011-2018 The DragonFly Project. All rights reserved. * * This code is derived from software contributed to The DragonFly Project * by Matthew Dillon diff --git a/sys/vfs/hammer2/hammer2_vfsops.c b/sys/vfs/hammer2/hammer2_vfsops.c index b8db5e4457..1dfd452aef 100644 --- a/sys/vfs/hammer2/hammer2_vfsops.c +++ b/sys/vfs/hammer2/hammer2_vfsops.c @@ -69,6 +69,7 @@ MALLOC_DEFINE(M_OBJCACHE, "objcache", "Object Cache"); struct hammer2_sync_info { int error; int waitfor; + int pass; }; TAILQ_HEAD(hammer2_mntlist, hammer2_dev); @@ -85,9 +86,8 @@ int hammer2_dedup_enable = 1; int hammer2_always_compress = 0; /* always try to compress */ int hammer2_inval_enable = 0; int hammer2_flush_pipe = 100; -int hammer2_synchronous_flush = 1; int hammer2_dio_count; -int hammer2_limit_dio = 256; +int hammer2_dio_limit = 256; int hammer2_bulkfree_tps = 5000; long hammer2_chain_allocs; long hammer2_chain_frees; @@ -134,8 +134,6 @@ SYSCTL_INT(_vfs_hammer2, OID_AUTO, inval_enable, CTLFLAG_RW, &hammer2_inval_enable, 0, ""); SYSCTL_INT(_vfs_hammer2, OID_AUTO, flush_pipe, CTLFLAG_RW, &hammer2_flush_pipe, 0, ""); -SYSCTL_INT(_vfs_hammer2, OID_AUTO, synchronous_flush, CTLFLAG_RW, - &hammer2_synchronous_flush, 0, ""); SYSCTL_INT(_vfs_hammer2, OID_AUTO, bulkfree_tps, CTLFLAG_RW, &hammer2_bulkfree_tps, 0, ""); SYSCTL_LONG(_vfs_hammer2, OID_AUTO, chain_allocs, CTLFLAG_RW, @@ -148,8 +146,8 @@ SYSCTL_LONG(_vfs_hammer2, OID_AUTO, count_modified_chains, CTLFLAG_RW, &hammer2_count_modified_chains, 0, ""); SYSCTL_INT(_vfs_hammer2, OID_AUTO, dio_count, CTLFLAG_RD, &hammer2_dio_count, 0, ""); -SYSCTL_INT(_vfs_hammer2, OID_AUTO, limit_dio, CTLFLAG_RW, - &hammer2_limit_dio, 0, ""); +SYSCTL_INT(_vfs_hammer2, OID_AUTO, dio_limit, CTLFLAG_RW, + &hammer2_dio_limit, 0, ""); SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_invals, CTLFLAG_RW, &hammer2_iod_invals, 0, ""); @@ -181,12 +179,12 @@ SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_fmap_write, CTLFLAG_RW, SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_volu_write, CTLFLAG_RW, &hammer2_iod_volu_write, 0, ""); -long hammer2_check_icrc32; -long hammer2_check_xxhash64; -SYSCTL_LONG(_vfs_hammer2, OID_AUTO, check_icrc32, CTLFLAG_RW, - &hammer2_check_icrc32, 0, ""); -SYSCTL_LONG(_vfs_hammer2, OID_AUTO, check_xxhash64, CTLFLAG_RW, - &hammer2_check_xxhash64, 0, ""); +long hammer2_process_icrc32; +long hammer2_process_xxhash64; +SYSCTL_LONG(_vfs_hammer2, OID_AUTO, process_icrc32, CTLFLAG_RW, + &hammer2_process_icrc32, 0, ""); +SYSCTL_LONG(_vfs_hammer2, OID_AUTO, process_xxhash64, CTLFLAG_RW, + &hammer2_process_xxhash64, 0, ""); static int hammer2_vfs_init(struct vfsconf *conf); static int hammer2_vfs_uninit(struct vfsconf *vfsp); @@ -259,12 +257,12 @@ hammer2_vfs_init(struct vfsconf *conf) * * NOTE: A large buffer cache can actually interfere with dedup * operation because we dedup based on media physical buffers - * and not logical buffers. Try to make the DIO chace large + * and not logical buffers. Try to make the DIO case large * enough to avoid this problem, but also cap it. */ - hammer2_limit_dio = nbuf * 2; - if (hammer2_limit_dio > 100000) - hammer2_limit_dio = 100000; + hammer2_dio_limit = nbuf * 2; + if (hammer2_dio_limit > 100000) + hammer2_dio_limit = 100000; if (HAMMER2_BLOCKREF_BYTES != sizeof(struct hammer2_blockref)) error = EINVAL; @@ -1769,9 +1767,9 @@ again: * rot). */ dumpcnt = 50; - hammer2_dump_chain(&hmp->vchain, 0, &dumpcnt, 'v'); + hammer2_dump_chain(&hmp->vchain, 0, &dumpcnt, 'v', (u_int)-1); dumpcnt = 50; - hammer2_dump_chain(&hmp->fchain, 0, &dumpcnt, 'f'); + hammer2_dump_chain(&hmp->fchain, 0, &dumpcnt, 'f', (u_int)-1); hammer2_dev_unlock(hmp); hammer2_chain_drop(&hmp->vchain); @@ -2246,7 +2244,8 @@ hammer2_recovery_scan(hammer2_dev_t *hmp, hammer2_chain_t *parent, if (tmp_error == 0 && (bref.flags & HAMMER2_BREF_FLAG_PFSROOT) && (chain->flags & HAMMER2_CHAIN_ONFLUSH)) { - hammer2_flush(chain, HAMMER2_FLUSH_TOP); + hammer2_flush(chain, HAMMER2_FLUSH_TOP | + HAMMER2_FLUSH_ALL); } rup_error |= tmp_error; } @@ -2254,9 +2253,8 @@ hammer2_recovery_scan(hammer2_dev_t *hmp, hammer2_chain_t *parent, } /* - * Sync a mount point; this is called on a per-mount basis from the - * filesystem syncer process periodically and whenever a user issues - * a sync. + * Sync a mount point; this is called periodically on a per-mount basis from + * the filesystem syncer, and whenever a user issues a sync. */ int hammer2_vfs_sync(struct mount *mp, int waitfor) @@ -2290,26 +2288,40 @@ hammer2_vfs_sync(struct mount *mp, int waitfor) flags |= VMSC_ONEPASS; /* - * Preflush the vnodes using a normal transaction before interlocking - * with a flush transaction. We do this to try to run as much of - * the compression as possible outside the flush transaction. + * Flush vnodes individually using a normal transaction to avoid + * stalling any concurrent operations. This will flush the related + * buffer cache buffers and inodes to the media. * * For efficiency do an async pass before making sure with a * synchronous pass on all related buffer cache buffers. + * + * Do a single synchronous pass to avoid double-flushing vnodes, + * which can waste copy-on-write blocks. XXX do not do two passes. */ hammer2_trans_init(pmp, 0); info.error = 0; info.waitfor = MNT_NOWAIT; + info.pass = 1; vsyncscan(mp, flags | VMSC_NOWAIT, hammer2_sync_scan2, &info); info.waitfor = MNT_WAIT; vsyncscan(mp, flags, hammer2_sync_scan2, &info); + + info.pass = 2; + info.waitfor = MNT_WAIT; + vsyncscan(mp, flags, hammer2_sync_scan2, &info); + + /* + * We must also run the sideq to handle any disconnected inodes + * as the vnode scan will not see these. + */ + hammer2_inode_run_sideq(pmp, 1); hammer2_trans_done(pmp); /* - * Start our flush transaction. This does not return until all - * concurrent transactions have completed and will prevent any - * new transactions from running concurrently, except for the - * buffer cache transactions. + * Start our flush transaction and flush the root topology down to + * the inodes, but not the inodes themselves (which we already flushed + * above). Any concurrent activity effecting inode contents will not + * be part of this flush cycle. * * (1) vfsync() all dirty vnodes via vfsyncscan(). * @@ -2326,14 +2338,6 @@ hammer2_vfs_sync(struct mount *mp, int waitfor) */ hammer2_trans_init(pmp, HAMMER2_TRANS_ISFLUSH); - info.error = 0; - info.waitfor = MNT_NOWAIT; - vsyncscan(mp, flags | VMSC_NOWAIT, hammer2_sync_scan2, &info); - info.waitfor = MNT_WAIT; - vsyncscan(mp, flags, hammer2_sync_scan2, &info); - hammer2_inode_run_sideq(pmp, 1); - hammer2_bioq_sync(pmp); - /* * Use the XOP interface to concurrently flush all nodes to * synchronize the PFSROOT subtopology to the media. A standard @@ -2345,7 +2349,18 @@ hammer2_vfs_sync(struct mount *mp, int waitfor) * XXX For now wait for all flushes to complete. */ if (iroot) { - xop = hammer2_xop_alloc(iroot, HAMMER2_XOP_MODIFYING); + /* + * If unmounting try to flush everything including any + * sub-trees under inodes, just in case there is dangling + * modified data, as a safety. Otherwise just flush up to + * the inodes in this stage. + */ + if (mp->mnt_kern_flag & MNTK_UNMOUNT) { + xop = hammer2_xop_alloc(iroot, HAMMER2_XOP_MODIFYING); + } else { + xop = hammer2_xop_alloc(iroot, HAMMER2_XOP_MODIFYING | + HAMMER2_XOP_INODE_STOP); + } hammer2_xop_start(&xop->head, hammer2_inode_xop_flush); error = hammer2_xop_collect(&xop->head, HAMMER2_XOP_COLLECT_WAITALL); @@ -2368,6 +2383,11 @@ hammer2_vfs_sync(struct mount *mp, int waitfor) * Note that we ignore the tranasction mtid we got above. Instead, * each vfsync below will ultimately get its own via TRANS_BUFCACHE * transactions. + * + * WARNING! The frontend might be waiting on chnmem (limit_dirty_chains) + * while holding a vnode locked. When this situation occurs we cannot + * safely test whether it is ok to clear the dirty bit on the vnode. + * However, we can still flush the inode's topology. */ static int hammer2_sync_scan2(struct mount *mp, struct vnode *vp, void *data) @@ -2391,34 +2411,39 @@ hammer2_sync_scan2(struct mount *mp, struct vnode *vp, void *data) } /* - * VOP_FSYNC will start a new transaction so replicate some code - * here to do it inline (see hammer2_vop_fsync()). + * Synchronize the buffer cche and inode meta-data to the backing + * chain topology. * - * WARNING: The vfsync interacts with the buffer cache and might - * block, we can't hold the inode lock at that time. - * However, we MUST ref ip before blocking to ensure that - * it isn't ripped out from under us (since we do not - * hold a lock on the vnode). + * vfsync is not necessarily synchronous, so it is best NOT to try + * to flush the backing topology to media at this point. */ hammer2_inode_ref(ip); - if ((ip->flags & HAMMER2_INODE_MODIFIED) || + if ((ip->flags & (HAMMER2_INODE_RESIZED|HAMMER2_INODE_MODIFIED)) || !RB_EMPTY(&vp->v_rbdirty_tree)) { - vfsync(vp, info->waitfor, 1, NULL, NULL); - if (ip->flags & (HAMMER2_INODE_RESIZED | - HAMMER2_INODE_MODIFIED)) { + if (info->pass == 1) + vfsync(vp, info->waitfor, 1, NULL, NULL); + else + bio_track_wait(&vp->v_track_write, 0, 0); + } + if (info->pass == 2 && (vp->v_flag & VISDIRTY)) { + if (vx_get_nonblock(vp) == 0) { hammer2_inode_lock(ip, 0); - if (ip->flags & (HAMMER2_INODE_RESIZED | - HAMMER2_INODE_MODIFIED)) { - hammer2_inode_chain_sync(ip); + if ((ip->flags & HAMMER2_INODE_MODIFIED) == 0 && + RB_EMPTY(&vp->v_rbdirty_tree) && + !bio_track_active(&vp->v_track_write)) { + vclrisdirty(vp); } + hammer2_inode_chain_sync(ip); + hammer2_inode_chain_flush(ip); + hammer2_inode_unlock(ip); + vx_put(vp); + } else { + /* can't safely clear isdirty */ + hammer2_inode_lock(ip, 0); + hammer2_inode_chain_flush(ip); hammer2_inode_unlock(ip); } } - if ((ip->flags & HAMMER2_INODE_MODIFIED) == 0 && - RB_EMPTY(&vp->v_rbdirty_tree)) { - vclrisdirty(vp); - } - hammer2_inode_drop(ip); #if 1 error = 0; @@ -2701,7 +2726,7 @@ hammer2_pfs_memory_wait(hammer2_pfs_t *pmp) /* * Try to start an early flush before we are forced to block. */ - if (count > limit * 7 / 10) + if (count > limit * 5 / 10) speedup_syncer(pmp->mp); break; } @@ -2794,7 +2819,8 @@ hammer2_vfs_enospace(hammer2_inode_t *ip, off_t bytes, struct ucred *cred) * Debugging */ void -hammer2_dump_chain(hammer2_chain_t *chain, int tab, int *countp, char pfx) +hammer2_dump_chain(hammer2_chain_t *chain, int tab, int *countp, char pfx, + u_int flags) { hammer2_chain_t *scan; hammer2_chain_t *parent; @@ -2828,8 +2854,12 @@ hammer2_dump_chain(hammer2_chain_t *chain, int tab, int *countp, char pfx) kprintf("\n"); } else { kprintf(" {\n"); - RB_FOREACH(scan, hammer2_chain_tree, &chain->core.rbtree) - hammer2_dump_chain(scan, tab + 4, countp, 'a'); + RB_FOREACH(scan, hammer2_chain_tree, &chain->core.rbtree) { + if ((scan->flags & flags) || flags == (u_int)-1) { + hammer2_dump_chain(scan, tab + 4, countp, 'a', + flags); + } + } if (chain->bref.type == HAMMER2_BREF_TYPE_INODE && chain->data) kprintf("%*.*s}(%s)\n", tab, tab, "", chain->data->ipdata.filename); diff --git a/sys/vfs/hammer2/hammer2_vnops.c b/sys/vfs/hammer2/hammer2_vnops.c index fb1788424c..d57dc953a7 100644 --- a/sys/vfs/hammer2/hammer2_vnops.c +++ b/sys/vfs/hammer2/hammer2_vnops.c @@ -163,7 +163,7 @@ hammer2_vop_reclaim(struct vop_reclaim_args *ap) /* * A modified inode may require chain synchronization. This - * synchronization is usually handled by VOP_SNYC / VOP_FSYNC + * synchronization is usually handled by VOP_SYNC / VOP_FSYNC * when vfsync() is called. However, that requires a vnode. * * When the vnode is disassociated we must keep track of any modified @@ -207,39 +207,54 @@ hammer2_vop_reclaim(struct vop_reclaim_args *ap) return (0); } +/* + * Currently this function synchronizes the front-end inode state to the + * backend chain topology, then flushes the inode's chain and sub-topology + * to backend media. This function does not flush the root topology down to + * the inode. + */ static int hammer2_vop_fsync(struct vop_fsync_args *ap) { hammer2_inode_t *ip; struct vnode *vp; + int error1; + int error2; vp = ap->a_vp; ip = VTOI(vp); + error1 = 0; -#if 0 - /* XXX can't do this yet */ - hammer2_trans_init(ip->pmp, HAMMER2_TRANS_ISFLUSH); - vfsync(vp, ap->a_waitfor, 1, NULL, NULL); -#endif hammer2_trans_init(ip->pmp, 0); + + /* + * Clean out buffer cache, wait for I/O's to complete. + */ vfsync(vp, ap->a_waitfor, 1, NULL, NULL); + bio_track_wait(&vp->v_track_write, 0, 0); /* - * Calling chain_flush here creates a lot of duplicative - * COW operations due to non-optimal vnode ordering. - * - * Only do it for an actual fsync() syscall. The other forms - * which call this function will eventually call chain_flush - * on the volume root as a catch-all, which is far more optimal. + * Flush any inode changes */ hammer2_inode_lock(ip, 0); - if (ip->flags & HAMMER2_INODE_MODIFIED) - hammer2_inode_chain_sync(ip); + if (ip->flags & (HAMMER2_INODE_RESIZED|HAMMER2_INODE_MODIFIED)) + error1 = hammer2_inode_chain_sync(ip); + + /* + * Flush dirty chains related to the inode. + * + * NOTE! XXX We do not currently flush to the volume root, ultimately + * we will want to have a shortcut for the flushed inode stored + * in the volume root for recovery purposes. + */ + error2 = hammer2_inode_chain_flush(ip); + if (error2) + error1 = error2; hammer2_inode_unlock(ip); hammer2_trans_done(ip->pmp); - return (0); + return (error1); } static @@ -461,13 +476,18 @@ hammer2_vop_setattr(struct vop_setattr_args *ap) done: /* - * If a truncation occurred we must call inode_fsync() now in order + * If a truncation occurred we must call chain_sync() now in order * to trim the related data chains, otherwise a later expansion can * cause havoc. * * If an extend occured that changed the DIRECTDATA state, we must * call inode_fsync now in order to prepare the inode's indirect * block table. + * + * WARNING! This means we are making an adjustment to the inode's + * chain outside of sync/fsync, and not just to inode->meta, which + * may result in some consistency issues if a crash were to occur + * at just the wrong time. */ if (ip->flags & HAMMER2_INODE_RESIZED) hammer2_inode_chain_sync(ip); @@ -1104,8 +1124,19 @@ hammer2_write_file(hammer2_inode_t *ip, struct uio *uio, hammer2_update_time(&ip->meta.mtime); vclrflags(vp, VLASTWRITETS); } - if (ip->flags & HAMMER2_INODE_MODIFIED) + +#if 0 + /* + * REMOVED - handled by hammer2_extend_file(). Do not issue + * a chain_sync() outside of a sync/fsync except for DIRECTDATA + * state changes. + * + * Under normal conditions we only issue a chain_sync if + * the inode's DIRECTDATA state changed. + */ + if (ip->flags & HAMMER2_INODE_RESIZED) hammer2_inode_chain_sync(ip); +#endif hammer2_mtx_unlock(&ip->lock); hammer2_knote(ip->vp, kflags); } @@ -1184,6 +1215,13 @@ hammer2_extend_file(hammer2_inode_t *ip, hammer2_key_t nsize) ip->osize = osize; ip->meta.size = nsize; + /* + * We must issue a chain_sync() when the DIRECTDATA state changes + * to prevent confusion between the flush code and the in-memory + * state. This is not perfect because we are doing it outside of + * a sync/fsync operation, so it might not be fully synchronized + * with the meta-data topology flush. + */ if (osize <= HAMMER2_EMBEDDED_BYTES && nsize > HAMMER2_EMBEDDED_BYTES) { atomic_set_int(&ip->flags, HAMMER2_INODE_RESIZED); hammer2_inode_chain_sync(ip); -- 2.41.0