hammer2 - Flush asynchronization, bug fixes, stabilization
authorMatthew Dillon <dillon@apollo.backplane.com>
Wed, 14 Mar 2018 01:00:44 +0000 (18:00 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Sat, 17 Mar 2018 08:31:35 +0000 (01:31 -0700)
* Better-asynchronize the flush (sync) code.  Generally split flushing
  of inodes and flushing of topology above the inodes such that we can
  flush the inodes inside a normal transaction.

  This leaves only the topology flush in the flush transaction (which
  locks out all other modifying transactions).  Concurrent read/write/
  flush performance is significantly improved.

* Fix numerous bugs in the inode tracking code where the chain topology
  under an inode could wind up remaining in a modified state when the
  vnode is in a clean state.  This could cause hefty 'sync' latency on
  clean trees (that had already been flushed).

* Fix numerous bugs in the handling of lost parent links, which occurs
  due to locking races primarily when indirect blocks have to be
  inserted or deleted.

* Deleted-chain sub-topology tracking had some issues which could lead
  to chain's being lost, resulting in malloc complaints on umount.

* Rename a few H2 sysctl variables for readability.

* Adjust the "dumpchain" debugging directive to allow us to trace
  the ONFLUSH topology.

12 files changed:
sbin/hammer2/cmd_debug.c
sbin/hammer2/hammer2.h
sbin/hammer2/main.c
sys/vfs/hammer2/hammer2.h
sys/vfs/hammer2/hammer2_chain.c
sys/vfs/hammer2/hammer2_flush.c
sys/vfs/hammer2/hammer2_inode.c
sys/vfs/hammer2/hammer2_io.c
sys/vfs/hammer2/hammer2_ioctl.c
sys/vfs/hammer2/hammer2_strategy.c
sys/vfs/hammer2/hammer2_vfsops.c
sys/vfs/hammer2/hammer2_vnops.c

index d95ed50..8c7a836 100644 (file)
@@ -860,9 +860,9 @@ cmd_hash(int ac, const char **av)
 }
 
 int
-cmd_chaindump(const char *path)
+cmd_dumpchain(const char *path, u_int flags)
 {
-       int dummy = 0;
+       int dummy = (int)flags;
        int fd;
 
        fd = open(path, O_RDONLY);
index 8a76b4d..fd2cb45 100644 (file)
@@ -143,7 +143,7 @@ int cmd_leaf(const char *sel_path);
 int cmd_shell(const char *hostname);
 int cmd_debugspan(const char *hostname);
 int cmd_destroy_path(int ac, const char **av);
-int cmd_chaindump(const char *path);
+int cmd_dumpchain(const char *path, u_int flags);
 int cmd_show(const char *devpath, int dofreemap);
 int cmd_rsainit(const char *dir_path);
 int cmd_rsaenc(const char **keys, int nkeys);
index e44fd6c..d434401 100644 (file)
@@ -149,11 +149,14 @@ main(int ac, char **av)
                        usage(1);
                }
                ecode = cmd_remote_connect(sel_path, av[1]);
-       } else if (strcmp(av[0], "chaindump") == 0) {
+       } else if (strcmp(av[0], "dumpchain") == 0) {
                if (ac < 2)
-                       ecode = cmd_chaindump(".");
+                       ecode = cmd_dumpchain(".", (u_int)-1);
+               else if (ac < 3)
+                       ecode = cmd_dumpchain(av[1], (u_int)-1);
                else
-                       ecode = cmd_chaindump(av[1]);
+                       ecode = cmd_dumpchain(av[1],
+                                             (u_int)strtoul(av[2], NULL, 0));
        } else if (strcmp(av[0], "debugspan") == 0) {
                /*
                 * Debug connection to the target hammer2 service and run
@@ -474,8 +477,9 @@ usage(int code)
                        "Add cluster link\n"
                "    debugspan <target>           "
                        "Debug spanning tree\n"
-               "    chaindump <path>             "
-                       "Dump in-memory chain topo at\n"
+               "    dumpchain <path> [chnflags]  "
+                       "Dump in-memory chain topo from\n"
+                       "NOTE: ONFLUSH flag is 0x200\n"
                "    destroy <path>*              "
                        "Destroy a directory entry (only use if inode bad)\n"
                "    disconnect <target>          "
index 3726c16..6da23a5 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2017 The DragonFly Project.  All rights reserved.
+ * Copyright (c) 2011-2018 The DragonFly Project.  All rights reserved.
  *
  * This code is derived from software contributed to The DragonFly Project
  * by Matthew Dillon <dillon@dragonflybsd.org>
@@ -780,6 +780,7 @@ typedef struct hammer2_trans hammer2_trans_t;
 
 #define HAMMER2_FLUSH_TOP              0x0001
 #define HAMMER2_FLUSH_ALL              0x0002
+#define HAMMER2_FLUSH_INODE_STOP       0x0004  /* stop at sub-inode */
 
 
 /*
@@ -1043,9 +1044,11 @@ typedef struct hammer2_xop_group hammer2_xop_group_t;
  * flags to hammer2_xop_alloc()
  *
  * MODIFYING   - This is a modifying transaction, allocate a mtid.
+ * RECURSE     - Recurse top-level inode (for root flushes)
  */
 #define HAMMER2_XOP_MODIFYING          0x00000001
 #define HAMMER2_XOP_STRATEGY           0x00000002
+#define HAMMER2_XOP_INODE_STOP         0x00000004
 
 /*
  * Global (per partition) management structure, represents a hard block
@@ -1362,9 +1365,8 @@ extern int hammer2_dedup_enable;
 extern int hammer2_always_compress;
 extern int hammer2_inval_enable;
 extern int hammer2_flush_pipe;
-extern int hammer2_synchronous_flush;
 extern int hammer2_dio_count;
-extern int hammer2_limit_dio;
+extern int hammer2_dio_limit;
 extern int hammer2_bulkfree_tps;
 extern long hammer2_chain_allocs;
 extern long hammer2_chain_frees;
@@ -1385,8 +1387,8 @@ extern long hammer2_iod_indr_write;
 extern long hammer2_iod_fmap_write;
 extern long hammer2_iod_volu_write;
 
-extern long hammer2_check_xxhash64;
-extern long hammer2_check_icrc32;
+extern long hammer2_process_xxhash64;
+extern long hammer2_process_icrc32;
 
 extern struct objcache *cache_buffer_read;
 extern struct objcache *cache_buffer_write;
@@ -1458,7 +1460,8 @@ hammer2_inode_t *hammer2_inode_create(hammer2_inode_t *dip,
                        const uint8_t *name, size_t name_len, hammer2_key_t lhc,
                        hammer2_key_t inum, uint8_t type, uint8_t target_type,
                        int flags, int *errorp);
-void hammer2_inode_chain_sync(hammer2_inode_t *ip);
+int hammer2_inode_chain_sync(hammer2_inode_t *ip);
+int hammer2_inode_chain_flush(hammer2_inode_t *ip);
 int hammer2_inode_unlink_finisher(hammer2_inode_t *ip, int isopen);
 int hammer2_dirent_create(hammer2_inode_t *dip, const char *name,
                        size_t name_len, hammer2_key_t inum, uint8_t type);
@@ -1498,7 +1501,7 @@ void hammer2_chain_unlock(hammer2_chain_t *chain);
 void hammer2_chain_unlock_hold(hammer2_chain_t *chain);
 void hammer2_chain_wait(hammer2_chain_t *chain);
 hammer2_chain_t *hammer2_chain_get(hammer2_chain_t *parent, int generation,
-                               hammer2_blockref_t *bref);
+                               hammer2_blockref_t *bref, int how);
 hammer2_chain_t *hammer2_chain_lookup_init(hammer2_chain_t *parent, int flags);
 void hammer2_chain_lookup_done(hammer2_chain_t *parent);
 hammer2_chain_t *hammer2_chain_getparent(hammer2_chain_t *chain, int flags);
@@ -1522,8 +1525,7 @@ int hammer2_chain_create(hammer2_chain_t **parentp, hammer2_chain_t **chainp,
                                hammer2_key_t key, int keybits,
                                int type, size_t bytes, hammer2_tid_t mtid,
                                hammer2_off_t dedup_off, int flags);
-void hammer2_chain_rename(hammer2_blockref_t *bref,
-                               hammer2_chain_t **parentp,
+void hammer2_chain_rename(hammer2_chain_t **parentp,
                                hammer2_chain_t *chain,
                                hammer2_tid_t mtid, int flags);
 int hammer2_chain_delete(hammer2_chain_t *parent, hammer2_chain_t *chain,
@@ -1686,7 +1688,8 @@ int hammer2_msg_adhoc_input(kdmsg_msg_t *msg);
  * hammer2_vfsops.c
  */
 void hammer2_volconf_update(hammer2_dev_t *hmp, int index);
-void hammer2_dump_chain(hammer2_chain_t *chain, int tab, int *countp, char pfx);
+void hammer2_dump_chain(hammer2_chain_t *chain, int tab, int *countp, char pfx,
+                               u_int flags);
 int hammer2_vfs_sync(struct mount *mp, int waitflags);
 int hammer2_vfs_enospace(hammer2_inode_t *ip, off_t bytes, struct ucred *cred);
 
index f17f86d..57dc1bb 100644 (file)
@@ -2088,9 +2088,9 @@ hammer2_chain_find_callback(hammer2_chain_t *child, void *data)
 /*
  * Retrieve the specified chain from a media blockref, creating the
  * in-memory chain structure which reflects it.  The returned chain is
- * held but not locked.  The caller must lock it to crc-check and
- * dereference its data, and should check chain->error after locking
- * before assuming that the data is good.
+ * held and locked according to (how) (HAMMER2_RESOLVE_*).  The caller must
+ * handle crc-checks and so forth, and should check chain->error before
+ * assuming that the data is good.
  *
  * To handle insertion races pass the INSERT_RACE flag along with the
  * generation number of the core.  NULL will be returned if the generation
@@ -2105,7 +2105,7 @@ hammer2_chain_find_callback(hammer2_chain_t *child, void *data)
  */
 hammer2_chain_t *
 hammer2_chain_get(hammer2_chain_t *parent, int generation,
-                 hammer2_blockref_t *bref)
+                 hammer2_blockref_t *bref, int how)
 {
        hammer2_dev_t *hmp = parent->hmp;
        hammer2_chain_t *chain;
@@ -2127,6 +2127,11 @@ hammer2_chain_get(hammer2_chain_t *parent, int generation,
         */
        atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPPED);
 
+       /*
+        * chain must be locked to avoid unexpected ripouts
+        */
+       hammer2_chain_lock(chain, how);
+
        /*
         * Link the chain into its parent.  A spinlock is required to safely
         * access the RBTREE, and it is possible to collide with another
@@ -2144,6 +2149,7 @@ hammer2_chain_get(hammer2_chain_t *parent, int generation,
        if (error) {
                KKASSERT((chain->flags & HAMMER2_CHAIN_ONRBTREE) == 0);
                /*kprintf("chain %p get race\n", chain);*/
+               hammer2_chain_unlock(chain);
                hammer2_chain_drop(chain);
                chain = NULL;
        } else {
@@ -2553,39 +2559,43 @@ again:
        /*
         * Selected from blockref or in-memory chain.
         */
+       bcopy = *bref;
        if (chain == NULL) {
-               bcopy = *bref;
                hammer2_spin_unex(&parent->core.spin);
-               chain = hammer2_chain_get(parent, generation,
-                                         &bcopy);
-               if (chain == NULL) {
-                       /*
-                       kprintf("retry lookup parent %p keys %016jx:%016jx\n",
-                               parent, key_beg, key_end);
-                       */
-                       goto again;
+               if (bcopy.type == HAMMER2_BREF_TYPE_INDIRECT ||
+                   bcopy.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) {
+                       chain = hammer2_chain_get(parent, generation,
+                                                 &bcopy, how_maybe);
+               } else {
+                       chain = hammer2_chain_get(parent, generation,
+                                                 &bcopy, how);
                }
-               if (bcmp(&bcopy, bref, sizeof(bcopy))) {
-                       hammer2_chain_drop(chain);
-                       chain = NULL;   /* SAFETY */
+               if (chain == NULL)
                        goto again;
-               }
        } else {
                hammer2_chain_ref(chain);
                hammer2_spin_unex(&parent->core.spin);
-       }
 
-       /*
-        * chain is referenced but not locked.  We must lock the chain
-        * to obtain definitive state.
-        */
-       if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
-           chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) {
-               hammer2_chain_lock(chain, how_maybe);
-       } else {
-               hammer2_chain_lock(chain, how);
+               /*
+                * chain is referenced but not locked.  We must lock the
+                * chain to obtain definitive state.
+                */
+               if (bcopy.type == HAMMER2_BREF_TYPE_INDIRECT ||
+                   bcopy.type == HAMMER2_BREF_TYPE_FREEMAP_NODE) {
+                       hammer2_chain_lock(chain, how_maybe);
+               } else {
+                       hammer2_chain_lock(chain, how);
+               }
+               KKASSERT(chain->parent == parent);
        }
-       KKASSERT(chain->parent == parent);
+       if (bcmp(&bcopy, &chain->bref, sizeof(bcopy)) ||
+           chain->parent != parent) {
+               hammer2_chain_unlock(chain);
+               hammer2_chain_drop(chain);
+               chain = NULL;   /* SAFETY */
+               goto again;
+       }
+
 
        /*
         * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX)
@@ -2943,17 +2953,10 @@ again:
                         * Recursion, always get the chain
                         */
                        hammer2_spin_unex(&parent->core.spin);
-                       chain = hammer2_chain_get(parent, generation, bref);
-                       if (chain == NULL) {
-                               kprintf("retry scan parent %p keys %016jx\n",
-                                       parent, key);
+                       chain = hammer2_chain_get(parent, generation,
+                                                 bref, how);
+                       if (chain == NULL)
                                goto again;
-                       }
-                       if (bcmp(bref, bref_ptr, sizeof(*bref))) {
-                               hammer2_chain_drop(chain);
-                               chain = NULL;
-                               goto again;
-                       }
                        break;
                default:
                        /*
@@ -2970,14 +2973,16 @@ again:
                 */
                hammer2_chain_ref(chain);
                hammer2_spin_unex(&parent->core.spin);
-       }
-
-       /*
-        * chain is referenced but not locked.  We must lock the chain
-        * to obtain definitive state.
-        */
-       if (chain)
                hammer2_chain_lock(chain, how);
+       }
+       if (chain &&
+           (bcmp(bref, &chain->bref, sizeof(*bref)) ||
+            chain->parent != parent)) {
+               hammer2_chain_unlock(chain);
+               hammer2_chain_drop(chain);
+               chain = NULL;
+               goto again;
+       }
 
        /*
         * Skip deleted chains (XXX cache 'i' end-of-block-array? XXX)
@@ -3354,10 +3359,6 @@ done:
  * FULL.  This typically means that the caller is creating the chain after
  * doing a hammer2_chain_lookup().
  *
- * A non-NULL bref is typically passed when key and keybits must be overridden.
- * Note that hammer2_cluster_duplicate() *ONLY* uses the key and keybits fields
- * from a passed-in bref and uses the old chain's bref for everything else.
- *
  * Neither (parent) or (chain) can be errored.
  *
  * If (parent) is non-NULL then the chain is inserted under the parent.
@@ -3372,10 +3373,10 @@ done:
  *         specific chain.
  */
 void
-hammer2_chain_rename(hammer2_blockref_t *bref,
-                    hammer2_chain_t **parentp, hammer2_chain_t *chain,
+hammer2_chain_rename(hammer2_chain_t **parentp, hammer2_chain_t *chain,
                     hammer2_tid_t mtid, int flags)
 {
+       hammer2_blockref_t *bref;
        hammer2_dev_t *hmp;
        hammer2_chain_t *parent;
        size_t bytes;
@@ -3398,8 +3399,7 @@ hammer2_chain_rename(hammer2_blockref_t *bref,
         *
         * NOTE: Handle special radix == 0 case (means 0 bytes).
         */
-       if (bref == NULL)
-               bref = &chain->bref;
+       bref = &chain->bref;
        bytes = (size_t)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
        if (bytes)
                bytes = (hammer2_off_t)1 << bytes;
@@ -3858,6 +3858,7 @@ hammer2_chain_create_indirect(hammer2_chain_t *parent,
                 * chain is referenced but not locked.  We must lock the
                 * chain to obtain definitive state.
                 */
+               bcopy = *bref;
                if (chain) {
                        /*
                         * Use chain already present in the RBTREE
@@ -3870,22 +3871,14 @@ hammer2_chain_create_indirect(hammer2_chain_t *parent,
                         * Get chain for blockref element.  _get returns NULL
                         * on insertion race.
                         */
-                       bcopy = *bref;
                        hammer2_spin_unex(&parent->core.spin);
-                       chain = hammer2_chain_get(parent, generation, &bcopy);
+                       chain = hammer2_chain_get(parent, generation, &bcopy,
+                                                 HAMMER2_RESOLVE_NEVER);
                        if (chain == NULL) {
                                reason = 1;
                                hammer2_spin_ex(&parent->core.spin);
                                continue;
                        }
-                       hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER);
-                       if (bcmp(&bcopy, bref, sizeof(bcopy))) {
-                               reason = 2;
-                               hammer2_chain_unlock(chain);
-                               hammer2_chain_drop(chain);
-                               hammer2_spin_ex(&parent->core.spin);
-                               continue;
-                       }
                }
 
                /*
@@ -3898,7 +3891,8 @@ hammer2_chain_create_indirect(hammer2_chain_t *parent,
                 *
                 *       (note reversed logic for this one)
                 */
-               if (chain->parent != parent ||
+               if (bcmp(&bcopy, &chain->bref, sizeof(bcopy)) ||
+                   chain->parent != parent ||
                    (chain->flags & HAMMER2_CHAIN_DELETED)) {
                        hammer2_chain_unlock(chain);
                        hammer2_chain_drop(chain);
@@ -3925,7 +3919,7 @@ hammer2_chain_create_indirect(hammer2_chain_t *parent,
                 */
                error = hammer2_chain_delete(parent, chain, mtid, 0);
                KKASSERT(error == 0);
-               hammer2_chain_rename(NULL, &ichain, chain, mtid, 0);
+               hammer2_chain_rename(&ichain, chain, mtid, 0);
                hammer2_chain_unlock(chain);
                hammer2_chain_drop(chain);
                KKASSERT(parent->refs > 0);
@@ -4120,37 +4114,33 @@ hammer2_chain_indirect_maintenance(hammer2_chain_t *parent,
                        break;
                key_next = bref->key + ((hammer2_key_t)1 << bref->keybits);
 
+               bcopy = *bref;
                if (sub) {
                        hammer2_chain_ref(sub);
                        hammer2_spin_unex(&chain->core.spin);
                        hammer2_chain_lock(sub, HAMMER2_RESOLVE_NEVER);
-                       if (sub->parent != chain ||
-                           (sub->flags & HAMMER2_CHAIN_DELETED)) {
-                               hammer2_chain_unlock(sub);
-                               hammer2_chain_drop(sub);
-                               hammer2_spin_ex(&chain->core.spin);
-                               continue;
-                       }
                } else {
-                       bcopy = *bref;
                        hammer2_spin_unex(&chain->core.spin);
-                       sub = hammer2_chain_get(chain, generation, &bcopy);
+                       sub = hammer2_chain_get(chain, generation, &bcopy,
+                                               HAMMER2_RESOLVE_NEVER);
                        if (sub == NULL) {
                                hammer2_spin_ex(&chain->core.spin);
                                continue;
                        }
-                       hammer2_chain_lock(sub, HAMMER2_RESOLVE_NEVER);
-                       if (bcmp(&bcopy, bref, sizeof(bcopy)) != 0) {
-                               hammer2_chain_unlock(sub);
-                               hammer2_chain_drop(sub);
-                               hammer2_spin_ex(&chain->core.spin);
-                               continue;
-                       }
+               }
+               if (bcmp(&bcopy, &sub->bref, sizeof(bcopy)) ||
+                   sub->parent != chain ||
+                   (sub->flags & HAMMER2_CHAIN_DELETED)) {
+                       hammer2_chain_unlock(sub);
+                       hammer2_chain_drop(sub);
+                       hammer2_spin_ex(&chain->core.spin);
+                       sub = NULL;     /* safety */
+                       continue;
                }
                error = hammer2_chain_delete(chain, sub,
                                             sub->bref.modify_tid, 0);
                KKASSERT(error == 0);
-               hammer2_chain_rename(NULL, &parent, sub,
+               hammer2_chain_rename(&parent, sub,
                                     sub->bref.modify_tid,
                                     HAMMER2_INSERT_SAMEPARENT);
                hammer2_chain_unlock(sub);
@@ -4795,11 +4785,16 @@ hammer2_chain_delete(hammer2_chain_t *parent, hammer2_chain_t *chain,
 
        /*
         * Permanent deletions mark the chain as destroyed.
+        *
+        * NOTE: We do not setflush the chain unless the deletion is
+        *       permanent, since the deletion of a chain does not actually
+        *       require it to be flushed.
         */
        if (error == 0) {
-               if (flags & HAMMER2_DELETE_PERMANENT)
+               if (flags & HAMMER2_DELETE_PERMANENT) {
                        atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY);
-               hammer2_chain_setflush(chain);
+                       hammer2_chain_setflush(chain);
+               }
        }
 
        return error;
@@ -5465,7 +5460,7 @@ hammer2_chain_testcheck(hammer2_chain_t *chain, void *bdata)
                                chain->bref.check.iscsi32.value,
                                check32);
                }
-               hammer2_check_icrc32 += chain->bytes;
+               hammer2_process_icrc32 += chain->bytes;
                break;
        case HAMMER2_CHECK_XXHASH64:
                check64 = XXH64(bdata, chain->bytes, XXH_HAMMER2_SEED);
@@ -5482,7 +5477,7 @@ hammer2_chain_testcheck(hammer2_chain_t *chain, void *bdata)
                                chain->bref.check.xxhash64.value,
                                check64);
                }
-               hammer2_check_xxhash64 += chain->bytes;
+               hammer2_process_xxhash64 += chain->bytes;
                break;
        case HAMMER2_CHECK_SHA192:
                {
index 611ab21..0b458e8 100644 (file)
@@ -298,6 +298,9 @@ hammer2_trans_assert_strategy(hammer2_pfs_t *pmp)
  * recursive deletions (rm -rf, etc) a chance to remove more of the
  * hierarchy, potentially allowing an enormous amount of write I/O to
  * be avoided.
+ *
+ * NOTE: The flush code tests HAMMER2_CHAIN_DESTROY to differentiate
+ *      between these chains and the deep-recursion requeue.
  */
 void
 hammer2_delayed_flush(hammer2_chain_t *chain)
@@ -337,6 +340,17 @@ hammer2_delayed_flush(hammer2_chain_t *chain)
  * chain is locked on call and will remain locked on return.  The chain's
  * UPDATE flag indicates that its parent's block table (which is not yet
  * part of the flush) should be updated.
+ *
+ * flags:
+ *     HAMMER2_FLUSH_TOP       Indicates that this is the top of the flush.
+ *                             Is cleared for the recursion.
+ *
+ *     HAMMER2_FLUSH_ALL       Recurse everything
+ *
+ *     HAMMER2_FLUSH_INODE_RECURSE
+ *                             Recurse one inode level, flush includes
+ *                             sub-inodes but do not go deeper (thus UPDATE
+ *                             can wind up remaining set).
  */
 int
 hammer2_flush(hammer2_chain_t *chain, int flags)
@@ -403,24 +417,44 @@ hammer2_flush(hammer2_chain_t *chain, int flags)
                         * Now that we've popped back up we can do a secondary
                         * recursion on the deferred elements.
                         *
+                        * NOTE: hmp->flushq chains (marked DESTROY) must be
+                        *       handled unconditionally so they can be cleaned
+                        *       out.
+                        *
                         * NOTE: hammer2_flush() may replace scan.
                         */
                        if (hammer2_debug & 0x0040)
                                kprintf("deferred flush %p\n", scan);
                        hammer2_chain_lock(scan, HAMMER2_RESOLVE_MAYBE);
                        if (scan->error == 0) {
-                               hammer2_flush(scan, flags & ~HAMMER2_FLUSH_TOP);
-                               hammer2_chain_unlock(scan);
-                               hammer2_chain_drop(scan);/* ref from defer */
+                               if (scan->flags & HAMMER2_CHAIN_DESTROY) {
+                                       hammer2_flush(scan,
+                                                   flags |
+                                                   HAMMER2_FLUSH_TOP |
+                                                   HAMMER2_FLUSH_ALL);
+                               } else {
+                                       hammer2_flush(scan,
+                                                   flags & ~HAMMER2_FLUSH_TOP);
+                               }
                        } else {
                                info.error |= scan->error;
                        }
+                       hammer2_chain_unlock(scan);
+                       hammer2_chain_drop(scan);/* ref from defer */
                }
 
                /*
-                * [re]flush chain.
+                * [re]flush chain as the deep recursion may have generated
+                * additional modifications.
                 */
                info.diddeferral = 0;
+               if (info.parent != chain->parent) {
+                       kprintf("LOST CHILD4 %p->%p (actual parent %p)\n",
+                               info.parent, chain, chain->parent);
+                       hammer2_chain_drop(info.parent);
+                       info.parent = chain->parent;
+                       hammer2_chain_ref(info.parent);
+               }
                hammer2_flush_core(&info, chain, flags);
 
                /*
@@ -522,11 +556,22 @@ hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain,
        }
 
        hmp = chain->hmp;
-       parent = info->parent;          /* can be NULL */
+
+       /*
+        * NOTE: parent can be NULL, usually due to destroy races.
+        */
+       parent = info->parent;
        KKASSERT(chain->parent == parent);
 
        /*
         * Downward search recursion
+        *
+        * We must be careful on cold stops.  If CHAIN_UPDATE is set and
+        * we stop cold (verses a deferral which will re-run the chain later),
+        * the update can wind up never being applied.  This situation most
+        * typically occurs on inode boundaries due to the way
+        * hammer2_vfs_sync() breaks-up the flush.  As a safety, we
+        * flush-through such situations.
         */
        if (chain->flags & (HAMMER2_CHAIN_DEFERRED | HAMMER2_CHAIN_DELAYED)) {
                /*
@@ -534,6 +579,7 @@ hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain,
                 */
                ++info->diddeferral;
        } else if ((chain->flags & HAMMER2_CHAIN_PFSBOUNDARY) &&
+                  (chain->flags & HAMMER2_CHAIN_UPDATE) == 0 &&
                   (flags & HAMMER2_FLUSH_ALL) == 0 &&
                   (flags & HAMMER2_FLUSH_TOP) == 0 &&
                   chain->pmp && chain->pmp->mp) {
@@ -566,6 +612,27 @@ hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain,
                                    HAMMER2_CHAIN_MODIFIED)) {
                        hammer2_chain_setflush(parent);
                }
+       } else if (chain->bref.type == HAMMER2_BREF_TYPE_INODE &&
+                  (chain->flags & HAMMER2_CHAIN_UPDATE) == 0 &&
+                  (flags & HAMMER2_FLUSH_INODE_STOP) &&
+                  (flags & HAMMER2_FLUSH_ALL) == 0 &&
+                  (flags & HAMMER2_FLUSH_TOP) == 0 &&
+                  chain->pmp && chain->pmp->mp) {
+               /*
+                * If FLUSH_INODE_STOP is specified and both ALL and TOP
+                * are clear, we must not flush the chain.  The chain should
+                * have already been flushed and any further ONFLUSH/UPDATE
+                * setting will be related to the next flush.
+                *
+                * This features allows us to flush inodes independently of
+                * each other and meta-data above the inodes separately.
+                */
+               if (chain->flags & (HAMMER2_CHAIN_ONFLUSH |
+                                   HAMMER2_CHAIN_DESTROY |
+                                   HAMMER2_CHAIN_MODIFIED)) {
+                       if (parent)
+                               hammer2_chain_setflush(parent);
+               }
        } else if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT) {
                /*
                 * Recursion depth reached.
@@ -1255,7 +1322,22 @@ done:
 /*
  * flush helper (backend threaded)
  *
- * Flushes core chains, issues disk sync, flushes volume roots.
+ * Flushes chain topology for the specified inode.
+ *
+ * If HAMMER2_XOP_FLUSH is set we flush all chains from the current inode
+ * through but stop at sub-inodes (we flush the inode chains for sub-inodes,
+ * but do not go further as deeper modifications do not belong to the current
+ * flush cycle).
+ *
+ * If HAMMER2_XOP_FLUSH is not set we flush the current inode's chains only
+ * and do not recurse through sub-inodes, including not including those
+ * sub-inodes.
+ *
+ * Remember that HAMMER2 is currently using a flat inode model, so directory
+ * hierarchies do not translate to inode hierarchies.  PFS ROOTs, however,
+ * do.
+ *
+ * chain->parent can be NULL, usually due to destroy races.
  *
  * Primarily called from vfs_sync().
  */
@@ -1270,6 +1352,12 @@ hammer2_inode_xop_flush(hammer2_thread_t *thr, hammer2_xop_t *arg)
        int fsync_error = 0;
        int total_error = 0;
        int j;
+       int xflags;
+       int ispfsroot = 0;
+
+       xflags = HAMMER2_FLUSH_TOP;
+       if (xop->head.flags & HAMMER2_XOP_INODE_STOP)
+               xflags |= HAMMER2_FLUSH_INODE_STOP;
 
        /*
         * Flush core chains
@@ -1280,11 +1368,13 @@ hammer2_inode_xop_flush(hammer2_thread_t *thr, hammer2_xop_t *arg)
                hmp = chain->hmp;
                if ((chain->flags & HAMMER2_CHAIN_FLUSH_MASK) ||
                    TAILQ_FIRST(&hmp->flushq) != NULL) {
-                       hammer2_flush(chain, HAMMER2_FLUSH_TOP);
+                       hammer2_flush(chain, xflags);
                        parent = chain->parent;
-                       KKASSERT(chain->pmp != parent->pmp);
-                       hammer2_chain_setflush(parent);
+                       if (parent)
+                               hammer2_chain_setflush(parent);
                }
+               if (chain->flags & HAMMER2_CHAIN_PFSBOUNDARY)
+                       ispfsroot = 1;
                hammer2_chain_unlock(chain);
                hammer2_chain_drop(chain);
                chain = NULL;
@@ -1292,6 +1382,14 @@ hammer2_inode_xop_flush(hammer2_thread_t *thr, hammer2_xop_t *arg)
                hmp = NULL;
        }
 
+       /*
+        * Don't flush from the volume root to the PFSROOT unless ip was
+        * a PFSROOT.  If it isn't then this flush is probably related to
+        * a VOP_FSYNC.
+        */
+       if (ispfsroot == 0)
+               goto skip;
+
        /*
         * Flush volume roots.  Avoid replication, we only want to
         * flush each hammer2_dev (hmp) once.
index a47162e..26c447f 100644 (file)
@@ -1165,12 +1165,18 @@ hammer2_inode_unlink_finisher(hammer2_inode_t *ip, int isopen)
                hammer2_knote(ip->vp, NOTE_DELETE);
 
        /*
-        * nlinks is now zero, delete the inode if not open.
+        * nlinks is now an implied zero, delete the inode if not open.
+        * We avoid unnecessary media updates by not bothering to actually
+        * decrement nlinks for the 1->0 transition
+        *
+        * Put the inode on the sideq to ensure that any disconnected chains
+        * get properly flushed (so they can be freed).
         */
        if (isopen == 0) {
                hammer2_xop_destroy_t *xop;
 
 killit:
+               hammer2_inode_delayed_sideq(ip);
                atomic_set_int(&ip->flags, HAMMER2_INODE_ISDELETED);
                xop = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING);
                hammer2_xop_start(&xop->head, hammer2_inode_xop_destroy);
@@ -1200,29 +1206,30 @@ killit:
 void
 hammer2_inode_modify(hammer2_inode_t *ip)
 {
-       hammer2_pfs_t *pmp;
-
        atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
        if (ip->vp) {
                vsetisdirty(ip->vp);
-       } else if ((pmp = ip->pmp) != NULL &&
-                  (ip->flags & HAMMER2_INODE_NOSIDEQ) == 0) {
+       } else if (ip->pmp && (ip->flags & HAMMER2_INODE_NOSIDEQ) == 0) {
                hammer2_inode_delayed_sideq(ip);
        }
 }
 
 /*
  * Synchronize the inode's frontend state with the chain state prior
- * to any explicit flush of the inode or any strategy write call.
+ * to any explicit flush of the inode or any strategy write call.  This
+ * does not flush the inode's chain or its sub-topology to media (higher
+ * level layers are responsible for doing that).
  *
- * Called with a locked inode inside a transaction.
+ * Called with a locked inode inside a normal transaction.
  */
-void
+int
 hammer2_inode_chain_sync(hammer2_inode_t *ip)
 {
+       int error;
+
+       error = 0;
        if (ip->flags & (HAMMER2_INODE_RESIZED | HAMMER2_INODE_MODIFIED)) {
                hammer2_xop_fsync_t *xop;
-               int error;
 
                xop = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING);
                xop->clear_directdata = 0;
@@ -1256,6 +1263,28 @@ hammer2_inode_chain_sync(hammer2_inode_t *ip)
                        /* XXX return error somehow? */
                }
        }
+       return error;
+}
+
+/*
+ * Flushes the inode's chain and its sub-topology to media.  Usually called
+ * after hammer2_inode_chain_sync() is called.
+ */
+int
+hammer2_inode_chain_flush(hammer2_inode_t *ip)
+{
+       hammer2_xop_fsync_t *xop;
+       int error;
+
+       xop = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING |
+                                   HAMMER2_XOP_INODE_STOP);
+       hammer2_xop_start(&xop->head, hammer2_inode_xop_flush);
+       error = hammer2_xop_collect(&xop->head, HAMMER2_XOP_COLLECT_WAITALL);
+       hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
+       if (error == HAMMER2_ERROR_ENOENT)
+               error = 0;
+
+       return error;
 }
 
 /*
@@ -1286,8 +1315,10 @@ hammer2_inode_run_sideq(hammer2_pfs_t *pmp, int doall)
                return;
        if (doall == 0) {
                if (pmp->sideq_count > (pmp->inum_count >> 3)) {
-                       kprintf("hammer2: flush sideq %ld/%ld\n",
-                               pmp->sideq_count, pmp->inum_count);
+                       if (hammer2_debug & 0x0001) {
+                               kprintf("hammer2: flush sideq %ld/%ld\n",
+                                       pmp->sideq_count, pmp->inum_count);
+                       }
                }
        }
 
@@ -1312,6 +1343,7 @@ hammer2_inode_run_sideq(hammer2_pfs_t *pmp, int doall)
                         * the boat and synchronize it normally.
                         */
                        hammer2_inode_chain_sync(ip);
+                       hammer2_inode_chain_flush(ip);
                } else if (ip->flags & HAMMER2_INODE_ISUNLINKED) {
                        /*
                         * The inode was unlinked while open.  The inode must
@@ -1330,6 +1362,7 @@ hammer2_inode_run_sideq(hammer2_pfs_t *pmp, int doall)
                         * chains.
                         */
                        hammer2_inode_chain_sync(ip);
+                       hammer2_inode_chain_flush(ip);
                }
 
                hammer2_inode_unlock(ip);
@@ -1343,8 +1376,10 @@ hammer2_inode_run_sideq(hammer2_pfs_t *pmp, int doall)
                 * don't stop flushing until sideq_count drops below 1/16.
                 */
                if (doall == 0 && pmp->sideq_count <= (pmp->inum_count >> 4)) {
-                       kprintf("hammer2: flush sideq %ld/%ld (end)\n",
-                               pmp->sideq_count, pmp->inum_count);
+                       if (hammer2_debug & 0x0001) {
+                               kprintf("hammer2: flush sideq %ld/%ld (end)\n",
+                                       pmp->sideq_count, pmp->inum_count);
+                       }
                        break;
                }
        }
@@ -1678,7 +1713,11 @@ fail:
 }
 
 /*
- * Synchronize the in-memory inode with the chain.
+ * Synchronize the in-memory inode with the chain.  This does not flush
+ * the chain to disk.  Instead, it makes front-end inode changes visible
+ * in the chain topology, thus visible to the backend.  This is done in an
+ * ad-hoc manner outside of the filesystem vfs_sync, and in a controlled
+ * manner inside the vfs_sync.
  */
 void
 hammer2_inode_xop_chain_sync(hammer2_thread_t *thr, hammer2_xop_t *arg)
index ca5e54f..4e2bb6d 100644 (file)
@@ -392,7 +392,7 @@ hammer2_io_putblk(hammer2_io_t **diop)
        struct buf *bp;
        off_t pbase;
        int psize;
-       int limit_dio;
+       int dio_limit;
        uint64_t orefs;
        uint64_t nrefs;
 
@@ -539,17 +539,17 @@ hammer2_io_putblk(hammer2_io_t **diop)
         * We cache free buffers so re-use cases can use a shared lock, but
         * if too many build up we have to clean them out.
         */
-       limit_dio = hammer2_limit_dio;
-       if (limit_dio < 256)
-               limit_dio = 256;
-       if (limit_dio > 1024*1024)
-               limit_dio = 1024*1024;
-       if (hmp->iofree_count > limit_dio) {
+       dio_limit = hammer2_dio_limit;
+       if (dio_limit < 256)
+               dio_limit = 256;
+       if (dio_limit > 1024*1024)
+               dio_limit = 1024*1024;
+       if (hmp->iofree_count > dio_limit) {
                struct hammer2_cleanupcb_info info;
 
                RB_INIT(&info.tmptree);
                hammer2_spin_ex(&hmp->io_spin);
-               if (hmp->iofree_count > limit_dio) {
+               if (hmp->iofree_count > dio_limit) {
                        info.count = hmp->iofree_count / 5;
                        RB_SCAN(hammer2_io_tree, &hmp->iotree, NULL,
                                hammer2_io_cleanup_callback, &info);
index c521772..00b4ae1 100644 (file)
@@ -57,7 +57,7 @@ static int hammer2_ioctl_pfs_snapshot(hammer2_inode_t *ip, void *data);
 static int hammer2_ioctl_pfs_delete(hammer2_inode_t *ip, void *data);
 static int hammer2_ioctl_inode_get(hammer2_inode_t *ip, void *data);
 static int hammer2_ioctl_inode_set(hammer2_inode_t *ip, void *data);
-static int hammer2_ioctl_debug_dump(hammer2_inode_t *ip);
+static int hammer2_ioctl_debug_dump(hammer2_inode_t *ip, u_int flags);
 //static int hammer2_ioctl_inode_comp_set(hammer2_inode_t *ip, void *data);
 //static int hammer2_ioctl_inode_comp_rec_set(hammer2_inode_t *ip, void *data);
 //static int hammer2_ioctl_inode_comp_rec_set2(hammer2_inode_t *ip, void *data);
@@ -155,7 +155,7 @@ hammer2_ioctl(hammer2_inode_t *ip, u_long com, void *data, int fflag,
                        error = hammer2_ioctl_destroy(ip, data);
                break;
        case HAMMER2IOC_DEBUG_DUMP:
-               error = hammer2_ioctl_debug_dump(ip);
+               error = hammer2_ioctl_debug_dump(ip, *(u_int *)data);
                break;
        default:
                error = EOPNOTSUPP;
@@ -661,6 +661,7 @@ hammer2_ioctl_pfs_create(hammer2_inode_t *ip, void *data)
                hammer2_inode_ref(nip);
                hammer2_inode_unlock(nip);
                hammer2_inode_chain_sync(nip);
+               hammer2_inode_chain_flush(nip);
                KKASSERT(nip->refs == 1);
                hammer2_inode_drop(nip);
 
@@ -906,6 +907,7 @@ hammer2_ioctl_pfs_snapshot(hammer2_inode_t *ip, void *data)
                hammer2_inode_ref(nip);
                hammer2_inode_unlock(nip);
                hammer2_inode_chain_sync(nip);
+               hammer2_inode_chain_flush(nip);
                KKASSERT(nip->refs == 1);
                hammer2_inode_drop(nip);
 
@@ -1021,17 +1023,17 @@ hammer2_ioctl_inode_set(hammer2_inode_t *ip, void *data)
 
 static
 int
-hammer2_ioctl_debug_dump(hammer2_inode_t *ip)
+hammer2_ioctl_debug_dump(hammer2_inode_t *ip, u_int flags)
 {
        hammer2_chain_t *chain;
-       int count = 1000;
+       int count = 100000;
        int i;
 
        for (i = 0; i < ip->cluster.nchains; ++i) {
                chain = ip->cluster.array[i].chain;
                if (chain == NULL)
                        continue;
-               hammer2_dump_chain(chain, 0, &count, 'i');
+               hammer2_dump_chain(chain, 0, &count, 'i', flags);
        }
        return 0;
 }
index 5bbabe0..6319e04 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2015 The DragonFly Project.  All rights reserved.
+ * Copyright (c) 2011-2018 The DragonFly Project.  All rights reserved.
  *
  * This code is derived from software contributed to The DragonFly Project
  * by Matthew Dillon <dillon@dragonflybsd.org>
index b8db5e4..1dfd452 100644 (file)
@@ -69,6 +69,7 @@ MALLOC_DEFINE(M_OBJCACHE, "objcache", "Object Cache");
 struct hammer2_sync_info {
        int error;
        int waitfor;
+       int pass;
 };
 
 TAILQ_HEAD(hammer2_mntlist, hammer2_dev);
@@ -85,9 +86,8 @@ int hammer2_dedup_enable = 1;
 int hammer2_always_compress = 0;       /* always try to compress */
 int hammer2_inval_enable = 0;
 int hammer2_flush_pipe = 100;
-int hammer2_synchronous_flush = 1;
 int hammer2_dio_count;
-int hammer2_limit_dio = 256;
+int hammer2_dio_limit = 256;
 int hammer2_bulkfree_tps = 5000;
 long hammer2_chain_allocs;
 long hammer2_chain_frees;
@@ -134,8 +134,6 @@ SYSCTL_INT(_vfs_hammer2, OID_AUTO, inval_enable, CTLFLAG_RW,
           &hammer2_inval_enable, 0, "");
 SYSCTL_INT(_vfs_hammer2, OID_AUTO, flush_pipe, CTLFLAG_RW,
           &hammer2_flush_pipe, 0, "");
-SYSCTL_INT(_vfs_hammer2, OID_AUTO, synchronous_flush, CTLFLAG_RW,
-          &hammer2_synchronous_flush, 0, "");
 SYSCTL_INT(_vfs_hammer2, OID_AUTO, bulkfree_tps, CTLFLAG_RW,
           &hammer2_bulkfree_tps, 0, "");
 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, chain_allocs, CTLFLAG_RW,
@@ -148,8 +146,8 @@ SYSCTL_LONG(_vfs_hammer2, OID_AUTO, count_modified_chains, CTLFLAG_RW,
           &hammer2_count_modified_chains, 0, "");
 SYSCTL_INT(_vfs_hammer2, OID_AUTO, dio_count, CTLFLAG_RD,
           &hammer2_dio_count, 0, "");
-SYSCTL_INT(_vfs_hammer2, OID_AUTO, limit_dio, CTLFLAG_RW,
-          &hammer2_limit_dio, 0, "");
+SYSCTL_INT(_vfs_hammer2, OID_AUTO, dio_limit, CTLFLAG_RW,
+          &hammer2_dio_limit, 0, "");
 
 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_invals, CTLFLAG_RW,
           &hammer2_iod_invals, 0, "");
@@ -181,12 +179,12 @@ SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_fmap_write, CTLFLAG_RW,
 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_volu_write, CTLFLAG_RW,
           &hammer2_iod_volu_write, 0, "");
 
-long hammer2_check_icrc32;
-long hammer2_check_xxhash64;
-SYSCTL_LONG(_vfs_hammer2, OID_AUTO, check_icrc32, CTLFLAG_RW,
-          &hammer2_check_icrc32, 0, "");
-SYSCTL_LONG(_vfs_hammer2, OID_AUTO, check_xxhash64, CTLFLAG_RW,
-          &hammer2_check_xxhash64, 0, "");
+long hammer2_process_icrc32;
+long hammer2_process_xxhash64;
+SYSCTL_LONG(_vfs_hammer2, OID_AUTO, process_icrc32, CTLFLAG_RW,
+          &hammer2_process_icrc32, 0, "");
+SYSCTL_LONG(_vfs_hammer2, OID_AUTO, process_xxhash64, CTLFLAG_RW,
+          &hammer2_process_xxhash64, 0, "");
 
 static int hammer2_vfs_init(struct vfsconf *conf);
 static int hammer2_vfs_uninit(struct vfsconf *vfsp);
@@ -259,12 +257,12 @@ hammer2_vfs_init(struct vfsconf *conf)
         *
         * NOTE: A large buffer cache can actually interfere with dedup
         *       operation because we dedup based on media physical buffers
-        *       and not logical buffers.  Try to make the DIO chace large
+        *       and not logical buffers.  Try to make the DIO case large
         *       enough to avoid this problem, but also cap it.
         */
-       hammer2_limit_dio = nbuf * 2;
-       if (hammer2_limit_dio > 100000)
-               hammer2_limit_dio = 100000;
+       hammer2_dio_limit = nbuf * 2;
+       if (hammer2_dio_limit > 100000)
+               hammer2_dio_limit = 100000;
 
        if (HAMMER2_BLOCKREF_BYTES != sizeof(struct hammer2_blockref))
                error = EINVAL;
@@ -1769,9 +1767,9 @@ again:
         * rot).
         */
        dumpcnt = 50;
-       hammer2_dump_chain(&hmp->vchain, 0, &dumpcnt, 'v');
+       hammer2_dump_chain(&hmp->vchain, 0, &dumpcnt, 'v', (u_int)-1);
        dumpcnt = 50;
-       hammer2_dump_chain(&hmp->fchain, 0, &dumpcnt, 'f');
+       hammer2_dump_chain(&hmp->fchain, 0, &dumpcnt, 'f', (u_int)-1);
        hammer2_dev_unlock(hmp);
        hammer2_chain_drop(&hmp->vchain);
 
@@ -2246,7 +2244,8 @@ hammer2_recovery_scan(hammer2_dev_t *hmp, hammer2_chain_t *parent,
                if (tmp_error == 0 &&
                    (bref.flags & HAMMER2_BREF_FLAG_PFSROOT) &&
                    (chain->flags & HAMMER2_CHAIN_ONFLUSH)) {
-                       hammer2_flush(chain, HAMMER2_FLUSH_TOP);
+                       hammer2_flush(chain, HAMMER2_FLUSH_TOP |
+                                            HAMMER2_FLUSH_ALL);
                }
                rup_error |= tmp_error;
        }
@@ -2254,9 +2253,8 @@ hammer2_recovery_scan(hammer2_dev_t *hmp, hammer2_chain_t *parent,
 }
 
 /*
- * Sync a mount point; this is called on a per-mount basis from the
- * filesystem syncer process periodically and whenever a user issues
- * a sync.
+ * Sync a mount point; this is called periodically on a per-mount basis from
+ * the filesystem syncer, and whenever a user issues a sync.
  */
 int
 hammer2_vfs_sync(struct mount *mp, int waitfor)
@@ -2290,26 +2288,40 @@ hammer2_vfs_sync(struct mount *mp, int waitfor)
                flags |= VMSC_ONEPASS;
 
        /*
-        * Preflush the vnodes using a normal transaction before interlocking
-        * with a flush transaction.  We do this to try to run as much of
-        * the compression as possible outside the flush transaction.
+        * Flush vnodes individually using a normal transaction to avoid
+        * stalling any concurrent operations.  This will flush the related
+        * buffer cache buffers and inodes to the media.
         *
         * For efficiency do an async pass before making sure with a
         * synchronous pass on all related buffer cache buffers.
+        *
+        * Do a single synchronous pass to avoid double-flushing vnodes,
+        * which can waste copy-on-write blocks.  XXX do not do two passes.
         */
        hammer2_trans_init(pmp, 0);
        info.error = 0;
        info.waitfor = MNT_NOWAIT;
+       info.pass = 1;
        vsyncscan(mp, flags | VMSC_NOWAIT, hammer2_sync_scan2, &info);
        info.waitfor = MNT_WAIT;
        vsyncscan(mp, flags, hammer2_sync_scan2, &info);
+
+       info.pass = 2;
+       info.waitfor = MNT_WAIT;
+       vsyncscan(mp, flags, hammer2_sync_scan2, &info);
+
+       /*
+        * We must also run the sideq to handle any disconnected inodes
+        * as the vnode scan will not see these.
+        */
+       hammer2_inode_run_sideq(pmp, 1);
        hammer2_trans_done(pmp);
 
        /*
-        * Start our flush transaction.  This does not return until all
-        * concurrent transactions have completed and will prevent any
-        * new transactions from running concurrently, except for the
-        * buffer cache transactions.
+        * Start our flush transaction and flush the root topology down to
+        * the inodes, but not the inodes themselves (which we already flushed
+        * above).  Any concurrent activity effecting inode contents will not
+        * be part of this flush cycle.
         *
         * (1) vfsync() all dirty vnodes via vfsyncscan().
         *
@@ -2326,14 +2338,6 @@ hammer2_vfs_sync(struct mount *mp, int waitfor)
         */
        hammer2_trans_init(pmp, HAMMER2_TRANS_ISFLUSH);
 
-       info.error = 0;
-       info.waitfor = MNT_NOWAIT;
-       vsyncscan(mp, flags | VMSC_NOWAIT, hammer2_sync_scan2, &info);
-       info.waitfor = MNT_WAIT;
-       vsyncscan(mp, flags, hammer2_sync_scan2, &info);
-       hammer2_inode_run_sideq(pmp, 1);
-       hammer2_bioq_sync(pmp);
-
        /*
         * Use the XOP interface to concurrently flush all nodes to
         * synchronize the PFSROOT subtopology to the media.  A standard
@@ -2345,7 +2349,18 @@ hammer2_vfs_sync(struct mount *mp, int waitfor)
         * XXX For now wait for all flushes to complete.
         */
        if (iroot) {
-               xop = hammer2_xop_alloc(iroot, HAMMER2_XOP_MODIFYING);
+               /*
+                * If unmounting try to flush everything including any
+                * sub-trees under inodes, just in case there is dangling
+                * modified data, as a safety.  Otherwise just flush up to
+                * the inodes in this stage.
+                */
+               if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
+                       xop = hammer2_xop_alloc(iroot, HAMMER2_XOP_MODIFYING);
+               } else {
+                       xop = hammer2_xop_alloc(iroot, HAMMER2_XOP_MODIFYING |
+                                                      HAMMER2_XOP_INODE_STOP);
+               }
                hammer2_xop_start(&xop->head, hammer2_inode_xop_flush);
                error = hammer2_xop_collect(&xop->head,
                                            HAMMER2_XOP_COLLECT_WAITALL);
@@ -2368,6 +2383,11 @@ hammer2_vfs_sync(struct mount *mp, int waitfor)
  * Note that we ignore the tranasction mtid we got above.  Instead,
  * each vfsync below will ultimately get its own via TRANS_BUFCACHE
  * transactions.
+ *
+ * WARNING! The frontend might be waiting on chnmem (limit_dirty_chains)
+ * while holding a vnode locked.  When this situation occurs we cannot
+ * safely test whether it is ok to clear the dirty bit on the vnode.
+ * However, we can still flush the inode's topology.
  */
 static int
 hammer2_sync_scan2(struct mount *mp, struct vnode *vp, void *data)
@@ -2391,34 +2411,39 @@ hammer2_sync_scan2(struct mount *mp, struct vnode *vp, void *data)
        }
 
        /*
-        * VOP_FSYNC will start a new transaction so replicate some code
-        * here to do it inline (see hammer2_vop_fsync()).
+        * Synchronize the buffer cche and inode meta-data to the backing
+        * chain topology.
         *
-        * WARNING: The vfsync interacts with the buffer cache and might
-        *          block, we can't hold the inode lock at that time.
-        *          However, we MUST ref ip before blocking to ensure that
-        *          it isn't ripped out from under us (since we do not
-        *          hold a lock on the vnode).
+        * vfsync is not necessarily synchronous, so it is best NOT to try
+        * to flush the backing topology to media at this point.
         */
        hammer2_inode_ref(ip);
-       if ((ip->flags & HAMMER2_INODE_MODIFIED) ||
+       if ((ip->flags & (HAMMER2_INODE_RESIZED|HAMMER2_INODE_MODIFIED)) ||
            !RB_EMPTY(&vp->v_rbdirty_tree)) {
-               vfsync(vp, info->waitfor, 1, NULL, NULL);
-               if (ip->flags & (HAMMER2_INODE_RESIZED |
-                                HAMMER2_INODE_MODIFIED)) {
+               if (info->pass == 1)
+                       vfsync(vp, info->waitfor, 1, NULL, NULL);
+               else
+                       bio_track_wait(&vp->v_track_write, 0, 0);
+       }
+       if (info->pass == 2 && (vp->v_flag & VISDIRTY)) {
+               if (vx_get_nonblock(vp) == 0) {
                        hammer2_inode_lock(ip, 0);
-                       if (ip->flags & (HAMMER2_INODE_RESIZED |
-                                        HAMMER2_INODE_MODIFIED)) {
-                               hammer2_inode_chain_sync(ip);
+                       if ((ip->flags & HAMMER2_INODE_MODIFIED) == 0 &&
+                           RB_EMPTY(&vp->v_rbdirty_tree) &&
+                           !bio_track_active(&vp->v_track_write)) {
+                               vclrisdirty(vp);
                        }
+                       hammer2_inode_chain_sync(ip);
+                       hammer2_inode_chain_flush(ip);
+                       hammer2_inode_unlock(ip);
+                       vx_put(vp);
+               } else {
+                       /* can't safely clear isdirty */
+                       hammer2_inode_lock(ip, 0);
+                       hammer2_inode_chain_flush(ip);
                        hammer2_inode_unlock(ip);
                }
        }
-       if ((ip->flags & HAMMER2_INODE_MODIFIED) == 0 &&
-           RB_EMPTY(&vp->v_rbdirty_tree)) {
-               vclrisdirty(vp);
-       }
-
        hammer2_inode_drop(ip);
 #if 1
        error = 0;
@@ -2701,7 +2726,7 @@ hammer2_pfs_memory_wait(hammer2_pfs_t *pmp)
                /*
                 * Try to start an early flush before we are forced to block.
                 */
-               if (count > limit * 7 / 10)
+               if (count > limit * 5 / 10)
                        speedup_syncer(pmp->mp);
                break;
        }
@@ -2794,7 +2819,8 @@ hammer2_vfs_enospace(hammer2_inode_t *ip, off_t bytes, struct ucred *cred)
  * Debugging
  */
 void
-hammer2_dump_chain(hammer2_chain_t *chain, int tab, int *countp, char pfx)
+hammer2_dump_chain(hammer2_chain_t *chain, int tab, int *countp, char pfx,
+                  u_int flags)
 {
        hammer2_chain_t *scan;
        hammer2_chain_t *parent;
@@ -2828,8 +2854,12 @@ hammer2_dump_chain(hammer2_chain_t *chain, int tab, int *countp, char pfx)
                kprintf("\n");
        } else {
                kprintf(" {\n");
-               RB_FOREACH(scan, hammer2_chain_tree, &chain->core.rbtree)
-                       hammer2_dump_chain(scan, tab + 4, countp, 'a');
+               RB_FOREACH(scan, hammer2_chain_tree, &chain->core.rbtree) {
+                       if ((scan->flags & flags) || flags == (u_int)-1) {
+                               hammer2_dump_chain(scan, tab + 4, countp, 'a',
+                                                  flags);
+                       }
+               }
                if (chain->bref.type == HAMMER2_BREF_TYPE_INODE && chain->data)
                        kprintf("%*.*s}(%s)\n", tab, tab, "",
                                chain->data->ipdata.filename);
index fb17884..d57dc95 100644 (file)
@@ -163,7 +163,7 @@ hammer2_vop_reclaim(struct vop_reclaim_args *ap)
 
        /*
         * A modified inode may require chain synchronization.  This
-        * synchronization is usually handled by VOP_SNYC / VOP_FSYNC
+        * synchronization is usually handled by VOP_SYNC / VOP_FSYNC
         * when vfsync() is called.  However, that requires a vnode.
         *
         * When the vnode is disassociated we must keep track of any modified
@@ -207,39 +207,54 @@ hammer2_vop_reclaim(struct vop_reclaim_args *ap)
        return (0);
 }
 
+/*
+ * Currently this function synchronizes the front-end inode state to the
+ * backend chain topology, then flushes the inode's chain and sub-topology
+ * to backend media.  This function does not flush the root topology down to
+ * the inode.
+ */
 static
 int
 hammer2_vop_fsync(struct vop_fsync_args *ap)
 {
        hammer2_inode_t *ip;
        struct vnode *vp;
+       int error1;
+       int error2;
 
        vp = ap->a_vp;
        ip = VTOI(vp);
+       error1 = 0;
 
-#if 0
-       /* XXX can't do this yet */
-       hammer2_trans_init(ip->pmp, HAMMER2_TRANS_ISFLUSH);
-       vfsync(vp, ap->a_waitfor, 1, NULL, NULL);
-#endif
        hammer2_trans_init(ip->pmp, 0);
+
+       /*
+        * Clean out buffer cache, wait for I/O's to complete.
+        */
        vfsync(vp, ap->a_waitfor, 1, NULL, NULL);
+       bio_track_wait(&vp->v_track_write, 0, 0);
 
        /*
-        * Calling chain_flush here creates a lot of duplicative
-        * COW operations due to non-optimal vnode ordering.
-        *
-        * Only do it for an actual fsync() syscall.  The other forms
-        * which call this function will eventually call chain_flush
-        * on the volume root as a catch-all, which is far more optimal.
+        * Flush any inode changes
         */
        hammer2_inode_lock(ip, 0);
-       if (ip->flags & HAMMER2_INODE_MODIFIED)
-               hammer2_inode_chain_sync(ip);
+       if (ip->flags & (HAMMER2_INODE_RESIZED|HAMMER2_INODE_MODIFIED))
+               error1 = hammer2_inode_chain_sync(ip);
+
+       /*
+        * Flush dirty chains related to the inode.
+        *
+        * NOTE! XXX We do not currently flush to the volume root, ultimately
+        *       we will want to have a shortcut for the flushed inode stored
+        *       in the volume root for recovery purposes.
+        */
+       error2 = hammer2_inode_chain_flush(ip);
+       if (error2)
+               error1 = error2;
        hammer2_inode_unlock(ip);
        hammer2_trans_done(ip->pmp);
 
-       return (0);
+       return (error1);
 }
 
 static
@@ -461,13 +476,18 @@ hammer2_vop_setattr(struct vop_setattr_args *ap)
 
 done:
        /*
-        * If a truncation occurred we must call inode_fsync() now in order
+        * If a truncation occurred we must call chain_sync() now in order
         * to trim the related data chains, otherwise a later expansion can
         * cause havoc.
         *
         * If an extend occured that changed the DIRECTDATA state, we must
         * call inode_fsync now in order to prepare the inode's indirect
         * block table.
+        *
+        * WARNING! This means we are making an adjustment to the inode's
+        * chain outside of sync/fsync, and not just to inode->meta, which
+        * may result in some consistency issues if a crash were to occur
+        * at just the wrong time.
         */
        if (ip->flags & HAMMER2_INODE_RESIZED)
                hammer2_inode_chain_sync(ip);
@@ -1104,8 +1124,19 @@ hammer2_write_file(hammer2_inode_t *ip, struct uio *uio,
                        hammer2_update_time(&ip->meta.mtime);
                        vclrflags(vp, VLASTWRITETS);
                }
-               if (ip->flags & HAMMER2_INODE_MODIFIED)
+
+#if 0
+               /*
+                * REMOVED - handled by hammer2_extend_file().  Do not issue
+                * a chain_sync() outside of a sync/fsync except for DIRECTDATA
+                * state changes.
+                *
+                * Under normal conditions we only issue a chain_sync if
+                * the inode's DIRECTDATA state changed.
+                */
+               if (ip->flags & HAMMER2_INODE_RESIZED)
                        hammer2_inode_chain_sync(ip);
+#endif
                hammer2_mtx_unlock(&ip->lock);
                hammer2_knote(ip->vp, kflags);
        }
@@ -1184,6 +1215,13 @@ hammer2_extend_file(hammer2_inode_t *ip, hammer2_key_t nsize)
        ip->osize = osize;
        ip->meta.size = nsize;
 
+       /*
+        * We must issue a chain_sync() when the DIRECTDATA state changes
+        * to prevent confusion between the flush code and the in-memory
+        * state.  This is not perfect because we are doing it outside of
+        * a sync/fsync operation, so it might not be fully synchronized
+        * with the meta-data topology flush.
+        */
        if (osize <= HAMMER2_EMBEDDED_BYTES && nsize > HAMMER2_EMBEDDED_BYTES) {
                atomic_set_int(&ip->flags, HAMMER2_INODE_RESIZED);
                hammer2_inode_chain_sync(ip);