hammer2 - Refactor flush mechanics
authorMatthew Dillon <dillon@apollo.backplane.com>
Wed, 19 Feb 2014 07:20:34 +0000 (23:20 -0800)
committerMatthew Dillon <dillon@apollo.backplane.com>
Tue, 25 Feb 2014 22:48:11 +0000 (14:48 -0800)
* Greatly simplify and reduce special cases in the flush code

* Remove the multi-layer rbtree and replace with two discrete rbtree's
  (rbtree and dbtree) representing the live state and one linked list (dbq)
  representing set-aside deleted chains that are not part of the live state.

* Cleanup some debugging junk, add more debugging junk.

* Separate flushing state flags and TIDs into their own fields instead of
  trying to use the live state flags and bref TIDs.

* Simplify transaction TID tracking.

22 files changed:
sys/vfs/hammer2/donew [deleted file]
sys/vfs/hammer2/donew2 [deleted file]
sys/vfs/hammer2/dossd [deleted file]
sys/vfs/hammer2/dossd2 [deleted file]
sys/vfs/hammer2/dotest [deleted file]
sys/vfs/hammer2/hammer2.h
sys/vfs/hammer2/hammer2_ccms.c
sys/vfs/hammer2/hammer2_ccms.h
sys/vfs/hammer2/hammer2_chain.c
sys/vfs/hammer2/hammer2_disk.h
sys/vfs/hammer2/hammer2_flush.c
sys/vfs/hammer2/hammer2_freemap.c
sys/vfs/hammer2/hammer2_inode.c
sys/vfs/hammer2/hammer2_io.c
sys/vfs/hammer2/hammer2_ioctl.c
sys/vfs/hammer2/hammer2_ioctl.h
sys/vfs/hammer2/hammer2_mount.h
sys/vfs/hammer2/hammer2_msgops.c
sys/vfs/hammer2/hammer2_subr.c
sys/vfs/hammer2/hammer2_vfsops.c
sys/vfs/hammer2/hammer2_vnops.c
sys/vfs/hammer2/mkvntest [deleted file]

diff --git a/sys/vfs/hammer2/donew b/sys/vfs/hammer2/donew
deleted file mode 100755 (executable)
index db573ea..0000000
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/bin/csh
-#
-
-umount /mnt
-newfs_hammer2 -L ROOT /dev/da0s1d
diff --git a/sys/vfs/hammer2/donew2 b/sys/vfs/hammer2/donew2
deleted file mode 100755 (executable)
index d98c5a2..0000000
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/bin/csh
-#
-
-umount /mnt
-newfs_hammer2 -L ROOT /dev/da0s1b
diff --git a/sys/vfs/hammer2/dossd b/sys/vfs/hammer2/dossd
deleted file mode 100755 (executable)
index 946bf7b..0000000
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/bin/csh
-#
-
-umount /mnt >& /dev/null
-kldunload hammer2.ko >& /dev/null
-kldstat | fgrep hammer2.ko >& /dev/null
-if ( $status > 0 ) then
-    kldload /usr/obj/usr/src/sys/vfs/hammer2/hammer2.ko
-endif
-mount_hammer2 /dev/da0s1d@ROOT /mnt
-sysctl vfs.hammer2.debug=0
diff --git a/sys/vfs/hammer2/dossd2 b/sys/vfs/hammer2/dossd2
deleted file mode 100755 (executable)
index 124869c..0000000
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/bin/csh
-#
-
-umount /mnt >& /dev/null
-kldunload hammer2.ko >& /dev/null
-kldstat | fgrep hammer2.ko >& /dev/null
-if ( $status > 0 ) then
-    kldload /usr/obj/usr/src/sys/vfs/hammer2/hammer2.ko
-endif
-mount_hammer2 /dev/da0s1b@ROOT /mnt
-sysctl vfs.hammer2.debug=0
diff --git a/sys/vfs/hammer2/dotest b/sys/vfs/hammer2/dotest
deleted file mode 100755 (executable)
index 803de0c..0000000
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/bin/csh
-#
-
-# ./mkvntest
-umount /mnt >& /dev/null
-kldunload hammer2.ko >& /dev/null
-kldstat | fgrep hammer2.ko >& /dev/null
-if ( $status > 0 ) then
-    kldload /usr/obj/usr/src/sys/vfs/hammer2/hammer2.ko
-endif
-mount_hammer2 /dev/vn0@ROOT /mnt
index bbacbc7..aa352c4 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2013 The DragonFly Project.  All rights reserved.
+ * Copyright (c) 2011-2014 The DragonFly Project.  All rights reserved.
  *
  * This code is derived from software contributed to The DragonFly Project
  * by Matthew Dillon <dillon@dragonflybsd.org>
@@ -91,14 +91,29 @@ struct hammer2_msg;
  * boundaries, renames, resizing, or any move of the chain to elsewhere
  * in the topology is accomplished via the DELETE-DUPLICATE mechanism.
  *
- * DELETE-DUPLICATE allows HAMMER2 to track work across flush synchronization
- * points without stalling the filesystem or corrupting the flush
- * sychronization point.  When necessary a chain will be marked DELETED
- * and a new, duplicate chain will be allocated.
+ * Deletions and delete-duplicates:
  *
- * This mechanism necessarily requires that we be able to overload chains
- * at any given layer in the topology.  Overloading is accomplished via a
- * RBTREE recursion through chain->rbtree.
+ *     Any movement of chains within the topology utilize a delete-duplicate
+ *     operation instead of a simple rename.  That is, the chain must be
+ *     deleted from its original location and then duplicated to the new
+ *     location.  A new chain structure is allocated while the old is
+ *     deleted.  Deleted chains are removed from the above chain_core's
+ *     rbtree but remain linked via the shadow topology for flush
+ *     synchronization purposes.
+ *
+ *     delete_bmap is allocated and a bit set if the chain was originally
+ *     loaded via the blockmap.
+ *
+ * Flush synchronization:
+ *
+ *     Flushes must synchronize chains up through the root.  To do this
+ *     the in-memory topology would normally have to be frozen during the
+ *     flush.  To avoid freezing the topology and to allow concurrent
+ *     foreground / flush activity, any new modifications made while a
+ *     flush is in progress retains the original chain in a shadow topology
+ *     that is only visible to the flush code.  Only one flush can be
+ *     running at a time so the shadow hierarchy can be implemented with
+ *     just a few link fields in our in-memory data structures.
  *
  * Advantages:
  *
@@ -123,29 +138,22 @@ struct hammer2_msg;
 RB_HEAD(hammer2_chain_tree, hammer2_chain);
 TAILQ_HEAD(h2_flush_deferral_list, hammer2_chain);
 TAILQ_HEAD(h2_core_list, hammer2_chain);
-TAILQ_HEAD(h2_layer_list, hammer2_chain_layer);
-
-struct hammer2_chain_layer {
-       int             good;
-       TAILQ_ENTRY(hammer2_chain_layer) entry;
-       struct hammer2_chain_tree rbtree;
-       int             refs;           /* prevent destruction */
-};
 
-typedef struct hammer2_chain_layer hammer2_chain_layer_t;
+#define CHAIN_CORE_DELETE_BMAP_ENTRIES \
+       (HAMMER2_PBUFSIZE / sizeof(hammer2_blockref_t) / sizeof(uint32_t))
 
 struct hammer2_chain_core {
        int             good;
        struct ccms_cst cst;
-       struct h2_core_list ownerq;     /* all chains sharing this core */
-       struct h2_layer_list layerq;
+       struct h2_core_list ownerq;       /* all chains sharing this core */
+       struct hammer2_chain_tree rbtree; /* live chains */
+       struct hammer2_chain_tree dbtree; /* bmapped deletions */
+       struct h2_core_list dbq;          /* other deletions */
        int             live_zero;      /* blockref array opt */
-       hammer2_tid_t   update_lo;      /* check update against parent */
-       hammer2_tid_t   update_hi;      /* check update against parent */
-       u_int           chain_count;    /* total chains in layers */
        u_int           sharecnt;
        u_int           flags;
        u_int           live_count;     /* live (not deleted) chains in tree */
+       u_int           chain_count;    /* live + deleted chains under core */
        int             generation;     /* generation number (inserts only) */
 };
 
@@ -186,9 +194,9 @@ typedef struct hammer2_io hammer2_io_t;
  * Primary chain structure keeps track of the topology in-memory.
  */
 struct hammer2_chain {
-       RB_ENTRY(hammer2_chain) rbnode;         /* node */
        TAILQ_ENTRY(hammer2_chain) core_entry;  /* contemporary chains */
-       hammer2_chain_layer_t   *inlayer;
+       RB_ENTRY(hammer2_chain) rbnode;         /* live chain(s) */
+       TAILQ_ENTRY(hammer2_chain) db_entry;    /* non bmapped deletions */
        hammer2_blockref_t      bref;
        hammer2_chain_core_t    *core;
        hammer2_chain_core_t    *above;
@@ -196,8 +204,23 @@ struct hammer2_chain {
        struct hammer2_mount    *hmp;
        struct hammer2_pfsmount *pmp;           /* can be NULL */
 
-       hammer2_tid_t   modify_tid;             /* snapshot/flush filter */
-       hammer2_tid_t   delete_tid;
+       hammer2_blockref_t      dsrc;                   /* DEBUG */
+       int                     ninserts;               /* DEBUG */
+       int                     nremoves;               /* DEBUG */
+       hammer2_tid_t           dsrc_dupfromat;         /* DEBUG */
+       uint32_t                dsrc_dupfromflags;      /* DEBUG */
+       int                     dsrc_reason;            /* DEBUG */
+       int                     dsrc_ninserts;          /* DEBUG */
+       uint32_t                dsrc_flags;             /* DEBUG */
+       hammer2_tid_t           dsrc_modify;            /* DEBUG */
+       hammer2_tid_t           dsrc_delete;            /* DEBUG */
+       hammer2_tid_t           dsrc_update_lo;         /* DEBUG */
+       struct hammer2_chain    *dsrc_original;         /* DEBUG */
+
+       hammer2_tid_t   modify_tid;             /* flush filter */
+       hammer2_tid_t   delete_tid;             /* flush filter */
+       hammer2_tid_t   update_lo;              /* flush propagation */
+       hammer2_tid_t   update_hi;              /* setsubmod propagation */
        hammer2_key_t   data_count;             /* delta's to apply */
        hammer2_key_t   inode_count;            /* delta's to apply */
        hammer2_io_t    *dio;                   /* physical data buffer */
@@ -224,32 +247,28 @@ RB_PROTOTYPE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp);
  *          related buffer.  The data is assumed to be all-zeros.  It
  *          is primarily used for indirect blocks.
  *
- * MOVED   - A modified chain becomes MOVED after it flushes.  A chain
- *          can also become MOVED if it is moved within the topology
- *          (even if not modified).
- *
  * MODIFIED- The chain's media data has been modified.
  */
 #define HAMMER2_CHAIN_MODIFIED         0x00000001      /* dirty chain data */
 #define HAMMER2_CHAIN_ALLOCATED                0x00000002      /* kmalloc'd chain */
-#define HAMMER2_CHAIN_UNUSED0004       0x00000004
+#define HAMMER2_CHAIN_FLUSH_TEMPORARY  0x00000004
 #define HAMMER2_CHAIN_FORCECOW         0x00000008      /* force copy-on-wr */
 #define HAMMER2_CHAIN_DELETED          0x00000010      /* deleted chain */
 #define HAMMER2_CHAIN_INITIAL          0x00000020      /* initial create */
-#define HAMMER2_CHAIN_FLUSHED          0x00000040      /* blktable updated */
-#define HAMMER2_CHAIN_MOVED            0x00000080      /* bref changed */
+#define HAMMER2_CHAIN_FLUSH_CREATE     0x00000040      /* needs flush blkadd */
+#define HAMMER2_CHAIN_FLUSH_DELETE     0x00000080      /* needs flush blkdel */
 #define HAMMER2_CHAIN_IOFLUSH          0x00000100      /* bawrite on put */
 #define HAMMER2_CHAIN_DEFERRED         0x00000200      /* on a deferral list */
 #define HAMMER2_CHAIN_UNLINKED         0x00000400      /* delete on reclaim */
 #define HAMMER2_CHAIN_VOLUMESYNC       0x00000800      /* needs volume sync */
-#define HAMMER2_CHAIN_UNUSED01000      0x00001000
+#define HAMMER2_CHAIN_ONDBQ            0x00001000      /* !bmapped deletes */
 #define HAMMER2_CHAIN_MOUNTED          0x00002000      /* PFS is mounted */
 #define HAMMER2_CHAIN_ONRBTREE         0x00004000      /* on parent RB tree */
 #define HAMMER2_CHAIN_SNAPSHOT         0x00008000      /* snapshot special */
 #define HAMMER2_CHAIN_EMBEDDED         0x00010000      /* embedded data */
 #define HAMMER2_CHAIN_RELEASE          0x00020000      /* don't keep around */
-#define HAMMER2_CHAIN_UNUSED40000      0x00040000
-#define HAMMER2_CHAIN_UNUSED80000      0x00080000
+#define HAMMER2_CHAIN_BMAPPED          0x00040000      /* in parent blkmap */
+#define HAMMER2_CHAIN_ONDBTREE         0x00080000      /* bmapped deletes */
 #define HAMMER2_CHAIN_DUPLICATED       0x00100000      /* fwd delete-dup */
 #define HAMMER2_CHAIN_PFSROOT          0x00200000      /* in pfs->cluster */
 
@@ -472,8 +491,8 @@ struct hammer2_trans {
        TAILQ_ENTRY(hammer2_trans) entry;
        struct hammer2_pfsmount *pmp;           /* might be NULL */
        struct hammer2_mount    *hmp_single;    /* if single-targetted */
-       hammer2_tid_t           sync_tid;
-       hammer2_tid_t           real_tid;
+       hammer2_tid_t           orig_tid;
+       hammer2_tid_t           sync_tid;       /* effective transaction id */
        hammer2_tid_t           inode_tid;
        thread_t                td;             /* pointer */
        int                     flags;
@@ -485,7 +504,7 @@ struct hammer2_trans {
 typedef struct hammer2_trans hammer2_trans_t;
 
 #define HAMMER2_TRANS_ISFLUSH          0x0001  /* formal flush */
-#define HAMMER2_TRANS_UNUSED0002       0x0002
+#define HAMMER2_TRANS_CONCURRENT       0x0002  /* concurrent w/flush */
 #define HAMMER2_TRANS_BUFCACHE         0x0004  /* from bioq strategy write */
 #define HAMMER2_TRANS_NEWINODE         0x0008  /* caller allocating inode */
 #define HAMMER2_TRANS_ISALLOCATING     0x0010  /* in allocator */
@@ -644,6 +663,7 @@ extern int hammer2_debug;
 extern int hammer2_cluster_enable;
 extern int hammer2_hardlink_enable;
 extern int hammer2_flush_pipe;
+extern int hammer2_synchronous_flush;
 extern long hammer2_limit_dirty_chains;
 extern long hammer2_iod_file_read;
 extern long hammer2_iod_meta_read;
@@ -786,8 +806,8 @@ void hammer2_chain_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
                                int nradix, int flags);
 void hammer2_chain_unlock(hammer2_chain_t *chain);
 void hammer2_chain_wait(hammer2_chain_t *chain);
-hammer2_chain_t *hammer2_chain_get(hammer2_chain_t *parent,
-                               hammer2_blockref_t *bref, int generation);
+hammer2_chain_t *hammer2_chain_get(hammer2_chain_t *parent, int generation,
+                               hammer2_blockref_t *bref);
 hammer2_chain_t *hammer2_chain_lookup_init(hammer2_chain_t *parent, int flags);
 void hammer2_chain_lookup_done(hammer2_chain_t *parent);
 hammer2_chain_t *hammer2_chain_lookup(hammer2_chain_t **parentp,
@@ -818,7 +838,7 @@ void hammer2_chain_delete(hammer2_trans_t *trans, hammer2_chain_t *chain,
                                int flags);
 void hammer2_chain_delete_duplicate(hammer2_trans_t *trans,
                                hammer2_chain_t **chainp, int flags);
-void hammer2_chain_flush(hammer2_trans_t *trans, hammer2_chain_t **chainp);
+void hammer2_flush(hammer2_trans_t *trans, hammer2_chain_t **chainp);
 void hammer2_chain_commit(hammer2_trans_t *trans, hammer2_chain_t *chain);
 void hammer2_chain_setsubmod(hammer2_trans_t *trans, hammer2_chain_t *chain);
 
@@ -827,13 +847,12 @@ void hammer2_chain_memory_inc(hammer2_pfsmount_t *pmp);
 void hammer2_chain_memory_wakeup(hammer2_pfsmount_t *pmp);
 void hammer2_chain_countbrefs(hammer2_chain_t *chain,
                                hammer2_blockref_t *base, int count);
-void hammer2_chain_layer_check_locked(hammer2_mount_t *hmp,
-                               hammer2_chain_core_t *core);
 
 int hammer2_base_find(hammer2_chain_t *chain,
                                hammer2_blockref_t *base, int count,
                                int *cache_indexp, hammer2_key_t *key_nextp,
-                               hammer2_key_t key_beg, hammer2_key_t key_end);
+                               hammer2_key_t key_beg, hammer2_key_t key_end,
+                               int delete_filter);
 void hammer2_base_delete(hammer2_trans_t *trans, hammer2_chain_t *chain,
                                hammer2_blockref_t *base, int count,
                                int *cache_indexp, hammer2_chain_t *child);
@@ -880,11 +899,11 @@ void hammer2_io_breadcb(hammer2_mount_t *hmp, off_t lbase, int lsize,
 void hammer2_io_bawrite(hammer2_io_t **diop);
 void hammer2_io_bdwrite(hammer2_io_t **diop);
 int hammer2_io_bwrite(hammer2_io_t **diop);
+int hammer2_io_isdirty(hammer2_io_t *dio);
 void hammer2_io_setdirty(hammer2_io_t *dio);
 void hammer2_io_setinval(hammer2_io_t *dio, u_int bytes);
 void hammer2_io_brelse(hammer2_io_t **diop);
 void hammer2_io_bqrelse(hammer2_io_t **diop);
-int hammer2_io_isdirty(hammer2_io_t *dio);
 
 /*
  * hammer2_msgops.c
@@ -898,7 +917,7 @@ int hammer2_msg_adhoc_input(kdmsg_msg_t *msg);
 void hammer2_clusterctl_wakeup(kdmsg_iocom_t *iocom);
 void hammer2_volconf_update(hammer2_pfsmount_t *pmp, int index);
 void hammer2_cluster_reconnect(hammer2_pfsmount_t *pmp, struct file *fp);
-void hammer2_dump_chain(hammer2_chain_t *chain, int tab, int *countp);
+void hammer2_dump_chain(hammer2_chain_t *chain, int tab, int *countp, char pfx);
 void hammer2_bioq_sync(hammer2_pfsmount_t *pmp);
 int hammer2_vfs_sync(struct mount *mp, int waitflags);
 void hammer2_lwinprog_ref(hammer2_pfsmount_t *pmp);
index ae500ca..47e9edb 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006,2012-2013 The DragonFly Project.  All rights reserved.
+ * Copyright (c) 2006,2012-2014 The DragonFly Project.  All rights reserved.
  *
  * This code is derived from software contributed to The DragonFly Project
  * by Matthew Dillon <dillon@backplane.com>
index c13ac27..cbbb7f7 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006,2012 The DragonFly Project.  All rights reserved.
+ * Copyright (c) 2006,2014 The DragonFly Project.  All rights reserved.
  *
  * This code is derived from software contributed to The DragonFly Project
  * by Matthew Dillon <dillon@backplane.com>
index ac0cc08..6aee4bb 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2013 The DragonFly Project.  All rights reserved.
+ * Copyright (c) 2011-2014 The DragonFly Project.  All rights reserved.
  *
  * This code is derived from software contributed to The DragonFly Project
  * by Matthew Dillon <dillon@dragonflybsd.org>
@@ -93,19 +93,14 @@ static hammer2_chain_t *hammer2_combined_find(
                int *cache_indexp, hammer2_key_t *key_nextp,
                hammer2_key_t key_beg, hammer2_key_t key_end,
                hammer2_blockref_t **bresp);
-static void hammer2_chain_assert_not_present(hammer2_chain_core_t *above,
-                                hammer2_chain_t *chain);
 
 /*
- * Basic RBTree for chains.  Chains cannot overlap within any given
- * core->rbtree without recursing through chain->rbtree.  We effectively
- * guarantee this by checking the full range rather than just the first
- * key element.  By matching on the full range callers can detect when
- * recursrion through chain->rbtree is needed.
+ * Basic RBTree for chains (core->rbtree and core->dbtree).  Chains cannot
+ * overlap in the RB trees.  Deleted chains are moved from rbtree to either
+ * dbtree or to dbq.
  *
- * NOTE: This also means the a delete-duplicate on the same key will
- *      overload by placing the deleted element in the new element's
- *      chain->rbtree (when doing a direct replacement).
+ * Chains in delete-duplicate sequences can always iterate through core_entry
+ * to locate the live version of the chain.
  */
 RB_GENERATE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp);
 
@@ -117,6 +112,10 @@ hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2)
        hammer2_key_t c2_beg;
        hammer2_key_t c2_end;
 
+       /*
+        * Compare chains.  Overlaps are not supposed to happen and catch
+        * any software issues early we count overlaps as a match.
+        */
        c1_beg = chain1->bref.key;
        c1_end = c1_beg + ((hammer2_key_t)1 << chain1->bref.keybits) - 1;
        c2_beg = chain2->bref.key;
@@ -144,40 +143,33 @@ hammer2_isclusterable(hammer2_chain_t *chain)
 }
 
 /*
- * Recursively set the update_hi flag up to the root starting at chain's
- * parent->core.  update_hi is not set in chain's core.
+ * Recursively set update_hi starting at chain up through to the root.
  *
  * This controls top-down visibility for flushes.  The child has just one
  * 'above' core, but the core itself can be multi-homed with parents iterated
- * via core->ownerq.
+ * via core->ownerq.  The last parent is the 'live' parent (all others had to
+ * have been delete-duplicated).  We always propagate upward through the live
+ * parent.
  *
  * This function is not used during a flush (except when the flush is
  * allocating which requires the live tree).  The flush keeps track of its
  * recursion itself.
  *
- * XXX needs to be optimized to use roll-up TIDs.  update_hi is only really
- * compared against bref.mirror_tid which itself is only updated by a flush.
+ * XXX SMP races
  */
 void
 hammer2_chain_setsubmod(hammer2_trans_t *trans, hammer2_chain_t *chain)
 {
        hammer2_chain_core_t *above;
 
+       if (chain->update_hi < trans->sync_tid)
+               chain->update_hi = trans->sync_tid;
+
        while ((above = chain->above) != NULL) {
                spin_lock(&above->cst.spin);
-               /* XXX optimize */
-               if (above->update_hi < trans->sync_tid)
-                       above->update_hi = trans->sync_tid;
                chain = TAILQ_LAST(&above->ownerq, h2_core_list);
-#if 0
-               TAILQ_FOREACH_REVERSE(chain, &above->ownerq,
-                                     h2_core_list, core_entry) {
-                       if (trans->sync_tid >= chain->modify_tid &&
-                           trans->sync_tid <= chain->delete_tid) {
-                               break;
-                       }
-               }
-#endif
+               if (chain->update_hi < trans->sync_tid)
+                       chain->update_hi = trans->sync_tid;
                spin_unlock(&above->cst.spin);
        }
 }
@@ -236,12 +228,18 @@ hammer2_chain_alloc(hammer2_mount_t *hmp, hammer2_pfsmount_t *pmp,
        chain->delete_tid = HAMMER2_MAX_TID;
 
        /*
-        * Set modify_tid if a transaction is creating the chain.  When
-        * loading a chain from backing store trans is passed as NULL and
-        * modify_tid is left set to 0.
+        * Set modify_tid if a transaction is creating the inode.
+        * Enforce update_lo = 0 so nearby transactions do not think
+        * it has been flushed when it hasn't.
+        *
+        * NOTE: When loading a chain from backing store or creating a
+        *       snapshot, trans will be NULL and the caller is responsible
+        *       for setting these fields.
         */
-       if (trans)
+       if (trans) {
                chain->modify_tid = trans->sync_tid;
+               chain->update_lo = 0;
+       }
 
        return (chain);
 }
@@ -271,14 +269,12 @@ hammer2_chain_core_alloc(hammer2_trans_t *trans,
                 */
                core = kmalloc(sizeof(*core), nchain->hmp->mchain,
                               M_WAITOK | M_ZERO);
-               TAILQ_INIT(&core->layerq);
                TAILQ_INIT(&core->ownerq);
+               TAILQ_INIT(&core->dbq);
+               RB_INIT(&core->rbtree); /* live chains */
+               RB_INIT(&core->dbtree); /* deleted original (bmapped) chains */
                core->sharecnt = 1;
                core->good = 0x1234;
-               if (trans)
-                       core->update_hi = trans->sync_tid;
-               else
-                       core->update_hi = nchain->bref.mirror_tid;
                nchain->core = core;
                ccms_cst_init(&core->cst, nchain);
                TAILQ_INSERT_TAIL(&core->ownerq, nchain, core_entry);
@@ -328,21 +324,20 @@ hammer2_chain_core_alloc(hammer2_trans_t *trans,
                spin_lock(&core->cst.spin);
                nchain->core = core;
 
-#if 0
-               if (core->update_hi < trans->sync_tid)
-                       core->update_hi = trans->sync_tid;
-#endif
-
                /*
                 * Maintain ordering for refactor test so we don't skip over
                 * a snapshot.  Also, during flushes, delete-duplications
-                * for block-table updates can occur on blocks already
-                * deleted (delete-duplicated by a later transaction).  We
-                * must insert nchain after ochain but before the later
-                * transaction's copy.
+                * for block-table updates can occur on ochains already
+                * deleted (delete-duplicated by a later transaction), or
+                * on forward-indexed ochains.  We must properly insert
+                * nchain relative to ochain.
                 */
-               TAILQ_INSERT_AFTER(&core->ownerq, ochain, nchain, core_entry);
-
+               if (trans && trans->sync_tid < ochain->modify_tid) {
+                       TAILQ_INSERT_BEFORE(ochain, nchain, core_entry);
+               } else {
+                       TAILQ_INSERT_AFTER(&core->ownerq, ochain,
+                                          nchain, core_entry);
+               }
                spin_unlock(&core->cst.spin);
        }
 }
@@ -357,8 +352,11 @@ hammer2_chain_ref(hammer2_chain_t *chain)
 }
 
 /*
- * Insert the chain in the core rbtree at the first layer
- * which accepts it (for now we don't sort layers by the transaction tid)
+ * Insert the chain in the core rbtree.
+ *
+ * Normal insertions are placed in the live rbtree.  Insertion of a deleted
+ * chain is a special case used by the flush code that is placed on the
+ * unstaged deleted list to avoid confusing the live view.
  */
 #define HAMMER2_CHAIN_INSERT_SPIN      0x0001
 #define HAMMER2_CHAIN_INSERT_LIVE      0x0002
@@ -366,46 +364,16 @@ hammer2_chain_ref(hammer2_chain_t *chain)
 
 static
 int
-hammer2_chain_insert(hammer2_chain_core_t *above, hammer2_chain_layer_t *layer,
-                    hammer2_chain_t *chain, int flags, int generation)
+hammer2_chain_insert(hammer2_chain_core_t *above,
+                    hammer2_chain_t *ochain, hammer2_chain_t *nchain,
+                    int flags, int generation)
 {
        hammer2_chain_t *xchain;
-       hammer2_chain_layer_t *nlayer;
        int error = 0;
 
        if (flags & HAMMER2_CHAIN_INSERT_SPIN)
                spin_lock(&above->cst.spin);
 
-       /*
-        * Special case, place the chain in the next most-recent layer as the
-        * specified layer, inserting a layer inbetween if necessary.
-        */
-       if (layer) {
-               KKASSERT((flags & HAMMER2_CHAIN_INSERT_RACE) == 0);
-               nlayer = TAILQ_PREV(layer, h2_layer_list, entry);
-               if (nlayer && RB_INSERT(hammer2_chain_tree,
-                                       &nlayer->rbtree, chain) == NULL) {
-                       layer = nlayer;
-                       goto done;
-               }
-
-               spin_unlock(&above->cst.spin);
-               KKASSERT((flags & HAMMER2_CHAIN_INSERT_LIVE) == 0);
-               nlayer = kmalloc(sizeof(*nlayer), chain->hmp->mchain,
-                                M_WAITOK | M_ZERO);
-               RB_INIT(&nlayer->rbtree);
-               nlayer->good = 0xABCD;
-               spin_lock(&above->cst.spin);
-
-               if ((chain->flags & HAMMER2_CHAIN_DELETED) == 0)
-                       hammer2_chain_assert_not_present(above, chain);
-
-               TAILQ_INSERT_BEFORE(layer, nlayer, entry);
-               RB_INSERT(hammer2_chain_tree, &nlayer->rbtree, chain);
-               layer = nlayer;
-               goto done;
-       }
-
        /*
         * Interlocked by spinlock, check for race
         */
@@ -416,54 +384,50 @@ hammer2_chain_insert(hammer2_chain_core_t *above, hammer2_chain_layer_t *layer,
        }
 
        /*
-        * Try to insert, allocate a new layer if a nominal collision
-        * occurs (a collision is different from a SMP race).
+        * Insert nchain
         */
-       layer = TAILQ_FIRST(&above->layerq);
-       xchain = NULL;
-
-       if (layer == NULL ||
-           (xchain = RB_INSERT(hammer2_chain_tree,
-                               &layer->rbtree, chain)) != NULL) {
-
-               /*
-                * Allocate a new layer to resolve the issue.
-                */
-               spin_unlock(&above->cst.spin);
-               layer = kmalloc(sizeof(*layer), chain->hmp->mchain,
-                               M_WAITOK | M_ZERO);
-               RB_INIT(&layer->rbtree);
-               layer->good = 0xABCD;
-               spin_lock(&above->cst.spin);
+       if (nchain->flags & HAMMER2_CHAIN_DELETED) {
+               if (ochain && (ochain->flags & HAMMER2_CHAIN_BMAPPED)) {
+                       if (ochain->flags & HAMMER2_CHAIN_ONDBTREE) {
+                               RB_REMOVE(hammer2_chain_tree,
+                                         &above->dbtree, ochain);
+                               atomic_clear_int(&ochain->flags,
+                                                HAMMER2_CHAIN_ONDBTREE);
+                               TAILQ_INSERT_TAIL(&above->dbq,
+                                                 ochain, db_entry);
+                               atomic_set_int(&ochain->flags,
+                                               HAMMER2_CHAIN_ONDBQ);
+                       }
+                       /* clear BMAPPED (DBTREE, sometimes RBTREE) */
+                       atomic_clear_int(&ochain->flags, HAMMER2_CHAIN_BMAPPED);
 
-               if ((flags & HAMMER2_CHAIN_INSERT_RACE) &&
-                   above->generation != generation) {
-                       spin_unlock(&above->cst.spin);
-                       kfree(layer, chain->hmp->mchain);
-                       spin_lock(&above->cst.spin);
-                       error = EAGAIN;
-                       goto failed;
+                       xchain = RB_INSERT(hammer2_chain_tree,
+                                          &above->dbtree, nchain);
+                       KKASSERT(xchain == NULL);
+                       atomic_set_int(&nchain->flags,
+                                      HAMMER2_CHAIN_ONDBTREE |
+                                      HAMMER2_CHAIN_BMAPPED);
+               } else {
+                       TAILQ_INSERT_TAIL(&above->dbq, nchain, db_entry);
+                       atomic_set_int(&nchain->flags, HAMMER2_CHAIN_ONDBQ);
                }
-               hammer2_chain_assert_not_present(above, chain);
-
-               TAILQ_INSERT_HEAD(&above->layerq, layer, entry);
-               RB_INSERT(hammer2_chain_tree, &layer->rbtree, chain);
+       } else {
+               xchain = RB_INSERT(hammer2_chain_tree, &above->rbtree, nchain);
+               KASSERT(xchain == NULL,
+                       ("hammer2_chain_insert: collision %p", nchain));
+               atomic_set_int(&nchain->flags, HAMMER2_CHAIN_ONRBTREE);
        }
-done:
-       chain->above = above;
-       chain->inlayer = layer;
+
+       nchain->above = above;
        ++above->chain_count;
        ++above->generation;
-       atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
 
        /*
         * We have to keep track of the effective live-view blockref count
         * so the create code knows when to push an indirect block.
         */
-       if ((flags & HAMMER2_CHAIN_INSERT_LIVE) &&
-           (chain->flags & HAMMER2_CHAIN_DELETED) == 0) {
+       if (flags & HAMMER2_CHAIN_INSERT_LIVE)
                atomic_add_int(&above->live_count, 1);
-       }
 failed:
        if (flags & HAMMER2_CHAIN_INSERT_SPIN)
                spin_unlock(&above->cst.spin);
@@ -490,7 +454,9 @@ hammer2_chain_drop(hammer2_chain_t *chain)
        if (hammer2_debug & 0x200000)
                Debugger("drop");
 
-       if (chain->flags & HAMMER2_CHAIN_MOVED)
+       if (chain->flags & HAMMER2_CHAIN_FLUSH_CREATE)
+               ++need;
+       if (chain->flags & HAMMER2_CHAIN_FLUSH_DELETE)
                ++need;
        if (chain->flags & HAMMER2_CHAIN_MODIFIED)
                ++need;
@@ -554,7 +520,6 @@ hammer2_chain_lastdrop(hammer2_chain_t *chain, struct h2_core_list *delayq)
        hammer2_mount_t *hmp;
        hammer2_chain_core_t *above;
        hammer2_chain_core_t *core;
-       hammer2_chain_layer_t *layer;
        hammer2_chain_t *rdrop1;
        hammer2_chain_t *rdrop2;
 
@@ -574,7 +539,7 @@ hammer2_chain_lastdrop(hammer2_chain_t *chain, struct h2_core_list *delayq)
                 * have their deleted flag set) short-cut recursive flush
                 * dependencies and can be freed here.  Any flushes which run
                 * through stale children due to the flush synchronization
-                * point should have the MOVED bit set in the chain and not
+                * point should have a FLUSH_* bit set in the chain and not
                 * reach lastdrop at this time.
                 *
                 * NOTE: We return (chain) on failure to retry.
@@ -626,7 +591,6 @@ hammer2_chain_lastdrop(hammer2_chain_t *chain, struct h2_core_list *delayq)
        pmp = chain->pmp;       /* can be NULL */
        rdrop1 = NULL;
        rdrop2 = NULL;
-       layer = NULL;
 
        /*
         * Spinlock the parent and try to drop the last ref on chain.
@@ -647,21 +611,31 @@ hammer2_chain_lastdrop(hammer2_chain_t *chain, struct h2_core_list *delayq)
 
                /*
                 * 1->0 transition successful, remove chain from its
-                * above core.  Track layer for removal/freeing.
+                * above core.
                 */
-               KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE);
-               layer = chain->inlayer;
-               RB_REMOVE(hammer2_chain_tree, &layer->rbtree, chain);
+               switch (chain->flags & (HAMMER2_CHAIN_ONRBTREE |
+                                       HAMMER2_CHAIN_ONDBTREE |
+                                       HAMMER2_CHAIN_ONDBQ)) {
+               case HAMMER2_CHAIN_ONRBTREE:
+                       RB_REMOVE(hammer2_chain_tree, &above->rbtree, chain);
+                       atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
+                       break;
+               case HAMMER2_CHAIN_ONDBTREE:
+                       RB_REMOVE(hammer2_chain_tree, &above->dbtree, chain);
+                       atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONDBTREE);
+                       break;
+               case HAMMER2_CHAIN_ONDBQ:
+                       TAILQ_REMOVE(&above->dbq, chain, db_entry);
+                       atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONDBQ);
+                       break;
+               default:
+                       panic("hammer2_chain_lastdrop: chain %p badflags %08x",
+                             chain, chain->flags);
+                       break;
+               }
+
                --above->chain_count;
-               atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
                chain->above = NULL;
-               chain->inlayer = NULL;
-
-               if (RB_EMPTY(&layer->rbtree) && layer->refs == 0) {
-                       TAILQ_REMOVE(&above->layerq, layer, entry);
-               } else {
-                       layer = NULL;
-               }
 
                /*
                 * If our chain was the last chain in the parent's core the
@@ -708,22 +682,13 @@ hammer2_chain_lastdrop(hammer2_chain_t *chain, struct h2_core_list *delayq)
                if (atomic_fetchadd_int(&core->sharecnt, -1) == 1) {
                        /*
                         * On the 1->0 transition of core we can destroy
-                        * it.  Any remaining layers should no longer be
-                        * referenced or visibile to other threads.
+                        * it.
                         */
                        KKASSERT(TAILQ_EMPTY(&core->ownerq));
-                       if (layer) {
-                               layer->good = 0xEF00;
-                               kfree(layer, hmp->mchain);
-                       }
-                       while ((layer = TAILQ_FIRST(&core->layerq)) != NULL) {
-                               KKASSERT(layer->refs == 0 &&
-                                        RB_EMPTY(&layer->rbtree));
-                               TAILQ_REMOVE(&core->layerq, layer, entry);
-                               layer->good = 0xEF01;
-                               kfree(layer, hmp->mchain);
-                       }
-                       /* layer now NULL */
+                       KKASSERT(RB_EMPTY(&core->rbtree) &&
+                                RB_EMPTY(&core->dbtree) &&
+                                TAILQ_EMPTY(&core->dbq) &&
+                                core->chain_count == 0);
                        KKASSERT(core->cst.count == 0);
                        KKASSERT(core->cst.upgrade == 0);
                        core->good = 0x5678;
@@ -735,20 +700,13 @@ hammer2_chain_lastdrop(hammer2_chain_t *chain, struct h2_core_list *delayq)
        /*
         * All spin locks are gone, finish freeing stuff.
         */
-       KKASSERT((chain->flags & (HAMMER2_CHAIN_MOVED |
+       KKASSERT((chain->flags & (HAMMER2_CHAIN_FLUSH_CREATE |
+                                 HAMMER2_CHAIN_FLUSH_DELETE |
                                  HAMMER2_CHAIN_MODIFIED)) == 0);
        hammer2_chain_drop_data(chain, 1);
 
        KKASSERT(chain->dio == NULL);
 
-       /*
-        * Free saved empty layer and return chained drop.
-        */
-       if (layer) {
-               layer->good = 0xEF02;
-               kfree(layer, hmp->mchain);
-       }
-
        /*
         * Once chain resources are gone we can use the now dead chain
         * structure to placehold what might otherwise require a recursive
@@ -945,6 +903,17 @@ hammer2_chain_lock(hammer2_chain_t *chain, int how)
                return (error);
        }
 
+#if 0
+       /*
+        * No need for this, always require that hammer2_chain_modify()
+        * be called before any modifying operations.
+        */
+       if ((chain->flags & HAMMER2_CHAIN_MODIFIED) &&
+           !hammer2_io_isdirty(chain->dio)) {
+               hammer2_io_setdirty(chain->dio);
+       }
+#endif
+
        /*
         * We can clear the INITIAL state now, we've resolved the buffer
         * to zeros and marked it dirty with hammer2_io_new().
@@ -1301,6 +1270,8 @@ hammer2_chain_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
        /*
         * Relocate the block, even if making it smaller (because different
         * block sizes may be in different regions).
+        *
+        * (data blocks only, we aren't copying the storage here).
         */
        hammer2_freemap_alloc(trans, chain, nbytes);
        chain->bytes = nbytes;
@@ -1313,17 +1284,6 @@ hammer2_chain_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
         */
        KKASSERT(chain->dio == NULL);
 
-#if 0
-       /*
-        * Make sure the chain is marked MOVED and propagate the update
-        * to the root for flush.
-        */
-       if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
-               hammer2_chain_ref(chain);
-               atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
-       }
-       hammer2_chain_setsubmod(trans, chain);
-#endif
        *chainp = chain;
 }
 
@@ -1402,15 +1362,6 @@ hammer2_chain_modify(hammer2_trans_t *trans, hammer2_chain_t **chainp,
                        hammer2_chain_delete_duplicate(trans, chainp, 0);
                        chain = *chainp;
                }
-
-               /*
-                * Fall through if fchain or vchain, clearing the CHAIN_FLUSHED
-                * flag.  Basically other chains are delete-duplicated and so
-                * the duplicated chains of course will not have the FLUSHED
-                * flag set, but fchain and vchain are special-cased and the
-                * flag must be cleared when changing modify_tid.
-                */
-               atomic_clear_int(&chain->flags, HAMMER2_CHAIN_FLUSHED);
        }
 
        /*
@@ -1424,20 +1375,20 @@ hammer2_chain_modify(hammer2_trans_t *trans, hammer2_chain_t **chainp,
        }
 
        /*
-        * Otherwise do initial-chain handling
+        * Otherwise do initial-chain handling.  Set MODIFIED to indicate
+        * that the chain has been modified.  Set FLUSH_CREATE to flush
+        * the new blockref (the D-D set FLUSH_DELETE on the old chain to
+        * delete the old blockref).
         */
        if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) {
                atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
                hammer2_chain_ref(chain);
                hammer2_chain_memory_inc(chain->pmp);
        }
-#if 0
-       /* shouldn't be needed */
-       if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
+       if ((chain->flags & HAMMER2_CHAIN_FLUSH_CREATE) == 0) {
+               atomic_set_int(&chain->flags, HAMMER2_CHAIN_FLUSH_CREATE);
                hammer2_chain_ref(chain);
-               atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
        }
-#endif
 
        /*
         * The modification or re-modification requires an allocation and
@@ -1622,44 +1573,12 @@ struct hammer2_chain_find_info {
 static int hammer2_chain_find_cmp(hammer2_chain_t *child, void *data);
 static int hammer2_chain_find_callback(hammer2_chain_t *child, void *data);
 
-/*
- * DEBUGGING - Assert that the chain will not collide.
- */
-static
-void
-hammer2_chain_assert_not_present(hammer2_chain_core_t *core,
-                                hammer2_chain_t *chain)
-{
-       struct hammer2_chain_find_info info;
-       hammer2_chain_layer_t *layer;
-
-       if (chain->flags & HAMMER2_CHAIN_DELETED)
-               return;
-
-       info.best = NULL;
-       info.key_beg = chain->bref.key;
-       info.key_end = chain->bref.key +
-                      ((hammer2_key_t)1 << chain->bref.keybits) - 1;
-       info.key_next = HAMMER2_MAX_KEY;
-
-       TAILQ_FOREACH(layer, &core->layerq, entry) {
-               KKASSERT(layer->good == 0xABCD);
-               RB_SCAN(hammer2_chain_tree, &layer->rbtree,
-                       hammer2_chain_find_cmp, hammer2_chain_find_callback,
-                       &info);
-       }
-       if (info.best && (info.best->flags & HAMMER2_CHAIN_DELETED) == 0)
-               panic("hammer2_chain_assert_not_present: %p/%p\n",
-                       chain, info.best);
-}
-
 static
 hammer2_chain_t *
 hammer2_chain_find(hammer2_chain_t *parent, hammer2_key_t *key_nextp,
                          hammer2_key_t key_beg, hammer2_key_t key_end)
 {
        struct hammer2_chain_find_info info;
-       hammer2_chain_layer_t *layer;
 
        info.best = NULL;
        info.key_beg = key_beg;
@@ -1667,12 +1586,9 @@ hammer2_chain_find(hammer2_chain_t *parent, hammer2_key_t *key_nextp,
        info.key_next = *key_nextp;
 
        KKASSERT(parent->core->good == 0x1234);
-       TAILQ_FOREACH(layer, &parent->core->layerq, entry) {
-               KKASSERT(layer->good == 0xABCD);
-               RB_SCAN(hammer2_chain_tree, &layer->rbtree,
-                       hammer2_chain_find_cmp, hammer2_chain_find_callback,
-                       &info);
-       }
+       RB_SCAN(hammer2_chain_tree, &parent->core->rbtree,
+               hammer2_chain_find_cmp, hammer2_chain_find_callback,
+               &info);
        *key_nextp = info.key_next;
 #if 0
        kprintf("chain_find %p %016jx:%016jx next=%016jx\n",
@@ -1682,6 +1598,36 @@ hammer2_chain_find(hammer2_chain_t *parent, hammer2_key_t *key_nextp,
        return (info.best);
 }
 
+/*
+ * Find a deleted chain covering a block table entry.  Be careful to deal
+ * with the race condition where the block table has been updated but the
+ * chain has not yet been removed from dbtree (due to multiple parents having
+ * to be updated).
+ */
+static
+hammer2_chain_t *
+hammer2_chain_find_deleted(hammer2_chain_t *parent,
+                         hammer2_key_t key_beg, hammer2_key_t key_end)
+{
+       struct hammer2_chain_find_info info;
+       hammer2_chain_t *child;
+
+       info.best = NULL;
+       info.key_beg = key_beg;
+       info.key_end = key_end;
+       info.key_next = 0;
+
+       KKASSERT(parent->core->good == 0x1234);
+       RB_SCAN(hammer2_chain_tree, &parent->core->dbtree,
+               hammer2_chain_find_cmp, hammer2_chain_find_callback,
+               &info);
+       if ((child = info.best) != NULL) {
+               if (child->delete_tid <= parent->update_lo)
+                       child = NULL;
+       }
+       return child;
+}
+
 static
 int
 hammer2_chain_find_cmp(hammer2_chain_t *child, void *data)
@@ -1791,8 +1737,8 @@ hammer2_chain_find_callback(hammer2_chain_t *child, void *data)
  * need the parent's bref array to find our block.
  */
 hammer2_chain_t *
-hammer2_chain_get(hammer2_chain_t *parent, hammer2_blockref_t *bref,
-                 int generation)
+hammer2_chain_get(hammer2_chain_t *parent, int generation,
+                 hammer2_blockref_t *bref)
 {
        hammer2_mount_t *hmp = parent->hmp;
        hammer2_chain_core_t *above = parent->core;
@@ -1806,7 +1752,14 @@ hammer2_chain_get(hammer2_chain_t *parent, hammer2_blockref_t *bref,
        chain = hammer2_chain_alloc(hmp, parent->pmp, NULL, bref);
        hammer2_chain_core_alloc(NULL, chain, NULL);
        /* ref'd chain returned */
+
+       /*
+        * Set modify_tid and update_lo to the chain's synchronization
+        * point from the media.
+        */
        chain->modify_tid = chain->bref.mirror_tid;
+       chain->update_lo = chain->bref.mirror_tid;
+       atomic_set_int(&chain->flags, HAMMER2_CHAIN_BMAPPED);
 
        /*
         * Link the chain into its parent.  A spinlock is required to safely
@@ -1820,7 +1773,9 @@ hammer2_chain_get(hammer2_chain_t *parent, hammer2_blockref_t *bref,
                                     HAMMER2_CHAIN_INSERT_RACE,
                                     generation);
        if (error) {
-               KKASSERT((chain->flags & HAMMER2_CHAIN_ONRBTREE) == 0);
+               KKASSERT((chain->flags & (HAMMER2_CHAIN_ONRBTREE |
+                                         HAMMER2_CHAIN_ONDBTREE |
+                                         HAMMER2_CHAIN_ONDBQ)) == 0);
                kprintf("chain %p get race\n", chain);
                hammer2_chain_drop(chain);
                chain = NULL;
@@ -2093,7 +2048,8 @@ again:
        spin_lock(&above->cst.spin);
        chain = hammer2_combined_find(parent, base, count,
                                      cache_indexp, key_nextp,
-                                     key_beg, key_end, &bref);
+                                     key_beg, key_end,
+                                     &bref);
        generation = above->generation;
 
        /*
@@ -2114,7 +2070,8 @@ again:
        if (chain == NULL) {
                bcopy = *bref;
                spin_unlock(&above->cst.spin);
-               chain = hammer2_chain_get(parent, &bcopy, generation);
+               chain = hammer2_chain_get(parent, generation,
+                                         &bcopy);
                if (chain == NULL) {
                        kprintf("retry lookup parent %p keys %016jx:%016jx\n",
                                parent, key_beg, key_end);
@@ -2398,7 +2355,8 @@ again:
        spin_lock(&above->cst.spin);
        chain = hammer2_combined_find(parent, base, count,
                                      cache_indexp, &next_key,
-                                     key, HAMMER2_MAX_KEY, &bref);
+                                     key, HAMMER2_MAX_KEY,
+                                     &bref);
        generation = above->generation;
 
        /*
@@ -2416,7 +2374,7 @@ again:
        if (chain == NULL) {
                bcopy = *bref;
                spin_unlock(&above->cst.spin);
-               chain = hammer2_chain_get(parent, &bcopy, generation);
+               chain = hammer2_chain_get(parent, generation, &bcopy);
                if (chain == NULL) {
                        kprintf("retry scan parent %p keys %016jx\n",
                                parent, key);
@@ -2605,11 +2563,9 @@ hammer2_chain_create(hammer2_trans_t *trans, hammer2_chain_t **parentp,
                 *
                 * Do NOT mess with the current state of the INITIAL flag.
                 */
-               KKASSERT(chain->modify_tid > parent->bref.mirror_tid);
                KKASSERT(chain->modify_tid == trans->sync_tid);
                chain->bref.key = key;
                chain->bref.keybits = keybits;
-               /* chain->modify_tid = chain->bref.mirror_tid; */
                KKASSERT(chain->above == NULL);
        }
 
@@ -2693,14 +2649,11 @@ again:
        }
 
        /*
-        * Link the chain into its parent.  Later on we will have to set
-        * the MOVED bit in situations where we don't mark the new chain
-        * as being modified.
+        * Link the chain into its parent.
         */
        if (chain->above != NULL)
                panic("hammer2: hammer2_chain_create: chain already connected");
        KKASSERT(chain->above == NULL);
-       KKASSERT((chain->flags & HAMMER2_CHAIN_DELETED) == 0);
        hammer2_chain_insert(above, NULL, chain,
                             HAMMER2_CHAIN_INSERT_SPIN |
                             HAMMER2_CHAIN_INSERT_LIVE,
@@ -2708,7 +2661,8 @@ again:
 
        if (allocated) {
                /*
-                * Mark the newly created chain modified.
+                * Mark the newly created chain modified.  This will cause
+                * FLUSH_CREATE to be set.
                 *
                 * Device buffers are not instantiated for DATA elements
                 * as these are handled by logical buffers.
@@ -2741,13 +2695,14 @@ again:
                }
        } else {
                /*
-                * When reconnecting a chain we must set MOVED and setsubmod
-                * so the flush recognizes that it must update the bref in
-                * the parent.
+                * When reconnecting a chain we must set FLUSH_CREATE and
+                * setsubmod so the flush recognizes that it must update
+                * the bref in the parent.
                 */
-               if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
+               if ((chain->flags & HAMMER2_CHAIN_FLUSH_CREATE) == 0) {
                        hammer2_chain_ref(chain);
-                       atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
+                       atomic_set_int(&chain->flags,
+                                      HAMMER2_CHAIN_FLUSH_CREATE);
                }
        }
        hammer2_chain_setsubmod(trans, chain);
@@ -2819,16 +2774,15 @@ hammer2_chain_duplicate(hammer2_trans_t *trans, hammer2_chain_t **parentp,
        hmp = ochain->hmp;
        KKASSERT(snapshot == 1 || (ochain->flags & HAMMER2_CHAIN_DELETED));
 
-       atomic_set_int(&ochain->flags, HAMMER2_CHAIN_FORCECOW);
-
        /*
         * Now create a duplicate of the chain structure, associating
         * it with the same core, making it the same size, pointing it
         * to the same bref (the same media block).
         *
-        * Give the duplicate the same modify_tid that we previously
-        * ensured was sufficiently advanced to trigger a block table
-        * insertion on flush.
+        * Give nchain the same modify_tid that we previously ensured was
+        * sufficiently advanced to trigger a block table insertion on flush.
+        *
+        * nchain copies ochain's data and must inherit ochain->update_lo.
         *
         * NOTE: bref.mirror_tid duplicated by virtue of bref copy in
         *       hammer2_chain_alloc()
@@ -2846,11 +2800,14 @@ hammer2_chain_duplicate(hammer2_trans_t *trans, hammer2_chain_t **parentp,
                (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
        nchain->bytes = bytes;
        nchain->modify_tid = ochain->modify_tid;
+       nchain->update_lo = ochain->update_lo;
        nchain->inode_reason = ochain->inode_reason + 0x100000;
-       if (ochain->flags & HAMMER2_CHAIN_INITIAL)
-               atomic_set_int(&nchain->flags, HAMMER2_CHAIN_INITIAL);
-       if (ochain->flags & HAMMER2_CHAIN_UNLINKED)
-               atomic_set_int(&nchain->flags, HAMMER2_CHAIN_UNLINKED);
+       atomic_set_int(&nchain->flags,
+                      ochain->flags & (HAMMER2_CHAIN_INITIAL |
+                                       HAMMER2_CHAIN_FORCECOW |
+                                       HAMMER2_CHAIN_UNLINKED));
+       if (ochain->modify_tid == trans->sync_tid)
+               atomic_set_int(&ochain->flags, HAMMER2_CHAIN_FORCECOW);
 
        /*
         * Switch from ochain to nchain
@@ -2885,7 +2842,8 @@ hammer2_chain_duplicate(hammer2_trans_t *trans, hammer2_chain_t **parentp,
 
        /*
         * If parent is not NULL the duplicated chain will be entered under
-        * the parent and the MOVED bit set.
+        * the parent and the FLUSH_CREATE bit set to tell flush to update
+        * the blockref.
         *
         * Having both chains locked is extremely important for atomicy.
         */
@@ -2900,28 +2858,85 @@ hammer2_chain_duplicate(hammer2_trans_t *trans, hammer2_chain_t **parentp,
                                     nchain->bref.type, nchain->bytes);
                parent = NULL;
 
-               if ((nchain->flags & HAMMER2_CHAIN_MOVED) == 0) {
-                       hammer2_chain_ref(nchain);
-                       atomic_set_int(&nchain->flags, HAMMER2_CHAIN_MOVED);
-               }
+               KKASSERT(nchain->flags & HAMMER2_CHAIN_FLUSH_CREATE);
                hammer2_chain_setsubmod(trans, nchain);
        }
 
-#if 0
+       *chainp = nchain;
+}
+
+/*
+ * Helper function for deleting chains.
+ *
+ * The chain is removed from the live view (the RBTREE).
+ *
+ * If appropriate, the chain is added to the shadow topology and FLUSH_DELETE
+ * is set for flusher visbility.  The caller is responsible for calling
+ * setsubmod on chain, so we do not adjust update_hi here.
+ */
+static void
+_hammer2_chain_delete_helper(hammer2_trans_t *trans,
+                            hammer2_chain_core_t *above,
+                            hammer2_chain_t *chain)
+{
+       hammer2_mount_t *hmp;
+       hammer2_chain_t *xchain;
+
+       KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE);
+       KKASSERT(trans->sync_tid >= chain->modify_tid);
+       KKASSERT((chain->flags & (HAMMER2_CHAIN_DELETED |
+                                 HAMMER2_CHAIN_ONDBQ |
+                                 HAMMER2_CHAIN_ONDBTREE |
+                                 HAMMER2_CHAIN_FLUSH_DELETE)) == 0);
+
        /*
-        * Unconditionally set MOVED to force the parent blockrefs to
-        * update, and adjust update_hi below nchain so nchain's
-        * blockrefs are updated with the new attachment.
+        * Flag as deleted, reduce live_count and bump the above core's
+        * generation.
         */
-       if (nchain->core->update_hi < trans->sync_tid) {
-               spin_lock(&nchain->core->cst.spin);
-               if (nchain->core->update_hi < trans->sync_tid)
-                       nchain->core->update_hi = trans->sync_tid;
-               spin_unlock(&nchain->core->cst.spin);
-       }
-#endif
+       chain->delete_tid = trans->sync_tid;
+       atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
+       atomic_add_int(&above->live_count, -1);
+       ++above->generation;
+       hmp = chain->hmp;
 
-       *chainp = nchain;
+       /*
+        * Remove from live tree
+        */
+       RB_REMOVE(hammer2_chain_tree, &above->rbtree, chain);
+       atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONRBTREE);
+
+       if (chain->flags & HAMMER2_CHAIN_BMAPPED) {
+               /*
+                * If the chain was originally bmapped we must place on the
+                * deleted tree and set FLUSH_DELETE (+ref) to prevent
+                * destruction of the chain until the flush can reconcile
+                * the parent's block table.
+                *
+                * NOTE! DBTREE is only representitive of the live view,
+                *       the flush must check both DBTREE and DBQ.
+                */
+               xchain = RB_INSERT(hammer2_chain_tree, &above->dbtree, chain);
+               KKASSERT(xchain == NULL);
+               atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONDBTREE);
+
+               atomic_set_int(&chain->flags, HAMMER2_CHAIN_FLUSH_DELETE);
+               hammer2_chain_ref(chain);
+       } else {
+               /*
+                * If the chain no longer (and never had) an actual blockmap
+                * entry we must place it on the dbq list and set FLUSH_DELETE
+                * (+ref) to prevent destruction of the chain until the flush
+                * can reconcile the parent's block table.
+                *
+                * NOTE! DBTREE is only representitive of the live view,
+                *       the flush must check both DBTREE and DBQ.
+                */
+               TAILQ_INSERT_TAIL(&above->dbq, chain, db_entry);
+               atomic_set_int(&chain->flags, HAMMER2_CHAIN_ONDBQ);
+
+               atomic_set_int(&chain->flags, HAMMER2_CHAIN_FLUSH_DELETE);
+               hammer2_chain_ref(chain);
+       }
 }
 
 /*
@@ -2930,11 +2945,9 @@ hammer2_chain_duplicate(hammer2_trans_t *trans, hammer2_chain_t **parentp,
  * with a duplicate.  Atomicy is at the very-fine spin-lock level in
  * order to ensure that lookups do not race us.
  *
- * If the old chain is already marked deleted the new chain will also be
- * marked deleted.  This case can occur when an inode is removed from the
- * filesystem but programs still have an open descriptor to it, and during
- * flushes when the flush needs to operate on a chain that is deleted in
- * the live view but still alive in the flush view.
+ * The flush code will sometimes call this function with a deleted chain.
+ * In this situation the old chain's memory is reallocated without
+ * duplicating it.
  *
  * The new chain will be marked modified for the current transaction.
  */
@@ -2969,8 +2982,21 @@ hammer2_chain_delete_duplicate(hammer2_trans_t *trans, hammer2_chain_t **chainp,
         * In the case where nchain inherits ochains core, nchain is
         * effectively locked due to ochain being locked (and sharing the
         * core), until we can give nchain its own official ock.
+        *
+        * WARNING! Flusher concurrency can create two cases.  The first is
+        *          that the flusher might be working on a chain that has
+        *          been deleted in the live view but is live in the flusher's
+        *          view.  In the second case the flusher may be duplicating
+        *          a forward-transacted chain.  In both situations nchain
+        *          must be marked deleted.
+        *
+        * WARNING! hammer2_chain_core_alloc() also acts on these issues.
         */
        nchain = hammer2_chain_alloc(hmp, ochain->pmp, trans, &ochain->bref);
+       if ((ochain->flags & HAMMER2_CHAIN_DELETED) ||
+           (ochain->modify_tid > trans->sync_tid)) {
+               atomic_set_int(&nchain->flags, HAMMER2_CHAIN_DELETED);
+       }
        if (flags & HAMMER2_DELDUP_RECORE)
                hammer2_chain_core_alloc(trans, nchain, NULL);
        else
@@ -2982,17 +3008,17 @@ hammer2_chain_delete_duplicate(hammer2_trans_t *trans, hammer2_chain_t **chainp,
        nchain->bytes = bytes;
 
        /*
-        * Duplicate inherits ochain's live state including its modification
+        * nchain inherits ochain's live state including its modification
         * state.  This function disposes of the original.  Because we are
         * doing this in-place under the same parent the block array
         * inserted/deleted state does not change.
         *
-        * The caller isn't expected to make further modifications of ochain
-        * but set the FORCECOW bit anyway, just in case it does.  If ochain
-        * was previously marked FORCECOW we also flag nchain FORCECOW
-        * (used during hardlink splits).  FORCECOW forces a reallocation
-        * of the block when we modify the chain a little later, it does
-        * not force another delete-duplicate.
+        * nchain copies ochain's data and must inherit ochain->update_lo.
+        *
+        * If ochain was previously marked FORCECOW we also flag nchain
+        * FORCECOW (used during hardlink splits).  FORCECOW forces a
+        * reallocation of the block when we modify the chain a little later,
+        * it does not force another delete-duplicate.
         *
         * NOTE: bref.mirror_tid duplicated by virtue of bref copy in
         *       hammer2_chain_alloc()
@@ -3003,106 +3029,73 @@ hammer2_chain_delete_duplicate(hammer2_trans_t *trans, hammer2_chain_t **chainp,
                       ochain->flags & (HAMMER2_CHAIN_INITIAL |
                                        HAMMER2_CHAIN_FORCECOW |
                                        HAMMER2_CHAIN_UNLINKED));
-       atomic_set_int(&ochain->flags, HAMMER2_CHAIN_FORCECOW);
+       if (ochain->modify_tid == trans->sync_tid)
+               atomic_set_int(&ochain->flags, HAMMER2_CHAIN_FORCECOW);
        nchain->inode_reason = ochain->inode_reason + 0x1000;
+       nchain->update_lo = ochain->update_lo;
+       nchain->dsrc = ochain->bref;                    /* DEBUG */
+       nchain->dsrc_dupfromat = trans->sync_tid;       /* DEBUG */
+       nchain->dsrc_dupfromflags = trans->flags;       /* DEBUG */
+       nchain->dsrc_reason = ochain->inode_reason;     /* DEBUG */
+       nchain->dsrc_ninserts = ochain->ninserts;       /* DEBUG */
+       nchain->dsrc_flags = ochain->flags;             /* DEBUG */
+       nchain->dsrc_modify = ochain->modify_tid;       /* DEBUG */
+       nchain->dsrc_delete = ochain->delete_tid;       /* DEBUG */
+       nchain->dsrc_update_lo = ochain->update_lo;     /* DEBUG */
+       nchain->dsrc_original = ochain;                 /* DEBUG */
 
        /*
         * Lock nchain so both chains are now locked (extremely important
-        * for atomicy).  Mark ochain deleted and reinsert into the topology
-        * and insert nchain all in one go.
-        *
-        * If the ochain is already deleted it is left alone and nchain
-        * is inserted into the topology as a deleted chain.  This is
-        * important because it allows ongoing operations to be executed
-        * on a deleted inode which still has open descriptors.
-        *
-        * The deleted case can also occur when a flush delete-duplicates
-        * a node which is being concurrently modified by ongoing operations
-        * in a later transaction.  This creates a problem because the flush
-        * is intended to update blockrefs which then propagate, allowing
-        * the original covering in-memory chains to be freed up.  In this
-        * situation the flush code does NOT free the original covering
-        * chains and will re-apply them to successive copies.
+        * for atomicy).  The shared core allows us to unlock ochain without
+        * actually unlocking ochain.
         */
        hammer2_chain_lock(nchain, HAMMER2_RESOLVE_NEVER);
        /* extra ref still present from original allocation */
 
-       KKASSERT(ochain->flags & HAMMER2_CHAIN_ONRBTREE);
+       KKASSERT(ochain->flags & (HAMMER2_CHAIN_ONRBTREE |
+                                 HAMMER2_CHAIN_ONDBTREE |
+                                 HAMMER2_CHAIN_ONDBQ));
        spin_lock(&above->cst.spin);
-       KKASSERT(ochain->flags & HAMMER2_CHAIN_ONRBTREE);
 
-       /*
-        * Ultimately nchain->modify_tid will be set to trans->sync_tid,
-        * but we can't do that here because we want to call
-        * hammer2_chain_modify() to reallocate the block (if necessary).
-        */
        nchain->modify_tid = ochain->modify_tid;
+       nchain->delete_tid = HAMMER2_MAX_TID;
 
-       if (ochain->flags & HAMMER2_CHAIN_DELETED) {
+       if (nchain->flags & HAMMER2_CHAIN_DELETED) {
                /*
-                * ochain was deleted
+                * Special case, used by the flush code only in two cases:
+                *
+                * (1) The flush must operate on a chain that is visible to
+                *     the flush but deleted in the live view.
+                *
+                * (2) The flush must operate on a forward-indexed chain in
+                *     the live view, which typically
+                *
+                * In these situations nchain will be marked deleted and
+                * insert before ochain.  nchain must inherit certain features
+                * of ochain such as the BMAPPED state.
                 */
-               atomic_set_int(&nchain->flags, HAMMER2_CHAIN_DELETED);
-               if (ochain->delete_tid > trans->sync_tid) {
-                       /*
-                        * delete-duplicate a chain deleted in a later
-                        * transaction.  Only allowed on chains created
-                        * before or during the current transaction (flush
-                        * code should filter out chains created after the
-                        * current transaction).
-                        *
-                        * To make this work is a bit of a hack.  We convert
-                        * ochain's delete_tid to the current sync_tid and
-                        * create a nchain which sets up ochains original
-                        * delete_tid.
-                        *
-                        * This effectively forces ochain to flush as a
-                        * deletion and nchain as a creation.  Thus MOVED
-                        * must be set in ochain (it should already be
-                        * set since it's original delete_tid could not
-                        * have been flushed yet).  Since ochain's delete_tid
-                        * has been moved down to sync_tid, a re-flush at
-                        * sync_tid won't try to delete-duplicate ochain
-                        * again.
-                        */
-                       KKASSERT(ochain->modify_tid <= trans->sync_tid);
+               KKASSERT(trans->flags & HAMMER2_TRANS_ISFLUSH);
+               KKASSERT(ochain->modify_tid < trans->sync_tid);
+               KKASSERT(ochain->delete_tid > trans->sync_tid);
+               atomic_set_int(&nchain->flags, HAMMER2_CHAIN_FLUSH_TEMPORARY);
+               hammer2_chain_insert(above, ochain, nchain, 0, 0);
+
+               if ((ochain->flags & HAMMER2_CHAIN_DELETED) &&
+                   ochain->modify_tid < trans->sync_tid) {
                        nchain->delete_tid = ochain->delete_tid;
                        ochain->delete_tid = trans->sync_tid;
-                       KKASSERT(ochain->flags & HAMMER2_CHAIN_MOVED);
-               } else if (ochain->delete_tid == trans->sync_tid) {
-                       /*
-                        * ochain was deleted in the current transaction
-                        */
-                       nchain->delete_tid = trans->sync_tid;
-               } else {
-                       /*
-                        * ochain was deleted in a prior transaction.
-                        * create and delete nchain in the current
-                        * transaction.
-                        *
-                        * (delete_tid might represent a deleted inode
-                        *  which still has an open descriptor).
-                        */
-                       nchain->delete_tid = trans->sync_tid;
+               } else if (ochain->modify_tid > trans->sync_tid) {
+                       nchain->delete_tid = ochain->modify_tid;
                }
-               hammer2_chain_insert(above, ochain->inlayer, nchain, 0, 0);
        } else {
                /*
-                * ochain was not deleted, delete it in the current
-                * transaction.
+                * Normal case, delete-duplicate deletes ochain and nchain
+                * is the new live chain.
                 */
-               KKASSERT(trans->sync_tid >= ochain->modify_tid);
-               ochain->delete_tid = trans->sync_tid;
-               atomic_set_int(&ochain->flags, HAMMER2_CHAIN_DELETED);
-               atomic_add_int(&above->live_count, -1);
-               hammer2_chain_insert(above, NULL, nchain,
+               _hammer2_chain_delete_helper(trans, above, ochain);
+               hammer2_chain_insert(above, ochain, nchain,
                                     HAMMER2_CHAIN_INSERT_LIVE, 0);
        }
-
-       if ((ochain->flags & HAMMER2_CHAIN_MOVED) == 0) {
-               hammer2_chain_ref(ochain);
-               atomic_set_int(&ochain->flags, HAMMER2_CHAIN_MOVED);
-       }
        spin_unlock(&above->cst.spin);
 
        /*
@@ -3110,6 +3103,7 @@ hammer2_chain_delete_duplicate(hammer2_trans_t *trans, hammer2_chain_t **chainp,
         * a buffer cache buffer, so we need to release it so nchain can
         * potentially obtain it.
         */
+       hammer2_chain_setsubmod(trans, ochain);
        hammer2_chain_unlock(ochain);
 
        /*
@@ -3135,22 +3129,24 @@ hammer2_chain_delete_duplicate(hammer2_trans_t *trans, hammer2_chain_t **chainp,
        hammer2_chain_drop(nchain);
 
        /*
-        * Unconditionally set MOVED to force the parent blockrefs to
+        * Unconditionally set FLUSH_CREATE to force the parent blockrefs to
         * update as the chain_modify() above won't necessarily do it.
-        *
-        * Adjust update_hi below nchain so nchain's blockrefs are updated
-        * with the new attachment.
         */
-       if ((nchain->flags & HAMMER2_CHAIN_MOVED) == 0) {
-               atomic_set_int(&nchain->flags, HAMMER2_CHAIN_MOVED);
+       if ((nchain->flags & HAMMER2_CHAIN_FLUSH_CREATE) == 0) {
+               atomic_set_int(&nchain->flags, HAMMER2_CHAIN_FLUSH_CREATE);
                hammer2_chain_ref(nchain);
        }
-#if 0
-       if (nchain->core->update_hi < trans->sync_tid) {
-               spin_lock(&nchain->core->cst.spin);
-               if (nchain->core->update_hi < trans->sync_tid)
-                       nchain->core->update_hi = trans->sync_tid;
-               spin_unlock(&nchain->core->cst.spin);
+
+       /*
+        * If nchain is in a DELETED state we must set FLUSH_DELETE
+        */
+       if (nchain->flags & HAMMER2_CHAIN_DELETED)
+               KKASSERT((nchain->flags & HAMMER2_CHAIN_FLUSH_DELETE) == 0);
+#if 1
+       if ((nchain->flags & HAMMER2_CHAIN_FLUSH_DELETE) == 0 &&
+           (nchain->flags & HAMMER2_CHAIN_DELETED)) {
+               atomic_set_int(&nchain->flags, HAMMER2_CHAIN_FLUSH_DELETE);
+               hammer2_chain_ref(nchain);
        }
 #endif
        hammer2_chain_setsubmod(trans, nchain);
@@ -3432,9 +3428,6 @@ hammer2_chain_create_indirect(hammer2_trans_t *trans, hammer2_chain_t *parent,
         * We have to mark it modified to allocate its block, but use
         * OPTDATA to allow it to remain in the INITIAL state.  Otherwise
         * it won't be acted upon by the flush code.
-        *
-        * XXX leave the node unmodified, depend on the update_hi
-        * flush to assign and modify parent blocks.
         */
        hammer2_chain_modify(trans, &ichain, HAMMER2_MODIFY_OPTDATA);
 
@@ -3509,7 +3502,7 @@ hammer2_chain_create_indirect(hammer2_trans_t *trans, hammer2_chain_t *parent,
                         */
                        bcopy = *bref;
                        spin_unlock(&above->cst.spin);
-                       chain = hammer2_chain_get(parent, &bcopy, generation);
+                       chain = hammer2_chain_get(parent, generation, &bcopy);
                        if (chain == NULL) {
                                reason = 1;
                                spin_lock(&above->cst.spin);
@@ -3580,9 +3573,9 @@ next_key_spinlocked:
         * cleared out some entries in the parent.  We calculated a good
         * insertion index in the loop above (ichain->index).
         *
-        * We don't have to set MOVED here because we mark ichain modified
-        * down below (so the normal modified -> flush -> set-moved sequence
-        * applies).
+        * We don't have to set FLUSH_CREATE here because we mark ichain
+        * modified down below (so the normal modified -> flush -> set-moved
+        * sequence applies).
         *
         * The insertion shouldn't race as this is a completely new block
         * and the parent is locked.
@@ -3599,19 +3592,8 @@ next_key_spinlocked:
         * also allocate the physical block in ichain for our caller,
         * and assign ichain->data to a pre-zero'd space (because there
         * is not prior data to copy into it).
-        *
-        * We have to set update_hi in ichain's flags manually so the
-        * flusher knows it has to recurse through it to get to all of
-        * our moved blocks, then call setsubmod() to set the bit
-        * recursively.
         */
        /*hammer2_chain_modify(trans, &ichain, HAMMER2_MODIFY_OPTDATA);*/
-       if (ichain->core->update_hi < trans->sync_tid) {
-               spin_lock(&ichain->core->cst.spin);
-               if (ichain->core->update_hi < trans->sync_tid)
-                       ichain->core->update_hi = trans->sync_tid;
-               spin_unlock(&ichain->core->cst.spin);
-       }
        hammer2_chain_setsubmod(trans, ichain);
 
        /*
@@ -3685,7 +3667,8 @@ hammer2_chain_indkey_freemap(hammer2_chain_t *parent, hammer2_key_t *keyp,
                }
                chain = hammer2_combined_find(parent, base, count,
                                              &cache_index, &key_next,
-                                             key_beg, key_end, &bref);
+                                             key_beg, key_end,
+                                             &bref);
 
                /*
                 * Exhausted search
@@ -3797,7 +3780,8 @@ hammer2_chain_indkey_normal(hammer2_chain_t *parent, hammer2_key_t *keyp,
                }
                chain = hammer2_combined_find(parent, base, count,
                                              &cache_index, &key_next,
-                                             key_beg, key_end, &bref);
+                                             key_beg, key_end,
+                                             &bref);
 
                /*
                 * Exhausted search
@@ -3916,7 +3900,7 @@ hammer2_chain_indkey_normal(hammer2_chain_t *parent, hammer2_key_t *keyp,
 }
 
 /*
- * Sets CHAIN_DELETED and CHAIN_MOVED in the chain being deleted and
+ * Sets CHAIN_DELETED and CHAIN_FLUSH_DELETE in the chain being deleted and
  * set chain->delete_tid.  The chain is not actually marked possibly-free
  * in the freemap until the deletion is completely flushed out (because
  * a flush which doesn't cover the entire deletion is flushing the deleted
@@ -3963,61 +3947,17 @@ hammer2_chain_delete(hammer2_trans_t *trans, hammer2_chain_t *chain, int flags)
         * We need the spinlock on the core whos RBTREE contains chain
         * to protect against races.
         */
-       KKASSERT(chain->flags & HAMMER2_CHAIN_ONRBTREE);
        spin_lock(&chain->above->cst.spin);
-
-       KKASSERT(trans->sync_tid >= chain->modify_tid);
-       chain->delete_tid = trans->sync_tid;
-       atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
-       atomic_add_int(&chain->above->live_count, -1);
-       ++chain->above->generation;
-
-       /*
-        * We must set MOVED along with DELETED for the flush code to
-        * recognize the operation and properly disconnect the chain
-        * in-memory.
-        */
-       if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
-               hammer2_chain_ref(chain);
-               atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
-       }
+       _hammer2_chain_delete_helper(trans, chain->above, chain);
        spin_unlock(&chain->above->cst.spin);
 
        hammer2_chain_setsubmod(trans, chain);
 }
 
-/*
- * Called with the core spinlock held to check for freeable layers.
- * Used by the flush code.  Layers can wind up not being freed due
- * to the temporary layer->refs count.  This function frees up any
- * layers that were missed.
- */
-void
-hammer2_chain_layer_check_locked(hammer2_mount_t *hmp,
-                                hammer2_chain_core_t *core)
-{
-       hammer2_chain_layer_t *layer;
-       hammer2_chain_layer_t *tmp;
-
-       tmp = TAILQ_FIRST(&core->layerq);
-       while ((layer = tmp) != NULL) {
-               tmp = TAILQ_NEXT(tmp, entry);
-               if (layer->refs == 0 && RB_EMPTY(&layer->rbtree)) {
-                       TAILQ_REMOVE(&core->layerq, layer, entry);
-                       if (tmp)
-                               ++tmp->refs;
-                       spin_unlock(&core->cst.spin);
-                       kfree(layer, hmp->mchain);
-                       spin_lock(&core->cst.spin);
-                       if (tmp)
-                               --tmp->refs;
-               }
-       }
-}
-
 /*
  * Returns the index of the nearest element in the blockref array >= elm.
- * Returns (count) if no element could be found.
+ * Returns (count) if no element could be found.  If delete_filter is non-zero
+ * the scan filters out any blockrefs which match deleted chains on dbtree.
  *
  * Sets *key_nextp to the next key for loop purposes but does not modify
  * it if the next key would be higher than the current value of *key_nextp.
@@ -4030,12 +3970,13 @@ hammer2_chain_layer_check_locked(hammer2_mount_t *hmp,
  * The spin lock on the related chain must be held.
  */
 int
-hammer2_base_find(hammer2_chain_t *chain,
+hammer2_base_find(hammer2_chain_t *parent,
                  hammer2_blockref_t *base, int count,
                  int *cache_indexp, hammer2_key_t *key_nextp,
-                 hammer2_key_t key_beg, hammer2_key_t key_end)
+                 hammer2_key_t key_beg, hammer2_key_t key_end,
+                 int delete_filter)
 {
-       hammer2_chain_core_t *core = chain->core;
+       hammer2_chain_core_t *core = parent->core;
        hammer2_blockref_t *scan;
        hammer2_key_t scan_end;
        int i;
@@ -4045,7 +3986,7 @@ hammer2_base_find(hammer2_chain_t *chain,
         * Require the live chain's already have their core's counted
         * so we can optimize operations.
         */
-        KKASSERT((chain->flags & HAMMER2_CHAIN_DUPLICATED) ||
+        KKASSERT((parent->flags & HAMMER2_CHAIN_DUPLICATED) ||
                 core->flags & HAMMER2_CORE_COUNTEDBREFS);
 
        /*
@@ -4063,7 +4004,7 @@ hammer2_base_find(hammer2_chain_t *chain,
         */
        i = *cache_indexp;
        cpu_ccfence();
-       if (chain->flags & HAMMER2_CHAIN_DUPLICATED)
+       if (parent->flags & HAMMER2_CHAIN_DUPLICATED)
                limit = count;
        else
                limit = core->live_zero;
@@ -4090,12 +4031,21 @@ hammer2_base_find(hammer2_chain_t *chain,
         */
        while (i < count) {
                if (scan->type != 0) {
-                       if (scan->key > key_beg)
-                               break;
                        scan_end = scan->key +
                                   ((hammer2_key_t)1 << scan->keybits) - 1;
-                       if (scan_end >= key_beg)
-                               break;
+                       if (scan->key > key_beg || scan_end >= key_beg) {
+                               /*
+                                * Check to see if the entry is covered by
+                                * a deleted chain and ignore the entry if
+                                * it is and delete_filter != 0.
+                                */
+                               if (delete_filter == 0)
+                                       break;
+                               if (hammer2_chain_find_deleted(
+                                       parent, scan->key, scan_end) == NULL) {
+                                       break;
+                               }
+                       }
                }
                if (i >= limit)
                        return (count);
@@ -4124,11 +4074,15 @@ hammer2_base_find(hammer2_chain_t *chain,
  * both cases, or sets it to NULL if the search exhausted.  Only returns
  * a non-NULL chain if the search matched from the in-memory chain.
  *
+ * When no in-memory chain has been found and a non-NULL bref is returned
+ * in *bresp.
+ *
  * Must be called with above's spinlock held.  Spinlock remains held
  * through the operation.
  *
  * The returned chain is not locked or referenced.  Use the returned bref
- * to determine if the search exhausted or not.
+ * to determine if the search exhausted or not.  Iterate if the base find
+ * is chosen but matches a deleted chain.
  */
 static hammer2_chain_t *
 hammer2_combined_find(hammer2_chain_t *parent,
@@ -4141,9 +4095,12 @@ hammer2_combined_find(hammer2_chain_t *parent,
        hammer2_chain_t *chain;
        int i;
 
+       /*
+        * Lookup in block array and in rbtree.
+        */
        *key_nextp = key_end + 1;
        i = hammer2_base_find(parent, base, count, cache_indexp,
-                             key_nextp, key_beg, key_end);
+                             key_nextp, key_beg, key_end, 1);
        chain = hammer2_chain_find(parent, key_nextp, key_beg, key_end);
 
        /*
@@ -4151,11 +4108,11 @@ hammer2_combined_find(hammer2_chain_t *parent,
         */
        if (i == count && chain == NULL) {
                *bresp = NULL;
-               return(chain);  /* NULL */
+               return(NULL);
        }
 
        /*
-        * Only chain matched
+        * Only chain matched.
         */
        if (i == count) {
                bref = &chain->bref;
@@ -4171,38 +4128,27 @@ hammer2_combined_find(hammer2_chain_t *parent,
        }
 
        /*
-        * Both in-memory and blockref match.  Select the nearer element.
-        * If both are flush with the left-hand side they are considered
-        * to be the same distance.
+        * Both in-memory and blockref matched, select the nearer element.
         *
-        * When both are the same distance away select the chain if it is
-        * live or if it's delete_tid is greater than the parent's
-        * synchronized bref.mirror_tid (a single test suffices for both
-        * conditions), otherwise select the element.
-        *
-        * (It is possible for an old deletion to linger after a rename-over
-        *  and flush, which would make the media copy the correct choice).
-        */
-
-       /*
-        * Either both are flush with the left-hand side or they are the
-        * same distance away.  Select the chain if it is not deleted
-        * or it has a higher delete_tid, else select the media.
+        * If both are flush with the left-hand side or both are the
+        * same distance away, select the chain.  In this situation the
+        * chain must have been loaded from the matching blockmap.
         */
        if ((chain->bref.key <= key_beg && base[i].key <= key_beg) ||
            chain->bref.key == base[i].key) {
-               if (chain->delete_tid > base[i].mirror_tid) {
-                       bref = &chain->bref;
-               } else {
-                       KKASSERT(chain->flags & HAMMER2_CHAIN_DELETED);
-                       bref = &base[i];
-                       chain = NULL;
+               KKASSERT(chain->bref.key == base[i].key);
+               if ((chain->flags & HAMMER2_CHAIN_BMAPPED) == 0) {
+                       kprintf("chain not bmapped %p.%d %08x\n", chain, chain->bref.type, chain->flags);
+                       kprintf("in chain mod/del %016jx %016jx\n", chain->modify_tid, chain->delete_tid);
+                       kprintf("and updlo/hi %016jx %016jx\n", chain->update_lo, chain->update_hi);
                }
+               KKASSERT(chain->flags & HAMMER2_CHAIN_BMAPPED);
+               bref = &chain->bref;
                goto found;
        }
 
        /*
-        * Select the nearer key.
+        * Select the nearer key
         */
        if (chain->bref.key < base[i].key) {
                bref = &chain->bref;
@@ -4251,17 +4197,21 @@ hammer2_base_delete(hammer2_trans_t *trans, hammer2_chain_t *parent,
         */
        key_next = 0; /* max range */
        i = hammer2_base_find(parent, base, count, cache_indexp,
-                             &key_next, elm->key, elm->key);
+                             &key_next, elm->key, elm->key, 0);
        if (i == count || base[i].type == 0 ||
            base[i].key != elm->key || base[i].keybits != elm->keybits) {
-               panic("delete base %p element not found at %d/%d elm %p\n",
-                     base, i, count, elm);
+               spin_unlock(&core->cst.spin);
+               panic("delete base %p element not found at %d/%d elm %p\n"
+                     "child ino_reason=%08x\n",
+                     base, i, count, elm,
+                     child->inode_reason);
                return;
        }
        bzero(&base[i], sizeof(*base));
-       base[i].mirror_tid = (intptr_t)parent;          /* debug */
-       base[i].modify_tid = (intptr_t)child;           /* debug */
-       base[i].check.debug.sync_tid = trans->sync_tid; /* debug */
+       base[i].mirror_tid = (intptr_t)parent;          /* MEDIA DEBUG */
+       base[i].modify_tid = (intptr_t)child;           /* MEDIA DEBUG */
+       base[i].check.debug.sync_tid = trans->sync_tid; /* MEDIA DEBUG */
+       ++parent->nremoves;                             /* DEBUG */
 
        /*
         * We can only optimize core->live_zero for live chains.
@@ -4308,7 +4258,7 @@ hammer2_base_insert(hammer2_trans_t *trans __unused, hammer2_chain_t *parent,
         */
        key_next = 0; /* max range */
        i = hammer2_base_find(parent, base, count, cache_indexp,
-                             &key_next, elm->key, elm->key);
+                             &key_next, elm->key, elm->key, 0);
 
        /*
         * Shortcut fill optimization, typical ordered insertion(s) may not
@@ -4323,14 +4273,23 @@ hammer2_base_insert(hammer2_trans_t *trans __unused, hammer2_chain_t *parent,
                if ((parent->flags & HAMMER2_CHAIN_DUPLICATED) == 0) {
                        i = core->live_zero++;
                        base[i] = *elm;
+       ++parent->ninserts;     /* DEBUG */
                        return;
                }
        }
 
        xkey = elm->key + ((hammer2_key_t)1 << elm->keybits) - 1;
        if (i != count && (base[i].key < elm->key || xkey >= base[i].key)) {
-               panic("insert base %p overlapping elements at %d elm %p\n",
-                     base, i, elm);
+               if (child->flags & HAMMER2_CHAIN_FLUSH_TEMPORARY) {
+                       kprintf("child %p special replace\n", child);
+                       base[i] = *elm;
+                       return;
+               } else {
+                       spin_unlock(&core->cst.spin);
+                       panic("insert base %p overlapping "
+                             "elements at %d elm %p\n",
+                             base, i, elm);
+               }
        }
 
        /*
@@ -4348,6 +4307,7 @@ hammer2_base_insert(hammer2_trans_t *trans __unused, hammer2_chain_t *parent,
                                      (i - j - 1) * sizeof(*base));
                                base[i - 1] = *elm;
                        }
+       ++parent->ninserts;     /* DEBUG */
                        goto validate;
                }
                ++k;
@@ -4365,6 +4325,7 @@ hammer2_base_insert(hammer2_trans_t *trans __unused, hammer2_chain_t *parent,
                                        core->live_zero = k + 1;
                        }
                        u = 2;
+       ++parent->ninserts;     /* DEBUG */
                        goto validate;
                }
        }
@@ -4576,6 +4537,7 @@ hammer2_chain_memory_wakeup(hammer2_pfsmount_t *pmp)
                        break;
                }
        }
+
        if (waiting & HAMMER2_DIRTYCHAIN_WAITING)
                wakeup(&pmp->inmem_dirty_chains);
 }
index ec62c81..558f0ab 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ * Copyright (c) 2011-2014 The DragonFly Project.  All rights reserved.
  *
  * This code is derived from software contributed to The DragonFly Project
  * by Matthew Dillon <dillon@dragonflybsd.org>
index f07e8c2..7091aae 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2013 The DragonFly Project.  All rights reserved.
+ * Copyright (c) 2011-2014 The DragonFly Project.  All rights reserved.
  *
  * This code is derived from software contributed to The DragonFly Project
  * by Matthew Dillon <dillon@dragonflybsd.org>
@@ -56,7 +56,6 @@ struct hammer2_flush_info {
        hammer2_trans_t *trans;
        int             depth;
        int             diddeferral;
-       int             pass;
        int             cache_index;
        int             domodify;
        struct h2_flush_deferral_list flush_list;
@@ -65,15 +64,16 @@ struct hammer2_flush_info {
 
 typedef struct hammer2_flush_info hammer2_flush_info_t;
 
-static void hammer2_chain_flush_core(hammer2_flush_info_t *info,
-                               hammer2_chain_t **chainp);
-static int hammer2_chain_flush_scan1(hammer2_chain_t *child, void *data);
-static int hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data);
-static void hammer2_flush_core_update(hammer2_chain_core_t *core,
-                         hammer2_flush_info_t *info);
+static void hammer2_flush_core(hammer2_flush_info_t *info,
+                               hammer2_chain_t **chainp, int deleting);
+static int hammer2_flush_pass1(hammer2_chain_t *child, void *data);
+static int hammer2_flush_pass2(hammer2_chain_t *child, void *data);
+static int hammer2_flush_pass3(hammer2_chain_t *child, void *data);
+static int hammer2_flush_pass4(hammer2_chain_t *child, void *data);
 static void hammer2_rollup_stats(hammer2_chain_t *parent,
                                hammer2_chain_t *child, int how);
 
+
 /*
  * Can we ignore a chain for the purposes of flushing modifications
  * to the media?
@@ -130,11 +130,6 @@ hammer2_updatestats(hammer2_flush_info_t *info, hammer2_blockref_t *bref,
  * buffer-cache flush will run either before or after the current pending
  * flush depending on its state.
  *
- * sync_tid vs real_tid.  For flush transactions ONLY, the flush operation
- * actually uses two transaction ids, one for the flush operation itself,
- * and <N+1> for any freemap allocations made as a side-effect.  real_tid
- * is fixed at <N>, sync_tid is adjusted dynamically as-needed.
- *
  * NOTE: The sync_tid for a flush's freemap allocation will match the
  *      sync_tid of the following <concurrent_fs_ops> transaction(s).
  *      The freemap topology will be out-of-step by one transaction id
@@ -171,13 +166,14 @@ hammer2_trans_init(hammer2_trans_t *trans, hammer2_pfsmount_t *pmp,
                 * We queue ourselves and then wait to become the head
                 * of the queue, allowing all prior flushes to complete.
                 *
-                * A unique transaction id is required to avoid confusion
-                * when updating the block tables.
+                * Multiple normal transactions can share the current
+                * transaction id but a flush transaction needs its own
+                * unique TID for proper block table update accounting.
                 */
                ++hmp->flushcnt;
                ++hmp->voldata.alloc_tid;
                trans->sync_tid = hmp->voldata.alloc_tid;
-               trans->real_tid = trans->sync_tid;
+               trans->orig_tid = trans->sync_tid;
                ++hmp->voldata.alloc_tid;
                TAILQ_INSERT_TAIL(&hmp->transq, trans, entry);
                if (TAILQ_FIRST(&hmp->transq) != trans) {
@@ -193,7 +189,7 @@ hammer2_trans_init(hammer2_trans_t *trans, hammer2_pfsmount_t *pmp,
                 */
                TAILQ_INSERT_TAIL(&hmp->transq, trans, entry);
                trans->sync_tid = hmp->voldata.alloc_tid;
-               trans->real_tid = trans->sync_tid;
+               trans->orig_tid = trans->sync_tid;
 
                /* XXX improve/optimize inode allocation */
        } else {
@@ -210,11 +206,19 @@ hammer2_trans_init(hammer2_trans_t *trans, hammer2_pfsmount_t *pmp,
                }
                KKASSERT(head);
                TAILQ_INSERT_AFTER(&hmp->transq, head, trans, entry);
-               trans->sync_tid = head->real_tid + 1;
-               trans->real_tid = trans->sync_tid;
+               trans->sync_tid = head->orig_tid + 1;
+               trans->orig_tid = trans->sync_tid;
+               trans->flags |= HAMMER2_TRANS_CONCURRENT;
 
                if ((trans->flags & HAMMER2_TRANS_BUFCACHE) == 0) {
-                       if (TAILQ_FIRST(&hmp->transq) != head) {
+                       /*
+                        * If synchronous flush mode is enabled concurrent
+                        * frontend transactions during the flush are not
+                        * allowed (except we don't have a choice for buffer
+                        * cache ops).
+                        */
+                       if (hammer2_synchronous_flush > 0 ||
+                           TAILQ_FIRST(&hmp->transq) != head) {
                                trans->blocked = 1;
                                while (trans->blocked) {
                                        lksleep(&trans->sync_tid,
@@ -245,23 +249,44 @@ hammer2_trans_done(hammer2_trans_t *trans)
                hmp = trans->hmp_single;
 
        /*
-        * Remove and adjust flushcnt
+        * Remove.
         */
        hammer2_voldata_lock(hmp);
        TAILQ_REMOVE(&hmp->transq, trans, entry);
-       if (trans->flags & HAMMER2_TRANS_ISFLUSH)
+       head = TAILQ_FIRST(&hmp->transq);
+
+       /*
+        * Adjust flushcnt if this was a flush, clear TRANS_CONCURRENT
+        * up through the next flush.  (If the head is a flush then we
+        * stop there, unlike the unblock code following this section).
+        */
+       if (trans->flags & HAMMER2_TRANS_ISFLUSH) {
                --hmp->flushcnt;
+               scan = head;
+               while (scan && (scan->flags & HAMMER2_TRANS_ISFLUSH) == 0) {
+                       atomic_clear_int(&scan->flags,
+                                        HAMMER2_TRANS_CONCURRENT);
+                       scan = TAILQ_NEXT(head, entry);
+               }
+       }
 
        /*
         * Unblock the head of the queue and any additional transactions
-        * up to the next flush.
+        * up to the next flush.  The head can be a flush and it will be
+        * unblocked along with the non-flush transactions following it
+        * (which are allowed to run concurrently with it).
+        *
+        * In synchronous flush mode we stop if the head transaction is
+        * a flush.
         */
-       head = TAILQ_FIRST(&hmp->transq);
        if (head && head->blocked) {
                head->blocked = 0;
                wakeup(&head->sync_tid);
 
-               scan = TAILQ_NEXT(head, entry);
+               if (hammer2_synchronous_flush > 0)
+                       scan = head;
+               else
+                       scan = TAILQ_NEXT(head, entry);
                while (scan && (scan->flags & HAMMER2_TRANS_ISFLUSH) == 0) {
                        if (scan->blocked) {
                                scan->blocked = 0;
@@ -289,19 +314,15 @@ hammer2_trans_done(hammer2_trans_t *trans)
  * these sorts of synchronization points.
  *
  * This routine can be called from several places but the most important
- * is from the hammer2_vop_reclaim() function.  We want to try to completely
- * clean out the inode structure to prevent disconnected inodes from
- * building up and blowing out the kmalloc pool.  However, it is not actually
- * necessary to flush reclaimed inodes to maintain HAMMER2's crash recovery
- * capability.
+ * is from VFS_SYNC.
  *
  * chain is locked on call and will remain locked on return.  If a flush
- * occured, the chain's MOVED bit will be set indicating that its parent
- * (which is not part of the flush) should be updated.  The chain may be
- * replaced by the call.
+ * occured, the chain's FLUSH_CREATE and/or FLUSH_DELETE bit will be set
+ * indicating that its parent (which is not part of the flush) should be
+ * updated.  The chain may be replaced by the call if it was modified.
  */
 void
-hammer2_chain_flush(hammer2_trans_t *trans, hammer2_chain_t **chainp)
+hammer2_flush(hammer2_trans_t *trans, hammer2_chain_t **chainp)
 {
        hammer2_chain_t *chain = *chainp;
        hammer2_chain_t *scan;
@@ -324,9 +345,6 @@ hammer2_chain_flush(hammer2_trans_t *trans, hammer2_chain_t **chainp)
        info.cache_index = -1;
 
        core = chain->core;
-#if FLUSH_DEBUG
-       kprintf("CHAIN FLUSH trans %p.%016jx chain %p.%d mod %016jx upd %016jx\n", trans, trans->sync_tid, chain, chain->bref.type, chain->modify_tid, core->update_lo);
-#endif
 
        /*
         * Extra ref needed because flush_core expects it when replacing
@@ -338,8 +356,8 @@ hammer2_chain_flush(hammer2_trans_t *trans, hammer2_chain_t **chainp)
        for (;;) {
                /*
                 * Unwind deep recursions which had been deferred.  This
-                * can leave MOVED set for these chains, which will be
-                * handled when we [re]flush chain after the unwind.
+                * can leave the FLUSH_* bits set for these chains, which
+                * will be handled when we [re]flush chain after the unwind.
                 */
                while ((scan = TAILQ_FIRST(&info.flush_list)) != NULL) {
                        KKASSERT(scan->flags & HAMMER2_CHAIN_DEFERRED);
@@ -350,13 +368,13 @@ hammer2_chain_flush(hammer2_trans_t *trans, hammer2_chain_t **chainp)
                         * Now that we've popped back up we can do a secondary
                         * recursion on the deferred elements.
                         *
-                        * NOTE: hammer2_chain_flush() may replace scan.
+                        * NOTE: hammer2_flush() may replace scan.
                         */
                        if (hammer2_debug & 0x0040)
                                kprintf("deferred flush %p\n", scan);
                        hammer2_chain_lock(scan, HAMMER2_RESOLVE_MAYBE);
                        hammer2_chain_drop(scan);       /* ref from deferral */
-                       hammer2_chain_flush(trans, &scan);
+                       hammer2_flush(trans, &scan);
                        hammer2_chain_unlock(scan);
                }
 
@@ -364,11 +382,7 @@ hammer2_chain_flush(hammer2_trans_t *trans, hammer2_chain_t **chainp)
                 * [re]flush chain.
                 */
                info.diddeferral = 0;
-               hammer2_chain_flush_core(&info, &chain);
-#if FLUSH_DEBUG
-               kprintf("flush_core_done parent=<base> chain=%p.%d %08x\n",
-                       chain, chain->bref.type, chain->flags);
-#endif
+               hammer2_flush_core(&info, &chain, 0);
 
                /*
                 * Only loop if deep recursions have been deferred.
@@ -377,7 +391,7 @@ hammer2_chain_flush(hammer2_trans_t *trans, hammer2_chain_t **chainp)
                        break;
 
                if (++loops % 1000 == 0) {
-                       kprintf("hammer2_chain_flush: excessive loops on %p\n",
+                       kprintf("hammer2_flush: excessive loops on %p\n",
                                chain);
                        if (hammer2_debug & 0x100000)
                                Debugger("hell4");
@@ -390,76 +404,85 @@ hammer2_chain_flush(hammer2_trans_t *trans, hammer2_chain_t **chainp)
 /*
  * This is the core of the chain flushing code.  The chain is locked by the
  * caller and must also have an extra ref on it by the caller, and remains
- * locked and will have an extra ref on return.
+ * locked and will have an extra ref on return.  Upon return, the caller can
+ * test the FLUSH_CREATE and FLUSH_DELETE bits to determine what action must
+ * be taken on the parent.
  *
- * If the flush accomplished any work chain will be flagged MOVED
- * indicating a copy-on-write propagation back up is required.
- * Deep sub-nodes may also have been entered onto the deferral list.
- * MOVED is never set on the volume root.
+ * (1) Determine if this node is a candidate for the flush, return if it is
+ *     not.  fchain and vchain are always candidates for the flush.
  *
- * NOTE: modify_tid is different from MODIFIED.  modify_tid is updated
- *      only when a chain is specifically modified, and not updated
- *      for copy-on-write propagations.  MODIFIED is set on any modification
- *      including copy-on-write propagations.
+ * (2) If we recurse too deep the chain is entered onto the deferral list and
+ *     the current flush stack is aborted until after the deferral list is
+ *     run.
  *
- * NOTE: We are responsible for updating chain->bref.mirror_tid and
- *      core->update_lo  The caller is responsible for processing us into
- *      our parent (if any).
+ * (3) Recursively flush live children (rbtree).  This can create deferrals.
+ *     A successful flush clears the MODIFIED bit in the children.
+ *
+ * (4) Recursively flush deleted children (dbtree).  Deletions may be
+ *     considered 'live' if the delete_tid is beyond the flush_tid.  If
+ *     considered 'dead' the recursion is still needed in order to clean
+ *     up the chain.  This can create deferrals.
  *
- *      We are also responsible for updating chain->core->update_lo to
- *      prevent repeated recursions due to deferrals.
+ *     A successful flush clears the MODIFIED bit in the children.
  *
- * WARNING! bref.mirror_tid may only be updated if either the MODIFIED bit
- *         is already zero or if we clear it.
+ * (5) Calculate block table updates on chain based on the children scans
+ *     in (3) and (4) by testing the FLUSH_CREATE and FLUSH_DELETE bits,
+ *     modifying chain if necessary to perform the block table updates.
+ *     Deletions must be removed from dbtree when removed from the
+ *     chain's block table.
  *
+ *     If 'chain' itself is marked DELETED but treated as live, the block
+ *     table update(s) must be propagated to all contemporary chains.  In
+ *     fact, all contemporary chains must be locked and updated uninterrupted
+ *     to avoid lookup races.  Once MODIFIED and FLUSH_CREATE is cleared,
+ *     a chain can be unloaded from memory with the expectation that it can
+ *     be reloaded later via the block table at any time.
+ *
+ * NOTE: chain->bref.modify_tid is different from chain->modify_tid.  COW
+ *      propagations for block updates do not update chain->bref.modify_tid,
+ *      only chain->bref.mirror_tid.  The MODIFIED bit is set on any
+ *      modified chain, including COW propagations, but the flusher normally
+ *      just keys off of the FLUSH_* bits.  FLUSH_CREATE will also be set
+ *      in this situation.
+ *
+ * NOTE: We are responsible for updating chain->bref.mirror_tid and
+ *      chain->update_lo.  The caller is responsible for processing us into
+ *      our parent (if any).
  */
 static void
-hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t **chainp)
+hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t **chainp,
+                  int deleting)
 {
        hammer2_chain_t *chain = *chainp;
        hammer2_chain_t *saved_parent;
        hammer2_mount_t *hmp;
        hammer2_chain_core_t *core;
-#if 0
-       hammer2_blockref_t *bref;
-       char *bdata;
-       hammer2_io_t *dio;
-       int error;
-#endif
        int diddeferral;
+       int saved_domodify;
 
        hmp = chain->hmp;
        core = chain->core;
        diddeferral = info->diddeferral;
 
        /*
-        * Check if we even have any work to do.
-        *
-        * We do not update core->update_lo because there might be other
-        * paths to the core and we haven't actually checked it.
+        * (1) Check if we even have any work to do.
         *
         * This bit of code is capable of short-cutting entire sub-trees
-        * if they have not been touched.
+        * if they have not been touched or if they have already been
+        * flushed.
         */
-       if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0 &&
-           (core->update_lo >= info->sync_tid ||
-            chain->bref.mirror_tid >= info->sync_tid ||
-            chain->bref.mirror_tid >= core->update_hi)) {
-               KKASSERT(chain->modify_tid <= info->sync_tid);
-               /* don't update update_lo, there may be other paths to core */
-               /* don't update bref.mirror_tid, scan2 is not called */
+       if (/*(chain->flags & HAMMER2_CHAIN_MODIFIED) == 0 &&*/
+           (chain->update_lo >= info->sync_tid ||      /* already synced */
+            chain->update_lo >= chain->update_hi)) {   /* old/unchanged */
+               /* update_lo/_hi already filters chain out, do not update */
+               /* don't update bref.mirror_tid, pass2 is not called */
                return;
        }
 
        /*
-        * Ignore chains which have already been flushed through the current
-        * synchronization point.
+        * mirror_tid should not be forward-indexed
         */
-       KKASSERT (chain->bref.mirror_tid <= info->sync_tid);
-       if (chain->bref.mirror_tid == info->sync_tid) {
-               /* do not update core->update_lo, there may be another path */
-               return;
-       }
+       KKASSERT(chain->bref.mirror_tid <= info->sync_tid);
 
        /*
         * Ignore chains modified beyond the current flush point.  These
@@ -472,442 +495,315 @@ hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t **chainp)
         * to 1 during inode flush tid 1 the blockrefs would only be partially
         * updated (and likely panic).
         *
-        * Do not update core->update_lo here, there might be other paths
-        * to the core and we haven't actually flushed it.
+        * We must update chain->update_lo here to prevent re-entry in this
+        * flush transaction.
         *
         * (vchain and fchain are exceptions since they cannot be duplicated)
         */
        if (chain->modify_tid > info->sync_tid &&
            chain != &hmp->fchain && chain != &hmp->vchain) {
-               /* do not update bref.mirror_tid, scan2 ignores chain */
-               /* do not update core->update_lo, there may be another path */
+               /* do not update bref.mirror_tid, pass2 ignores chain */
+               /* chain->update_lo = info->sync_tid; */
                return;
        }
 
-       saved_parent = info->parent;
-       info->parent = chain;
-retry:
-
        /*
-        * Chains deleted as-of the flush synchronization point require some
-        * special early handling to avoid double flushing because multiple
-        * deletions are sometimes forced within the same transaction.
-        * Allowing the flush to proceed through more than one path can wind
-        * up updating the chain's block table twice and cause an assertion.
+        * (2) Recurse downward and check recursion depth.
+        * (3) Flush live children
+        * (4) Flush deleted children
         *
-        * We don't check the 'same transaction' part but simply punt in this
-        * situation.  We must still check for multiple deletions, since any
-        * terminal (non-stale) deletion still requires processing to at
-        * least clean up the children, and also (for inodes) there might
-        * still be an open descriptor.
+        * We adjust update_lo if not deferring chain to prevent re-entry
+        * in this flush cycle, but it must be set AFTER the flush in case
+        * a deeper flush hits the chain.  Otherwise the deeper flush cannot
+        * complete.  We re-check the condition after finishing the flushes.
         *
-        * Clear MODIFIED but set MOVED to ensure that the parent still
-        * deals with it.
+        * update_hi was already checked and prevents initial recursions on
+        * subtrees which have not been modified.
         */
-       if (chain->delete_tid <= info->sync_tid &&
-           (chain->flags & HAMMER2_CHAIN_DUPLICATED)) {
-               if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
-#if 0
-                       /*
-                        * XXX should be able to invalidate the buffer here.
-                        * XXX problem if reused, snapshotted, or reactivated.
-                        */
-                       if (chain->dio) {
-                               hammer2_io_setinval(chain->dio, chain->bytes);
-                       }
-#endif
-                       if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
-                               hammer2_chain_ref(chain);
-                               atomic_set_int(&chain->flags,
-                                              HAMMER2_CHAIN_MOVED);
-                       }
-                       atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
-                       hammer2_chain_memory_wakeup(chain->pmp);
-                       hammer2_chain_drop(chain);
+       saved_parent = info->parent;
+       saved_domodify = info->domodify;
+       info->parent = chain;
+       info->domodify = 0;
+
+       if (chain->flags & HAMMER2_CHAIN_DEFERRED) {
+               ++info->diddeferral;
+       } else if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT) {
+               if ((chain->flags & HAMMER2_CHAIN_DEFERRED) == 0) {
+                       hammer2_chain_ref(chain);
+                       TAILQ_INSERT_TAIL(&info->flush_list,
+                                         chain, flush_node);
+                       atomic_set_int(&chain->flags,
+                                      HAMMER2_CHAIN_DEFERRED);
                }
+               ++info->diddeferral;
+       } else {
+               hammer2_chain_t *scan;
 
                /*
-                * Update mirror_tid, indicating that chain is synchronized
-                * on its modification and block table.  This probably isn't
-                * needed since scan2 should ignore deleted chains anyway.
+                * The flush is queue-agnostic when running pass1, but order
+                * is important to catch any races where an existing
+                * flush-visible child is moved from rbtree->dbtree/dbq.
                 *
-                * NOTE: bref.mirror_tid cannot be updated
-                *       unless MODIFIED is cleared or already
-                *       clear.
+                * New children added by concurrent operations are not visible
+                * to the flush anyway so we don't care about those races.
+                * However, the flush itself can move a child from dbq to
+                * dbtree (rare in pass1 but it is possible).
+                *
+                * pass1 can handle re-execution of a child.
                 */
-               if (chain->bref.mirror_tid < info->sync_tid)
-                       chain->bref.mirror_tid = info->sync_tid;
-               /* do not update core->update_lo, there may be another path */
-               info->parent = saved_parent;
-               return;
+               spin_lock(&core->cst.spin);
+               KKASSERT(core->good == 0x1234 && core->sharecnt > 0);
+               RB_SCAN(hammer2_chain_tree, &core->rbtree,
+                       NULL, hammer2_flush_pass1, info);
+               RB_SCAN(hammer2_chain_tree, &core->dbtree,
+                       NULL, hammer2_flush_pass1, info);
+               scan = TAILQ_FIRST(&core->dbq);
+               while (scan) {
+                       KKASSERT(scan->flags & HAMMER2_CHAIN_ONDBQ);
+                       hammer2_flush_pass1(scan, info);
+                       if (scan->flags & HAMMER2_CHAIN_ONDBQ)
+                               scan = TAILQ_NEXT(scan, db_entry);
+                       else
+                               scan = TAILQ_FIRST(&core->dbq);
+               }
+               spin_unlock(&core->cst.spin);
        }
 
        /*
-        * Recurse if we are not up-to-date.  Once we are done we will
-        * update update_lo if there were no deferrals.  update_lo can become
-        * higher than update_hi and is used to prevent re-recursions during
-        * the same flush cycle.
-        *
-        * update_hi was already checked and prevents initial recursions on
-        * subtrees which have not been modified.
+        * Stop if deferred, do not update update_lo.
+        */
+       if (info->diddeferral)
+               goto done;
+
+       /*
+        * If a block table update is needed place the parent in a modified
+        * state, which might delete-duplicate it.
         *
-        * NOTE: We must recurse whether chain is flagged DELETED or not.
-        *       However, if it is flagged DELETED we limit sync_tid to
-        *       delete_tid to ensure that the chain's bref.mirror_tid is
-        *       not fully updated and causes it to miss the non-DELETED
-        *       path.
+        * - To prevent loops and other confusion, we synchronize update_lo
+        *   for the original chain.
         *
-        * NOTE: If a deferral occurs hammer2_chain_flush() will flush the
-        *       deferred chain independently which will update it's
-        *       bref.mirror_tid and prevent it from deferred again.
+        * - The original parent will not be used by the flush so we can
+        *   clear its MODIFIED bit.
         */
-       if (chain->bref.mirror_tid < info->sync_tid &&
-           chain->bref.mirror_tid < core->update_hi) {
-               hammer2_chain_layer_t *layer;
-               int saved_domodify;
-               int save_gen;
-
-               /*
-                * Races will bump update_hi above trans->sync_tid and should
-                * not affect this test.
-                *
-                * We don't want to set our chain to MODIFIED gratuitously.
-                *
-                * We need an extra ref on chain because we are going to
-                * release its lock temporarily in our child loop.
-                */
-
-               /*
-                * Run two passes.  The first pass handles MODIFIED and
-                * update_lo recursions while the second pass handles
-                * MOVED chains on the way back up.
-                *
-                * If the stack gets too deep we defer the chain.   Since
-                * hammer2_chain_core's can be shared at multiple levels
-                * in the tree, we may encounter a chain that we had already
-                * deferred.  We could undefer it but it will probably just
-                * defer again so it is best to leave it deferred.
-                *
-                * Scan1 is recursive.
-                *
-                * NOTE: The act of handling a modified/submodified chain can
-                *       cause the MOVED Flag to be set.  It can also be set
-                *       via hammer2_chain_delete() and in other situations.
-                *
-                * NOTE: RB_SCAN() must be used instead of RB_FOREACH()
-                *       because children can be physically removed during
-                *       the scan.
-                *
-                * NOTE: We would normally not care about insertions except
-                *       that some insertions might occur from the flush
-                *       itself, so loop on generation number changes.
-                */
-               saved_domodify = info->domodify;
-               info->domodify = 0;
-
-               if (chain->flags & HAMMER2_CHAIN_DEFERRED) {
-                       ++info->diddeferral;
-               } else if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT) {
-                       if ((chain->flags & HAMMER2_CHAIN_DEFERRED) == 0) {
-                               hammer2_chain_ref(chain);
-                               TAILQ_INSERT_TAIL(&info->flush_list,
-                                                 chain, flush_node);
-                               atomic_set_int(&chain->flags,
-                                              HAMMER2_CHAIN_DEFERRED);
-                       }
-                       ++info->diddeferral;
-               } else {
-                       spin_lock(&core->cst.spin);
-                       KKASSERT(core->good == 0x1234 && core->sharecnt > 0);
-                       do {
-                               save_gen = core->generation;
-                               TAILQ_FOREACH_REVERSE(layer, &core->layerq,
-                                                     h2_layer_list, entry) {
-                                       ++layer->refs;
-                                       KKASSERT(layer->good == 0xABCD);
-                                       RB_SCAN(hammer2_chain_tree,
-                                               &layer->rbtree,
-                                               NULL, hammer2_chain_flush_scan1,
-                                               info);
-                                       --layer->refs;
-                               }
-                       } while (core->generation != save_gen);
-                       spin_unlock(&core->cst.spin);
-               }
-
+       if (info->domodify) {
+               hammer2_chain_modify(info->trans, &info->parent,
+                                    HAMMER2_MODIFY_NO_MODIFY_TID);
                if (info->parent != chain) {
-                       kprintf("ZZZ\n");
+                       /*
+                        * chain        - old
+                        * info->parent - new
+                        *
+                        * NOTE: bref.mirror_tid cannot be updated
+                        *       unless MODIFIED is cleared or already
+                        *       clear.
+                        */
+                       chain->inode_reason += 0x10000000;
+                       info->parent->inode_reason += 0x100;
+                       KKASSERT(info->parent->core == chain->core);
+                       if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
+                               atomic_clear_int(&chain->flags,
+                                               HAMMER2_CHAIN_MODIFIED);
+                               hammer2_chain_memory_wakeup(chain->pmp);
+                               hammer2_chain_drop(chain);
+                       }
+#if 0
+                       if (chain->flags & HAMMER2_CHAIN_FLUSH_CREATE) {
+                               atomic_clear_int(&chain->flags,
+                                               HAMMER2_CHAIN_FLUSH_CREATE);
+                               hammer2_chain_drop(chain);
+                       }
+                       if (info->parent->flags & HAMMER2_CHAIN_FLUSH_DELETE) {
+                               atomic_clear_int(&info->parent->flags,
+                                               HAMMER2_CHAIN_FLUSH_DELETE);
+                               hammer2_chain_drop(info->parent);
+                       }
+#endif
+#if 0
+                       if (chain->bref.mirror_tid < info->sync_tid)
+                               chain->bref.mirror_tid = info->sync_tid;
+#endif
+                       if (chain->update_lo < info->sync_tid)
+                               chain->update_lo = info->sync_tid;
+                       KKASSERT(info->parent->update_lo < info->sync_tid);
                        hammer2_chain_drop(chain);
                        hammer2_chain_ref(info->parent);
                }
                chain = info->parent;
+       }
 
-               /*
-                * chain was unlocked during the scan1 recursion and may
-                * have been deleted, destroyed, or even synchronously
-                * flushed due to aliasing.
-                *
-                * The flush continues normally in the first two places as
-                * the deletion or destruction does NOT affect the flush
-                * as-of the flush synchronization point.
-                *
-                * We must detect the last case and avoid flushing chain twice.
-                */
-#if 0
-               if (chain->delete_tid <= info->sync_tid &&
-                   (chain->flags & HAMMER2_CHAIN_DUPLICATED)) {
-                       kprintf("xxx\n");
-                       goto retry;
-               }
-#endif
-               if (chain->bref.mirror_tid >= info->sync_tid ||
-                   chain->bref.mirror_tid >= core->update_hi) {
-                       kprintf("yyy\n");
-                       goto retry;
-               }
+       /*
+        * If a blocktable update is needed determine if this is the last
+        * parent requiring modification (check all parents using the core).
+        *
+        * Set bit 1 (0x02) of domodify if this is the last parent,
+        * which will cause scan2 to clear FLUSH_CREATE and FLUSH_DELETE.
+        */
+       if (1) {
+               hammer2_chain_t *scan;
 
-               /*
-                * If any deferral occurred we must set domodify to 0 to avoid
-                * potentially modifying the parent twice (now and when we run
-                * the deferral list), as doing so could cause the blockref
-                * update to run on a block array which has already been
-                * updated.
-                */
-               if (info->domodify && diddeferral != info->diddeferral)
-                       info->domodify = 0;
+               spin_lock(&core->cst.spin);
+               TAILQ_FOREACH(scan, &core->ownerq, core_entry) {
+                       /*
+                        * Ignore chains which have already been updated
+                        * Ignore unmodified chains
+                        */
+                       if ((scan->flags & HAMMER2_CHAIN_MODIFIED) == 0 &&
+                           (scan->update_lo >= info->sync_tid ||
+                            scan->update_lo >= scan->update_hi)) {
+                               continue;
+                       }
 
-               /*
-                * THIS IS THE ONLY POINT IN THE FLUSH WHERE A PARENT IN THE
-                * NOMINAL TOPOLOGY, OTHER THAN FREEMAP ALLOCATIONS, IS
-                * MODIFIED.  FREEMAP ALLOCATIONS WILL MODIFY THE FREEMAP
-                * TOPOLOGY WITH SYNC_TID+1 AND DO NOT AFFECT THE CURRENT
-                * FLUSH.
-                *
-                * Modifying the parent can create issues if the current
-                * parent is already in a modified state with an earlier
-                * transaction id.  We want to avoid an endless flush loop
-                * on the original parent so we must clear its modified bit
-                * after creating the new parent, if they wind up being
-                * different.  Care must also be taken to avoid flushing the
-                * same parent twice.
-                *
-                * We are responsible for setting the parent into a modified
-                * state before we scan the children to update the parent's
-                * block table.  This must essentially be done as an atomic
-                * operation (the parent must remain locked throughout the
-                * operation), otherwise other transactions can squeeze a
-                * delete-duplicate in and create block table havoc.
-                *
-                * NOTE: Blockrefs are only updated on live chains.
-                *
-                * NOTE: Modifying the parent generally causes a
-                *       delete-duplicate to occur from within the flush
-                *       itself, with an allocation from the freemap occuring
-                *       as an additional side-effect.
-                *
-                * NOTE: If the parent was deleted our modified chain will
-                *       also be marked deleted, but since it inherits the
-                *       parent's delete_tid it will still appear to be
-                *       'live' for the purposes of the flush.
-                */
-               if (info->domodify && !h2ignore_deleted(info, chain)) {
-                       KKASSERT(chain->modify_tid < info->sync_tid);
+                       /*
+                        * Ignore the current parent being processed (we do
+                        * not adjust update_lo until after the fixup).
+                        */
+                       if (scan == chain)
+                               continue;
 
                        /*
-                        * The scan1 loop and/or flush_core is reentrant,
-                        * particularly when core->generation changes.  To
-                        * avoid havoc we have to prevent repetitive
-                        * delete-duplicates of the same chain.
-                        *
-                        * After executing the modify set the original chain's
-                        * bref.mirror_tid to prevent any reentrancy during
-                        * the current flush cycle.
+                        * Cannot exhaust all parents if one is not visible
+                        * to the flush.  The root chains are special-cased
+                        * because they cannot really be delete-duplicated.
                         */
-                       hammer2_chain_modify(info->trans, &info->parent,
-                                            HAMMER2_MODIFY_NO_MODIFY_TID);
-                       if (info->parent != chain) {
-                               /*
-                                * NOTE: bref.mirror_tid cannot be updated
-                                *       unless MODIFIED is cleared or already
-                                *       clear.
-                                */
-                               if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
-                                       atomic_clear_int(&chain->flags,
-                                                       HAMMER2_CHAIN_MODIFIED);
-                                       hammer2_chain_memory_wakeup(chain->pmp);
-                                       hammer2_chain_drop(chain);
-                               }
-                               if (chain->bref.mirror_tid < info->sync_tid)
-                                       chain->bref.mirror_tid = info->sync_tid;
-                               hammer2_chain_drop(chain);
-                               hammer2_chain_ref(info->parent);
+                       if (scan != &scan->hmp->fchain &&
+                           scan != &scan->hmp->vchain &&
+                           scan->modify_tid > info->sync_tid) {
+                               break;
                        }
-                       chain = info->parent;
+
+                       /*
+                        * Fail if update_lo has not been synchronized to
+                        * at least our sync_tid on any modified parent chain.
+                        */
+                       if (scan->update_lo < info->sync_tid)
+                               break;
                }
+               spin_unlock(&core->cst.spin);
+               if (scan == NULL)
+                       info->domodify |= 2;
+       }
 
-               KKASSERT(chain == info->parent);
+       /*
+        * (5) Calculate block table updates or child cleanups.
+        *     (this whole operation has to be atomic)
+        *
+        * domodify 0x01 - block table updates
+        *          0x02 - child cleanups
+        *
+        *      pass2 - Process deletions from dbtree and dbq.
+        *      pass3 - Process insertions from rbtree, dbtree, and dbq.
+        *      pass4 - Cleanup child flags on the last parent and
+        *              Adjust queues on the live parent.
+        */
+       if (info->domodify) {
+               hammer2_chain_t *scan;
 
-               /*
-                * Handle successfully flushed children who are in the MOVED
-                * state on the way back up the recursion.  This can have
-                * the side-effect of clearing MOVED.
-                *
-                * Scan2 may replace info->parent.  If it does it will also
-                * replace the extra ref we made.
-                *
-                * Scan2 is non-recursive.
-                */
-               if (diddeferral != info->diddeferral) {
-                       spin_lock(&core->cst.spin);
-               } else {
-                       KKASSERT(chain == info->parent);
-                       KKASSERT(info->domodify == 0 ||
-                                (chain->flags & HAMMER2_CHAIN_FLUSHED) == 0);
-                       atomic_set_int(&chain->flags, HAMMER2_CHAIN_FLUSHED);
-                       spin_lock(&core->cst.spin);
-                       KKASSERT(core->good == 0x1234 && core->sharecnt > 0);
-                       KKASSERT(info->parent->core == core);
-                       TAILQ_FOREACH_REVERSE(layer, &core->layerq,
-                                             h2_layer_list, entry) {
-                               info->pass = 1;
-                               ++layer->refs;
-                               KKASSERT(layer->good == 0xABCD);
-                               RB_SCAN(hammer2_chain_tree, &layer->rbtree,
-                                       NULL, hammer2_chain_flush_scan2, info);
-                               info->pass = 2;
-                               RB_SCAN(hammer2_chain_tree, &layer->rbtree,
-                                       NULL, hammer2_chain_flush_scan2, info);
-                               --layer->refs;
+               spin_lock(&core->cst.spin);
+
+               while ((info->domodify & 1) && info->parent) {
+                       /* PASS2 - Deletions */
+                       RB_SCAN(hammer2_chain_tree, &core->rbtree,
+                               NULL, hammer2_flush_pass2, info);
+                       RB_SCAN(hammer2_chain_tree, &core->dbtree,
+                               NULL, hammer2_flush_pass2, info);
+                       scan = TAILQ_FIRST(&core->dbq);
+                       TAILQ_FOREACH(scan, &core->dbq, db_entry) {
+                               KKASSERT(scan->flags & HAMMER2_CHAIN_ONDBQ);
+                               hammer2_flush_pass2(scan, info);
                        }
-               }
-
-               /*
-                * info->parent must not have been replaced again
-                */
-               KKASSERT(info->parent == chain);
 
-               *chainp = chain;
+                       /* PASS3 - Insertions */
+                       RB_SCAN(hammer2_chain_tree, &core->rbtree,
+                               NULL, hammer2_flush_pass3, info);
+                       RB_SCAN(hammer2_chain_tree, &core->dbtree,
+                               NULL, hammer2_flush_pass3, info);
+                       TAILQ_FOREACH(scan, &core->dbq, db_entry) {
+                               KKASSERT(scan->flags & HAMMER2_CHAIN_ONDBQ);
+                               hammer2_flush_pass3(scan, info);
+                       }
+                       info->parent = TAILQ_NEXT(info->parent, core_entry);
+                       if (info->parent)
+                               kprintf("FLUSH SPECIAL UPDATE (%p) %p.%d %08x\n",
+                                       chain, info->parent,
+                                       info->parent->bref.type,
+                                       info->parent->flags);
+               }
+               info->parent = chain;
+
+               /* PASS4 - Cleanup */
+               RB_SCAN(hammer2_chain_tree, &core->rbtree,
+                       NULL, hammer2_flush_pass4, info);
+               scan = TAILQ_FIRST(&core->dbq);
+               while (scan) {
+                       KKASSERT(scan->flags & HAMMER2_CHAIN_ONDBQ);
+                       hammer2_flush_pass4(scan, info);
+                       if (scan->flags & HAMMER2_CHAIN_ONDBQ)
+                               scan = TAILQ_NEXT(scan, db_entry);
+                       else
+                               scan = TAILQ_FIRST(&core->dbq);
+               }
+               RB_SCAN(hammer2_chain_tree, &core->dbtree,
+                       NULL, hammer2_flush_pass4, info);
 
-               hammer2_chain_layer_check_locked(chain->hmp, core);
                spin_unlock(&core->cst.spin);
-
-               /*
-                * Update the core only if no deferrals occurred.  Otherwise
-                * we could end up clearing the MOVED bit in the children
-                * prematurely.
-                */
-               if (diddeferral == info->diddeferral)
-                       hammer2_flush_core_update(core, info);
-
-               info->domodify = saved_domodify;
-               KKASSERT(chain->refs > 1);
-       } else {
-               /*
-                * Update the core, no deferrals occurred in this path.
-                */
-               hammer2_flush_core_update(core, info);
        }
-       info->parent = saved_parent;
-
-#if FLUSH_DEBUG
-       kprintf("POP    %p.%d defer=%d\n", chain, chain->bref.type, diddeferral);
-#endif
 
        /*
-        * Do not flush the chain if there were any deferrals.  It will be
-        * retried later after the deferrals are independently handled.
-        * Do not update update_lo or bref.mirror_tid.
+        * Synchronize update_lo to prevent reentrant block updates of this
+        * parent.
         */
-       if (diddeferral != info->diddeferral) {
-               if (hammer2_debug & 0x0008) {
-                       kprintf("%*.*s} %p/%d %04x (deferred)",
-                               info->depth, info->depth, "",
-                               chain, chain->refs, chain->flags);
-               }
-               /* do not update core->update_lo */
-               /* do not update bref.mirror_tid */
-               return;
-       }
+       chain->update_lo = info->sync_tid;
 
-       KKASSERT(chain->bref.mirror_tid < info->sync_tid);
+       /*
+        * Skip the flush if the chain was not placed in a modified state
+        * or was not already in a modified state.
+        */
+       if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0)
+               goto done;
 
        /*
-        * Non-deferral path, chain is now deterministically being flushed.
+        * FLUSH THE CHAIN (on the way back up the recursion)
+        *
+        * Chain is now deterministically being flushed and not being deferred.
         * We've finished running the recursion and the blockref update.
         *
         * update bref.mirror_tid.  update_lo has already been updated.
-        *
-        * After this point we MUST dipose of the MODIFIED bit on chain.
         */
        if (chain->bref.mirror_tid < info->sync_tid)
                chain->bref.mirror_tid = info->sync_tid;
 
        /*
-        * Deal with deleted and destroyed chains on the way back up.
-        *
-        * Otherwise a deleted chain can be optimized by clearing MODIFIED
-        * without bothering to write it out.
-        *
-        * NOTE: We optimize this by noting that only 'inode' chains require
-        *       this treatment.  When a file with an open descriptor is
-        *       deleted only its inode is marked deleted.  Other deletions,
-        *       such as indirect block deletions, will no longer be visible
-        *       to the live filesystem and do not need to be updated.
+        * Dispose of the modified bit.  FLUSH_CREATE should already be
+        * set.
         */
-       if (h2ignore_deleted(info, chain)) {
+       KKASSERT(chain->flags & HAMMER2_CHAIN_FLUSH_CREATE);
+       atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
+       hammer2_chain_memory_wakeup(chain->pmp);
+
+       if ((chain->flags & HAMMER2_CHAIN_FLUSH_CREATE) ||
+           chain == &hmp->vchain ||
+           chain == &hmp->fchain) {
                /*
-                * At the moment we unconditionally set the MOVED bit because
-                * there are situations where it might not have been set due
-                * to similar delete-destroyed optimizations, and the parent
-                * of the parent still may need to be notified of the deletion.
+                * Drop the ref from the MODIFIED bit we cleared,
+                * net -1 ref.
                 */
-               if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
-                       hammer2_chain_ref(chain);
-                       atomic_set_int(&chain->flags,
-                                      HAMMER2_CHAIN_MOVED);
-               }
-               if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
-#if 0
-                       /*
-                        * XXX should be able to invalidate the buffer here.
-                        * XXX problem if reused, snapshotted, or reactivated.
-                        */
-                       if (chain->dio) {
-                               hammer2_io_setinval(chain->dio, chain->bytes);
-                       }
-#endif
-                       atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
-                       hammer2_chain_memory_wakeup(chain->pmp);
-                       hammer2_chain_drop(chain);
-               }
-               return;
+               hammer2_chain_drop(chain);
+       } else {
+               /*
+                * Drop the ref from the MODIFIED bit we cleared and
+                * set a ref for the FLUSH_CREATE bit we are setting.
+                * Net 0 refs.
+                */
+               atomic_set_int(&chain->flags, HAMMER2_CHAIN_FLUSH_CREATE);
        }
 
        /*
-        * A degenerate flush might not have flushed anything and thus not
-        * processed modified blocks on the way back up.  Detect the case.
-        *
-        * This case can occur when a create, modify, and rename (to a
-        * different part of the topology) occurs in the same flush,
-        * resulting in a parent which effectively needs no modification.
+        * Skip the actual flush operation if the chain has been deleted
+        * in our flus hview.  There will be no block table entry that
+        * references it.
         */
-       if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) {
-#if 0
-               kprintf("chain %p.%d %08x recursed but wasn't "
-                       "modified mirr=%016jx "
-                       "update_lo=%016jx synctid=%016jx\n",
-                       chain, chain->bref.type, chain->flags,
-                       chain->bref.mirror_tid,
-                       core->update_lo, info->sync_tid);
-#endif
-#if 0
-               if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
-                       hammer2_chain_ref(chain);
-                       atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
-               }
-#endif
-               return;
-       }
+       if (h2ignore_deleted(info, chain))
+               goto done;
 
        /*
         * Issue flush.
@@ -933,25 +829,6 @@ retry:
                Debugger("Flush hell");
        }
 
-       atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
-       hammer2_chain_memory_wakeup(chain->pmp);
-
-       if ((chain->flags & HAMMER2_CHAIN_MOVED) ||
-           chain == &hmp->vchain ||
-           chain == &hmp->fchain) {
-               /*
-                * Drop the ref from the MODIFIED bit we cleared,
-                * net -1 ref.
-                */
-               hammer2_chain_drop(chain);
-       } else {
-               /*
-                * Drop the ref from the MODIFIED bit we cleared and
-                * set a ref for the MOVED bit we are setting.  Net 0 refs.
-                */
-               atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
-       }
-
        /*
         * If this is part of a recursive flush we can go ahead and write
         * out the buffer cache buffer and pass a new bref back up the chain
@@ -1025,10 +902,6 @@ retry:
                 * Make sure any device buffer(s) have been flushed out here.
                 * (there aren't usually any to flush).
                 */
-#if 0
-               /* XXX */
-               /* chain and chain->bref, NOWAIT operation */
-#endif
                break;
 #if 0
        case HAMMER2_BREF_TYPE_INDIRECT:
@@ -1064,105 +937,59 @@ retry:
                panic("hammer2_chain_flush_core: unsupported embedded bref %d",
                      chain->bref.type);
                /* NOT REACHED */
-#if 0
-               /*
-                * Embedded elements have to be flushed out.
-                * (Basically just BREF_TYPE_INODE).
-                */
-               KKASSERT(chain->flags & HAMMER2_CHAIN_EMBEDDED);
-               KKASSERT(chain->data != NULL);
-               KKASSERT(chain->dio == NULL);
-               bref = &chain->bref;
-
-               KKASSERT((bref->data_off & HAMMER2_OFF_MASK) != 0);
-               KKASSERT(HAMMER2_DEC_CHECK(chain->bref.methods) ==
-                        HAMMER2_CHECK_ISCSI32 ||
-                        HAMMER2_DEC_CHECK(chain->bref.methods) ==
-                        HAMMER2_CHECK_FREEMAP);
-
-               /*
-                * The data is embedded, we have to acquire the
-                * buffer cache buffer and copy the data into it.
-                */
-               error = hammer2_io_bread(hmp, bref->data_off, chain->bytes,
-                                        &dio);
-               KKASSERT(error == 0);
-               bdata = hammer2_io_data(dio, bref->data_off);
+       }
 
-               /*
-                * Copy the data to the buffer, mark the buffer
-                * dirty, and convert the chain to unmodified.
-                */
-               bcopy(chain->data, bdata, chain->bytes);
-               hammer2_io_bdwrite(&dio);
+       /*
+        * Final cleanup after flush
+        */
+done:
+       KKASSERT(chain->refs > 1);
+       info->domodify = saved_domodify;
+       info->parent = saved_parent;
+       *chainp = chain;
 
-               switch(HAMMER2_DEC_CHECK(chain->bref.methods)) {
-               case HAMMER2_CHECK_FREEMAP:
-                       chain->bref.check.freemap.icrc32 =
-                               hammer2_icrc32(chain->data, chain->bytes);
-                       break;
-               case HAMMER2_CHECK_ISCSI32:
-                       chain->bref.check.iscsi32.value =
-                               hammer2_icrc32(chain->data, chain->bytes);
-                       break;
-               default:
-                       panic("hammer2_flush_core: bad crc type");
-                       break; /* NOT REACHED */
-               }
-               if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
-                       ++hammer2_iod_meta_write;
-               else
-                       ++hammer2_iod_indr_write;
-#endif
-       }
+       KKASSERT(chain->bref.mirror_tid <= info->sync_tid);
 }
 
 /*
- * Flush helper scan1 (recursive)
+ * Flush helper pass1 (recursive)
  *
- * Flushes the children of the caller's chain (parent) and updates
- * the blockref, restricted by sync_tid.
+ * Flushes the children of the caller's chain (info->parent), restricted
+ * by sync_tid.  Set info->domodify if the child's blockref must propagate
+ * back up to the parent.
  *
- * Ripouts during the loop should not cause any problems.  Because we are
- * flushing to a synchronization point, modification races will occur after
- * sync_tid and do not have to be flushed anyway.
+ * Ripouts can move child from rbtree to dbtree or dbq but the caller's
+ * flush scan order prevents any chains from being lost.  A child can be
+ * executes more than once (update_lo is used to prevent infinite recursions).
  *
- * It is also ok if the parent is chain_duplicate()'d while unlocked because
- * the delete/duplication will install a delete_tid that is still larger than
- * our current sync_tid.
+ * WARNING! If we do not call hammer2_flush_core() we must update
+ *         bref.mirror_tid ourselves to indicate that the flush has
+ *         processed the child.
  *
- * WARNING! If we do not call chain_flush_core we must update bref.mirror_tid
- *         ourselves.
+ * WARNING! parent->core spinlock is held on entry and return.
  */
 static int
-hammer2_chain_flush_scan1(hammer2_chain_t *child, void *data)
+hammer2_flush_pass1(hammer2_chain_t *child, void *data)
 {
        hammer2_flush_info_t *info = data;
        hammer2_trans_t *trans = info->trans;
        hammer2_chain_t *parent = info->parent;
-       int     diddeferral;
-
-       if (hammer2_debug & 0x80000)
-               Debugger("hell3");
-       diddeferral = info->diddeferral;
 
        /*
-        * Child is beyond the flush synchronization zone, don't persue.
+        * Child modified in a later transactions, nothing to flush in this
+        * transaction.
+        *
         * Remember that modifications generally delete-duplicate so if the
         * sub-tree is dirty another child will get us there.  But not this
         * one.
         *
-        * Or MODIFIED is not set and child is already fully synchronized
-        * with its sub-tree.  Don't persue.
-        *
         * (child can never be fchain or vchain so a special check isn't
         *  needed).
         */
        if (child->modify_tid > trans->sync_tid) {
                KKASSERT(child->delete_tid >= child->modify_tid);
-               /* do not update child->core->update_lo, core not flushed */
-               /* do not update core->update_lo, there may be another path */
-               /* do not update mirror_tid, scan2 will ignore chain */
+               /*child->update_lo = info->sync_tid;*/
+               /* do not update mirror_tid, pass2 will ignore chain */
                return (0);
        }
 
@@ -1179,78 +1006,49 @@ hammer2_chain_flush_scan1(hammer2_chain_t *child, void *data)
        hammer2_chain_lock(child, HAMMER2_RESOLVE_MAYBE);
 
        /*
-        * No recursion needed if neither the child or anything under it
-        * was messed with.
+        * Recurse and collect deferral data.  We only recursively sync
+        * (basically) if update_lo has not been updated, indicating that
+        * the child has not already been processed.
         */
-       if ((child->flags & HAMMER2_CHAIN_MODIFIED) == 0 &&
-           child->core->update_lo >= info->sync_tid) {
-               KKASSERT((child->flags & HAMMER2_CHAIN_MODIFIED) == 0);
-               if (child->bref.mirror_tid < info->sync_tid)
-                       child->bref.mirror_tid = info->sync_tid;
-               goto skip;
+       if ((child->flags & HAMMER2_CHAIN_MODIFIED) ||
+           (child->update_lo < info->sync_tid &&
+            child->update_lo < child->update_hi)) {
+               ++info->depth;
+               hammer2_flush_core(info, &child, 0); /* XXX deleting */
+               --info->depth;
        }
 
        /*
-        * XXX delete child if parent is deleted.  Propagate deletion
-        *     downward.  TODO
-        */
-
-
-       /*
-        * Re-check original pre-lock conditions after locking.
+        * Determine if domodify should be set.  Do not otherwise adjust
+        * the child or pass2 will get confused.
+        *
+        * Insertion:
+        *      - child is flagged as possibly needing block table insertion.
+        *      - child not deleted or deletion is beyond transaction id
+        *      - child created beyond parent synchronization point
+        *      - parent not deleted as-of this transaction
         */
-       if (child->modify_tid > trans->sync_tid) {
-               hammer2_chain_unlock(child);
-               hammer2_chain_drop(child);
-               hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE);
-               spin_lock(&parent->core->cst.spin);
-               return (0);
-       }
-
-       if ((child->flags & HAMMER2_CHAIN_MODIFIED) == 0 &&
-           child->core->update_lo >= info->sync_tid) {
-               KKASSERT((child->flags & HAMMER2_CHAIN_MODIFIED) == 0);
-               if (child->bref.mirror_tid < info->sync_tid)
-                       child->bref.mirror_tid = info->sync_tid;
-               goto skip;
+       if ((child->flags & HAMMER2_CHAIN_FLUSH_CREATE) &&
+           child->delete_tid > trans->sync_tid &&
+           child->modify_tid > parent->update_lo &&
+           parent->delete_tid > trans->sync_tid) {
+               info->domodify = 1;
        }
 
        /*
-        * Recurse and collect deferral data.
-        */
-       ++info->depth;
-       hammer2_chain_flush_core(info, &child);
-       --info->depth;
-
-skip:
-       /*
-        * Check the conditions that could cause SCAN2 to modify the parent.
-        * Modify the parent here instead of in SCAN2, which would cause
-        * rollup chicken-and-egg races.
-        *
-        * Scan2 is expected to update bref.mirror_tid in the domodify case,
-        * but will skip the child otherwise giving us the responsibility to
-        * update bref.mirror_tid.
-        *
-        * WARNING!  Do NOT update the child's bref.mirror_tid right here,
-        *           even if there was no deferral.  Doing so would cause
-        *           confusion with the child's block array state in a
-        *           future flush.
+        * Removal:
+        *      - child is flagged as possibly needing block table removal.
+        *      - child deleted before or during this transaction
+        *      - child created prior or during parent synchronization point
+        *      - parent not yet synchronized to child deletion
+        *      - parent not deleted as-of this transaction
         */
-       if (h2ignore_deleted(info, parent)) {
-               /*
-                * Special optimization matching similar tests done in
-                * flush_core, scan1, and scan2.  Avoid updating the block
-                * table in the parent if the parent is no longer visible.
-                */
-               ;
-       } else if (child->delete_tid <= trans->sync_tid &&
-                  child->delete_tid > parent->bref.mirror_tid &&
-                  child->modify_tid <= parent->bref.mirror_tid) {
+       if ((child->flags & HAMMER2_CHAIN_FLUSH_DELETE) &&
+           child->delete_tid <= trans->sync_tid &&
+           child->modify_tid <= parent->update_lo &&
+           child->delete_tid > parent->update_lo &&
+           parent->delete_tid > trans->sync_tid) {
                info->domodify = 1;
-       } else if (child->delete_tid > trans->sync_tid &&
-                  child->modify_tid > parent->bref.mirror_tid) {
-               info->domodify = 1;             /* base insertion */
        }
 
        /*
@@ -1266,128 +1064,157 @@ skip:
 }
 
 /*
- * Flush helper scan2 (non-recursive)
- *
- * This pass on a chain's children propagates any MOVED or DELETED
- * elements back up the chain towards the root after those elements have
- * been fully flushed.  Unlike scan1, this function is NOT recursive and
- * the parent remains locked across the entire scan.
- *
- * SCAN2 is called twice, once with pass set to 1 and once with it set to 2.
- * We have to do this so base[] elements can be deleted in pass 1 to make
- * room for adding new elements in pass 2.
- *
- * This function also rolls up storage statistics.
- *
- * NOTE!  A deletion is a visbility issue, there can still be references to
- *       deleted elements (for example, to an unlinked file which is still
- *       open), and there can also be multiple chains pointing to the same
- *       bref where some are deleted and some are not (for example due to
- *       a rename).   So a chain marked for deletion is basically considered
- *       to be live until it is explicitly destroyed or until its ref-count
- *       reaches zero (also implying that MOVED and MODIFIED are clear).
- *
- * NOTE!  Info->parent will be locked but will only be instantiated/modified
- *       if it is either MODIFIED or if scan1 determined that block table
- *       updates will occur.
- *
- * NOTE!  SCAN2 is responsible for updating child->bref.mirror_tid only in
- *       the case where it modifies the parent (does a base insertion
- *       or deletion).  SCAN1 handled all other cases.
+ * PASS2 - BLOCKTABLE DELETIONS
  */
 static int
-hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data)
+hammer2_flush_pass2(hammer2_chain_t *child, void *data)
 {
        hammer2_flush_info_t *info = data;
        hammer2_chain_t *parent = info->parent;
-       hammer2_chain_core_t *above = child->above;
        hammer2_mount_t *hmp = child->hmp;
        hammer2_trans_t *trans = info->trans;
        hammer2_blockref_t *base;
        int count;
-       int ok;
 
-#if FLUSH_DEBUG
-       kprintf("SCAN2 %p.%d %08x mod=%016jx del=%016jx trans=%016jx\n", child, child->bref.type, child->flags, child->modify_tid, child->delete_tid, info->trans->sync_tid);
-#endif
        /*
-        * Ignore children created after our flush point, treating them as
-        * if they did not exist).  These children will not cause the parent
-        * to be updated.
-        *
-        * Children deleted after our flush point are treated as having been
-        * created for the purposes of the flush.  The parent's update_hi
-        * will already be higher than our trans->sync_tid so the path for
-        * the next flush is left intact.
-        *
-        * When we encounter such children and the parent chain has not been
-        * deleted, delete/duplicated, or delete/duplicated-for-move, then
-        * the parent may be used to funnel through several flush points.
-        * These chains will still be visible to later flushes due to having
-        * a higher update_hi than we can set in the current flush.
+        * Prefilter - Ignore children not flagged as needing a parent
+        *             blocktable update.
+        */
+       if ((child->flags & HAMMER2_CHAIN_FLUSH_DELETE) == 0)
+               return (0);
+
+       /*
+        * Prefilter - Ignore children created after our flush_tid (not
+        *             visible to our flush).
         */
        if (child->modify_tid > trans->sync_tid) {
                KKASSERT(child->delete_tid >= child->modify_tid);
-               goto finalize;
+               return 0;
        }
 
-#if 0
        /*
-        * Ignore children which have not changed.  The parent's block table
-        * is already correct.
+        * Prefilter - Don't bother updating the blockrefs for a deleted
+        *             parent (from the flush's perspective).  Otherwise,
+        *             we need to be COUNTEDBREFS synchronized for the
+        *             hammer2_base_*() functions.
         *
-        * XXX The MOVED bit is only cleared when all multi-homed parents
-        *     have flushed, creating a situation where a re-flush can occur
-        *     via a parent which has already flushed.  The hammer2_base_*()
-        *     functions currently have a hack to deal with this case but
-        *     we need something better.
+        * NOTE: This test must match the similar one in flush_core.
         */
-       if ((child->flags & HAMMER2_CHAIN_MOVED) == 0) {
-               KKASSERT((child->flags & HAMMER2_CHAIN_MODIFIED) == 0);
-               goto finalize;
+       if (h2ignore_deleted(info, parent))
+               return 0;
+
+       /*
+        * Calculate blockmap pointer
+        */
+       switch(parent->bref.type) {
+       case HAMMER2_BREF_TYPE_INODE:
+               /*
+                * Access the inode's block array.  However, there is no
+                * block array if the inode is flagged DIRECTDATA.  The
+                * DIRECTDATA case typicaly only occurs when a hardlink has
+                * been shifted up the tree and the original inode gets
+                * replaced with an OBJTYPE_HARDLINK placeholding inode.
+                */
+               if (parent->data &&
+                   (parent->data->ipdata.op_flags &
+                    HAMMER2_OPFLAG_DIRECTDATA) == 0) {
+                       base = &parent->data->ipdata.u.blockset.blockref[0];
+               } else {
+                       base = NULL;
+               }
+               count = HAMMER2_SET_COUNT;
+               break;
+       case HAMMER2_BREF_TYPE_INDIRECT:
+       case HAMMER2_BREF_TYPE_FREEMAP_NODE:
+               if (parent->data)
+                       base = &parent->data->npdata[0];
+               else
+                       base = NULL;
+               count = parent->bytes / sizeof(hammer2_blockref_t);
+               break;
+       case HAMMER2_BREF_TYPE_VOLUME:
+               base = &hmp->voldata.sroot_blockset.blockref[0];
+               count = HAMMER2_SET_COUNT;
+               break;
+       case HAMMER2_BREF_TYPE_FREEMAP:
+               base = &parent->data->npdata[0];
+               count = HAMMER2_SET_COUNT;
+               break;
+       default:
+               base = NULL;
+               count = 0;
+               panic("hammer2_chain_flush_pass2: "
+                     "unrecognized blockref type: %d",
+                     parent->bref.type);
        }
-#endif
 
        /*
-        * Make sure child is referenced before we unlock.
+        * Removal
+        *      - child is flagged for removal
+        *      - child deleted before or during this transaction
+        *      - child created prior or during parent synchronization point
+        *      - parent not yet synchronized to child's deletion
         */
-       hammer2_chain_ref(child);
-       spin_unlock(&above->cst.spin);
+       if (child->delete_tid <= trans->sync_tid &&
+           child->modify_tid <= parent->update_lo &&
+           child->delete_tid > parent->update_lo) {
+               /* can't assert BMAPPED because state adjustment may occur
+                * before we are done, and BMAPPED only applies to the live
+                * parent.
+                *KKASSERT(child->flags & HAMMER2_CHAIN_BMAPPED);*/
+               if (base) {
+                       hammer2_rollup_stats(parent, child, -1);
+                       hammer2_base_delete(trans, parent, base, count,
+                                           &info->cache_index, child);
+               }
+       }
+
+       return 0;
+}
+
+/*
+ * PASS3 - BLOCKTABLE INSERTIONS
+ */
+static int
+hammer2_flush_pass3(hammer2_chain_t *child, void *data)
+{
+       hammer2_flush_info_t *info = data;
+       hammer2_chain_t *parent = info->parent;
+       hammer2_mount_t *hmp = child->hmp;
+       hammer2_trans_t *trans = info->trans;
+       hammer2_blockref_t *base;
+       int count;
 
        /*
-        * Parent reflushed after the child has passed them by should skip
-        * due to the modify_tid test. XXX
+        * Prefilter - Ignore children not flagged as needing a parent
+        *             blocktable update.
         */
-       hammer2_chain_lock(child, HAMMER2_RESOLVE_NEVER);
-       KKASSERT(child->above == above);
-       KKASSERT(parent->core == above);
+       if ((child->flags & HAMMER2_CHAIN_FLUSH_CREATE) == 0)
+               return (0);
 
        /*
-        * The parent's blockref to the child must be deleted or updated.
-        *
-        * This point is not reached on successful DELETED optimizations
-        * but can be reached on recursive deletions and restricted flushes.
-        *
-        * The chain_modify here may delete-duplicate the block.  This can
-        * cause a multitude of issues if the block was already modified
-        * by a later (post-flush) transaction.  Primarily blockrefs in
-        * the later block can be out-of-date, so if the situation occurs
-        * we can't throw away the MOVED bit on the current blocks until
-        * the later blocks are flushed (so as to be able to regenerate all
-        * the changes that were made).
-        *
-        * Because flushes are ordered we do not have to make a
-        * modify/duplicate of indirect blocks.  That is, the flush
-        * code does not have to kmalloc or duplicate anything.  We
-        * can adjust the indirect block table in-place and reuse the
-        * chain.  It IS possible that the chain has already been duplicated
-        * or may wind up being duplicated on-the-fly by modifying code
-        * on the frontend.  We simply use the original and ignore such
-        * chains.  However, it does mean we can't clear the MOVED bit.
+        * Prefilter - Ignore children created after our flush_tid (not
+        *             visible to our flush).
+        */
+       if (child->modify_tid > trans->sync_tid) {
+               KKASSERT(child->delete_tid >= child->modify_tid);
+               return 0;
+       }
+
+       /*
+        * Prefilter - Don't bother updating the blockrefs for a deleted
+        *             parent (from the flush's perspective).  Otherwise,
+        *             we need to be COUNTEDBREFS synchronized for the
+        *             hammer2_base_*() functions.
         *
-        * XXX recursive deletions not optimized.
+        * NOTE: This test must match the similar one in flush_core.
         */
+       if (h2ignore_deleted(info, parent))
+               return 0;
 
+       /*
+        * Calculate blockmap pointer
+        */
        switch(parent->bref.type) {
        case HAMMER2_BREF_TYPE_INODE:
                /*
@@ -1425,212 +1252,237 @@ hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data)
        default:
                base = NULL;
                count = 0;
-               panic("hammer2_chain_flush_scan2: "
+               panic("hammer2_chain_flush_pass2: "
                      "unrecognized blockref type: %d",
                      parent->bref.type);
        }
 
        /*
-        * Don't bother updating a deleted parent's blockrefs.
-        *
-        * Otherwise, we need to be COUNTEDBREFS synchronized for the
-        * hammer2_base_*() functions.
-        *
-        * This test must match the similar one in flush_core.
+        * Insertion
+        *      - child is flagged as possibly needing block table insertion.
+        *      - child not deleted or deletion is beyond transaction id
+        *      - child created beyond parent synchronization point
         */
-#if FLUSH_DEBUG
-       kprintf("SCAN2 base=%p pass=%d PARENT %p.%d DTID=%016jx SYNC=%016jx\n",
-               base,
-               info->pass, parent, parent->bref.type, parent->delete_tid, trans->sync_tid);
-#endif
-       if (h2ignore_deleted(info, parent))
-               base = NULL;
+       if (child->delete_tid > trans->sync_tid &&
+           child->modify_tid > parent->update_lo) {
+               if (base) {
+                       hammer2_rollup_stats(parent, child, 1);
+                       hammer2_base_insert(trans, parent, base, count,
+                                           &info->cache_index, child);
+               }
+       }
+
+       return 0;
+}
+
+/*
+ * PASS4 - CLEANUP CHILDREN (non-recursive, but CAN be re-entrant)
+ *
+ * Adjust queues and set or clear BMAPPED appropriately if processing
+ * the live parent.
+ *
+ * Cleanup FLUSH_CREATE/FLUSH_DELETE on the last parent.
+ */
+static int
+hammer2_flush_pass4(hammer2_chain_t *child, void *data)
+{
+       hammer2_flush_info_t *info = data;
+       hammer2_chain_t *parent = info->parent;
+       hammer2_chain_t *xchain;
+       hammer2_chain_core_t *above = child->above;
+       hammer2_trans_t *trans = info->trans;
 
        /*
-        * Update the parent's blockref table and propagate mirror_tid.
-        *
-        * NOTE! Children with modify_tid's beyond our flush point are
-        *       considered to not exist for the purposes of updating the
-        *       parent's blockref array.
-        *
-        * NOTE! SCAN1 has already put the parent in a modified state
-        *       so if it isn't we panic.
-        *
-        * NOTE! chain->modify_tid vs chain->bref.modify_tid.  The chain's
-        *       internal modify_tid is always updated based on creation
-        *       or delete-duplicate.  However, the bref.modify_tid is NOT
-        *       updated due to simple blockref updates.
+        * Prefilter - Ignore children created after our flush_tid (not
+        *             visible to our flush).
         */
-#if FLUSH_DEBUG
-       kprintf("chain %p->%p pass %d trans %016jx sync %p.%d %016jx/%d C=%016jx D=%016jx PMIRROR %016jx\n",
-               parent, child,
-               info->pass, trans->sync_tid,
-               child, child->bref.type,
-               child->bref.key, child->bref.keybits,
-               child->modify_tid, child->delete_tid, parent->bref.mirror_tid);
-#endif
+       if (child->modify_tid > trans->sync_tid) {
+               KKASSERT(child->delete_tid >= child->modify_tid);
+               return 0;
+       }
 
-       if (info->pass == 1 && child->delete_tid <= trans->sync_tid) {
+       /*
+        * Ref and lock child for operation, spinlock must be temporarily
+        * Make sure child is referenced before we unlock.
+        */
+       hammer2_chain_ref(child);
+       spin_unlock(&above->cst.spin);
+       hammer2_chain_lock(child, HAMMER2_RESOLVE_NEVER);
+       KKASSERT(child->above == above);
+       KKASSERT(parent->core == above);
+
+       /*
+        * Adjust BMAPPED state and rbtree/queue only when we hit the
+        * actual live parent.
+        */
+       if ((parent->flags & HAMMER2_CHAIN_DELETED) == 0) {
                /*
-                * Deleting.  The block array is expected to contain the
-                * child's entry if:
-                *
-                * (1) The deletion occurred after the parent's block table
-                *     was last synchronized (delete_tid), and
-                *
-                * (2) The creation occurred before or during the parent's
-                *     last block table synchronization.
+                * Deleting from blockmap, move child out of dbtree
+                * and clear BMAPPED.  Child should not be on RBTREE.
                 */
-#if FLUSH_DEBUG
-               kprintf("S2A %p.%d b=%p d/b=%016jx/%016jx m/b=%016jx/%016jx\n",
-                       child, child->bref.type,
-                       base, child->delete_tid, parent->bref.mirror_tid,
-                       child->modify_tid, parent->bref.mirror_tid);
-#endif
-               if (base &&
-                   child->delete_tid > parent->bref.mirror_tid &&
-                   child->modify_tid <= parent->bref.mirror_tid) {
-                       KKASSERT(child->flags & HAMMER2_CHAIN_MOVED);
-                       KKASSERT(parent->modify_tid == trans->sync_tid ||
-                                (parent == &hmp->vchain ||
-                                 parent == &hmp->fchain));
-                       hammer2_rollup_stats(parent, child, -1);
-                       spin_lock(&above->cst.spin);
-#if FLUSH_DEBUG
-                       kprintf("trans %jx parent %p.%d child %p.%d m/d %016jx/%016jx "
-                               "flg=%08x %016jx/%d delete\n",
-                               trans->sync_tid,
-                               parent, parent->bref.type,
-                               child, child->bref.type,
-                               child->modify_tid, child->delete_tid,
-                               child->flags,
-                               child->bref.key, child->bref.keybits);
-#endif
-                       hammer2_base_delete(trans, parent, base, count,
-                                           &info->cache_index, child);
-                       spin_unlock(&above->cst.spin);
+               spin_lock(&above->cst.spin);
+               if (child->delete_tid <= trans->sync_tid &&
+                   child->modify_tid <= parent->update_lo &&
+                   child->delete_tid > parent->update_lo &&
+                   (child->flags & HAMMER2_CHAIN_BMAPPED)) {
+                       KKASSERT(child->flags & HAMMER2_CHAIN_ONDBTREE);
+                       RB_REMOVE(hammer2_chain_tree, &above->dbtree, child);
+                       atomic_clear_int(&child->flags, HAMMER2_CHAIN_ONDBTREE);
+                       atomic_clear_int(&child->flags, HAMMER2_CHAIN_BMAPPED);
                }
-       } else if (info->pass == 2 && child->delete_tid > trans->sync_tid) {
+
                /*
-                * Inserting.  The block array is expected to NOT contain
-                * the child's entry if:
-                *
-                * (1) The creation occurred after the parent's block table
-                *     was last synchronized (modify_tid), and
-                *
-                * (2) The child is not being deleted in the same
-                *     transaction.
+                * Inserting into blockmap, place child in rbtree or dbtree.
                 */
-#if FLUSH_DEBUG
-               kprintf("S2B %p.%d b=%p d/b=%016jx/%016jx m/b=%016jx/%016jx\n",
-                       child, child->bref.type,
-                       base, child->delete_tid, parent->bref.mirror_tid,
-                       child->modify_tid, parent->bref.mirror_tid);
-#endif
-               if (base &&
-                   child->modify_tid > parent->bref.mirror_tid) {
-                       KKASSERT(child->flags & HAMMER2_CHAIN_MOVED);
-                       KKASSERT(parent->modify_tid == trans->sync_tid ||
-                                (parent == &hmp->vchain ||
-                                 parent == &hmp->fchain));
-                       hammer2_rollup_stats(parent, child, 1);
+               if (child->delete_tid > trans->sync_tid &&
+                   child->modify_tid > parent->update_lo &&
+                   (child->flags & HAMMER2_CHAIN_BMAPPED) == 0) {
+                       if (child->flags & HAMMER2_CHAIN_ONDBQ) {
+                               TAILQ_REMOVE(&above->dbq, child, db_entry);
+                               atomic_clear_int(&child->flags,
+                                                HAMMER2_CHAIN_ONDBQ);
+                       }
+                       if ((child->flags & HAMMER2_CHAIN_DELETED) == 0 &&
+                           (child->flags & HAMMER2_CHAIN_ONRBTREE) == 0) {
+                               KKASSERT((child->flags &
+                                         (HAMMER2_CHAIN_ONDBTREE |
+                                          HAMMER2_CHAIN_ONDBQ)) == 0);
+                               xchain = RB_INSERT(hammer2_chain_tree,
+                                                  &above->rbtree, child);
+                               KKASSERT(xchain == NULL);
+                               atomic_set_int(&child->flags,
+                                              HAMMER2_CHAIN_ONRBTREE);
+                       } else
+                       if ((child->flags & HAMMER2_CHAIN_DELETED) &&
+                           (child->flags & HAMMER2_CHAIN_ONDBTREE) == 0) {
+                               KKASSERT((child->flags &
+                                         (HAMMER2_CHAIN_ONRBTREE |
+                                          HAMMER2_CHAIN_ONDBQ)) == 0);
+                               xchain = RB_INSERT(hammer2_chain_tree,
+                                                  &above->dbtree, child);
+                               KKASSERT(xchain == NULL);
+                               atomic_set_int(&child->flags,
+                                              HAMMER2_CHAIN_ONDBTREE);
+                       }
+                       atomic_set_int(&child->flags, HAMMER2_CHAIN_BMAPPED);
+                       KKASSERT(child->flags &
+                                (HAMMER2_CHAIN_ONRBTREE |
+                                 HAMMER2_CHAIN_ONDBTREE |
+                                 HAMMER2_CHAIN_ONDBQ));
+               }
+
+               /*
+                * Not on any list, place child on DBQ
+                */
+               if ((child->flags & (HAMMER2_CHAIN_ONRBTREE |
+                                    HAMMER2_CHAIN_ONDBTREE |
+                                    HAMMER2_CHAIN_ONDBQ)) == 0) {
+                       KKASSERT((child->flags & HAMMER2_CHAIN_BMAPPED) == 0);
+                       TAILQ_INSERT_TAIL(&above->dbq, child, db_entry);
+                       atomic_set_int(&child->flags, HAMMER2_CHAIN_ONDBQ);
+               }
+               spin_unlock(&above->cst.spin);
+       }
+
+#if 0
+       /*
+        * Adjust queues and adjust BMAPPED on the live parent only
+        * (there should be only one).
+        *
+        * First handle deletions
+        */
+       if (parent->delete_tid > info->sync_tid &&
+           child->delete_tid <= trans->sync_tid &&
+           child->modify_tid <= parent->update_lo &&
+           child->delete_tid > parent->update_lo) {
+               KKASSERT(child->flags & HAMMER2_CHAIN_FLUSH_DELETE);
+               if (child->flags & HAMMER2_CHAIN_ONDBTREE) {
+                       KKASSERT(child->flags & HAMMER2_CHAIN_FLUSH_DELETE);
+                       KKASSERT(child->flags & HAMMER2_CHAIN_BMAPPED);
                        spin_lock(&above->cst.spin);
-#if FLUSH_DEBUG
-                       kprintf("trans %jx parent %p.%d child %p.%d m/d %016jx/%016jx "
-                               "flg=%08x %016jx/%d insert\n",
-                               trans->sync_tid,
-                               parent, parent->bref.type,
-                               child, child->bref.type,
-                               child->modify_tid, child->delete_tid,
-                               child->flags,
-                               child->bref.key, child->bref.keybits);
-#endif
-                       hammer2_base_insert(trans, parent, base, count,
-                                           &info->cache_index, child);
+                       RB_REMOVE(hammer2_chain_tree, &above->dbtree, child);
+                       atomic_clear_int(&child->flags, HAMMER2_CHAIN_ONDBTREE);
+                       atomic_clear_int(&child->flags, HAMMER2_CHAIN_BMAPPED);
+                       TAILQ_INSERT_TAIL(&above->dbq, child, db_entry);
+                       atomic_set_int(&child->flags, HAMMER2_CHAIN_ONDBQ);
                        spin_unlock(&above->cst.spin);
+               } else {
+                       KKASSERT(child->flags & HAMMER2_CHAIN_ONDBQ);
                }
-       } else if (info->pass == 3 &&
-                  (child->delete_tid == HAMMER2_MAX_TID ||
-                   child->delete_tid <= trans->sync_tid) &&
-                  (child->flags & HAMMER2_CHAIN_MOVED)) {
+       }
+
+       /*
+        * Adjust child state when updating a live parent.
+        * Adjust child state on the last parent.
+        */
+       if (parent->delete_tid > info->sync_tid &&
+           child->delete_tid > trans->sync_tid &&
+           child->modify_tid > parent->update_lo) {
                /*
-                * We can't clear the MOVED bit on children whos modify_tid
-                * is beyond our current trans (was tested at top of scan2),
-                * or on deleted children which have not yet been flushed
-                * (handled above).
-                *
-                * Scan all parents of this child and determine if any of
-                * them still need the child's MOVED bit.
+                * NOTE: FLUSH_CREATE may have already been cleared and
+                *       BMAPPED may have already been set due to
+                *       reentrancy due to the queue movement below.
                 */
-               hammer2_chain_t *scan;
-
-               if (hammer2_debug & 0x4000)
-                       kprintf("CHECKMOVED %p (parent=%p)", child, parent);
+               atomic_set_int(&child->flags, HAMMER2_CHAIN_BMAPPED);
 
-               ok = 1;
-               spin_lock(&above->cst.spin);
-               TAILQ_FOREACH(scan, &above->ownerq, core_entry) {
+               if ((child->flags & HAMMER2_CHAIN_DELETED) &&
+                   (child->flags & HAMMER2_CHAIN_ONDBQ)) {
                        /*
-                        * Can't clear child's MOVED until all parent's have
-                        * synchronized with it.
+                        * A deleted child that is on DBQ which has been
+                        * inserted must be moved to DBTREE.  However,
+                        * temporary deleted children are only applicable to
+                        * the current flush and must not have any visibility
+                        * beyond it, so temporary children are left on DBQ.
                         *
-                        * Ignore deleted parents as-of this flush TID.
-                        * Ignore the current parent being flushed.
+                        * We run DBQ before DBTREE so this can cause pass4
+                        * to be called twice on this child.
                         */
-                       if (h2ignore_deleted(info, scan))
-                               continue;
-                       if (scan == parent)
-                               continue;
-
+                       KKASSERT((child->flags & HAMMER2_CHAIN_ONRBTREE) == 0);
+                       if ((child->flags & (HAMMER2_CHAIN_ONDBQ |
+                                            HAMMER2_CHAIN_FLUSH_TEMPORARY)) ==
+                           HAMMER2_CHAIN_ONDBQ) {
+                               spin_lock(&above->cst.spin);
+
+                               KKASSERT(child->flags &
+                                        HAMMER2_CHAIN_FLUSH_CREATE);
+                               TAILQ_REMOVE(&above->dbq, child, db_entry);
+                               atomic_clear_int(&child->flags,
+                                                HAMMER2_CHAIN_ONDBQ);
+                               xchain = RB_INSERT(hammer2_chain_tree,
+                                                  &above->dbtree, child);
+                               KASSERT(xchain == NULL, ("pass4: collision with %p moving %p to dbtree", xchain, child));
+                               atomic_set_int(&child->flags,
+                                              HAMMER2_CHAIN_ONDBTREE);
+                               spin_unlock(&above->cst.spin);
+                       }
+               } else if (child->flags & HAMMER2_CHAIN_DELETED) {
+                       KKASSERT(child->flags & HAMMER2_CHAIN_ONDBTREE);
+               } else {
                        /*
-                        * For parents not already synchronized check to see
-                        * if the flush has gotten past them yet or not.
-                        *
-                        * This must roughly mimic the tests that
-                        * hammer2_chain_flush_core() runs or we could leave
-                        * children hanging around with MOVED set and cause
-                        * a memory leak.
+                        * If the child is not deleted it must be on
+                        * the rbtree.
                         */
-                       if (scan->bref.mirror_tid >= trans->sync_tid)
-                               continue;
-                       if (scan->bref.mirror_tid >= above->update_hi)
-                               continue;
-
-                       if (hammer2_debug & 0x4000) {
-                               kprintf("(fail scan %p %016jx/%016jx)",
-                                       scan, scan->bref.mirror_tid,
-                                       child->modify_tid);
-                       }
-                       ok = 0;
-                       break;
+                       KKASSERT(child->flags & HAMMER2_CHAIN_ONRBTREE);
                }
-               if (hammer2_debug & 0x4000)
-                       kprintf("\n");
-               spin_unlock(&above->cst.spin);
+       }
+#endif
 
-               /*
-                * Can we finally clear MOVED?
-                */
-               if (ok) {
-                       if (hammer2_debug & 0x4000)
-                               kprintf("clear moved %p.%d %016jx/%d\n",
-                                       child, child->bref.type,
-                                       child->bref.key, child->bref.keybits);
-                       atomic_clear_int(&child->flags, HAMMER2_CHAIN_MOVED);
-                       if (child->flags & HAMMER2_CHAIN_MODIFIED) {
-                               kprintf("modified child %p all parents updated\n",
-                                       child);
-                               atomic_clear_int(&child->flags,
-                                                HAMMER2_CHAIN_MODIFIED);
-                               hammer2_chain_memory_wakeup(child->pmp);
-                               hammer2_chain_drop(child);/* cleared MODIFIED */
-                       }
-                       hammer2_chain_drop(child);      /* cleared MOVED */
-               } else {
-                       if (hammer2_debug & 0x4000)
-                               kprintf("keep  moved %p.%d %016jx/%d\n",
-                                       child, child->bref.type,
-                                       child->bref.key, child->bref.keybits);
+       /*
+        * Cleanup flags on last parent iterated for flush.
+        */
+       if (info->domodify & 2) {
+               if (child->flags & HAMMER2_CHAIN_FLUSH_CREATE) {
+                       atomic_clear_int(&child->flags,
+                                        HAMMER2_CHAIN_FLUSH_CREATE);
+                       hammer2_chain_drop(child);
+               }
+               if ((child->flags & HAMMER2_CHAIN_FLUSH_DELETE) &&
+                   child->delete_tid <= trans->sync_tid) {
+                       KKASSERT((child->flags & HAMMER2_CHAIN_ONDBTREE) == 0);
+                       KKASSERT((child->flags & HAMMER2_CHAIN_BMAPPED) == 0);
+                       atomic_clear_int(&child->flags,
+                                        HAMMER2_CHAIN_FLUSH_DELETE);
+                       hammer2_chain_drop(child);
                }
        }
 
@@ -1647,43 +1499,9 @@ hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data)
        /*
         * The parent may have been delete-duplicated.
         */
-       info->parent = parent;
-finalize:
        return (0);
 }
 
-/*
- * Update core->update_lo and attempt to clear the MOVED bit
- * for its children.
- *
- * This routine is only called after a sub-tree has been fully flushed
- * up to the current flush synchronization point.  Calling it under any
- * other condition will blow up flush tracking.
- */
-static
-void
-hammer2_flush_core_update(hammer2_chain_core_t *core,
-                         hammer2_flush_info_t *info)
-{
-       hammer2_chain_layer_t *layer;
-
-       spin_lock(&core->cst.spin);
-       if (core->update_lo < info->sync_tid)
-               core->update_lo = info->sync_tid;
-       TAILQ_FOREACH_REVERSE(layer, &core->layerq,
-                             h2_layer_list, entry) {
-               info->pass = 3;
-               ++layer->refs;
-               KKASSERT(layer->good == 0xABCD);
-               RB_SCAN(hammer2_chain_tree, &layer->rbtree,
-                       NULL, hammer2_chain_flush_scan2, info);
-               --layer->refs;
-               KKASSERT(info->parent->core == core);
-       }
-       spin_unlock(&core->cst.spin);
-}
-
-static
 void
 hammer2_rollup_stats(hammer2_chain_t *parent, hammer2_chain_t *child, int how)
 {
index d10bf9e..cfd7d65 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2013 The DragonFly Project.  All rights reserved.
+ * Copyright (c) 2011-2014 The DragonFly Project.  All rights reserved.
  *
  * This code is derived from software contributed to The DragonFly Project
  * by Matthew Dillon <dillon@dragonflybsd.org>
@@ -45,6 +45,8 @@
 
 #include "hammer2.h"
 
+#define FREEMAP_DEBUG  0
+
 struct hammer2_fiterate {
        hammer2_off_t   bpref;
        hammer2_off_t   bnext;
@@ -203,8 +205,9 @@ hammer2_freemap_reserve(hammer2_trans_t *trans, hammer2_chain_t *chain,
                break;
        }
        bref->data_off = off | radix;
-#if 0
-       kprintf("-> %016jx\n", bref->data_off);
+#if FREEMAP_DEBUG
+       kprintf("FREEMAP BLOCK TYPE %d %016jx/%d DATA_OFF=%016jx\n",
+               bref->type, bref->key, bref->keybits, bref->data_off);
 #endif
        return (0);
 }
@@ -894,6 +897,12 @@ hammer2_freemap_adjust(hammer2_trans_t *trans, hammer2_mount_t *hmp,
                /* XXX handle error */
        }
 
+#if FREEMAP_DEBUG
+       kprintf("FREEMAP ADJUST TYPE %d %016jx/%d DATA_OFF=%016jx\n",
+               chain->bref.type, chain->bref.key,
+               chain->bref.keybits, chain->bref.data_off);
+#endif
+
        /*
         * Calculate the bitmask (runs in 2-bit pairs).
         */
@@ -917,12 +926,15 @@ hammer2_freemap_adjust(hammer2_trans_t *trans, hammer2_mount_t *hmp,
        /*
         * [re]load the bmap and bitmap pointers.  Each bmap entry covers
         * a 2MB swath.  The bmap itself (LEVEL1) covers 2GB.
+        *
+        * Be sure to reset the linear iterator to ensure that the adjustment
+        * is not ignored.
         */
 again:
        bmap = &chain->data->bmdata[(int)(data_off >> HAMMER2_SEGRADIX) &
                                    (HAMMER2_FREEMAP_COUNT - 1)];
        bitmap = &bmap->bitmap[(int)(data_off >> (HAMMER2_SEGRADIX - 3)) & 7];
-
+       bmap->linear = 0;
 
        while (count) {
                KKASSERT(bmmask11);
index d295947..37f7dd3 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2013 The DragonFly Project.  All rights reserved.
+ * Copyright (c) 2011-2014 The DragonFly Project.  All rights reserved.
  *
  * This code is derived from software contributed to The DragonFly Project
  * by Matthew Dillon <dillon@dragonflybsd.org>
@@ -41,6 +41,8 @@
 
 #include "hammer2.h"
 
+#define INODE_DEBUG    0
+
 static void hammer2_inode_move_to_hidden(hammer2_trans_t *trans,
                                         hammer2_chain_t **chainp,
                                         hammer2_tid_t inum);
@@ -619,6 +621,10 @@ retry:
                                             HAMMER2_BREF_TYPE_INODE,
                                             HAMMER2_INODE_BYTES);
        }
+#if INODE_DEBUG
+       kprintf("CREATE INODE %*.*s chain=%p\n",
+               (int)name_len, (int)name_len, name, chain);
+#endif
 
        /*
         * Cleanup and handle retries.
index f2e2db6..1b52ed5 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013 The DragonFly Project.  All rights reserved.
+ * Copyright (c) 2013-2014 The DragonFly Project.  All rights reserved.
  *
  * This code is derived from software contributed to The DragonFly Project
  * by Matthew Dillon <dillon@dragonflybsd.org>
index 76c9724..05bedfc 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2013 The DragonFly Project.  All rights reserved.
+ * Copyright (c) 2011-2014 The DragonFly Project.  All rights reserved.
  *
  * This code is derived from software contributed to The DragonFly Project
  * by Matthew Dillon <dillon@dragonflybsd.org>
index dafa4f3..82b8778 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ * Copyright (c) 2011-2014 The DragonFly Project.  All rights reserved.
  *
  * This code is derived from software contributed to The DragonFly Project
  * by Matthew Dillon <dillon@dragonflybsd.org>
index 23a1692..e3c4a73 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ * Copyright (c) 2011-2014 The DragonFly Project.  All rights reserved.
  *
  * This code is derived from software contributed to The DragonFly Project
  * by Matthew Dillon <dillon@backplane.com>
index 7d9cabd..e7e7042 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012-2013 The DragonFly Project.  All rights reserved.
+ * Copyright (c) 2012-2014 The DragonFly Project.  All rights reserved.
  *
  * This code is derived from software contributed to The DragonFly Project
  * by Matthew Dillon <dillon@dragonflybsd.org>
index 6aa9614..5c934a4 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2013 The DragonFly Project.  All rights reserved.
+ * Copyright (c) 2011-2014 The DragonFly Project.  All rights reserved.
  *
  * This code is derived from software contributed to The DragonFly Project
  * by Matthew Dillon <dillon@dragonflybsd.org>
index 0edeacd..e94eb3d 100644 (file)
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 2011-2013 The DragonFly Project.  All rights reserved.
+ * Copyright (c) 2011-2014 The DragonFly Project.  All rights reserved.
  *
  * This code is derived from software contributed to The DragonFly Project
  * by Matthew Dillon <dillon@backplane.com>
@@ -82,6 +82,7 @@ int hammer2_debug;
 int hammer2_cluster_enable = 1;
 int hammer2_hardlink_enable = 1;
 int hammer2_flush_pipe = 100;
+int hammer2_synchronous_flush = 0;
 long hammer2_limit_dirty_chains;
 long hammer2_iod_file_read;
 long hammer2_iod_meta_read;
@@ -120,6 +121,8 @@ SYSCTL_INT(_vfs_hammer2, OID_AUTO, hardlink_enable, CTLFLAG_RW,
           &hammer2_hardlink_enable, 0, "");
 SYSCTL_INT(_vfs_hammer2, OID_AUTO, flush_pipe, CTLFLAG_RW,
           &hammer2_flush_pipe, 0, "");
+SYSCTL_INT(_vfs_hammer2, OID_AUTO, synchronous_flush, CTLFLAG_RW,
+          &hammer2_synchronous_flush, 0, "");
 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, limit_dirty_chains, CTLFLAG_RW,
           &hammer2_limit_dirty_chains, 0, "");
 
@@ -532,10 +535,16 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
                        return error;
                }
 
+               /*
+                * Really important to get these right or flush will get
+                * confused.
+                */
                hmp->vchain.bref.mirror_tid = hmp->voldata.mirror_tid;
                hmp->vchain.modify_tid = hmp->voldata.mirror_tid;
+               hmp->vchain.update_lo = hmp->voldata.mirror_tid;
                hmp->fchain.bref.mirror_tid = hmp->voldata.freemap_tid;
                hmp->fchain.modify_tid = hmp->voldata.freemap_tid;
+               hmp->fchain.update_lo = hmp->voldata.freemap_tid;
 
                /*
                 * First locate the super-root inode, which is key 0
@@ -1519,8 +1528,8 @@ hammer2_vfs_unmount_hmp1(struct mount *mp, hammer2_mount_t *hmp)
        hammer2_voldata_lock(hmp);
        if (((hmp->vchain.flags | hmp->fchain.flags) &
             HAMMER2_CHAIN_MODIFIED) ||
-           hmp->vchain.core->update_hi > hmp->voldata.mirror_tid ||
-           hmp->fchain.core->update_hi > hmp->voldata.freemap_tid) {
+           hmp->vchain.update_hi > hmp->voldata.mirror_tid ||
+           hmp->fchain.update_hi > hmp->voldata.freemap_tid) {
                hammer2_voldata_unlock(hmp, 0);
                hammer2_vfs_sync(mp, MNT_WAIT);
                /*hammer2_vfs_sync(mp, MNT_WAIT);*/
@@ -1530,20 +1539,20 @@ hammer2_vfs_unmount_hmp1(struct mount *mp, hammer2_mount_t *hmp)
        if (hmp->pmp_count == 0) {
                if (((hmp->vchain.flags | hmp->fchain.flags) &
                     HAMMER2_CHAIN_MODIFIED) ||
-                   (hmp->vchain.core->update_hi >
+                   (hmp->vchain.update_hi >
                     hmp->voldata.mirror_tid) ||
-                   (hmp->fchain.core->update_hi >
+                   (hmp->fchain.update_hi >
                     hmp->voldata.freemap_tid)) {
                        kprintf("hammer2_unmount: chains left over "
                                "after final sync\n");
                        kprintf("    vchain %08x update_hi %jx/%jx\n",
                                hmp->vchain.flags,
                                hmp->voldata.mirror_tid,
-                               hmp->vchain.core->update_hi);
+                               hmp->vchain.update_hi);
                        kprintf("    fchain %08x update_hi %jx/%jx\n",
                                hmp->fchain.flags,
                                hmp->voldata.freemap_tid,
-                               hmp->fchain.core->update_hi);
+                               hmp->fchain.update_hi);
 
                        if (hammer2_debug & 0x0010)
                                Debugger("entered debugger");
@@ -1576,8 +1585,7 @@ hammer2_vfs_unmount_hmp2(struct mount *mp, hammer2_mount_t *hmp)
                        vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
                        vinvalbuf(devvp, (ronly ? 0 : V_SAVE), 0, 0);
                        hmp->devvp = NULL;
-                       VOP_CLOSE(devvp,
-                                 (ronly ? FREAD : FREAD|FWRITE));
+                       VOP_CLOSE(devvp, (ronly ? FREAD : FREAD|FWRITE), NULL);
                        vn_unlock(devvp);
                        vrele(devvp);
                        devvp = NULL;
@@ -1598,9 +1606,9 @@ hammer2_vfs_unmount_hmp2(struct mount *mp, hammer2_mount_t *hmp)
                 * rot).
                 */
                dumpcnt = 50;
-               hammer2_dump_chain(&hmp->vchain, 0, &dumpcnt);
+               hammer2_dump_chain(&hmp->vchain, 0, &dumpcnt, 'v');
                dumpcnt = 50;
-               hammer2_dump_chain(&hmp->fchain, 0, &dumpcnt);
+               hammer2_dump_chain(&hmp->fchain, 0, &dumpcnt, 'f');
                hammer2_mount_unlock(hmp);
                hammer2_chain_drop(&hmp->vchain);
 
@@ -1838,7 +1846,7 @@ hammer2_recovery_scan(hammer2_trans_t *trans, hammer2_mount_t *hmp,
                                   HAMMER2_LOOKUP_NODATA);
        while (chain) {
                atomic_set_int(&chain->flags, HAMMER2_CHAIN_RELEASE);
-               if (chain->bref.mirror_tid >= hmp->voldata.mirror_tid) {
+               if (chain->bref.mirror_tid >= hmp->voldata.alloc_tid - 1) {
                        error = hammer2_recovery_scan(trans, hmp, chain,
                                                      list, depth + 1);
                        if (error)
@@ -1936,21 +1944,25 @@ hammer2_vfs_sync(struct mount *mp, int waitfor)
                 */
 #if 1
                hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS);
+               kprintf("sync tid test fmap %016jx %016jx\n",
+                       hmp->fchain.update_hi, hmp->voldata.freemap_tid);
                if ((hmp->fchain.flags & HAMMER2_CHAIN_MODIFIED) ||
-                   hmp->fchain.core->update_hi > hmp->voldata.freemap_tid) {
+                   hmp->fchain.update_hi > hmp->voldata.freemap_tid) {
                        /* this will also modify vchain as a side effect */
                        chain = &hmp->fchain;
-                       hammer2_chain_flush(&info.trans, &chain);
+                       hammer2_flush(&info.trans, &chain);
                        KKASSERT(chain == &hmp->fchain);
                }
                hammer2_chain_unlock(&hmp->fchain);
 #endif
 
                hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS);
+               kprintf("sync tid test vmap %016jx %016jx\n",
+                       hmp->vchain.update_hi, hmp->voldata.mirror_tid);
                if ((hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED) ||
-                   hmp->vchain.core->update_hi > hmp->voldata.mirror_tid) {
+                   hmp->vchain.update_hi > hmp->voldata.mirror_tid) {
                        chain = &hmp->vchain;
-                       hammer2_chain_flush(&info.trans, &chain);
+                       hammer2_flush(&info.trans, &chain);
                        KKASSERT(chain == &hmp->vchain);
                        force_fchain = 1;
                } else {
@@ -1961,11 +1973,11 @@ hammer2_vfs_sync(struct mount *mp, int waitfor)
 #if 0
                hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS);
                if ((hmp->fchain.flags & HAMMER2_CHAIN_MODIFIED) ||
-                   hmp->fchain.core->update_hi > hmp->voldata.freemap_tid ||
+                   hmp->fchain.update_hi > hmp->voldata.freemap_tid ||
                    force_fchain) {
                        /* this will also modify vchain as a side effect */
                        chain = &hmp->fchain;
-                       hammer2_chain_flush(&info.trans, &chain);
+                       hammer2_flush(&info.trans, &chain);
                        KKASSERT(chain == &hmp->fchain);
                }
                hammer2_chain_unlock(&hmp->fchain);
@@ -2039,12 +2051,7 @@ hammer2_vfs_sync(struct mount *mp, int waitfor)
 
 /*
  * Sync passes.
- *
- * NOTE: We don't test update_lo/update_hi or MOVED here because the fsync
- *      code won't flush on those flags.  The syncer code above will do a
- *      general meta-data flush globally that will catch these flags.
  */
-
 static int
 hammer2_sync_scan2(struct mount *mp, struct vnode *vp, void *data)
 {
@@ -2091,7 +2098,7 @@ hammer2_sync_scan2(struct mount *mp, struct vnode *vp, void *data)
         *     below.
         */
        parent = hammer2_inode_lock_ex(ip);
-       hammer2_chain_flush(&info->trans, &parent);
+       hammer2_flush(&info->trans, &parent);
        hammer2_inode_unlock_ex(ip, parent);
 #endif
        hammer2_inode_drop(ip);
@@ -2460,9 +2467,8 @@ hammer2_lwinprog_wait(hammer2_pfsmount_t *pmp)
 }
 
 void
-hammer2_dump_chain(hammer2_chain_t *chain, int tab, int *countp)
+hammer2_dump_chain(hammer2_chain_t *chain, int tab, int *countp, char pfx)
 {
-       hammer2_chain_layer_t *layer;
        hammer2_chain_t *scan;
        hammer2_chain_t *first_parent;
 
@@ -2474,44 +2480,49 @@ hammer2_dump_chain(hammer2_chain_t *chain, int tab, int *countp)
        if (*countp < 0)
                return;
        first_parent = chain->core ? TAILQ_FIRST(&chain->core->ownerq) : NULL;
-       kprintf("%*.*schain %p.%d %016jx/%d mir=%016jx\n",
-               tab, tab, "",
+       kprintf("%*.*s%c-chain %p.%d %016jx/%d mir=%016jx\n",
+               tab, tab, "", pfx,
                chain, chain->bref.type,
                chain->bref.key, chain->bref.keybits,
                chain->bref.mirror_tid);
 
-       kprintf("%*.*s      [%08x] (%s) dt=%016jx refs=%d\n",
+       kprintf("%*.*s      [%08x] (%s) mod=%016jx del=%016jx "
+               "lo=%08jx hi=%08jx refs=%d\n",
                tab, tab, "",
                chain->flags,
                ((chain->bref.type == HAMMER2_BREF_TYPE_INODE &&
                chain->data) ?  (char *)chain->data->ipdata.filename : "?"),
+               chain->modify_tid,
                chain->delete_tid,
+               chain->update_lo,
+               chain->update_hi,
                chain->refs);
 
-       kprintf("%*.*s      core %p [%08x] lo=%08jx hi=%08jx fp=%p np=%p",
+       kprintf("%*.*s      core %p [%08x]",
                tab, tab, "",
-               chain->core, (chain->core ? chain->core->flags : 0),
-               (chain->core ? chain->core->update_lo : -1),
-               (chain->core ? chain->core->update_hi : -1),
-               first_parent,
-               (first_parent ? TAILQ_NEXT(chain, core_entry) : NULL));
+               chain->core, (chain->core ? chain->core->flags : 0));
 
        if (first_parent)
-               kprintf(" [fpflags %08x fprefs %d\n",
+               kprintf("\n%*.*s      fp=%p np=%p [fpflags %08x fprefs %d",
+                       tab, tab, "",
+                       first_parent,
+                       (first_parent ? TAILQ_NEXT(first_parent, core_entry) :
+                                       NULL),
                        first_parent->flags,
                        first_parent->refs);
-       if (chain->core == NULL || TAILQ_EMPTY(&chain->core->layerq))
+       if (chain->core == NULL || RB_EMPTY(&chain->core->rbtree))
                kprintf("\n");
        else
                kprintf(" {\n");
        if (chain->core) {
-               TAILQ_FOREACH(layer, &chain->core->layerq, entry) {
-                       RB_FOREACH(scan, hammer2_chain_tree, &layer->rbtree) {
-                               hammer2_dump_chain(scan, tab + 4, countp);
-                       }
-               }
+               RB_FOREACH(scan, hammer2_chain_tree, &chain->core->rbtree)
+                       hammer2_dump_chain(scan, tab + 4, countp, 'a');
+               RB_FOREACH(scan, hammer2_chain_tree, &chain->core->dbtree)
+                       hammer2_dump_chain(scan, tab + 4, countp, 'r');
+               TAILQ_FOREACH(scan, &chain->core->dbq, db_entry)
+                       hammer2_dump_chain(scan, tab + 4, countp, 'd');
        }
-       if (chain->core && !TAILQ_EMPTY(&chain->core->layerq)) {
+       if (chain->core && !RB_EMPTY(&chain->core->rbtree)) {
                if (chain->bref.type == HAMMER2_BREF_TYPE_INODE && chain->data)
                        kprintf("%*.*s}(%s)\n", tab, tab, "",
                                chain->data->ipdata.filename);
index 38cfb1c..1d0b321 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2013 The DragonFly Project.  All rights reserved.
+ * Copyright (c) 2011-2014 The DragonFly Project.  All rights reserved.
  *
  * This code is derived from software contributed to The DragonFly Project
  * by Matthew Dillon <dillon@dragonflybsd.org>
@@ -305,9 +305,6 @@ hammer2_vop_reclaim(struct vop_reclaim_args *ap)
                return(0);
 
        /*
-        * Set update_hi so we can detect and propagate the DELETED
-        * bit in the flush code.
-        *
         * ip->chain might be stale, correct it before checking as older
         * versions of the chain are likely marked deleted even if the
         * file hasn't been.  XXX ip->chain should never be stale on
@@ -327,11 +324,7 @@ hammer2_vop_reclaim(struct vop_reclaim_args *ap)
         *
         * HAMMER2 usually does not try to optimize the freemap by returning
         * deleted blocks to it as it does not usually know how many snapshots
-        * might be referencing portions of the file/dir.  XXX TODO.
-        *
-        * XXX TODO - However, any modified file as-of when a snapshot is made
-        *            cannot use this optimization as some of the modifications
-        *            may wind up being part of the snapshot.
+        * might be referencing portions of the file/dir.
         */
        vp->v_data = NULL;
        ip->vp = NULL;
@@ -341,11 +334,6 @@ hammer2_vop_reclaim(struct vop_reclaim_args *ap)
                hammer2_trans_init(&trans, ip->pmp, NULL,
                                   HAMMER2_TRANS_BUFCACHE);
                hammer2_chain_delete(&trans, chain, 0);
-               hammer2_chain_setsubmod(&trans, chain);
-               spin_lock(&chain->core->cst.spin);
-               if (chain->core->update_hi < trans.sync_tid)
-                       chain->core->update_hi = trans.sync_tid; /* needed? */
-               spin_unlock(&chain->core->cst.spin);
                hammer2_trans_done(&trans);
        }
 
@@ -408,7 +396,7 @@ hammer2_vop_fsync(struct vop_fsync_args *ap)
         * XXX creates discontinuity w/modify_tid
         */
        if (ap->a_flags & VOP_FSYNC_SYSCALL) {
-               hammer2_chain_flush(&trans, &chain);
+               hammer2_flush(&trans, &chain);
        }
 #endif
        hammer2_inode_unlock_ex(ip, chain);
diff --git a/sys/vfs/hammer2/mkvntest b/sys/vfs/hammer2/mkvntest
deleted file mode 100755 (executable)
index fad5923..0000000
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/csh
-#
-
-vnconfig -u vn0 >& /dev/null
-rm -f /usr/obj/hammer2.img
-truncate -s 1G /usr/obj/hammer2.img
-newfs_hammer2 -L ROOT /usr/obj/hammer2.img
-vnconfig -c vn0 /usr/obj/hammer2.img
-
-echo "hammer2.img on /dev/vn0"